Spaces:
Runtime error
Runtime error
code cleaned up
Browse files- utils/helper_functions.py +130 -18
utils/helper_functions.py
CHANGED
|
@@ -19,7 +19,18 @@ openai.api_key = os.environ["OPENAI_API_KEY"]
|
|
| 19 |
|
| 20 |
|
| 21 |
def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
|
| 22 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# Concatenate the list of dataframes
|
| 24 |
combined_dataframe = pd.concat(
|
| 25 |
dataframes, ignore_index=True
|
|
@@ -64,21 +75,50 @@ def call_chatgpt(prompt: str) -> str:
|
|
| 64 |
|
| 65 |
|
| 66 |
def openai_text_embedding(prompt: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
|
| 68 |
"data"
|
| 69 |
-
][0][
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
# Compute sentence embeddings
|
| 74 |
embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
|
| 75 |
embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
|
| 76 |
|
| 77 |
-
# Convert to
|
| 78 |
embedding1 = np.asarray(embedding1)
|
| 79 |
embedding2 = np.asarray(embedding2)
|
| 80 |
|
| 81 |
# Calculate cosine similarity between the embeddings
|
|
|
|
| 82 |
similarity_score = 1 - cosine(embedding1, embedding2)
|
| 83 |
|
| 84 |
return similarity_score
|
|
@@ -88,11 +128,29 @@ def add_dist_score_column(
|
|
| 88 |
dataframe: pd.DataFrame,
|
| 89 |
sentence: str,
|
| 90 |
) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
dataframe["stsopenai"] = dataframe["questions"].apply(
|
| 92 |
lambda x: calculate_sts_openai_score(str(x), sentence)
|
| 93 |
)
|
| 94 |
|
|
|
|
| 95 |
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
|
|
|
|
|
|
|
| 96 |
return sorted_dataframe.iloc[:5, :]
|
| 97 |
|
| 98 |
|
|
@@ -181,21 +239,75 @@ def llama2_7b_ysa(prompt: str) -> str:
|
|
| 181 |
|
| 182 |
|
| 183 |
def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray:
|
| 184 |
-
"""
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
|
| 193 |
def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float:
|
| 194 |
-
"""
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
|
| 22 |
+
"""
|
| 23 |
+
Merges a list of pandas DataFrames into a single DataFrame.
|
| 24 |
+
|
| 25 |
+
This function concatenates the given DataFrames and filters the resulting DataFrame to only include the columns 'context', 'questions', and 'answers'.
|
| 26 |
+
|
| 27 |
+
Parameters:
|
| 28 |
+
dataframes (List[pd.DataFrame]): A list of DataFrames to be merged.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
pd.DataFrame: The concatenated DataFrame containing only the specified columns.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
# Concatenate the list of dataframes
|
| 35 |
combined_dataframe = pd.concat(
|
| 36 |
dataframes, ignore_index=True
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
def openai_text_embedding(prompt: str) -> str:
|
| 78 |
+
"""
|
| 79 |
+
Retrieves the text embedding for a given prompt using OpenAI's text-embedding model.
|
| 80 |
+
|
| 81 |
+
This function utilizes OpenAI's API to generate an embedding for the input text. It specifically uses the "text-embedding-ada-002" model.
|
| 82 |
+
|
| 83 |
+
Parameters:
|
| 84 |
+
prompt (str): The text input for which to generate an embedding.
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
str: A string representation of the text embedding.
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
# Call OpenAI API to create a text embedding
|
| 91 |
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
|
| 92 |
"data"
|
| 93 |
+
][0][
|
| 94 |
+
"embedding"
|
| 95 |
+
] # Retrieve the embedding from the response
|
| 96 |
|
| 97 |
|
| 98 |
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
|
| 99 |
+
"""
|
| 100 |
+
Calculates the Semantic Textual Similarity (STS) between two sentences using OpenAI's text-embedding model.
|
| 101 |
+
|
| 102 |
+
This function computes embeddings for each sentence and then calculates the cosine similarity between these embeddings. A higher score indicates greater similarity.
|
| 103 |
+
|
| 104 |
+
Parameters:
|
| 105 |
+
sentence1 (str): The first sentence for similarity comparison.
|
| 106 |
+
sentence2 (str): The second sentence for similarity comparison.
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
float: The STS score representing the similarity between sentence1 and sentence2.
|
| 110 |
+
"""
|
| 111 |
+
|
| 112 |
# Compute sentence embeddings
|
| 113 |
embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
|
| 114 |
embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
|
| 115 |
|
| 116 |
+
# Convert embeddings to NumPy arrays
|
| 117 |
embedding1 = np.asarray(embedding1)
|
| 118 |
embedding2 = np.asarray(embedding2)
|
| 119 |
|
| 120 |
# Calculate cosine similarity between the embeddings
|
| 121 |
+
# Since 'cosine' returns the distance, 1 - distance is used to get similarity
|
| 122 |
similarity_score = 1 - cosine(embedding1, embedding2)
|
| 123 |
|
| 124 |
return similarity_score
|
|
|
|
| 128 |
dataframe: pd.DataFrame,
|
| 129 |
sentence: str,
|
| 130 |
) -> pd.DataFrame:
|
| 131 |
+
"""
|
| 132 |
+
Adds a new column to the provided DataFrame with STS (Semantic Textual Similarity) scores,
|
| 133 |
+
calculated between a given sentence and each question in the 'questions' column of the DataFrame.
|
| 134 |
+
The DataFrame is then sorted by this new column in descending order and the top 5 rows are returned.
|
| 135 |
+
|
| 136 |
+
Parameters:
|
| 137 |
+
dataframe (pd.DataFrame): A pandas DataFrame containing a 'questions' column.
|
| 138 |
+
sentence (str): The sentence against which to compute STS scores for each question in the DataFrame.
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
pd.DataFrame: A DataFrame containing the original data along with the new 'stsopenai' column,
|
| 142 |
+
sorted by the 'stsopenai' column, and limited to the top 5 entries with the highest scores.
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
# Calculate the STS score between `sentence` and each row's `question`
|
| 146 |
dataframe["stsopenai"] = dataframe["questions"].apply(
|
| 147 |
lambda x: calculate_sts_openai_score(str(x), sentence)
|
| 148 |
)
|
| 149 |
|
| 150 |
+
# Sort the dataframe by the newly added 'stsopenai' column in descending order
|
| 151 |
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
|
| 152 |
+
|
| 153 |
+
# Return the top 5 rows from the sorted dataframe
|
| 154 |
return sorted_dataframe.iloc[:5, :]
|
| 155 |
|
| 156 |
|
|
|
|
| 239 |
|
| 240 |
|
| 241 |
def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray:
|
| 242 |
+
"""
|
| 243 |
+
Converts an array to a 4-bit representation by normalizing and scaling its values.
|
| 244 |
+
|
| 245 |
+
The function first checks if the input is an instance of numpy ndarray,
|
| 246 |
+
if not, it converts the input into a numpy ndarray. Then, it normalizes
|
| 247 |
+
the values of the array to be between 0 and 1. Finally, it scales these
|
| 248 |
+
normalized values to the range of 0-15, corresponding to 4-bit integers,
|
| 249 |
+
and returns this array of integers.
|
| 250 |
+
|
| 251 |
+
Parameters:
|
| 252 |
+
arr (Union[np.ndarray, Any]): An array or any type that can be converted to a numpy ndarray.
|
| 253 |
+
|
| 254 |
+
Returns:
|
| 255 |
+
np.ndarray: A numpy ndarray containing the input data quantized to 4-bit representation.
|
| 256 |
+
|
| 257 |
+
Examples:
|
| 258 |
+
>>> quantize_to_4bit([0, 128, 255])
|
| 259 |
+
array([ 0, 7, 15])
|
| 260 |
+
"""
|
| 261 |
+
if not isinstance(arr, np.ndarray): # Check if the input is a numpy array
|
| 262 |
+
arr = np.array(arr) # Convert to numpy array if not already
|
| 263 |
+
|
| 264 |
+
arr_min = arr.min() # Find minimum value in the array
|
| 265 |
+
arr_max = arr.max() # Find maximum value in the array
|
| 266 |
+
|
| 267 |
+
# Normalize array values to a [0, 1] range
|
| 268 |
+
normalized_arr = (arr - arr_min) / (arr_max - arr_min)
|
| 269 |
+
|
| 270 |
+
# Scale normalized values to a 0-15 range (4-bit) and convert to integer
|
| 271 |
+
return np.round(normalized_arr * 15).astype(int)
|
| 272 |
|
| 273 |
|
| 274 |
def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float:
|
| 275 |
+
"""
|
| 276 |
+
Calculates a weighted measure of influence between two arrays based on their quantized (4-bit) versions.
|
| 277 |
+
|
| 278 |
+
This function first quantizes both input arrays to 4-bit representations and then calculates a weighting based
|
| 279 |
+
on the unique values of the first array's quantized version. It uses these weights to compute local averages
|
| 280 |
+
within the second array's quantized version, assessing the influence of the first array on the second.
|
| 281 |
+
The influence is normalized by the standard deviation of the second array's quantized version.
|
| 282 |
+
|
| 283 |
+
Parameters:
|
| 284 |
+
arr1 (np.ndarray): The first input numpy array.
|
| 285 |
+
arr2 (np.ndarray): The second input numpy array.
|
| 286 |
+
|
| 287 |
+
Returns:
|
| 288 |
+
float: The calculated influence value, representing a weighted average that has been normalized.
|
| 289 |
+
|
| 290 |
+
Note:
|
| 291 |
+
Both inputs must be numpy ndarrays and it's expected that a function named `quantize_to_4bit`
|
| 292 |
+
exists for converting an array to its 4-bit representation.
|
| 293 |
+
"""
|
| 294 |
+
arr1_4bit = quantize_to_4bit(arr1) # Quantize the first array to 4-bit
|
| 295 |
+
arr2_4bit = quantize_to_4bit(arr2) # Quantize the second array to 4-bit
|
| 296 |
+
|
| 297 |
+
unique_values = np.unique(
|
| 298 |
+
arr1_4bit
|
| 299 |
+
) # Get the unique 4-bit values from the first array
|
| 300 |
+
y_bar_global = np.mean(
|
| 301 |
+
arr2_4bit
|
| 302 |
+
) # Calculate the global mean of the second array's 4-bit version
|
| 303 |
+
|
| 304 |
+
# Compute the sum of squares of the differences between local and global means,
|
| 305 |
+
# each weighted by the square of the count of values in the local mean
|
| 306 |
+
weighted_local_averages = [
|
| 307 |
+
(np.mean((arr2_4bit[arr1_4bit == val]) - y_bar_global) ** 2)
|
| 308 |
+
* len(arr2_4bit[arr1_4bit == val]) ** 2
|
| 309 |
+
for val in unique_values
|
| 310 |
+
]
|
| 311 |
+
|
| 312 |
+
# Return normalized weighted mean by dividing by the standard deviation of the second array's 4-bit version
|
| 313 |
+
return np.mean(weighted_local_averages) / np.std(arr2_4bit)
|