import os from dataclasses import dataclass from operator import add, sub import gradio as gr import numpy as np from datasets import load_dataset from sklearn.metrics.pairwise import cosine_similarity from pyparsing import Word, alphas, Char, ParseException term = Word(alphas) operator = Char("+ -") expression = term + (operator + term)[...] operations = {"+": add, "-": sub} def parse_expression(input): try: return expression.parseString(input) except ParseException as e: raise gr.Error(f"Parsing error: {e.msg} at position [{e.loc}].") def evaluate_expression(input): # Skip every other item words = input[::2] operators = input[1::2] result = word_to_vectors(words[0]) for operator, word in zip(operators, words[1:]): result = operations[operator](result, word_to_vectors(word)) return result dataset = load_dataset("karmiq/glove", split="train") df = dataset.to_pandas() all_words = df["word"].to_numpy() all_vectors = np.array(df["embeddings"].to_list()) def word_to_vectors(word): result = df.loc[df["word"] == word].embeddings.to_numpy() if len(result) < 1: raise gr.Error("Word not found in the dictionary.") else: return result[0] def expression_to_vectors(input): return evaluate_expression(parse_expression(input)) def get_results(expression): if len(expression) < 1: raise gr.Error("Please provide an expression.") expression = expression.lower() vectors = expression_to_vectors(expression) similarity_scores = cosine_similarity([vectors], all_vectors)[0] top_indices = np.argsort(similarity_scores)[::-1] return dict( [ (all_words[i], similarity_scores[i]) for i in top_indices if not all_words[i] in expression.split() ][:10] ) examples = [ "king - man + woman", "mother - woman + man", "berlin - germany + france", "saxophone - jazz + classical", ] initial_output = get_results(examples[0]) css = """ button.gallery-item { color: var(--body-text-color) !important; } .output-class { color: var(--color-red-700) !important; } .confidence-set .label .text { font-weight: var(--weight-medium); } .confidence-set:hover .label { color: var(--color-red-700) !important; } """ with gr.Blocks( css=css, theme=gr.themes.Monochrome(radius_size=gr.themes.sizes.radius_sm), ) as app: with gr.Row(): with gr.Column(): input = gr.Textbox(value=examples[0], label="Expression") with gr.Row(): btn = gr.Button("Run", variant="primary") with gr.Row(): gr.Markdown( "Demonstration of computing cosine similarity of embeddings " "from the [GloVe](https://nlp.stanford.edu/projects/glove/) dataset." ) with gr.Row(): gr.Examples(examples, inputs=input) with gr.Column(): output = gr.Label(label="Closest words", value=initial_output) btn.click(fn=get_results, inputs=input, outputs=output) app.launch()