yanaiela commited on
Commit
7026285
1 Parent(s): 3c7a8fe
Files changed (2) hide show
  1. app.py +117 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from functools import lru_cache
3
+
4
+ import gradio as gr
5
+ import plotly.graph_objects as go
6
+
7
+ from wimbd.es import es_init, count_documents_containing_phrases
8
+
9
+
10
+ es = es_init(None, os.getenv("lm_datasets_cloud_id"), os.getenv("lm_datasets_api_key"))
11
+ es_dolma = es_init(None, os.getenv("dolma_cloud_id"), os.getenv("dolma_api_key"))
12
+
13
+ datasets = ["OpenWebText", "C4", "OSCAR", "The Pile", "LAION-2B-en", "Dolma"]
14
+ dataset_es_map = {
15
+ "OSCAR": "re_oscar",
16
+ "LAION-2B-en": "re_laion2b-en-*",
17
+ "LAION-5B": "*laion2b*",
18
+ "OpenWebText": "openwebtext",
19
+ "The Pile": "re_pile",
20
+ "C4": "c4",
21
+ "Dolma v1.5": "docs_v1.5_2023-11-02",
22
+ "Dolma v1.7": "docs_v1.7_2024-06-04",
23
+ "Tulu v2": "tulu-v2-sft-mixture",
24
+ }
25
+ default_checked = ["C4", "The Pile", "Dolma v1.7"] # Datasets to be checked by default
26
+
27
+
28
+ @lru_cache()
29
+ def get_counts(index_name, phrase, es):
30
+ return count_documents_containing_phrases(index_name, phrase, es=es)
31
+
32
+
33
+ def process_input(phrases, *dataset_choices):
34
+ results = []
35
+ for dataset_name, index_name, is_selected in zip(
36
+ dataset_es_map.keys(), dataset_es_map.values(), dataset_choices
37
+ ):
38
+ if is_selected:
39
+ for phrase in phrases.split("\n"):
40
+ phrase = phrase.strip()
41
+ if phrase:
42
+ if "dolma" in dataset_name.lower():
43
+ count = get_counts(index_name, phrase, es=es_dolma)
44
+ else:
45
+ count = get_counts(index_name, phrase, es=es)
46
+ results.append((dataset_name, phrase, count))
47
+
48
+ # Format results for different output components
49
+ table_data = [[dataset, phrase, str(count)] for dataset, phrase, count in results]
50
+
51
+ # Create bar chart using plotly
52
+ fig = go.Figure()
53
+ for phrase in set([r[1] for r in results]):
54
+ dataset_names = [r[0] for r in results if r[1] == phrase]
55
+ counts = [r[2] for r in results if r[1] == phrase]
56
+ fig.add_trace(go.Bar(x=dataset_names, y=counts, name=phrase))
57
+
58
+ fig.update_layout(
59
+ title="Document Counts by Dataset and Phrase",
60
+ xaxis_title="Dataset",
61
+ yaxis_title="Count",
62
+ barmode="group",
63
+ )
64
+
65
+ # return table_data, markdown_text, fig
66
+ return table_data, fig
67
+
68
+
69
+ citation_text = """If you find this tool useful, please kindly cite our paper:
70
+ ```bibtex
71
+ @inproceedings{elazar2023s,
72
+ title={What's In My Big Data?},
73
+ author={Elazar, Yanai and Bhagia, Akshita and Magnusson, Ian Helgi and Ravichander, Abhilasha and Schwenk, Dustin and Suhr, Alane and Walsh, Evan Pete and Groeneveld, Dirk and Soldaini, Luca and Singh, Sameer and Hajishirzi, Hanna and Smith, Noah A. and Dodge, Jesse},
74
+ booktitle={The Twelfth International Conference on Learning Representations},
75
+ year={2024}
76
+ }```"""
77
+
78
+
79
+ def custom_layout(input_components, output_components, citation):
80
+ return [
81
+ input_components[0], # Textbox
82
+ *input_components[1:], # Checkboxes
83
+ output_components[0], # Dataframe
84
+ # output_components[1], # Markdown
85
+ output_components[1], # Plot
86
+ citation, # Citation Markdown
87
+ ]
88
+
89
+
90
+ iface = gr.Interface(
91
+ fn=process_input,
92
+ inputs=[
93
+ gr.Textbox(label="Enter phrases (one per line)", lines=5),
94
+ *[
95
+ gr.Checkbox(label=dataset, value=(dataset in default_checked))
96
+ for dataset in dataset_es_map.keys()
97
+ ],
98
+ ],
99
+ outputs=[
100
+ gr.Dataframe(headers=["Dataset", "Phrase", "Count"], label="Counts Table"),
101
+ # gr.Markdown(label="Results as Text"),
102
+ gr.Plot(label="Results Chart"),
103
+ # gr.Markdown(value=citation_text)
104
+ ],
105
+ title="What's In My Big Data? String Counts Demo",
106
+ description="""This app connects to the WIMBD Elasticsearch instance and counts the number of documents containing a given string in the various indexed datasets.\\
107
+ The app uses the wimbd pypi package, which can be installed by simply running `pip install wimbd`.\\
108
+ Access to the indices require an API key, due to the sensitive nature of the data, but can be accessed by filling up the following [form](https://forms.gle/Mk9uwJibR9H4hh9Y9).\\
109
+ This app was created by [Yanai Elazar](https://yanaiela.github.io/), and for bugs, improvements, or feature requests, please open an issue on the [GitHub repository](https://github.com/allenai/wimbd), or send me an email.
110
+
111
+ The returned counts are the number of documents that contain each string per dataset.""",
112
+ article=citation_text, # This adds the citation at the bottom
113
+ theme=custom_layout, # This uses our custom layout function
114
+ )
115
+
116
+
117
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ wimbd
2
+ gradio
3
+ plotly