import gradio as gr import pandas as pd import numpy as np import plotly.express as px # Load the spaces.parquet file as a dataframe and do some pre cleaning steps """ Todos: Clean up existing filtering code """ def filtered_df(emoji, likes, author, hardware, tags, models, datasets, space_licenses): _df = df # if emoji is not none, filter the dataframe with it if emoji: _df = _df[_df["emoji"].isin(emoji)] # if likes is not none, filter the dataframe with it if likes: _df = _df[_df["likes"] >= likes] if author: _df = _df[_df["author"].isin(author)] if hardware: _df = _df[_df["hardware"].isin(hardware)] # check to see if the array of sdk_tags contains any of the selected tags if tags: _df = _df[_df["sdk_tags"].apply(lambda x: any(tag in x for tag in tags))] if models: _df = _df[ _df["models"].apply( lambda x: ( any(model in x for model in models) if x is not None else False ) ) ] if datasets: _df = _df[ _df["datasets"].apply( lambda x: ( any(dataset in x for dataset in datasets) if x is not None else False ) ) ] if space_licenses: _df = _df[ _df["licenses"].apply( lambda x: ( any(space_license in x for space_license in space_licenses) if x is not None else False ) ) ] # rename the columns names to make them more readable _df = _df.rename( columns={ 'url': 'URL', 'likes': 'Likes', "r_models": "Models", "r_datasets": "Datasets", "r_licenses": "Licenses", } ) return _df[["URL", "Likes", "Models", "Datasets", "Licenses" ]] with gr.Blocks(fill_width=True) as demo: with gr.Tab(label="Spaces Overview"): # The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time. # The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date . df = pd.read_parquet("spaces.parquet") df = df.sort_values("created_at") df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int) fig1 = px.line(df, x='created_at', y='cumulative_spaces', title='Growth of Spaces Over Time', labels={'created_at': 'Date', 'cumulative_spaces': 'Number of Spaces'}, template='plotly_dark') gr.Plot(fig1) # Create a pie charge showing the distribution of spaces by SDK fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark') gr.Plot(fig2) # create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis emoji_counts = df['emoji'].value_counts().head(10).reset_index() fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark') gr.Plot(fig3) # Create a dataframe with the top 10 authors and the number of spaces they have created author_counts = df['author'].value_counts().head(20).reset_index() author_counts.columns = ['Author', 'Number of Spaces'] gr.DataFrame(author_counts) # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index() fig4 = px.scatter(author_likes, x='id', y='likes', title='Relationship between Number of Spaces Created and Number of Likes', labels={'id': 'Number of Spaces Created', 'likes': 'Number of Likes'}, hover_data={'author': True}, template='plotly_dark') gr.Plot(fig4) # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index() fig10 = px.scatter(emoji_likes, x='id', y='likes', title='Relationship between Number of Spaces Created and Number of Likes', labels={'id': 'Number of Spaces Created', 'likes': 'Number of Likes'}, hover_data={'emoji': True}, template='plotly_dark') gr.Plot(fig10) # Create a bar chart of hardware in use hardware = df['hardware'].value_counts().reset_index() hardware.columns = ['Hardware', 'Number of Spaces'] fig5 = px.bar(hardware, x='Hardware', y='Number of Spaces', title='Hardware in Use', labels={'Hardware': 'Hardware', 'Number of Spaces': 'Number of Spaces (log scale)'}, color='Hardware', template='plotly_dark') fig5.update_layout(yaxis_type='log') gr.Plot(fig5) models = np.concatenate([arr for arr in df['models'].values if arr is not None]) model_count = {} model_author_count = {} for model in models: author = model.split('/')[0] if model in model_count: model_count[model] += 1 else: model_count[model] = 1 if author in model_author_count: model_author_count[author] += 1 else: model_author_count[author] = 1 model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces']) fig8 = px.bar(model_author_count.sort_values('Number of Spaces', ascending=False).head(20), x='Model Author', y='Number of Spaces', title='Most Popular Model Authors', labels={'Model': 'Model', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark') gr.Plot(fig8) model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces']) # then make a bar chart fig6 = px.bar(model_count.sort_values('Number of Spaces', ascending=False).head(20), x='Model', y='Number of Spaces', title='Most Used Models', labels={'Model': 'Model', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark') gr.Plot(fig6) datasets = np.concatenate([arr for arr in df['datasets'].values if arr is not None]) dataset_count = {} dataset_author_count = {} for dataset in datasets: author = dataset.split('/')[0] if dataset in dataset_count: dataset_count[dataset] += 1 else: dataset_count[dataset] = 1 if author in dataset_author_count: dataset_author_count[author] += 1 else: dataset_author_count[author] = 1 dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces']) dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces']) fig9 = px.bar(dataset_author_count.sort_values('Number of Spaces', ascending=False).head(20), x='Dataset Author', y='Number of Spaces', title='Most Popular Dataset Authors', labels={'Dataset Author': 'Dataset Author', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark') gr.Plot(fig9) # then make a bar chart fig7 = px.bar(dataset_count.sort_values('Number of Spaces', ascending=False).head(20), x='Datasets', y='Number of Spaces', title='Most Used Datasets', labels={'Datasets': 'Datasets', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark') gr.Plot(fig7) # Get the most duplicated spaces duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index() duplicated_spaces.columns = ['Space', 'Number of Duplicates'] gr.DataFrame(duplicated_spaces) # Get the most duplicated spaces liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20) liked_spaces.columns = ['Space', 'Number of Likes'] gr.DataFrame(liked_spaces) # Get the spaces with the longest READMEs readme_sizes = df[['id', 'readme_size']].sort_values(by='readme_size', ascending=False).head(20) readme_sizes.columns = ['Space', 'Longest READMEs'] gr.DataFrame(readme_sizes) with gr.Tab(label="Spaces Search"): df = pd.read_parquet("spaces.parquet") df = df[df["stage"] == "RUNNING"] # combine the sdk and tags columns, one of which is a string and the other is an array of strings # first convert the sdk column to an array of strings df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)])) df["licenses"] = df["license"].apply( lambda x: np.array([str(x)]) if x is None else x ) # then combine the sdk and tags columns so that their elements are together df["sdk_tags"] = df[["sdk", "tags"]].apply( lambda x: np.concatenate((x.iloc[0], x.iloc[1])), axis=1 ) df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji']) # where the custom_domains column is not null, use that as the url, otherwise, use the host column df["url"] = np.where( df["custom_domains"].isnull(), df["id"], df["custom_domains"], ) df["url"] = df[["url", "emoji"]].apply( lambda x: ( f"{str(x.iloc[1]) + " " + x.iloc[0]}" if x.iloc[0] is not None and "/" in x.iloc[0] else f"{str(x.iloc[1]) + " " + x.iloc[0][0]}" ), axis=1, ) # Make all of this human readable df["r_models"] = [', '.join(models) if models is not None else '' for models in df["models"]] df["r_sdk_tags"] = [', '.join(sdk_tags) if sdk_tags is not None else '' for sdk_tags in df["sdk_tags"]] df["r_datasets"] = [', '.join(datasets) if datasets is not None else '' for datasets in df["datasets"]] df["r_licenses"] = [', '.join(licenses) if licenses is not None else '' for licenses in df["licenses"]] emoji = gr.Dropdown( df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True ) # Dropdown to select the emoji likes = gr.Slider( minimum=df["likes"].min(), maximum=df["likes"].max(), step=1, label="Filter by Likes", ) # Slider to filter by likes hardware = gr.Dropdown( df["hardware"].unique().tolist(), label="Search by Hardware", multiselect=True ) author = gr.Dropdown( df["author"].unique().tolist(), label="Search by Author", multiselect=True ) # get the list of unique strings in the sdk_tags column sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values)) # create a dropdown for the sdk_tags sdk_tags = gr.Dropdown( sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True ) # create a gradio checkbox group for hardware hardware = gr.CheckboxGroup( df["hardware"].unique().tolist(), label="Filter by Hardware" ) licenses = np.unique(np.concatenate(df["licenses"].values)) space_license = gr.CheckboxGroup(licenses.tolist(), label="Filter by license") # If the models column is none make it an array of "none" so that things don't break models_column_to_list = df["models"].apply( lambda x: np.array(["None"]) if np.ndim(x) == 0 else x ) # Now, flatten all arrays into one list models_flattened = np.concatenate(models_column_to_list.values) # Get unique strings unique_models = np.unique(models_flattened) models = gr.Dropdown( unique_models.tolist(), label="Search by Model", multiselect=True, ) # Do the same for datasets that we did for models datasets_column_to_list = df["datasets"].apply( lambda x: np.array(["None"]) if np.ndim(x) == 0 else x ) flattened_datasets = np.concatenate(datasets_column_to_list.values) unique_datasets = np.unique(flattened_datasets) datasets = gr.Dropdown( unique_datasets.tolist(), label="Search by Dataset", multiselect=True, ) devMode = gr.Checkbox(value=False, label="DevMode Enabled") clear = gr.ClearButton(components=[ emoji, author, hardware, sdk_tags, models, datasets, space_license ]) df = pd.DataFrame( df[ [ "id", "emoji", "author", "url", "likes", "hardware", "sdk_tags", "models", "datasets", "licenses", "r_sdk_tags", "r_models", "r_datasets", "r_licenses", ] ] ) gr.DataFrame( filtered_df, inputs=[ emoji, likes, author, hardware, sdk_tags, models, datasets, space_license, ], datatype="html", wrap=True, column_widths=["25%", "5%", "25%", "25%", "20%"] ) demo.launch()