Spaces:

jsulz
/

spaces-ship

Running

File size: 16,929 Bytes

import gradio as gr
import pandas as pd
import numpy as np
import plotly.express as px
from datasets import load_dataset

def load_transform_data():
    """
    Load and transform data from a parquet file.

    Returns:
        pandas.DataFrame: Transformed dataframe.
    """
    spaces_dataset = 'jsulz/space-stats'
    dataset = load_dataset(spaces_dataset)
    df = dataset['train'].to_pandas()
    # combine the sdk and tags columns, one of which is a string and the other is an array of strings
    df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
    df["licenses"] = df["license"].apply(
        lambda x: np.array([str(x)]) if x is None else x
    )
    # then combine the sdk and tags columns so that their elements are together
    df["sdk_tags"] = df[["sdk", "tags"]].apply(
        lambda x: np.concatenate((x.iloc[0], x.iloc[1])), axis=1
    )

    # Fill the NaN values with an empty string
    df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji'])

    # where the custom_domains column is not null, use that as the url, otherwise, use the host column
    df["url"] = np.where(
        df["custom_domains"].isnull(),
        df["id"],
        df["custom_domains"],
    )

    # Build up a pretty url that's clickable with the emoji
    df["url"] = df[["url", "emoji"]].apply(
        lambda x: (
            f"<a target='_blank' href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>"
            if x.iloc[0] is not None and "/" in x.iloc[0]
            else f"<a target='_blank' href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>"
        ),
        axis=1,
    )

    # Prep the models, datasets, and licenses columns for display
    df["r_models"] = [
        ", ".join(models) if models is not None else "" for models in df["models"]
    ]
    df["r_sdk_tags"] = [
        ", ".join(sdk_tags) if sdk_tags is not None else ""
        for sdk_tags in df["sdk_tags"]
    ]
    df["r_datasets"] = [
        ", ".join(datasets) if datasets is not None else ""
        for datasets in df["datasets"]
    ]
    df["r_licenses"] = [
        ", ".join(licenses) if licenses is not None else ""
        for licenses in df["licenses"]
    ]
    return df


def filtered_df(
    filtered_emojis,
    filtered_likes,
    filtered_author,
    filtered_hardware,
    filtered_tags,
    filtered_models,
    filtered_datasets,
    space_licenses,
    filtered_devmode,
):
    """
    Filter the dataframe based on the given criteria.

    Args:
        filtered_emojis (list): List of emojis to filter the dataframe by.
        filtered_likes (int): Minimum number of likes to filter the dataframe by.
        filtered_author (list): List of authors to filter the dataframe by.
        filtered_hardware (list): List of hardware to filter the dataframe by.
        filtered_tags (list): List of tags to filter the dataframe by.
        filtered_models (list): List of models to filter the dataframe by.
        filtered_datasets (list): List of datasets to filter the dataframe by.
        space_licenses (list): List of licenses to filter the dataframe by.

    Returns:
        pandas.DataFrame: Filtered dataframe with the following columns: "URL", "Likes", "Models", "Datasets", "Licenses".
    """
    _df = df
    if filtered_emojis:
        _df = _df[_df["emoji"].isin(filtered_emojis)]
    if filtered_likes:
        _df = _df[_df["likes"] >= filtered_likes]
    if filtered_author:
        _df = _df[_df["author"].isin(filtered_author)]
    if filtered_hardware:
        _df = _df[_df["hardware"].isin(filtered_hardware)]
    if filtered_tags:
        _df = _df[
            _df["sdk_tags"].apply(lambda x: any(tag in x for tag in filtered_tags))
        ]
    if filtered_models:
        _df = _df[
            _df["models"].apply(
                lambda x: (
                    any(model in x for model in filtered_models)
                    if x is not None
                    else False
                )
            )
        ]
    if filtered_datasets:
        _df = _df[
            _df["datasets"].apply(
                lambda x: (
                    any(dataset in x for dataset in filtered_datasets)
                    if x is not None
                    else False
                )
            )
        ]
    if space_licenses:
        _df = _df[
            _df["licenses"].apply(
                lambda x: (
                    any(space_license in x for space_license in space_licenses)
                    if x is not None
                    else False
                )
            )
        ]

    # rename the columns names to make them more readable
    _df = _df.rename(
        columns={
            "url": "URL",
            "likes": "Likes",
            "r_models": "Models",
            "r_datasets": "Datasets",
            "r_licenses": "Licenses",
        }
    )
    if filtered_devmode:
        _df = _df[
            _df["devMode"] == filtered_devmode
        ]

    return _df[["URL", "Likes", "Models", "Datasets", "Licenses"]]


def count_items(items):
    """
    Count the occurrences of items and authors in a given list of items.
    Parameters:
    items (dataframe column): A dataframe column containing a list of items.
    Returns:
    tuple: A tuple containing two dictionaries. The first dictionary contains the count of each item, 
    and the second dictionary contains the count of each author.
    """
    items = np.concatenate([arr for arr in items.values if arr is not None])
    item_count = {}
    item_author_count = {}
    for item in items:
        if item in item_count:
            item_count[item] += 1
        else:
            item_count[item] = 1
        author = item.split('/')[0]
        if author in item_author_count:
            item_author_count[author] += 1
        else:
            item_author_count[author] = 1
    
    return item_count, item_author_count

def flatten_column(_df, column):
    """
    Flattens a column in a DataFrame.

    Args:
        _df (pandas.DataFrame): The DataFrame containing the column.
        column (str): The name of the column to flatten.

    Returns:
        list: A list of unique values from the flattened column.
    """
    column_to_list = _df[column].apply(
        lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
    )
    flattened = np.concatenate(column_to_list.values)
    uniques = np.unique(flattened)
    return uniques.tolist()


with gr.Blocks(fill_width=True) as demo:
    df = load_transform_data()
    with gr.Tab(label="Spaces Overview"):

        # The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
        # The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
        df = df.sort_values("created_at")
        df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int)
        fig1 = px.line(
            df,
            x="created_at",
            y="cumulative_spaces",
            title="Growth of Spaces Over Time",
            labels={"created_at": "Date", "cumulative_spaces": "Number of Spaces"},
            template="plotly_dark",
        )
        gr.Plot(fig1)

        with gr.Row():
            # Create a pie charge showing the distribution of spaces by SDK
            fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark')
            gr.Plot(fig2)

            # create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
            emoji_counts = df['emoji'].value_counts().head(10).reset_index()
            fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark')
            gr.Plot(fig3)

        # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
        author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index()
        fig4 = px.scatter(
            author_likes,
            x="id",
            y="likes",
            title="Relationship between Number of Spaces Created and Number of Likes",
            labels={"id": "Number of Spaces Created", "likes": "Number of Likes"},
            hover_data={"author": True},
            template="plotly_dark",
        )
        gr.Plot(fig4)

        # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
        emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index()
        fig10 = px.scatter(
            emoji_likes,
            x="id",
            y="likes",
            title="Relationship between Space Emoji and Number of Likes",
            labels={"id": "Number of Spaces Created", "likes": "Number of Likes"},
            hover_data={"emoji": True},
            template="plotly_dark",
        )
        gr.Plot(fig10)

        # Create a bar chart of hardware in use
        hardware = df['hardware'].value_counts().reset_index()
        hardware.columns = ['Hardware', 'Number of Spaces']
        fig5 = px.bar(
            hardware,
            x="Hardware",
            y="Number of Spaces",
            title="Hardware in Use",
            labels={
                "Hardware": "Hardware",
                "Number of Spaces": "Number of Spaces (log scale)",
            },
            color="Hardware",
            template="plotly_dark",
        )
        fig5.update_layout(yaxis_type="log")
        gr.Plot(fig5)

        model_count, model_author_count = count_items(df['models'])
        model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces'])
        fig8 = px.bar(
            model_author_count.sort_values("Number of Spaces", ascending=False).head(
                20
            ),
            x="Model Author",
            y="Number of Spaces",
            title="Most Popular Model Authors",
            labels={"Model": "Model", "Number of Spaces": "Number of Spaces"},
            template="plotly_dark",
        )
        gr.Plot(fig8)
        model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces'])
        # then make a bar chart
        fig6 = px.bar(
            model_count.sort_values("Number of Spaces", ascending=False).head(20),
            x="Model",
            y="Number of Spaces",
            title="Most Used Models",
            labels={"Model": "Model", "Number of Spaces": "Number of Spaces"},
            template="plotly_dark",
        )
        gr.Plot(fig6)

        dataset_count, dataset_author_count = count_items(df['datasets'])
        dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces'])
        dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces'])
        fig9 = px.bar(
            dataset_author_count.sort_values("Number of Spaces", ascending=False).head(
                20
            ),
            x="Dataset Author",
            y="Number of Spaces",
            title="Most Popular Dataset Authors",
            labels={
                "Dataset Author": "Dataset Author",
                "Number of Spaces": "Number of Spaces",
            },
            template="plotly_dark",
        )
        gr.Plot(fig9)
        # then make a bar chart
        fig7 = px.bar(
            dataset_count.sort_values("Number of Spaces", ascending=False).head(20),
            x="Datasets",
            y="Number of Spaces",
            title="Most Used Datasets",
            labels={"Datasets": "Datasets", "Number of Spaces": "Number of Spaces"},
            template="plotly_dark",
        )
        gr.Plot(fig7)

        with gr.Row():
            # Get the most duplicated spaces
            duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index()
            duplicated_spaces["duplicated_from"] = duplicated_spaces[
                "duplicated_from"
            ].apply(
                lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
            )
            duplicated_spaces.columns = ["Space", "Number of Duplicates"]
            gr.DataFrame(duplicated_spaces, datatype="html" )

            # Get the most liked spaces
            liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20)
            liked_spaces["id"] = liked_spaces["id"].apply(
                lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
            )
            liked_spaces.columns = ['Space', 'Number of Likes']
            gr.DataFrame(liked_spaces, datatype="html")

        with gr.Row():
            # Create a dataframe with the top 10 authors and the number of spaces they have created
            author_counts = df['author'].value_counts().head(20).reset_index()
            author_counts["author"] = author_counts["author"].apply(
                lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
            )
            author_counts.columns = ["Author", "Number of Spaces"]
            gr.DataFrame(author_counts, datatype="html")

            # create a dataframe where we groupby author and sum their likes
            author_likes = df.groupby('author').agg({'likes': 'sum'}).reset_index()
            author_likes = author_likes.sort_values(by='likes', ascending=False).head(20)
            author_likes["author"] = author_likes["author"].apply(
                lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
            )
            author_likes.columns = ["Author", "Number of Likes"]
            gr.DataFrame(author_likes, datatype="html")


    with gr.Tab(label="Spaces Search"):
        df = df[df['stage'] == 'RUNNING']

        # Layout
        with gr.Row():
            emoji = gr.Dropdown(
                df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
            )  # Dropdown to select the emoji
            likes = gr.Slider(
                minimum=df["likes"].min(),
                maximum=df["likes"].max(),
                step=1,
                label="Filter by Likes",
            )  # Slider to filter by likes
        with gr.Row():
            author = gr.Dropdown(
                df["author"].unique().tolist(), label="Search by Author", multiselect=True
            )
            # get the list of unique strings in the sdk_tags column
            sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
            # create a dropdown for the sdk_tags
            sdk_tags = gr.Dropdown(
                sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True
            )
        with gr.Row():
            # create a gradio checkbox group for hardware
            hardware = gr.CheckboxGroup(
                df["hardware"].unique().tolist(), label="Filter by Hardware"
            )

            licenses = np.unique(np.concatenate(df["licenses"].values))
            space_license = gr.Dropdown(licenses.tolist(), label="Filter by license")

        with gr.Row():
            models = gr.Dropdown(
                flatten_column(df, "models"),
                label="Search by Model",
                multiselect=True,
            )
            datasets = gr.Dropdown(
                flatten_column(df, "datasets"),
                label="Search by Dataset",
                multiselect=True,
            )

        devmode = gr.Checkbox(label="Show Dev Mode Spaces")
        clear = gr.ClearButton(components=[
                emoji,
                author,
                hardware,
                sdk_tags,
                models,
                datasets,
                space_license
                ])

        df = pd.DataFrame(
            df[
                [
                    "id",
                    "emoji",
                    "author",
                    "url",
                    "likes",
                    "hardware",
                    "sdk_tags",
                    "models",
                    "datasets",
                    "licenses",
                    "r_sdk_tags",
                    "r_models",
                    "r_datasets",
                    "r_licenses",
                    'devMode'
                ]
            ]
        )
        gr.DataFrame(
            filtered_df,
            inputs=[
                emoji,
                likes,
                author,
                hardware,
                sdk_tags,
                models,
                datasets,
                space_license,
                devmode,
            ],
            datatype="html",
            wrap=True, 
            column_widths=["25%", "5%", "25%", "25%", "20%"]
        )


demo.launch()