Spaces:

neggles
/

pi-tagger

Running

App Files Files

neggles commited on Mar 7

Commit

5b0d6ce

•

1 Parent(s): 92d4909

update for v3

Browse files

Files changed (4) hide show

README.md +11 -5
app.py +169 -63
data/selected_tags.csv +0 -0
tagger/common.py +56 -4

README.md CHANGED Viewed

@@ -9,12 +9,18 @@ app_file: app.py
 pinned: false
 short_description: A WD Tagger Space for pi-chan to use
 preload_from_hub:
-  - SmilingWolf/wd-v1-4-moat-tagger-v2 model.onnx
-  - SmilingWolf/wd-v1-4-swinv2-tagger-v2 model.onnx
-  - SmilingWolf/wd-v1-4-convnext-tagger-v2 model.onnx
-  - SmilingWolf/wd-v1-4-convnextv2-tagger-v2 model.onnx
-  - SmilingWolf/wd-v1-4-vit-tagger-v2 model.onnx
 models:
   - SmilingWolf/wd-v1-4-moat-tagger-v2
   - SmilingWolf/wd-v1-4-swinv2-tagger-v2
   - SmilingWolf/wd-v1-4-convnext-tagger-v2

 pinned: false
 short_description: A WD Tagger Space for pi-chan to use
 preload_from_hub:
+  - SmilingWolf/wd-vit-tagger-v3 model.onnx,selected_tags.csv
+  - SmilingWolf/wd-swinv2-tagger-v3 model.onnx,selected_tags.csv
+  - SmilingWolf/wd-convnext-tagger-v3 model.onnx,selected_tags.csv
+  - SmilingWolf/wd-v1-4-moat-tagger-v2 model.onnx,selected_tags.csv
+  - SmilingWolf/wd-v1-4-swinv2-tagger-v2 model.onnx,selected_tags.csv
+  - SmilingWolf/wd-v1-4-convnext-tagger-v2 model.onnx,selected_tags.csv
+  - SmilingWolf/wd-v1-4-convnextv2-tagger-v2 model.onnx,selected_tags.csv
+  - SmilingWolf/wd-v1-4-vit-tagger-v2 model.onnx,selected_tags.csv
 models:
+  - SmilingWolf/wd-vit-tagger-v3
+  - SmilingWolf/wd-swinv2-tagger-v3
+  - SmilingWolf/wd-convnext-tagger-v3
   - SmilingWolf/wd-v1-4-moat-tagger-v2
   - SmilingWolf/wd-v1-4-swinv2-tagger-v2
   - SmilingWolf/wd-v1-4-convnext-tagger-v2

app.py CHANGED Viewed

@@ -7,25 +7,41 @@ import numpy as np
 import onnxruntime as rt
 from PIL import Image
-from tagger.common import LabelData, load_labels, preprocess_image
 from tagger.model import create_session
 HF_TOKEN = getenv("HF_TOKEN", None)
-WORK_DIR = Path.cwd().resolve()
 MODEL_VARIANTS: dict[str, str] = {
-    "MOAT": "SmilingWolf/wd-v1-4-moat-tagger-v2",
-    "SwinV2": "SmilingWolf/wd-v1-4-swinv2-tagger-v2",
-    "ConvNeXT": "SmilingWolf/wd-v1-4-convnext-tagger-v2",
-    "ConvNeXTv2": "SmilingWolf/wd-v1-4-convnextv2-tagger-v2",
-    "ViT": "SmilingWolf/wd-v1-4-vit-tagger-v2",
 }
 # allowed extensions
 IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"]
-# model input shape
-IMAGE_SIZE = 448
 example_images = sorted(
     [
         str(x.relative_to(WORK_DIR))
@@ -33,34 +49,51 @@ example_images = sorted(
         if x.is_file() and x.suffix.lower() in IMAGE_EXTENSIONS
     ]
 )
-loaded_models: dict[str, Optional[rt.InferenceSession]] = {k: None for k, _ in MODEL_VARIANTS.items()}
-def load_model(variant: str) -> rt.InferenceSession:
     global loaded_models
     # resolve the repo name
-    model_repo = MODEL_VARIANTS.get(variant, None)
     if model_repo is None:
-        raise ValueError(f"Unknown model variant: {variant}")
-    if loaded_models.get(variant, None) is None:
         # save model to cache
-        loaded_models[variant] = create_session(model_repo, token=HF_TOKEN)
-    return loaded_models[variant]
 def predict(
     image: Image.Image,
     variant: str,
-    general_threshold: float = 0.35,
-    character_threshold: float = 0.85,
 ):
-    # Load model
-    model: rt.InferenceSession = load_model(variant)
     # load labels
-    labels: LabelData = load_labels()
     # get input size and name
     _, h, w, _ = model.get_inputs()[0].shape
@@ -85,13 +118,21 @@ def predict(
     rating_labels = dict([probs[i] for i in labels.rating])
     # General labels, pick any where prediction confidence > threshold
     gen_labels = [probs[i] for i in labels.general]
-    gen_labels = dict([x for x in gen_labels if x[1] > general_threshold])
     gen_labels = dict(sorted(gen_labels.items(), key=lambda item: item[1], reverse=True))
     # Character labels, pick any where prediction confidence > threshold
     char_labels = [probs[i] for i in labels.character]
-    char_labels = dict([x for x in char_labels if x[1] > character_threshold])
     char_labels = dict(sorted(char_labels.items(), key=lambda item: item[1], reverse=True))
     # Combine general and character labels, sort by confidence
@@ -102,64 +143,129 @@ def predict(
     caption = ", ".join(combined_names)
     booru = caption.replace("_", " ").replace("(", "\(").replace(")", "\)")
-    return image, caption, booru, rating_labels, char_labels, gen_labels
-with gr.Blocks(theme="NoCrypt/miku", analytics_enabled=False, title="pi-chan's tagger") as demo:
     with gr.Row(equal_height=False):
-        with gr.Column():
-            img_input = gr.Image(
-                label="Input",
-                type="pil",
-                image_mode="RGB",
-                sources=["upload", "clipboard"],
-            )
-            variant = gr.Radio(choices=list(MODEL_VARIANTS.keys()), label="Model Variant", value="MOAT")
-            gen_thresh = gr.Slider(0.0, 1.0, value=0.35, label="General Tag Threshold")
-            char_thresh = gr.Slider(0.0, 1.0, value=0.85, label="Character Tag Threshold")
-            show_processed = gr.Checkbox(label="Show Preprocessed", value=False)
             with gr.Row():
-                submit = gr.Button(value="Submit", variant="primary", size="lg")
                 clear = gr.ClearButton(
                     components=[],
                     variant="secondary",
                     size="lg",
                 )
-            with gr.Row():
-                examples = gr.Examples(
-                    examples=[
-                        [imgpath, var, 0.35, 0.85]
-                        for imgpath in example_images
-                        for var in ["MOAT", "ConvNeXTv2"]
-                    ],
-                    inputs=[img_input, variant, gen_thresh, char_thresh],
-                )
-        with gr.Column():
-            img_output = gr.Image(label="Preprocessed", type="pil", image_mode="RGB", scale=1, visible=False)
             with gr.Group():
-                tags_string = gr.Textbox(
-                    label="Caption", placeholder="Caption will appear here", show_copy_button=True
-                )
-                tags_booru = gr.Textbox(
-                    label="Tags", placeholder="Tag string will appear here", show_copy_button=True
-                )
-            rating = gr.Label(label="Rating")
-            character = gr.Label(label="Character")
-            general = gr.Label(label="General")
     # tell clear button which components to clear
-    clear.add([img_input, img_output, tags_string, rating, character, general])
     # show/hide processed image
-    def on_select_show_processed(evt: gr.SelectData):
-        return gr.update(visible=evt.selected)
-    show_processed.select(on_select_show_processed, inputs=[], outputs=[img_output])
     submit.click(
         predict,
-        inputs=[img_input, variant, gen_thresh, char_thresh],
-        outputs=[img_output, tags_string, tags_booru, rating, character, general],
         api_name="predict",
     )

 import onnxruntime as rt
 from PIL import Image
+from tagger.common import LabelData, load_labels_hf, preprocess_image
 from tagger.model import create_session
+TITLE = "WaifuDiffusion Tagger"
+DESCRIPTION = """
+Tag images with the WaifuDiffusion Tagger models!
+Primarily used as a backend for a Discord bot.
+"""
 HF_TOKEN = getenv("HF_TOKEN", None)
 MODEL_VARIANTS: dict[str, str] = {
+    "v3": {
+        "SwinV2": "SmilingWolf/wd-swinv2-tagger-v3",
+        "ConvNeXT": "SmilingWolf/wd-convnext-tagger-v3",
+        "ViT": "SmilingWolf/wd-vit-tagger-v3",
+    },
+    "v2": {
+        "MOAT": "SmilingWolf/wd-v1-4-moat-tagger-v2",
+        "SwinV2": "SmilingWolf/wd-v1-4-swinv2-tagger-v2",
+        "ConvNeXT": "SmilingWolf/wd-v1-4-convnext-tagger-v2",
+        "ConvNeXTv2": "SmilingWolf/wd-v1-4-convnextv2-tagger-v2",
+        "ViT": "SmilingWolf/wd-v1-4-vit-tagger-v2",
+    },
 }
+# prepopulate cache keys in model cache
+cache_keys = ["-".join([x, y]) for x in MODEL_VARIANTS.keys() for y in MODEL_VARIANTS[x].keys()]
+loaded_models: dict[str, Optional[rt.InferenceSession]] = {k: None for k in cache_keys}
+# get the repo root (or the current working directory if running in ipython)
+WORK_DIR = Path(__file__).parent.resolve() if "__file__" in globals() else Path().resolve()
 # allowed extensions
 IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"]
+# get the example images
 example_images = sorted(
     [
         str(x.relative_to(WORK_DIR))
         if x.is_file() and x.suffix.lower() in IMAGE_EXTENSIONS
     ]
 )
+def load_model(version: str, variant: str) -> rt.InferenceSession:
     global loaded_models
     # resolve the repo name
+    model_repo = MODEL_VARIANTS.get(version, {}).get(variant, None)
     if model_repo is None:
+        raise ValueError(f"Unknown model variant: {version}-{variant}")
+    cache_key = f"{version}-{variant}"
+    if loaded_models.get(cache_key, None) is None:
         # save model to cache
+        loaded_models[cache_key] = create_session(model_repo, token=HF_TOKEN)
+    return loaded_models[cache_key]
+def mcut_threshold(probs: np.ndarray) -> float:
+    """
+    Maximum Cut Thresholding (MCut)
+    Largeron, C., Moulin, C., & Gery, M. (2012). MCut: A Thresholding Strategy
+     for Multi-label Classification. In 11th International Symposium, IDA 2012
+     (pp. 172-183).
+    """
+    probs = probs[probs.argsort()[::-1]]
+    diffs = probs[:-1] - probs[1:]
+    idx = diffs.argmax()
+    thresh = (probs[idx] + probs[idx + 1]) / 2
+    return float(thresh)
 def predict(
     image: Image.Image,
+    version: str,
     variant: str,
+    gen_threshold: float = 0.35,
+    gen_use_mcut: bool = False,
+    char_threshold: float = 0.85,
+    char_use_mcut: bool = False,
 ):
+    # join variant for cache key
+    model: rt.InferenceSession = load_model(version, variant)
     # load labels
+    labels: LabelData = load_labels_hf(MODEL_VARIANTS[version][variant])
     # get input size and name
     _, h, w, _ = model.get_inputs()[0].shape
     rating_labels = dict([probs[i] for i in labels.rating])
     # General labels, pick any where prediction confidence > threshold
+    if gen_use_mcut:
+        gen_array = np.array([probs[i][1] for i in labels.general])
+        gen_threshold = mcut_threshold(gen_array)
     gen_labels = [probs[i] for i in labels.general]
+    gen_labels = dict([x for x in gen_labels if x[1] > gen_threshold])
     gen_labels = dict(sorted(gen_labels.items(), key=lambda item: item[1], reverse=True))
     # Character labels, pick any where prediction confidence > threshold
+    if char_use_mcut:
+        char_array = np.array([probs[i][1] for i in labels.character])
+        char_threshold = round(mcut_threshold(char_array), 2)
     char_labels = [probs[i] for i in labels.character]
+    char_labels = dict([x for x in char_labels if x[1] > char_threshold])
     char_labels = dict(sorted(char_labels.items(), key=lambda item: item[1], reverse=True))
     # Combine general and character labels, sort by confidence
     caption = ", ".join(combined_names)
     booru = caption.replace("_", " ").replace("(", "\(").replace(")", "\)")
+    return image, caption, booru, rating_labels, char_labels, char_threshold, gen_labels, gen_threshold
+css = """
+#gen_mcut, #char_mcut {
+    padding-top: var(--scale-3);
+}
+#gen_threshold.dimmed, #char_threshold.dimmed {
+    filter: brightness(75%);
+}
+"""
+with gr.Blocks(theme="NoCrypt/miku", analytics_enabled=False, title=TITLE, css=css) as demo:
     with gr.Row(equal_height=False):
+        with gr.Column(min_width=720):
+            with gr.Group():
+                img_input = gr.Image(
+                    label="Input",
+                    type="pil",
+                    image_mode="RGB",
+                    sources=["upload", "clipboard"],
+                )
+                show_processed = gr.Checkbox(label="Show Preprocessed Image", value=False)
+            with gr.Row():
+                version = gr.Radio(
+                    choices=list(MODEL_VARIANTS.keys()),
+                    label="Model Version",
+                    value="v3",
+                    min_width=160,
+                    scale=1,
+                )  # gen_threshold > div.wrap.hide
+                variant = gr.Radio(
+                    choices=list(MODEL_VARIANTS[version.value].keys()),
+                    label="Model Variant",
+                    value="ConvNeXT",
+                    min_width=560,
+                )
+            with gr.Group():
+                with gr.Row():
+                    gen_threshold = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.35,
+                        step=0.01,
+                        label="General Tag Threshold",
+                        scale=5,
+                        elem_id="gen_threshold",
+                    )
+                    gen_mcut = gr.Checkbox(label="Use Max-Cut", value=False, scale=1, elem_id="gen_mcut")
+                with gr.Row():
+                    char_threshold = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.85,
+                        step=0.01,
+                        label="Character Tag Threshold",
+                        scale=5,
+                        elem_id="char_threshold",
+                    )
+                    char_mcut = gr.Checkbox(label="Use Max-Cut", value=False, scale=1, elem_id="char_mcut")
             with gr.Row():
                 clear = gr.ClearButton(
                     components=[],
                     variant="secondary",
                     size="lg",
                 )
+                submit = gr.Button(value="Submit", variant="primary", size="lg")
+        with gr.Column(min_width=720):
+            img_output = gr.Image(
+                label="Preprocessed Image", type="pil", image_mode="RGB", scale=1, visible=False
+            )
             with gr.Group():
+                caption = gr.Textbox(label="Caption", show_copy_button=True)
+                tags = gr.Textbox(label="Tags", show_copy_button=True)
+            with gr.Group():
+                rating = gr.Label(label="Rating")
+            with gr.Group():
+                char_mcut_out = gr.Number(label="Max-Cut Threshold", precision=2, visible=False)
+                character = gr.Label(label="Character")
+            with gr.Group():
+                gen_mcut_out = gr.Number(label="Max-Cut Threshold", precision=2, visible=False)
+                general = gr.Label(label="General")
+    with gr.Row():
+        examples = [[imgpath, 0.35, mc, 0.85, mc] for mc in [False, True] for imgpath in example_images]
+        examples = gr.Examples(
+            examples=examples,
+            inputs=[img_input, gen_threshold, gen_mcut, char_threshold, char_mcut],
+        )
     # tell clear button which components to clear
+    clear.add([img_input, img_output, caption, rating, character, general])
+    def on_select_variant(evt: gr.SelectData, variant: str):
+        if evt.selected:
+            choices = list(MODEL_VARIANTS[variant])
+            return gr.update(choices=choices, value=choices[0])
+        return gr.update()
+    version.select(on_select_variant, inputs=[version], outputs=[variant])
     # show/hide processed image
+    def on_change_show(val: gr.Checkbox):
+        return gr.update(visible=val)
+    show_processed.select(on_change_show, inputs=[show_processed], outputs=[img_output])
+    # handle mcut thresholding (auto-calculate threshold from probs, disable slider)
+    def on_change_mcut(val: gr.Checkbox):
+        return (
+            gr.update(interactive=not val, elem_classes=["dimmed"] if val else []),
+            gr.update(visible=val),
+        )
+    gen_mcut.change(on_change_mcut, inputs=[gen_mcut], outputs=[gen_threshold, gen_mcut_out])
+    char_mcut.change(on_change_mcut, inputs=[char_mcut], outputs=[char_threshold, char_mcut_out])
     submit.click(
         predict,
+        inputs=[img_input, version, variant, gen_threshold, gen_mcut, char_threshold, char_mcut],
+        outputs=[img_output, caption, tags, rating, character, char_threshold, general, gen_threshold],
         api_name="predict",
     )

data/selected_tags.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

tagger/common.py CHANGED Viewed

@@ -3,10 +3,12 @@ from dataclasses import asdict, dataclass
 from functools import lru_cache
 from os import PathLike
 from pathlib import Path
-from typing import Any
 import numpy as np
 import pandas as pd
 from PIL import Image
@@ -36,10 +38,36 @@ class ImageLabels(DictJsonMixin):
 @lru_cache(maxsize=5)
-def load_labels(csv_path: PathLike = "data/selected_tags.csv") -> LabelData:
-    csv_path = Path(csv_path).resolve()
     if not csv_path.is_file():
-        raise FileNotFoundError("No selected_tags.csv found")
     df: pd.DataFrame = pd.read_csv(csv_path, usecols=["name", "category"])
     tag_data = LabelData(
@@ -101,3 +129,27 @@ def preprocess_image(
         image.thumbnail(size_px, Image.BICUBIC)
     return image

 from functools import lru_cache
 from os import PathLike
 from pathlib import Path
+from typing import Any, Optional
 import numpy as np
 import pandas as pd
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import HfHubHTTPError
 from PIL import Image
 @lru_cache(maxsize=5)
+def load_labels(version: str = "v3", data_dir: PathLike = "./data") -> LabelData:
+    data_dir = Path(data_dir).resolve()
+    csv_path = data_dir.joinpath(f"selected_tags_{version}.csv")
     if not csv_path.is_file():
+        raise FileNotFoundError(f"{csv_path.name} not found in {data_dir}")
+    df: pd.DataFrame = pd.read_csv(csv_path, usecols=["name", "category"])
+    tag_data = LabelData(
+        names=df["name"].tolist(),
+        rating=list(np.where(df["category"] == 9)[0]),
+        general=list(np.where(df["category"] == 0)[0]),
+        character=list(np.where(df["category"] == 4)[0]),
+    )
+    return tag_data
+@lru_cache(maxsize=5)
+def load_labels_hf(
+    repo_id: str,
+    revision: Optional[str] = None,
+    token: Optional[str] = None,
+) -> LabelData:
+    try:
+        csv_path = hf_hub_download(
+            repo_id=repo_id, filename="selected_tags.csv", revision=revision, token=token
+        )
+        csv_path = Path(csv_path).resolve()
+    except HfHubHTTPError as e:
+        raise FileNotFoundError(f"selected_tags.csv failed to download from {repo_id}") from e
     df: pd.DataFrame = pd.read_csv(csv_path, usecols=["name", "category"])
     tag_data = LabelData(
         image.thumbnail(size_px, Image.BICUBIC)
     return image
+# https://github.com/toriato/stable-diffusion-webui-wd14-tagger/blob/a9eacb1eff904552d3012babfa28b57e1d3e295c/tagger/ui.py#L368
+kaomojis = [
+    "0_0",
+    "(o)_(o)",
+    "+_+",
+    "+_-",
+    "._.",
+    "<o>_<o>",
+    "<|>_<|>",
+    "=_=",
+    ">_<",
+    "3_3",
+    "6_9",
+    ">_o",
+    "@_@",
+    "^_^",
+    "o_o",
+    "u_u",
+    "x_x",
+    "|_|",
+    "||_||",
+]