import torch import torch.nn as nn import gradio as gr import glob from typing import List import torch.nn.functional as F import torchvision.transforms as T from sklearn.decomposition import PCA import sklearn import numpy as np # Constants patch_h = 40 patch_w = 40 # Use GPU if available if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") # DINOV2 model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14') # Trasnforms transform = T.Compose([ T.Resize((patch_h * 14, patch_w * 14)), T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ]) # Empty Tenosr imgs_tensor = torch.zeros(4, 3, patch_h * 14, patch_w * 14) # PCA pca = PCA(n_components=3) # Min-Max Scaler from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(clip=True) def query_image( img1, img2, img3, img4, background_threshold, is_foreground_larger_than_threshold, ) -> List[np.ndarray]: # Transform imgs = [img1, img2, img3, img4] for i, img in enumerate(imgs): img = np.transpose(img, (2, 0, 1)) / 255 imgs_tensor[i] = transform(torch.Tensor(img)) # Get feature from patches with torch.no_grad(): features_dict = model.forward_features(imgs_tensor) features = features_dict['x_prenorm'][:, 1:] features = features.reshape(4 * patch_h * patch_w, -1) # PCA Feature pca.fit(features) pca_features = pca.transform(features) scaler.fit(pca_features) pca_feature = scaler.transform(pca_features) # Foreground/Background if is_foreground_larger_than_threshold: pca_features_bg = pca_features[:, 0] < background_threshold else: pca_features_bg = pca_features[:, 0] > background_threshold pca_features_fg = ~pca_features_bg # PCA with only foreground pca.fit(features[pca_features_fg]) pca_features_rem = pca.transform(features[pca_features_fg]) # Min Max Normalization scaler.fit(pca_features_rem) pca_features_rem = scaler.transform(pca_features_rem) pca_features_rgb = np.zeros((4 * patch_h * patch_w, 3)) pca_features_rgb[pca_features_bg] = 0 pca_features_rgb[pca_features_fg] = pca_features_rem pca_features_rgb = pca_features_rgb.reshape(4, patch_h, patch_w, 3) return [pca_features_rgb[i] for i in range(4)] description = """ DINOV2 PCA demo for DINOv2: Learning Robust Visual Features without Supervision(Figure 1) How to Use: 1. Enter 4 images that have clean background and similar object. 2. Edit threshold and checkbox to split background/foreground. Method: 1. Compute the features of patches from 4 images. We can get a feature that have (4 * patch_w * patch_h, feature_dim) shape. 2. PCA the feature with 3 dims. After PCA, Min-Max normalization is performed. 3. Use first component to split foreground and background. (threshold and checkbox) 4. All the feature of patches included in the background are set to 0. 5. PCA is performed based on the remaining features. Afer PCA, Min-Max normalization is performed. 6. Visualize """ demo = gr.Interface( query_image, inputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image(), gr.Slider(-1, 1, value=0.1), gr.Checkbox(label="foreground is larger than threshold", value=True) ], outputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image()], title="DINOV2 PCA", description=description, examples=[ ["assets/1.png", "assets/2.png","assets/3.png","assets/4.png", 0.9, True], ["assets/5.png", "assets/6.png","assets/7.png","assets/8.png", 0.6, True], ["assets/9.png", "assets/10.png","assets/11.png","assets/12.png", 0.6, True], ] ) demo.launch()