import torch import torch.nn as nn import torch.nn.functional as F from .backbone import CNNEncoder from .geometry import coords_grid from .matching import ( global_correlation_softmax_prototype, local_correlation_softmax_prototype, ) from .transformer import FeatureTransformer from .utils import feature_add_position class UniMatch(nn.Module): def __init__( self, num_scales=1, feature_channels=128, upsample_factor=8, num_head=1, ffn_dim_expansion=4, num_transformer_layers=6, bilinear_upsample=False, corr_fn="global", ): super().__init__() self.feature_channels = feature_channels self.num_scales = num_scales self.upsample_factor = upsample_factor self.bilinear_upsample = bilinear_upsample if corr_fn == "global": self.corr_fn = global_correlation_softmax_prototype elif corr_fn == "local": self.corr_fn = local_correlation_softmax_prototype else: raise NotImplementedError(f"Correlation function {corr_fn} not implemented") # CNN self.backbone = CNNEncoder(output_dim=feature_channels, num_output_scales=num_scales) # Transformer self.transformer = FeatureTransformer( num_layers=num_transformer_layers, d_model=feature_channels, nhead=num_head, ffn_dim_expansion=ffn_dim_expansion, ) # convex upsampling similar to RAFT # concat feature0 and low res flow as input if not bilinear_upsample: self.upsampler = nn.Sequential( nn.Conv2d(2 + feature_channels, 256, 3, 1, 1), nn.ReLU(inplace=True), nn.Conv2d(256, upsample_factor**2 * 9, 1, 1, 0), ) def extract_feature(self, img0, img1): concat =, img1), dim=0) # [2B, C, H, W] features = self.backbone(concat) # list of [2B, C, H, W], resolution from high to low # reverse: resolution from low to high features = features[::-1] feature0, feature1 = [], [] for i in range(len(features)): feature = features[i] chunks = torch.chunk(feature, 2, 0) # tuple feature0.append(chunks[0]) feature1.append(chunks[1]) return feature0, feature1 def correlate_feature(self, feature0, feature1, attn_splits=2, attn_type="swin"): feature0, feature1 = feature_add_position( feature0, feature1, attn_splits, self.feature_channels ) feature0, feature1 = self.transformer( feature0, feature1, attn_type=attn_type, attn_num_splits=attn_splits, ) b, c, h, w = feature0.shape feature0 = feature0.view(b, c, -1).permute(0, 2, 1) # [B, H*W, C] feature1 = feature1.view(b, c, -1) # [B, C, H*W] correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / ( c**0.5 ) # [B, H, W, H, W] correlation = correlation.view(b, h * w, h * w) # [B, H*W, H*W] return correlation def forward( self, img0, img1, attn_type="swin", attn_splits=2, return_feature=False, bidirectional=False, cycle_consistency=False, corr_mask=None, ): # list of features, resolution low to high feature0_list, feature1_list = self.extract_feature(img0, img1) # list of features assert self.num_scales == 1 # multi-scale depth model is not supported yet scale_idx = 0 feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx] if cycle_consistency: # get both directions of features feature0, feature1 =, feature1), dim=0), (feature1, feature0), dim=0 ) # add position to features feature0, feature1 = feature_add_position( feature0, feature1, attn_splits, self.feature_channels ) # Transformer feature0, feature1 = self.transformer( feature0, feature1, attn_type=attn_type, attn_num_splits=attn_splits, ) b, c, h, w = feature0.shape # downsampled_img0 = F.interpolate(img0, size=(h, w), mode="bilinear", align_corners=False) flow_coords = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W] # values =, flow_coords), dim=1) # [B, 5, H, W] # correlation and softmax query_results, correlation = self.corr_fn( feature0, feature1, flow_coords, pred_bidir_flow=bidirectional, corr_mask=corr_mask ) if bidirectional: flow_coords =, flow_coords), dim=0) up_feature =, feature1), dim=0) else: up_feature = feature0 flow = query_results - flow_coords flow_up = self.upsample_flow(flow, up_feature, bilinear=self.bilinear_upsample) if return_feature: return flow_up, flow, correlation, feature0, feature1 else: return flow_up, flow, correlation def forward_features( self, img0, img1, attn_type="swin", attn_splits=2, ): feature0_list, feature1_list = self.extract_feature(img0, img1) # list of features assert self.num_scales == 1 # multi-scale depth model is not supported yet scale_idx = 0 feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx] # add position to features feature0, feature1 = feature_add_position( feature0, feature1, attn_splits, self.feature_channels ) # Transformer feature0, feature1 = self.transformer( feature0, feature1, attn_type=attn_type, attn_num_splits=attn_splits, ) return feature0, feature1 def upsample_flow(self, flow, feature, bilinear=False, upsample_factor=8, is_depth=False): if bilinear: multiplier = 1 if is_depth else upsample_factor up_flow = ( F.interpolate( flow, scale_factor=upsample_factor, mode="bilinear", align_corners=False ) * multiplier ) else: concat =, feature), dim=1) mask = self.upsampler(concat) up_flow = upsample_flow_with_mask( flow, mask, upsample_factor=self.upsample_factor, is_depth=is_depth ) return up_flow def upsample_flow_with_mask(flow, up_mask, upsample_factor, is_depth=False): # convex upsampling following raft mask = up_mask b, flow_channel, h, w = flow.shape mask = mask.view(b, 1, 9, upsample_factor, upsample_factor, h, w) # [B, 1, 9, K, K, H, W] mask = torch.softmax(mask, dim=2) multiplier = 1 if is_depth else upsample_factor up_flow = F.unfold(multiplier * flow, [3, 3], padding=1) up_flow = up_flow.view(b, flow_channel, 9, 1, 1, h, w) # [B, 2, 9, 1, 1, H, W] up_flow = torch.sum(mask * up_flow, dim=2) # [B, 2, K, K, H, W] up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) # [B, 2, K, H, K, W] up_flow = up_flow.reshape( b, flow_channel, upsample_factor * h, upsample_factor * w ) # [B, 2, K*H, K*W] return up_flow