From 358e86ed98d3bda653965fbbb056bf0ffa490b49 Mon Sep 17 00:00:00 2001 From: shenyunhang Date: Thu, 28 Mar 2024 09:57:57 +0800 Subject: [PATCH] ef35a38ad29c@2024-03-28_09-57-57: add ape-ti with vit-ti backbone, support fsdp and vit-e --- ape/checkpoint/__init__.py | 1 + ape/checkpoint/detection_checkpoint.py | 48 +- ape/data/__init__.py | 1 + ape/data/build.py | 135 ++++ ape/data/samplers/__init__.py | 3 +- .../distributed_sampler_multi_dataset.py | 39 + ape/engine/defaults.py | 131 ++++ ape/modeling/backbone/vit.py | 3 + ape/modeling/backbone/vit_eva.py | 3 + ape/modeling/backbone/vit_eva02.py | 208 +++++- ape/modeling/backbone/vit_eva_clip.py | 33 +- ape/modeling/text/clip_wrapper_eva02.py | 10 +- ape/modeling/text/eva01_clip/eva_clip.py | 2 +- ape/modeling/text/eva01_clip/eva_model.py | 10 + ape/modeling/text/eva02_clip/model.py | 5 +- ape/modeling/text/eva02_clip/transformer.py | 2 +- .../ape_deta_vitt_eva02_vlf_lsj1024.py | 101 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 101 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 101 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 109 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 101 +++ ...ta_vite_eva02_clip_lsj1024_cp_12ep_fsdp.py | 114 +++ ..._vite_eva02_clip_lsj1024_cp_32x90k_fsdp.py | 114 +++ ...pe_deta_vitl_eva02_clip_lsj1024_cp_12ep.py | 1 + ...ta_vitl_eva02_clip_lsj1024_cp_12ep_fsdp.py | 114 +++ .../ape_deta_vitl_eva02_lsj1024_cp_12ep.py | 8 +- ...=> ape_deta_vitt_eva02_lsj1024_cp_12ep.py} | 35 +- ...ape_deta_vitt_eva02_vlf_lsj1024_cp_12ep.py | 52 ++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 20 + .../ape_deta_vitt_eva02_vlf_lsj1024.py | 101 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 101 +++ ...ta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k.py | 227 ++++++ ...itt_eva02_vlf_lsj1024_cp_16x4_1080k_mdl.py | 230 ++++++ ...vitt_eva02_vlf_lsj1024_cp_64x1_270k_mdl.py | 231 ++++++ .../ape_deta_vitt_eva02_lsj1024_cp_24ep.py | 35 + ...ape_deta_vitt_eva02_vlf_lsj1024_cp_24ep.py | 34 + .../ape_deta_vitt_eva02_vlf_lsj1024_13.py | 103 +++ .../ape_deta_vitt_eva02_vlf_lsj1024_35.py | 104 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 101 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 101 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 100 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 100 +++ .../ape_deta_vitt_eva02_vlf_lsj1024.py | 108 +++ configs/common/backbone/vitl_eva02.py | 3 + configs/common/backbone/vitl_eva02_1536.py | 3 + configs/common/backbone/vitt_eva02.py | 41 + scripts/{eval_all_A.sh => eval_APE-L_A.sh} | 7 +- scripts/{eval_all_B.sh => eval_APE-L_B.sh} | 7 +- scripts/{eval_all_C.sh => eval_APE-L_C.sh} | 5 +- scripts/{eval_all_D.sh => eval_APE-L_D.sh} | 6 +- scripts/eval_APE-Ti.sh | 39 + scripts/eval_flops.sh | 84 +++ ...val_computational_cost.sh => eval_time.sh} | 28 +- tools/eva_interpolate_patch_14to16.py | 8 + tools/eva_interpolate_patch_14to162.py | 122 +++ tools/train_net.py | 3 + tools/train_net_fsdp.py | 703 ++++++++++++++++++ 57 files changed, 4266 insertions(+), 74 deletions(-) create mode 100644 ape/data/build.py create mode 100644 configs/ADE20kFull_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/ADE20k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/ADE20k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/BDD10k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/BDD10k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1024_cp_12ep_fsdp.py create mode 100644 configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1024_cp_32x90k_fsdp.py create mode 100644 configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep_fsdp.py rename configs/COCO_InstanceSegmentation/ape_deta/{ape_deta_vite_eva02_clip_lsj1536_cp_64x90k.py => ape_deta_vitt_eva02_lsj1024_cp_12ep.py} (69%) create mode 100644 configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_12ep.py create mode 100644 configs/COCO_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/Cityscapes_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/D3_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k.py create mode 100644 configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k_mdl.py create mode 100644 configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_64x1_270k_mdl.py create mode 100644 configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_lsj1024_cp_24ep.py create mode 100644 configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_24ep.py create mode 100644 configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_13.py create mode 100644 configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_35.py create mode 100644 configs/PascalContext459_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/PascalContext59_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/PascalVOC20_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/Roboflow_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/SegInW_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py create mode 100644 configs/common/backbone/vitt_eva02.py rename scripts/{eval_all_A.sh => eval_APE-L_A.sh} (82%) rename scripts/{eval_all_B.sh => eval_APE-L_B.sh} (86%) rename scripts/{eval_all_C.sh => eval_APE-L_C.sh} (86%) rename scripts/{eval_all_D.sh => eval_APE-L_D.sh} (87%) create mode 100755 scripts/eval_APE-Ti.sh create mode 100755 scripts/eval_flops.sh rename scripts/{eval_computational_cost.sh => eval_time.sh} (69%) create mode 100644 tools/eva_interpolate_patch_14to162.py create mode 100644 tools/train_net_fsdp.py diff --git a/ape/checkpoint/__init__.py b/ape/checkpoint/__init__.py index d69773d..2f9f0ba 100644 --- a/ape/checkpoint/__init__.py +++ b/ape/checkpoint/__init__.py @@ -2,5 +2,6 @@ from .detection_checkpoint import DetectionCheckpointer +from .detection_checkpoint import FSDPDetectionCheckpointer __all__ = ["DetectionCheckpointer"] diff --git a/ape/checkpoint/detection_checkpoint.py b/ape/checkpoint/detection_checkpoint.py index a09ebe8..aef22aa 100644 --- a/ape/checkpoint/detection_checkpoint.py +++ b/ape/checkpoint/detection_checkpoint.py @@ -2,11 +2,13 @@ import logging import os import pickle -from collections import defaultdict from typing import IO, Any, Dict, Iterable, List, NamedTuple, Optional, Tuple, cast import numpy as np import torch +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import StateDictType +from torch.distributed.fsdp import FullStateDictConfig from detectron2.checkpoint import DetectionCheckpointer as DetectionCheckpointer_d2 @@ -43,3 +45,47 @@ def _convert_ndarray_to_tensor(self, state_dict: Dict[str, Any]) -> None: raise ValueError("Unsupported type found in checkpoint! {}: {}".format(k, type(v))) if not isinstance(v, torch.Tensor): state_dict[k] = torch.from_numpy(v) + + +class FSDPDetectionCheckpointer(DetectionCheckpointer): + + # def __init__(self, skip_key="", **kwargs): + # super().__init__(**kwargs) + # self.skip_key = skip_key + + def save(self, name: str, **kwargs: Any) -> None: + """ + Dump model and checkpointables to a file. + + Args: + name (str): name of the file. + kwargs (dict): extra arbitrary data to save. + """ + # if not self.save_dir or not self.save_to_disk: + # return + + data = {} + + save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) + with FSDP.state_dict_type( + self.model, StateDictType.FULL_STATE_DICT, save_policy + ): + data["model"] = self.model.state_dict() + + if not self.save_dir or not self.save_to_disk: + return + + # data["model"] = self.model.state_dict() + for key, obj in self.checkpointables.items(): + data[key] = obj.state_dict() + data.update(kwargs) + + basename = "{}.pth".format(name) + save_file = os.path.join(self.save_dir, basename) + assert os.path.basename(save_file) == basename, basename + self.logger.info("Saving checkpoint to {}".format(save_file)) + with self.path_manager.open(save_file, "wb") as f: + # pyre-fixme[22]: The cast is redundant. + torch.save(data, cast(IO[bytes], f)) + self.tag_last_checkpoint(basename) + diff --git a/ape/data/__init__.py b/ape/data/__init__.py index 7ca826b..e02b97f 100644 --- a/ape/data/__init__.py +++ b/ape/data/__init__.py @@ -11,6 +11,7 @@ build_detection_train_loader_multi_dataset_copypaste, get_detection_dataset_dicts_multi_dataset_copypaste, ) +from .build import build_detection_test_loader from .dataset_mapper import DatasetMapper_ape from .dataset_mapper_copypaste import DatasetMapper_copypaste from .dataset_mapper_detr_instance import DatasetMapper_detr_instance diff --git a/ape/data/build.py b/ape/data/build.py new file mode 100644 index 0000000..5b46ec3 --- /dev/null +++ b/ape/data/build.py @@ -0,0 +1,135 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import itertools +import logging +import numpy as np +import operator +import pickle +from typing import Any, Callable, Dict, List, Optional, Union +import torch +import torch.utils.data as torchdata +from tabulate import tabulate +from termcolor import colored + +from detectron2.config import configurable +from detectron2.structures import BoxMode +from detectron2.utils.comm import get_world_size +from detectron2.utils.env import seed_all_rng +from detectron2.utils.file_io import PathManager +from detectron2.utils.logger import _log_api_usage, log_first_n + +from detectron2.data.build import trivial_batch_collator + +from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset, ToIterableDataset +from detectron2.data.dataset_mapper import DatasetMapper +from detectron2.data.detection_utils import check_metadata_consistency +from detectron2.data.samplers import ( + RandomSubsetTrainingSampler, + RepeatFactorTrainingSampler, + TrainingSampler, +) + +from .samplers import ( + InferenceSampler, +) + +""" +This file contains the default logic to build a dataloader for training or testing. +""" + +__all__ = [ + "build_detection_test_loader", +] + + +def _test_loader_from_config(cfg, dataset_name, mapper=None): + """ + Uses the given `dataset_name` argument (instead of the names in cfg), because the + standard practice is to evaluate each test set individually (not combining them). + """ + if isinstance(dataset_name, str): + dataset_name = [dataset_name] + + dataset = get_detection_dataset_dicts( + dataset_name, + filter_empty=False, + proposal_files=[ + cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name + ] + if cfg.MODEL.LOAD_PROPOSALS + else None, + ) + if mapper is None: + mapper = DatasetMapper(cfg, False) + return { + "dataset": dataset, + "mapper": mapper, + "num_workers": cfg.DATALOADER.NUM_WORKERS, + "sampler": InferenceSampler(len(dataset)) + if not isinstance(dataset, torchdata.IterableDataset) + else None, + } + + +@configurable(from_config=_test_loader_from_config) +def build_detection_test_loader( + dataset: Union[List[Any], torchdata.Dataset], + *, + mapper: Callable[[Dict[str, Any]], Any], + sampler: Optional[torchdata.Sampler] = None, + batch_size: int = 1, + num_workers: int = 0, + collate_fn: Optional[Callable[[List[Any]], Any]] = None, +) -> torchdata.DataLoader: + """ + Similar to `build_detection_train_loader`, with default batch size = 1, + and sampler = :class:`InferenceSampler`. This sampler coordinates all workers + to produce the exact set of all samples. + + Args: + dataset: a list of dataset dicts, + or a pytorch dataset (either map-style or iterable). They can be obtained + by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. + mapper: a callable which takes a sample (dict) from dataset + and returns the format to be consumed by the model. + When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. + sampler: a sampler that produces + indices to be applied on ``dataset``. Default to :class:`InferenceSampler`, + which splits the dataset across all workers. Sampler must be None + if `dataset` is iterable. + batch_size: the batch size of the data loader to be created. + Default to 1 image per worker since this is the standard when reporting + inference time in papers. + num_workers: number of parallel data loading workers + collate_fn: same as the argument of `torch.utils.data.DataLoader`. + Defaults to do no collation and return a list of data. + + Returns: + DataLoader: a torch DataLoader, that loads the given detection + dataset, with test-time transformation and batching. + + Examples: + :: + data_loader = build_detection_test_loader( + DatasetRegistry.get("my_test"), + mapper=DatasetMapper(...)) + + # or, instantiate with a CfgNode: + data_loader = build_detection_test_loader(cfg, "my_test") + """ + if isinstance(dataset, list): + dataset = DatasetFromList(dataset, copy=False) + if mapper is not None: + dataset = MapDataset(dataset, mapper) + if isinstance(dataset, torchdata.IterableDataset): + assert sampler is None, "sampler must be None if dataset is IterableDataset" + else: + if sampler is None: + sampler = InferenceSampler(len(dataset)) + return torchdata.DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + drop_last=False, + num_workers=num_workers, + collate_fn=trivial_batch_collator if collate_fn is None else collate_fn, + ) diff --git a/ape/data/samplers/__init__.py b/ape/data/samplers/__init__.py index 08a4a26..2d05897 100644 --- a/ape/data/samplers/__init__.py +++ b/ape/data/samplers/__init__.py @@ -1,5 +1,6 @@ -from .distributed_sampler_multi_dataset import MultiDatasetTrainingSampler +from .distributed_sampler_multi_dataset import MultiDatasetTrainingSampler, InferenceSampler __all__ = [ "MultiDatasetTrainingSampler", + "InferenceSampler", ] diff --git a/ape/data/samplers/distributed_sampler_multi_dataset.py b/ape/data/samplers/distributed_sampler_multi_dataset.py index e180a3e..aa0b8a7 100644 --- a/ape/data/samplers/distributed_sampler_multi_dataset.py +++ b/ape/data/samplers/distributed_sampler_multi_dataset.py @@ -135,3 +135,42 @@ def _infinite_indices(self): yield from indices[randperm].tolist() else: yield from indices.tolist() + + +class InferenceSampler(Sampler): + """ + Produce indices for inference across all workers. + Inference needs to run on the __exact__ set of samples, + therefore when the total number of samples is not divisible by the number of workers, + this sampler produces different number of samples on different workers. + """ + + def __init__(self, size: int): + """ + Args: + size (int): the total number of data of the underlying dataset to sample from + """ + self._size = size + assert size > 0 + self._rank = comm.get_rank() + self._world_size = comm.get_world_size() + self._local_indices = self._get_local_indices(size, self._world_size, self._rank) + + @staticmethod + def _get_local_indices(total_size, world_size, rank): + shard_size = total_size // world_size + left = total_size % world_size + shard_sizes = [shard_size + int(r < left) for r in range(world_size)] + + begin = sum(shard_sizes[:rank]) + end = min(sum(shard_sizes[: rank + 1]), total_size) + if end - begin < max(shard_sizes): + assert begin > 0 + begin = begin - 1 + return range(begin, end) + + def __iter__(self): + yield from self._local_indices + + def __len__(self): + return len(self._local_indices) diff --git a/ape/engine/defaults.py b/ape/engine/defaults.py index de9ac42..6a104e9 100644 --- a/ape/engine/defaults.py +++ b/ape/engine/defaults.py @@ -12,17 +12,148 @@ import copy import os import sys +import functools import torch +try: + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.fsdp import MixedPrecision, ShardingStrategy + from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy, ModuleWrapPolicy +except ImportError as e: + print(e, "just skip this") from ape.checkpoint import DetectionCheckpointer from detectron2.config import instantiate +from detectron2.utils import comm + +from transformers.trainer_pt_utils import get_module_class_from_name __all__ = [ + "create_fsdp_model", "DefaultPredictor", ] +def create_fsdp_model(model, *, fp16_compression=False, **kwargs): + """ + Create a DistributedDataParallel model if there are >1 processes. + + Args: + model: a torch.nn.Module + fp16_compression: add fp16 compression hooks to the ddp object. + See more at https://pytorch.org/docs/stable/ddp_comm_hooks.html#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook + kwargs: other arguments of :module:`torch.nn.parallel.DistributedDataParallel`. + """ # noqa + + sharding_strategy_dict = { + "NO_SHARD": ShardingStrategy.NO_SHARD, + "SHARD_GRAD_OP": ShardingStrategy.SHARD_GRAD_OP, + "FULL_SHARD": ShardingStrategy.FULL_SHARD, + } + + dtype_dict = { + "fp32": torch.float32, + "fp16": torch.float16, + "bf16": torch.bfloat16, + } + + auto_wrap_policy = None + module_name_to_wrap = kwargs.pop("module_name_to_wrap", None) + if module_name_to_wrap is not None: + module_cls_to_wrap = set() + for module_name in module_name_to_wrap: + module_cls = get_module_class_from_name(model, module_name) + if module_cls is None: + raise Exception("Could not find the layer class to wrap in the model.") + else: + module_cls_to_wrap.add(module_cls) + + # print("module_cls_to_wrap", module_cls_to_wrap) + # auto_wrap_policy = functools.partial( + # transformer_auto_wrap_policy, + # # Transformer layer class to wrap + # transformer_layer_cls=module_cls_to_wrap, + # ) + auto_wrap_policy = ModuleWrapPolicy(module_cls_to_wrap) + else: + # auto_wrap_policy = functools.partial( + # size_based_auto_wrap_policy, min_num_params=int(1e5) + # ) + auto_wrap_policy = size_based_auto_wrap_policy + + if comm.get_world_size() == 1: + return model + if "device_id" not in kwargs: + kwargs["device_id"] = comm.get_local_rank() + + param_dtype = kwargs.pop("param_dtype", None) + reduce_dtype = kwargs.pop("reduce_dtype", None) + buffer_dtype = kwargs.pop("buffer_dtype", None) + + if param_dtype is not None: + param_dtype = getattr(torch, param_dtype) + if reduce_dtype is not None: + reduce_dtype = getattr(torch, reduce_dtype) + if buffer_dtype is not None: + buffer_dtype = getattr(torch, buffer_dtype) + + # from ape.layers import MultiScaleDeformableAttention + mp_policy = MixedPrecision( + param_dtype=param_dtype, + # Gradient communication precision. + reduce_dtype=reduce_dtype, + # Buffer precision. + buffer_dtype=buffer_dtype, + cast_forward_inputs=True, + # _module_classes_to_ignore=(MultiScaleDeformableAttention,), + ) + + fsdp = FSDP( + model, + auto_wrap_policy=auto_wrap_policy, + mixed_precision=mp_policy, + **kwargs, + ) + return fsdp + + model.model_vision.model_language = FSDP( + model.model_vision.model_language, + # auto_wrap_policy=auto_wrap_policy, + sharding_strategy=ShardingStrategy.NO_SHARD, + mixed_precision=mp_policy, + **kwargs, + ) + model.model_vision.backbone = FSDP( + model.model_vision.backbone, + auto_wrap_policy=auto_wrap_policy, + mixed_precision=mp_policy, + **kwargs, + ) + model.model_vision.transfomer = FSDP( + model.model_vision.transformer, + auto_wrap_policy=auto_wrap_policy, + mixed_precision=mp_policy, + **kwargs, + ) + + # auto_wrap_policy = functools.partial( + # size_based_auto_wrap_policy, min_num_params=int(1e5) + # ) + fsdp = FSDP( + model, + # auto_wrap_policy=size_based_auto_wrap_policy, + sharding_strategy=ShardingStrategy.NO_SHARD, + mixed_precision=mp_policy, + **kwargs, + ) + + # if fp16_compression: + # from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks + + # ddp.register_comm_hook(state=None, hook=comm_hooks.fp16_compress_hook) + return fsdp + + class DefaultPredictor: """ Create a simple end-to-end predictor with the given config that runs on diff --git a/ape/modeling/backbone/vit.py b/ape/modeling/backbone/vit.py index ded59f6..6279e74 100644 --- a/ape/modeling/backbone/vit.py +++ b/ape/modeling/backbone/vit.py @@ -16,6 +16,9 @@ def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): Returns: lr decay rate for the given parameter. """ + if name.startswith("_fsdp_wrapped_module."): + name = name[len("_fsdp_wrapped_module.") :] + if name.startswith("model_vision."): name = name[len("model_vision."):] diff --git a/ape/modeling/backbone/vit_eva.py b/ape/modeling/backbone/vit_eva.py index 1effd59..b663e0e 100644 --- a/ape/modeling/backbone/vit_eva.py +++ b/ape/modeling/backbone/vit_eva.py @@ -630,6 +630,9 @@ def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): Returns: lr decay rate for the given parameter. """ + if name.startswith("_fsdp_wrapped_module."): + name = name[len("_fsdp_wrapped_module.") :] + if name.startswith("model_vision."): name = name[len("model_vision."):] diff --git a/ape/modeling/backbone/vit_eva02.py b/ape/modeling/backbone/vit_eva02.py index 2e6f814..3cccd47 100644 --- a/ape/modeling/backbone/vit_eva02.py +++ b/ape/modeling/backbone/vit_eva02.py @@ -1,6 +1,7 @@ import logging import math from functools import partial +from typing import Dict, Optional, Sequence, Tuple, Union import fvcore.nn.weight_init as weight_init import torch @@ -23,13 +24,14 @@ try: import xformers.ops as xops except: - pass + xops = None try: from apex.normalization import FusedLayerNorm except: pass +has_sdp_kernel = hasattr(torch.backends.cuda, "sdp_kernel") logger = logging.getLogger(__name__) @@ -38,6 +40,141 @@ __all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"] +class xops_SwiGLU(nn.Module): + """ + A Module that encapsulates the call to :attr:`xformers.ops.swiglu`, + and holds the weights for the 3 linear layers + """ + + def __init__( + self, + in_features: int, + hidden_features: int, + out_features: Optional[int] = None, + bias: bool = True, + *, + _pack_weights: bool = True, + ) -> None: + """Create a SwiGLU module + + Args: + in_features (int): Number of features of the input + hidden_features (int): Number of hidden features + out_features (Optional[int], optional): Number of features of the input. Defaults to None. + bias (bool, optional): Whether linear layers also include a bias. Defaults to True. + """ + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.w12: Optional[nn.Linear] + if _pack_weights: + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + else: + self.w12 = None + self.w1 = nn.Linear(in_features, hidden_features, bias=bias) + self.w2 = nn.Linear(in_features, hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + self.hidden_features = hidden_features + self.out_features = out_features + self.in_features = in_features + self.op: Optional[SwiGLUOp] = None + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Computes :attr:`swiglu` with the module's weights + + Args: + x (torch.Tensor): A Tensor of shape ``[..., in_features]`` + + Returns: + torch.Tensor: A Tensor of shape ``[..., out_features]`` + """ + + w1, b1, w2, b2, w3, b3 = self._ordered_params() + x1 = F.linear(x, w1, b1) + x2 = F.linear(x, w2, b2) + hidden = F.silu(x1) * x2 + return F.linear(hidden, w3, b3) + + if self.w12 is not None: + if self.op is not None: + assert ( + self.op.PACKED_WEIGHTS + ), "_pack_weights and self.op.PACKED_WEIGHTS should match" + return swiglu_packed(x, *self._packed_ordered_params(), op=self.op) + + return swiglu(x, *self._ordered_params(), op=self.op) + + def _ordered_params( + self, + ) -> Tuple[ + torch.Tensor, + Optional[torch.Tensor], + torch.Tensor, + Optional[torch.Tensor], + torch.Tensor, + Optional[torch.Tensor], + ]: + """Used for testing - returns ordered arguments for operators""" + b1: Optional[torch.Tensor] + b2: Optional[torch.Tensor] + if self.w12 is not None: + w1w2 = self.w12.weight + b1b2 = self.w12.bias + # w1, w2 = xops.unbind( + # w1w2.view([2, w1w2.shape[0] // 2, w1w2.shape[1]]), + # dim=0, + # ) + w1, w2 = torch.unbind( + w1w2.view([2, w1w2.shape[0] // 2, w1w2.shape[1]]), + dim=0, + ) + if b1b2 is not None: + # b1, b2 = xops.unbind(b1b2.view([2, b1b2.shape[0] // 2]), dim=0) + b1, b2 = torch.unbind(b1b2.view([2, b1b2.shape[0] // 2]), dim=0) + else: + b1, b2 = None, None + else: + w1, w2 = self.w1.weight, self.w2.weight + b1, b2 = self.w1.bias, self.w2.bias + + return ( + w1, + b1, + w2, + b2, + self.w3.weight, + self.w3.bias, + ) + + def _packed_ordered_params( + self, + ) -> Tuple[ + torch.Tensor, + Optional[torch.Tensor], + torch.Tensor, + Optional[torch.Tensor], + ]: + assert self.w12 is not None, "Packed weights are only available when using w12" + + """Used for testing - returns ordered arguments for packed operators""" + w1w2 = self.w12.weight + b1b2_param = self.w12.bias + + w1w2 = w1w2.view([2, w1w2.shape[0] // 2, w1w2.shape[1]]) + + b1b2: Optional[torch.Tensor] = None + if b1b2_param is not None: + b1b2 = b1b2_param.view([2, b1b2_param.shape[0] // 2]) + + return ( + w1w2, + b1b2, + self.w3.weight, + self.w3.bias, + ) + class SwiGLU(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0., @@ -76,6 +213,7 @@ def __init__( attn_head_dim=None, rope=None, xattn=True, + subln=False, ): super().__init__() self.num_heads = num_heads @@ -85,9 +223,13 @@ def __init__( all_head_dim = head_dim * self.num_heads self.scale = qk_scale or head_dim ** -0.5 - self.q_proj = nn.Linear(dim, all_head_dim, bias=False) - self.k_proj = nn.Linear(dim, all_head_dim, bias=False) - self.v_proj = nn.Linear(dim, all_head_dim, bias=False) + self.subln = subln + if self.subln: + self.q_proj = nn.Linear(dim, all_head_dim, bias=False) + self.k_proj = nn.Linear(dim, all_head_dim, bias=False) + self.v_proj = nn.Linear(dim, all_head_dim, bias=False) + else: + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) if qkv_bias: self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) @@ -105,19 +247,32 @@ def forward(self, x): x = x.view(B, -1, C) N = H * W - q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) - k = F.linear(input=x, weight=self.k_proj.weight, bias=None) - v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) - - q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C - k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) - v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + if self.subln: + q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) + k = F.linear(input=x, weight=self.k_proj.weight, bias=None) + v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) + + q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C + k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + else: + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # 3, B, num_heads, N, C + q, k, v = qkv[0], qkv[1], qkv[2] ## rope q = self.rope(q).type_as(v) k = self.rope(k).type_as(v) - if self.xattn and not (torch.jit.is_scripting() or torch.jit.is_tracing()): + if has_sdp_kernel: + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + x = F.scaled_dot_product_attention(q, k, v) + x = x.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C + x = x.reshape(B, N, -1) + elif self.xattn and not (torch.jit.is_scripting() or torch.jit.is_tracing()): q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) @@ -212,6 +367,9 @@ def __init__( use_residual_block=False, rope=None, xattn=True, + subln=False, + swiglu=False, + naiveswiglu=False, ): """ Args: @@ -238,18 +396,31 @@ def __init__( qkv_bias=qkv_bias, rope=rope, xattn=xattn, + subln=subln, ) from timm.models.layers import DropPath self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) - self.mlp = SwiGLU( + if swiglu: + # self.mlp = xops.SwiGLU( + # in_features=dim, + # hidden_features=int(dim * mlp_ratio), + # ) # hidden_features: 2/3 + self.mlp = xops_SwiGLU( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + ) # hidden_features: 2/3 + elif naiveswiglu: + self.mlp = SwiGLU( in_features=dim, hidden_features=int(dim * mlp_ratio), subln=True, norm_layer=norm_layer, ) + else: + assert False self.window_size = window_size @@ -320,6 +491,9 @@ def __init__( pretrain_use_cls_token=True, out_feature="last_feat", xattn=True, + subln=False, + swiglu=False, + naiveswiglu=False, frozen_stages=-1, ): """ @@ -394,7 +568,10 @@ def __init__( window_size=window_size if i in window_block_indexes else 0, use_residual_block=i in residual_block_indexes, rope=self.rope_win if i in window_block_indexes else self.rope_glb, - xattn=xattn + xattn=xattn, + subln=subln, + swiglu=swiglu, + naiveswiglu=naiveswiglu, ) if use_act_checkpoint and i > frozen_stages - 1: # TODO: use torch.utils.checkpoint @@ -611,6 +788,9 @@ def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): Returns: lr decay rate for the given parameter. """ + if name.startswith("_fsdp_wrapped_module."): + name = name[len("_fsdp_wrapped_module.") :] + if name.startswith("model_vision."): name = name[len("model_vision."):] diff --git a/ape/modeling/backbone/vit_eva_clip.py b/ape/modeling/backbone/vit_eva_clip.py index e0d66d5..406504b 100644 --- a/ape/modeling/backbone/vit_eva_clip.py +++ b/ape/modeling/backbone/vit_eva_clip.py @@ -23,12 +23,22 @@ try: import xformers.ops as xops except: - pass + xops = None + # print("xformers not found, will use pytorch implementations") + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm (with cast back to input dtype).""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + return x.to(orig_type) try: from apex.normalization import FusedLayerNorm except: - pass + FusedLayerNorm = LayerNorm + print("apex.normalization.FusedLayerNorm not found, will use pytorch implementations") logger = logging.getLogger(__name__) @@ -246,7 +256,15 @@ def forward(self, x, rel_pos_bias=None, attn_mask=None): q = self.rope(q).type_as(v) k = self.rope(k).type_as(v) - if self.xattn and not (torch.jit.is_scripting() or torch.jit.is_tracing()): + if True: + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + x = F.scaled_dot_product_attention(q, k, v, dropout_p=self.xattn_drop, scale=self.scale) + x = x.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C + x = x.reshape(B, N, -1) + x = self.inner_attn_ln(x) + x = self.proj(x) + x = self.proj_drop(x) + elif self.xattn and not (torch.jit.is_scripting() or torch.jit.is_tracing()): q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) @@ -913,6 +931,9 @@ def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): Returns: lr decay rate for the given parameter. """ + if name.startswith("_fsdp_wrapped_module."): + name = name[len("_fsdp_wrapped_module.") :] + if name.startswith("model_vision."): name = name[len("model_vision.") :] @@ -923,9 +944,5 @@ def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): elif ".blocks." in name and ".residual." not in name: layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1 - logger.info( - "get_vit_lr_decay_rate: name={} num_layers={} layer_id={} lr_decay_rate={}".format( - name, num_layers, layer_id, lr_decay_rate ** (num_layers + 1 - layer_id) - ) - ) + logger.info("get_vit_lr_decay_rate: name={} num_layers={} layer_id={} lr_decay_rate={}".format(name, num_layers, layer_id, lr_decay_rate ** (num_layers + 1 - layer_id))) return lr_decay_rate ** (num_layers + 1 - layer_id) diff --git a/ape/modeling/text/clip_wrapper_eva02.py b/ape/modeling/text/clip_wrapper_eva02.py index bed4492..51e439e 100644 --- a/ape/modeling/text/clip_wrapper_eva02.py +++ b/ape/modeling/text/clip_wrapper_eva02.py @@ -12,6 +12,7 @@ def __init__( cache_dir="EVA02_CLIP_B_psz16_s8B.pt", dtype="float32", max_batch_size=2560, + freeze=True, ): super().__init__() self.net, _, _ = create_model_and_transforms( @@ -27,10 +28,11 @@ def __init__( self.dtype = torch.float32 del self.net.visual - self.net.eval() - for name, param in self.net.named_parameters(): - param.requires_grad = False - param.data = param.data.to(self.dtype) + if freeze: + self.net.eval() + for name, param in self.net.named_parameters(): + param.requires_grad = False + param.data = param.data.to(self.dtype) self.register_buffer("unused_tensor", torch.zeros(1), False) diff --git a/ape/modeling/text/eva01_clip/eva_clip.py b/ape/modeling/text/eva01_clip/eva_clip.py index 42489d4..d869ad7 100644 --- a/ape/modeling/text/eva01_clip/eva_clip.py +++ b/ape/modeling/text/eva01_clip/eva_clip.py @@ -5,7 +5,7 @@ import re from copy import deepcopy from pathlib import Path -from tkinter import E +# from tkinter import E from typing import Optional, Tuple, Any, Union, List import torch diff --git a/ape/modeling/text/eva01_clip/eva_model.py b/ape/modeling/text/eva01_clip/eva_model.py index 04002e5..8d1f475 100644 --- a/ape/modeling/text/eva01_clip/eva_model.py +++ b/ape/modeling/text/eva01_clip/eva_model.py @@ -16,9 +16,19 @@ from .vit_model import VisionTransformer +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm (with cast back to input dtype).""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + return x.to(orig_type) + try: from apex.normalization import FusedLayerNorm except: + FusedLayerNorm = LayerNorm + print("apex.normalization.FusedLayerNorm not found, will use pytorch implementations") pass diff --git a/ape/modeling/text/eva02_clip/model.py b/ape/modeling/text/eva02_clip/model.py index da3bbd7..3d6d9b0 100644 --- a/ape/modeling/text/eva02_clip/model.py +++ b/ape/modeling/text/eva02_clip/model.py @@ -25,13 +25,14 @@ from apex.normalization import FusedLayerNorm except: FusedLayerNorm = LayerNorm - print("Please 'pip install apex'") + # print("Please 'pip install apex'") + print("apex.normalization.FusedLayerNorm not found, will use pytorch implementations") try: import xformers.ops as xops except ImportError: xops = None - print("Please 'pip install xformers'") + # print("Please 'pip install xformers'") @dataclass class CLIPVisionCfg: diff --git a/ape/modeling/text/eva02_clip/transformer.py b/ape/modeling/text/eva02_clip/transformer.py index 33e89ff..16fea47 100644 --- a/ape/modeling/text/eva02_clip/transformer.py +++ b/ape/modeling/text/eva02_clip/transformer.py @@ -31,7 +31,7 @@ import xformers.ops as xops except ImportError: xops = None - print("Please 'pip install xformers'") + # print("Please 'pip install xformers'") class LayerNormFp32(nn.LayerNorm): """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back).""" diff --git a/configs/ADE20kFull_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/ADE20kFull_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..8834501 --- /dev/null +++ b/configs/ADE20kFull_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,101 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/ADE20k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/ADE20k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..8834501 --- /dev/null +++ b/configs/ADE20k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,101 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/ADE20k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/ADE20k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..8834501 --- /dev/null +++ b/configs/ADE20k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,101 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/BDD10k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/BDD10k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..3270dfa --- /dev/null +++ b/configs/BDD10k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,109 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +model.model_vision.panoptic_configs = { + "prob": 0.01, + "pano_temp": 0.06, + "transform_eval": True, + "object_mask_threshold": 0.0001, + "overlap_threshold": 0.4, +} + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/BDD10k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/BDD10k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..8834501 --- /dev/null +++ b/configs/BDD10k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,101 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1024_cp_12ep_fsdp.py b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1024_cp_12ep_fsdp.py new file mode 100644 index 0000000..9a377fe --- /dev/null +++ b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1024_cp_12ep_fsdp.py @@ -0,0 +1,114 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec + +from detectron2.model_zoo import get_config as get_config_d2 +from detrex.config import get_config as get_config_detrex +from ape.modeling.backbone.vit import get_vit_lr_decay_rate + +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vite_eva02_clip_1024 import backbone +from ...common.data.coco_instance_lsj1024_cp import dataloader +from .models.ape_deta_r50 import model + +constants = get_config_d2("common/data/constants.py").constants + +model.model_vision.pixel_mean = constants.imagenet_rgb256_mean +model.model_vision.pixel_std = constants.imagenet_rgb256_std +model.model_vision.input_format = "RGB" + +model.model_vision.backbone = backbone + +model.model_vision.neck.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} +model.model_vision.neck.in_features = ["p2", "p3", "p4", "p5", "p6"] + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +optimizer = get_config_detrex("common/optim.py").AdamW +optimizer.params.lr_factor_func = ( + lambda module_name: 0.1 + if "reference_points" in module_name or "sampling_offsets" in module_name + else get_vit_lr_decay_rate(module_name, lr_decay_rate=0.8, num_layers=64) + if "backbone.net" in module_name + else 1 +) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} +optimizer.params.weight_decay_norm = None + +optimizer.lr = 2e-4 +optimizer.betas = (0.9, 0.999) +optimizer.weight_decay = 1e-4 + +train = get_config_detrex("common/train.py").train +train.max_iter = 90000 +train.eval_period = 5000 +train.log_period = 20 + +train.checkpointer.period = 5000 +train.checkpointer.max_to_keep = 2 + +train.clip_grad.enabled = True +train.clip_grad.params.max_norm = 0.1 +train.clip_grad.params.norm_type = 2 + +train.device = "cuda" + +train.init_checkpoint = ( + "models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14to16_plus_s9B.pt?matching_heuristics=True" +) + +train.amp.enabled = True +train.ddp.fp16_compression = True +train.fsdp = dict( + cpu_offload=False, + use_orig_params=True, + sync_module_states=True, + module_name_to_wrap=["Block",], + # module_name_to_wrap=["Block", "BaseTransformerLayer"], + param_dtype="float32", + reduce_dtype="float32", + buffer_dtype="float32", + # param_dtype="float16", + # reduce_dtype="float16", + # buffer_dtype="float16", +) + +lr_multiplier = get_config_detrex("common/coco_schedule.py").lr_multiplier_12ep +lr_multiplier.scheduler.milestones = [75000, 90000] +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.train.num_workers = 16 +dataloader.train.total_batch_size = 16 +dataloader.train.mapper.image_format = "RGB" +dataloader.train.mapper.use_instance_mask = True + +model.model_vision.dataset_prompts = ["name"] +model.model_vision.dataset_names = ["coco_2017"] +model.model_vision.dataset_metas = dataloader.train.dataset.names + +train.output_dir = "output/" + __file__[:-3] + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + # dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +from ape.data.build import build_detection_test_loader +dataloader.test.update( + _target_=build_detection_test_loader, +) diff --git a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1024_cp_32x90k_fsdp.py b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1024_cp_32x90k_fsdp.py new file mode 100644 index 0000000..17bba0d --- /dev/null +++ b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1024_cp_32x90k_fsdp.py @@ -0,0 +1,114 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec + +from detectron2.model_zoo import get_config as get_config_d2 +from detrex.config import get_config as get_config_detrex +from ape.modeling.backbone.vit import get_vit_lr_decay_rate + +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vite_eva02_clip_1024 import backbone +from ...common.data.coco_instance_lsj1024_cp import dataloader +from .models.ape_deta_r50 import model + +constants = get_config_d2("common/data/constants.py").constants + +model.model_vision.pixel_mean = constants.imagenet_rgb256_mean +model.model_vision.pixel_std = constants.imagenet_rgb256_std +model.model_vision.input_format = "RGB" + +model.model_vision.backbone = backbone + +model.model_vision.neck.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} +model.model_vision.neck.in_features = ["p2", "p3", "p4", "p5", "p6"] + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +optimizer = get_config_detrex("common/optim.py").AdamW +optimizer.params.lr_factor_func = ( + lambda module_name: 0.1 + if "reference_points" in module_name or "sampling_offsets" in module_name + else get_vit_lr_decay_rate(module_name, lr_decay_rate=0.8, num_layers=64) + if "backbone.net" in module_name + else 1 +) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} +optimizer.params.weight_decay_norm = None + +optimizer.lr = 2e-4 +optimizer.betas = (0.9, 0.999) +optimizer.weight_decay = 1e-4 + +train = get_config_detrex("common/train.py").train +train.max_iter = 90000 +train.eval_period = 5000 +train.log_period = 20 + +train.checkpointer.period = 5000 +train.checkpointer.max_to_keep = 2 + +train.clip_grad.enabled = True +train.clip_grad.params.max_norm = 0.1 +train.clip_grad.params.norm_type = 2 + +train.device = "cuda" + +train.init_checkpoint = ( + "models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14to16_plus_s9B.pt?matching_heuristics=True" +) + +train.amp.enabled = True +train.ddp.fp16_compression = True +train.fsdp = dict( + cpu_offload=False, + use_orig_params=True, + sync_module_states=True, + module_name_to_wrap=["Block",], + # module_name_to_wrap=["Block", "BaseTransformerLayer"], + param_dtype="float32", + reduce_dtype="float32", + buffer_dtype="float32", + # param_dtype="float16", + # reduce_dtype="float16", + # buffer_dtype="float16", +) + +lr_multiplier = get_config_detrex("common/coco_schedule.py").lr_multiplier_12ep +lr_multiplier.scheduler.milestones = [75000, 90000] +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.train.num_workers = 32 +dataloader.train.total_batch_size = 32 +dataloader.train.mapper.image_format = "RGB" +dataloader.train.mapper.use_instance_mask = True + +model.model_vision.dataset_prompts = ["name"] +model.model_vision.dataset_names = ["coco_2017"] +model.model_vision.dataset_metas = dataloader.train.dataset.names + +train.output_dir = "output/" + __file__[:-3] + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + # dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +from ape.data.build import build_detection_test_loader +dataloader.test.update( + _target_=build_detection_test_loader, +) diff --git a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep.py b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep.py index 32adc0b..656075a 100644 --- a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep.py +++ b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep.py @@ -1,5 +1,6 @@ from detectron2.config import LazyCall as L from detectron2.layers import ShapeSpec + from detectron2.model_zoo import get_config as get_config_d2 from detrex.config import get_config as get_config_detrex from ape.modeling.backbone.vit import get_vit_lr_decay_rate diff --git a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep_fsdp.py b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep_fsdp.py new file mode 100644 index 0000000..ec91757 --- /dev/null +++ b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep_fsdp.py @@ -0,0 +1,114 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec + +from detectron2.model_zoo import get_config as get_config_d2 +from detrex.config import get_config as get_config_detrex +from ape.modeling.backbone.vit import get_vit_lr_decay_rate + +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitl_eva02_clip import backbone +from ...common.data.coco_instance_lsj1024_cp import dataloader +from .models.ape_deta_r50 import model + +constants = get_config_d2("common/data/constants.py").constants + +model.model_vision.pixel_mean = constants.imagenet_rgb256_mean +model.model_vision.pixel_std = constants.imagenet_rgb256_std +model.model_vision.input_format = "RGB" + +model.model_vision.backbone = backbone + +model.model_vision.neck.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} +model.model_vision.neck.in_features = ["p2", "p3", "p4", "p5", "p6"] + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +optimizer = get_config_detrex("common/optim.py").AdamW +optimizer.params.lr_factor_func = ( + lambda module_name: 0.1 + if "reference_points" in module_name or "sampling_offsets" in module_name + else get_vit_lr_decay_rate(module_name, lr_decay_rate=0.8, num_layers=24) + if "backbone.net" in module_name + else 1 +) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} +optimizer.params.weight_decay_norm = None + +optimizer.lr = 2e-4 +optimizer.betas = (0.9, 0.999) +optimizer.weight_decay = 1e-4 + +train = get_config_detrex("common/train.py").train +train.max_iter = 90000 +train.eval_period = 5000 +train.log_period = 20 + +train.checkpointer.period = 5000 +train.checkpointer.max_to_keep = 2 + +train.clip_grad.enabled = True +train.clip_grad.params.max_norm = 0.1 +train.clip_grad.params.norm_type = 2 + +train.device = "cuda" + +train.init_checkpoint = ( + "models/QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14to16_s6B.pt?matching_heuristics=True" +) + +train.amp.enabled = True +train.ddp.fp16_compression = True +train.fsdp = dict( + cpu_offload=False, + use_orig_params=True, + sync_module_states=True, + module_name_to_wrap=["Block",], + # module_name_to_wrap=["Block", "BaseTransformerLayer"], + param_dtype="float32", + reduce_dtype="float32", + buffer_dtype="float32", + # param_dtype="float16", + # reduce_dtype="float16", + # buffer_dtype="float16", +) + +lr_multiplier = get_config_detrex("common/coco_schedule.py").lr_multiplier_12ep +lr_multiplier.scheduler.milestones = [75000, 90000] +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.train.num_workers = 16 +dataloader.train.total_batch_size = 16 +dataloader.train.mapper.image_format = "RGB" +dataloader.train.mapper.use_instance_mask = True + +model.model_vision.dataset_prompts = ["name"] +model.model_vision.dataset_names = ["coco_2017"] +model.model_vision.dataset_metas = dataloader.train.dataset.names + +train.output_dir = "output/" + __file__[:-3] + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + # dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +from ape.data.build import build_detection_test_loader +dataloader.test.update( + _target_=build_detection_test_loader, +) diff --git a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_12ep.py b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_12ep.py index 045e0cd..ff5c7b7 100644 --- a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_12ep.py +++ b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_12ep.py @@ -2,7 +2,7 @@ from detectron2.layers import ShapeSpec from detectron2.model_zoo import get_config as get_config_d2 -from detrex.config import get_config +from detrex.config import get_config as get_config_detrex from ape.modeling.backbone.vit import get_vit_lr_decay_rate from ape.modeling.text import EVA01CLIP @@ -29,7 +29,7 @@ "p6": ShapeSpec(channels=256), } -optimizer = get_config("common/optim.py").AdamW +optimizer = get_config_detrex("common/optim.py").AdamW optimizer.params.lr_factor_func = ( lambda module_name: 0.1 if "reference_points" in module_name or "sampling_offsets" in module_name @@ -44,7 +44,7 @@ optimizer.betas = (0.9, 0.999) optimizer.weight_decay = 1e-4 -train = get_config("common/train.py").train +train = get_config_detrex("common/train.py").train train.max_iter = 90000 train.eval_period = 5000 train.log_period = 20 @@ -65,7 +65,7 @@ train.amp.enabled = True train.ddp.fp16_compression = True -lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_12ep +lr_multiplier = get_config_detrex("common/coco_schedule.py").lr_multiplier_12ep lr_multiplier.scheduler.milestones = [75000, 90000] lr_multiplier.warmup_length = 1000 / train.max_iter diff --git a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1536_cp_64x90k.py b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_lsj1024_cp_12ep.py similarity index 69% rename from configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1536_cp_64x90k.py rename to configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_lsj1024_cp_12ep.py index e75ac75..5ae38b2 100644 --- a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vite_eva02_clip_lsj1536_cp_64x90k.py +++ b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_lsj1024_cp_12ep.py @@ -1,21 +1,32 @@ from detectron2.config import LazyCall as L from detectron2.layers import ShapeSpec -from detrex.config import get_config + +from detectron2.model_zoo import get_config as get_config_d2 +from detrex.config import get_config as get_config_detrex from ape.modeling.backbone.vit import get_vit_lr_decay_rate + from ape.modeling.text import EVA02CLIP -from .....detectron2.configs.common.data.constants import constants -from ...common.backbone.vite_eva02_clip_1536 import backbone -from ...common.data.coco_instance_lsj1536_cp import dataloader +from ...common.backbone.vitt_eva02 import backbone +from ...common.data.coco_instance_lsj1024_cp import dataloader from .models.ape_deta_r50 import model +constants = get_config_d2("common/data/constants.py").constants + model.model_vision.pixel_mean = constants.imagenet_rgb256_mean model.model_vision.pixel_std = constants.imagenet_rgb256_std model.model_vision.input_format = "RGB" model.model_vision.backbone = backbone -model.model_vision.neck = None +model.model_vision.neck.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} +model.model_vision.neck.in_features = ["p2", "p3", "p4", "p5", "p6"] model.model_vision.mask_in_features = ["p2"] model.model_vision.input_shapes = { @@ -26,11 +37,11 @@ "p6": ShapeSpec(channels=256), } -optimizer = get_config("common/optim.py").AdamW +optimizer = get_config_detrex("common/optim.py").AdamW optimizer.params.lr_factor_func = ( lambda module_name: 0.1 if "reference_points" in module_name or "sampling_offsets" in module_name - else get_vit_lr_decay_rate(module_name, lr_decay_rate=0.8, num_layers=64) + else get_vit_lr_decay_rate(module_name, lr_decay_rate=0.8, num_layers=24) if "backbone.net" in module_name else 1 ) @@ -41,7 +52,7 @@ optimizer.betas = (0.9, 0.999) optimizer.weight_decay = 1e-4 -train = get_config("common/train.py").train +train = get_config_detrex("common/train.py").train train.max_iter = 90000 train.eval_period = 5000 train.log_period = 20 @@ -56,18 +67,18 @@ train.device = "cuda" train.init_checkpoint = ( - "models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14to16_plus_s9B.pt?matching_heuristics=True" + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" ) train.amp.enabled = True train.ddp.fp16_compression = True -lr_multiplier = get_config("common/coco_schedule.py").lr_multiplier_12ep +lr_multiplier = get_config_detrex("common/coco_schedule.py").lr_multiplier_12ep lr_multiplier.scheduler.milestones = [75000, 90000] lr_multiplier.warmup_length = 1000 / train.max_iter -dataloader.train.num_workers = 8 -dataloader.train.total_batch_size = 64 +dataloader.train.num_workers = 16 +dataloader.train.total_batch_size = 16 dataloader.train.mapper.image_format = "RGB" dataloader.train.mapper.use_instance_mask = True diff --git a/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_12ep.py b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_12ep.py new file mode 100644 index 0000000..7434ffa --- /dev/null +++ b/configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_12ep.py @@ -0,0 +1,52 @@ +from detectron2.config import LazyCall as L +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) + +from .ape_deta_vitt_eva02_lsj1024_cp_12ep import ( + dataloader, + lr_multiplier, + model, + optimizer, + train, +) + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/COCO_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/COCO_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..5de3bd5 --- /dev/null +++ b/configs/COCO_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,20 @@ +from ...COCO_InstanceSegmentation.ape_deta.ape_deta_vitt_eva02_vlf_lsj1024_cp_12ep import ( + lr_multiplier, + model, + optimizer, + train, +) + +from ...common.data.coco_panoptic_lsj1024 import dataloader + +model.model_vision.dataset_prompts = ["name"] +model.model_vision.dataset_names = ["coco_2017"] +model.model_vision.dataset_metas = dataloader.train.dataset.names + +model.model_vision.instance_on = True +model.model_vision.semantic_on = True +model.model_vision.panoptic_on = True + +model.model_vision.stuff_prob_thing = -1.0 + +train.output_dir = "output/" + __file__[:-3] diff --git a/configs/Cityscapes_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/Cityscapes_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..8834501 --- /dev/null +++ b/configs/Cityscapes_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,101 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/D3_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/D3_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..af8f52e --- /dev/null +++ b/configs/D3_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,101 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_bank_reset = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 2 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k.py b/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k.py new file mode 100644 index 0000000..ad4a84b --- /dev/null +++ b/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k.py @@ -0,0 +1,227 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.solver import WarmupParamScheduler +from detrex.modeling.neck import ChannelMapper +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from ape.data.detection_utils import get_fed_loss_cls_weights +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from ...common.data.lviscocococostuff_o365_oid_vgr_sa1b_refcoco_group_by_image_gqa_phrasecut_flickr30k_panoptic_lsj1024_cp import ( + dataloader, +) +from ...LVIS_InstanceSegmentation.ape_deta.ape_deta_vitl_eva02_lsj1024_cp_24ep import ( + model, + optimizer, + train, +) + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) +model.model_vision.transformer.encoder.use_act_checkpoint = True + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.num_classes = 1256 +model.model_vision.select_box_nums_for_evaluation = 300 + +criterion = model.model_vision.criterion[0] +del criterion.use_fed_loss +del criterion.get_fed_loss_cls_weights +del criterion.fed_loss_num_classes +model.model_vision.criterion = [criterion for _ in range(10)] +for criterion, num_classes in zip( + model.model_vision.criterion, [1256, 365, 601, 256, 1, 256, 256, 256, 256, 256] +): + criterion.num_classes = num_classes + +dataloader.train.mapper.max_num_phrase = 128 +dataloader.train.mapper.nms_thresh_phrase = 0.6 + +model.model_vision.criterion[0].use_fed_loss = True +model.model_vision.criterion[0].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names[0], 0.5 +) +model.model_vision.criterion[0].fed_loss_num_classes = 50 +model.model_vision.criterion[0].fed_loss_pad_type = "cat" + +model.model_vision.criterion[2].use_fed_loss = True +model.model_vision.criterion[2].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names[2], 0.5 +) +model.model_vision.criterion[2].fed_loss_num_classes = 50 +model.model_vision.criterion[2].fed_loss_pad_type = "cat" + +model.model_vision.criterion[3].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[3].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[3].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[3].weight_dict.update({k: 0.0}) + +for k, v in model.model_vision.criterion[4].weight_dict.items(): + if "_class" in k and "_enc" not in k: + model.model_vision.criterion[4].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[5].weight_dict["loss_class_enc"] = 0.0 + +model.model_vision.criterion[6].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[6].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[6].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[6].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[7].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[7].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[7].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[7].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[8].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[8].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[8].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[8].weight_dict.update({k: 0.0}) + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = 0.9 +model.model_vision.transformer.proposal_ambiguous = 1 + +model.model_vision.instance_on = True +model.model_vision.semantic_on = True +model.model_vision.panoptic_on = False + +train.max_iter = 1080000 +train.eval_period = 1080000 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1], + milestones=[900000], + num_updates=1080000, + ), + warmup_length=2000 / 270000, + warmup_method="linear", + warmup_factor=0.001, +) + +dataloader.train.total_batch_size = 16 +dataloader.train.total_batch_size_list = [16, 16, 16, 16, 16, 16, 16, 16, 16] +dataloader.train.num_workers = 0 +train.iter_size = 4 +train.iter_loop = False + + +model.model_vision.dataset_prompts = [ + "name", + "name", + "name", + "phrase", + "name", + "phrase", + "phrase", + "phrase", + "phrase", + "expression", +] +model.model_vision.dataset_names = [ + "lvis+stuffonly", + "objects365", + "openimages", + "vgregion", + "sa1b", + "refcoco-mixed_group-by-image", + "gqa", + "phrasecut", + "flickr30k", + "refcoco", +] +model.model_vision.dataset_metas = dataloader.train.dataset.names + ["refcoco-mixed"] + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 5120 diff --git a/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k_mdl.py b/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k_mdl.py new file mode 100644 index 0000000..1f7fd55 --- /dev/null +++ b/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k_mdl.py @@ -0,0 +1,230 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.solver import WarmupParamScheduler +from detrex.modeling.neck import ChannelMapper +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from ape.data.detection_utils import get_fed_loss_cls_weights +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from ...common.data.lviscocococostuff_o365_oid_vgr_sa1b_refcoco_group_by_image_gqa_phrasecut_flickr30k_panoptic_lsj1024_cp_mdl import ( + dataloader, +) +from ...LVIS_InstanceSegmentation.ape_deta.ape_deta_vitl_eva02_lsj1024_cp_24ep import ( + model, + optimizer, + train, +) + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) +model.model_vision.transformer.encoder.use_act_checkpoint = True + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.num_classes = 1256 +model.model_vision.select_box_nums_for_evaluation = 300 + +criterion = model.model_vision.criterion[0] +del criterion.use_fed_loss +del criterion.get_fed_loss_cls_weights +del criterion.fed_loss_num_classes +model.model_vision.criterion = [criterion for _ in range(10)] +for criterion, num_classes in zip( + model.model_vision.criterion, [1256, 365, 601, 256, 1, 256, 256, 256, 256, 256] +): + criterion.num_classes = num_classes + +model.model_vision.criterion[0].use_fed_loss = True +model.model_vision.criterion[0].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train[0].dataset.names, 0.5 +) +model.model_vision.criterion[0].fed_loss_num_classes = 50 +model.model_vision.criterion[0].fed_loss_pad_type = "cat" + +model.model_vision.criterion[2].use_fed_loss = True +model.model_vision.criterion[2].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train[2].dataset.names, 0.5 +) +model.model_vision.criterion[2].fed_loss_num_classes = 50 +model.model_vision.criterion[2].fed_loss_pad_type = "cat" + +model.model_vision.criterion[3].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[3].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[3].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[3].weight_dict.update({k: 0.0}) + +for k, v in model.model_vision.criterion[4].weight_dict.items(): + if "_class" in k and "_enc" not in k: + model.model_vision.criterion[4].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[5].weight_dict["loss_class_enc"] = 0.0 + +model.model_vision.criterion[6].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[6].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[6].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[6].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[7].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[7].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[7].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[7].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[8].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[8].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[8].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[8].weight_dict.update({k: 0.0}) + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = 0.9 +model.model_vision.transformer.proposal_ambiguous = 1 + +model.model_vision.instance_on = True +model.model_vision.semantic_on = True +model.model_vision.panoptic_on = False + +train.max_iter = 1080000 +train.eval_period = 1080000 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1], + milestones=[900000], + num_updates=1080000, + ), + warmup_length=2000 / 270000, + warmup_method="linear", + warmup_factor=0.001, +) + +for i in range(len(dataloader.train)): + dataloader.train[i].mapper.max_num_phrase = 128 + dataloader.train[i].mapper.nms_thresh_phrase = 0.6 + dataloader.train[i].total_batch_size = 16 + dataloader.train[i].total_batch_size_list = [16] + dataloader.train[i].num_workers = 2 + +train.iter_size = 4 +train.iter_loop = False +train.dataset_ratio = [1, 1, 1, 1, 1, 0.1, 0.1, 0.1, 0.1] + +model.model_vision.dataset_prompts = [ + "name", + "name", + "name", + "phrase", + "name", + "phrase", + "phrase", + "phrase", + "phrase", + "expression", +] +model.model_vision.dataset_names = [ + "lvis+stuffonly", + "objects365", + "openimages", + "vgregion", + "sa1b", + "refcoco-mixed_group-by-image", + "gqa", + "phrasecut", + "flickr30k", + "refcoco", +] +model.model_vision.dataset_metas = [xx for x in dataloader.train for xx in x.dataset.names] + [ + "refcoco-mixed" +] + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 5120 diff --git a/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_64x1_270k_mdl.py b/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_64x1_270k_mdl.py new file mode 100644 index 0000000..0b7e5a2 --- /dev/null +++ b/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_64x1_270k_mdl.py @@ -0,0 +1,231 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.solver import WarmupParamScheduler +from detrex.modeling.neck import ChannelMapper +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from ape.data.detection_utils import get_fed_loss_cls_weights +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from ...common.data.lviscocococostuff_o365_oid_vgr_sa1b_refcoco_group_by_image_gqa_phrasecut_flickr30k_panoptic_lsj1024_cp_mdl import ( + dataloader, +) +from ...LVIS_InstanceSegmentation.ape_deta.ape_deta_vitl_eva02_lsj1024_cp_24ep import ( + model, + optimizer, + train, +) + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, + use_attention_mask_v=True, +) +model.model_vision.transformer.encoder.use_act_checkpoint = True + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.num_classes = 1256 +model.model_vision.select_box_nums_for_evaluation = 300 + +criterion = model.model_vision.criterion[0] +del criterion.use_fed_loss +del criterion.get_fed_loss_cls_weights +del criterion.fed_loss_num_classes +model.model_vision.criterion = [criterion for _ in range(10)] +for criterion, num_classes in zip( + model.model_vision.criterion, [1256, 365, 601, 256, 1, 256, 256, 256, 256, 256] +): + criterion.num_classes = num_classes + +model.model_vision.criterion[0].use_fed_loss = True +model.model_vision.criterion[0].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train[0].dataset.names, 0.5 +) +model.model_vision.criterion[0].fed_loss_num_classes = 50 +model.model_vision.criterion[0].fed_loss_pad_type = "cat" + +model.model_vision.criterion[2].use_fed_loss = True +model.model_vision.criterion[2].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train[2].dataset.names, 0.5 +) +model.model_vision.criterion[2].fed_loss_num_classes = 50 +model.model_vision.criterion[2].fed_loss_pad_type = "cat" + +model.model_vision.criterion[3].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[3].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[3].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[3].weight_dict.update({k: 0.0}) + +for k, v in model.model_vision.criterion[4].weight_dict.items(): + if "_class" in k and "_enc" not in k: + model.model_vision.criterion[4].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[5].weight_dict["loss_class_enc"] = 0.0 + +model.model_vision.criterion[6].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[6].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[6].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[6].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[7].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[7].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[7].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[7].weight_dict.update({k: 0.0}) + +model.model_vision.criterion[8].weight_dict["loss_class_enc"] = 0.0 +for k, v in model.model_vision.criterion[8].weight_dict.items(): + if "_enc" in k: + model.model_vision.criterion[8].weight_dict.update({k: 0.0}) + if "_bbox" in k or "_giou" in k or "_dice" in k or "_mask" in k: + model.model_vision.criterion[8].weight_dict.update({k: 0.0}) + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = 0.9 +model.model_vision.transformer.proposal_ambiguous = 1 + +model.model_vision.instance_on = True +model.model_vision.semantic_on = True +model.model_vision.panoptic_on = False + +train.max_iter = 270000 +train.eval_period = 270000 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1], + milestones=[225000], + num_updates=270000, + ), + warmup_length=2000 / 270000, + warmup_method="linear", + warmup_factor=0.001, +) + +for i in range(len(dataloader.train)): + dataloader.train[i].mapper.max_num_phrase = 128 + dataloader.train[i].mapper.nms_thresh_phrase = 0.6 + dataloader.train[i].total_batch_size = 64 + dataloader.train[i].total_batch_size_list = [64] + dataloader.train[i].num_workers = 2 + +train.iter_size = 1 +train.iter_loop = False +train.dataset_ratio = [1, 1, 1, 1, 1, 0.1, 0.1, 0.1, 0.1] + +model.model_vision.dataset_prompts = [ + "name", + "name", + "name", + "phrase", + "name", + "phrase", + "phrase", + "phrase", + "phrase", + "expression", +] +model.model_vision.dataset_names = [ + "lvis+stuffonly", + "objects365", + "openimages", + "vgregion", + "sa1b", + "refcoco-mixed_group-by-image", + "gqa", + "phrasecut", + "flickr30k", + "refcoco", +] +model.model_vision.dataset_metas = [xx for x in dataloader.train for xx in x.dataset.names] + [ + "refcoco-mixed" +] + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 5120 diff --git a/configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_lsj1024_cp_24ep.py b/configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_lsj1024_cp_24ep.py new file mode 100644 index 0000000..c43e515 --- /dev/null +++ b/configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_lsj1024_cp_24ep.py @@ -0,0 +1,35 @@ +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from ...COCO_InstanceSegmentation.ape_deta.ape_deta_vitt_eva02_lsj1024_cp_12ep import ( + lr_multiplier, + model, + optimizer, + train, +) +from ...common.data.lvis_instance_lsj1024_cp import dataloader + +model.model_vision.num_classes = 1203 +model.model_vision.select_box_nums_for_evaluation = 300 +model.model_vision.test_score_thresh = 0.0 +model.model_vision.criterion[0].num_classes = 1203 +model.model_vision.criterion[0].use_fed_loss = True +model.model_vision.criterion[0].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 +) +model.model_vision.criterion[0].fed_loss_num_classes = 50 + +del optimizer.params.weight_decay_norm + +optimizer.weight_decay = 0.05 + +train.max_iter = 180000 +train.eval_period = 20000 + +lr_multiplier.scheduler.milestones = [150000, 180000] +lr_multiplier.warmup_length = 1000 / train.max_iter + +model.model_vision.dataset_prompts = ["name"] +model.model_vision.dataset_names = ["lvis"] +model.model_vision.dataset_metas = dataloader.train.dataset.names + +train.output_dir = "output/" + __file__[:-3] diff --git a/configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_24ep.py b/configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_24ep.py new file mode 100644 index 0000000..832729a --- /dev/null +++ b/configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_24ep.py @@ -0,0 +1,34 @@ +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from ...COCO_InstanceSegmentation.ape_deta.ape_deta_vitt_eva02_vlf_lsj1024_cp_12ep import ( + lr_multiplier, + model, + optimizer, + train, +) +from ...common.data.lvis_instance_lsj1024_cp import dataloader + +model.model_vision.num_classes = 1203 +model.model_vision.select_box_nums_for_evaluation = 300 +model.model_vision.criterion[0].num_classes = 1203 +model.model_vision.criterion[0].use_fed_loss = True +model.model_vision.criterion[0].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 +) +model.model_vision.criterion[0].fed_loss_num_classes = 50 + +del optimizer.params.weight_decay_norm + +optimizer.weight_decay = 0.05 + +train.max_iter = 180000 +train.eval_period = 20000 + +lr_multiplier.scheduler.milestones = [150000, 180000] +lr_multiplier.warmup_length = 1000 / train.max_iter + +model.model_vision.dataset_prompts = ["name"] +model.model_vision.dataset_names = ["lvis"] +model.model_vision.dataset_metas = dataloader.train.dataset.names + +train.output_dir = "output/" + __file__[:-3] diff --git a/configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_13.py b/configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_13.py new file mode 100644 index 0000000..c2d6864 --- /dev/null +++ b/configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_13.py @@ -0,0 +1,103 @@ +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from ape.data.detection_utils import get_fed_loss_cls_weights +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) + +from ...common.data.odinw13_instance_lsj1024 import dataloader +from ...LVIS_InstanceSegmentation.ape_deta.ape_deta_vitt_eva02_vlf_lsj1024_cp_24ep import ( + model, + optimizer, + train, +) + +model.model_vision.num_classes = 1256 +model.model_vision.select_box_nums_for_evaluation = 300 + +criterion = model.model_vision.criterion[0] +del criterion.use_fed_loss +del criterion.get_fed_loss_cls_weights +model.model_vision.criterion = [criterion for _ in range(35)] +for criterion, num_classes in zip( + model.model_vision.criterion, + [ + 1000, + ] + * 35, +): + criterion.num_classes = num_classes + +model.model_vision.instance_on = True +model.model_vision.semantic_on = False +model.model_vision.panoptic_on = False + +train.max_iter = 720000 +train.eval_period = 720000 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1], + milestones=[640000], + num_updates=720000, + ), + warmup_length=1000 / 720000, + warmup_method="linear", + warmup_factor=0.001, +) + +for i in range(len(dataloader.train)): + dataloader.train[i].total_batch_size = 16 + dataloader.train[i].total_batch_size_list = [16] + +model.model_vision.dataset_prompts = ["name" for _ in dataloader.train] +model.model_vision.dataset_names = [ + x.dataset.names[0].replace("_train", "") for x in dataloader.train +] +model.model_vision.dataset_metas = [x.dataset.names[0] for x in dataloader.train] +model.model_vision.name_prompt_fusion_text = dataloader.name_prompt_fusion_text +model.model_vision.select_box_nums_for_evaluation_list = ( + dataloader.select_box_nums_for_evaluation_list +) + + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +train.output_dir = "output/" + __file__[:-3] diff --git a/configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_35.py b/configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_35.py new file mode 100644 index 0000000..b11f0d2 --- /dev/null +++ b/configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_35.py @@ -0,0 +1,104 @@ +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from ape.data.detection_utils import get_fed_loss_cls_weights +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) + +from ...common.data.odinw35_instance_lsj1024 import dataloader +from ...LVIS_InstanceSegmentation.ape_deta.ape_deta_vitt_eva02_vlf_lsj1024_cp_24ep import ( + model, + optimizer, + train, +) + +model.model_vision.num_classes = 1256 +model.model_vision.select_box_nums_for_evaluation = 300 + +criterion = model.model_vision.criterion[0] +del criterion.use_fed_loss +del criterion.get_fed_loss_cls_weights +model.model_vision.criterion = [criterion for _ in range(35)] +for criterion, num_classes in zip( + model.model_vision.criterion, + [ + 1000, + ] + * 35, +): + criterion.num_classes = num_classes + +model.model_vision.instance_on = True +model.model_vision.semantic_on = False +model.model_vision.panoptic_on = False + + +train.max_iter = 720000 +train.eval_period = 720000 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1], + milestones=[640000], + num_updates=720000, + ), + warmup_length=1000 / 720000, + warmup_method="linear", + warmup_factor=0.001, +) + +for i in range(len(dataloader.train)): + dataloader.train[i].total_batch_size = 16 + dataloader.train[i].total_batch_size_list = [16] + +model.model_vision.dataset_prompts = ["name" for _ in dataloader.train] +model.model_vision.dataset_names = [ + x.dataset.names[0].replace("_train", "") for x in dataloader.train +] +model.model_vision.dataset_metas = [x.dataset.names[0] for x in dataloader.train] +model.model_vision.name_prompt_fusion_text = dataloader.name_prompt_fusion_text +model.model_vision.select_box_nums_for_evaluation_list = ( + dataloader.select_box_nums_for_evaluation_list +) + + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +train.output_dir = "output/" + __file__[:-3] diff --git a/configs/PascalContext459_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/PascalContext459_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..8834501 --- /dev/null +++ b/configs/PascalContext459_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,101 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/PascalContext59_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/PascalContext59_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..8834501 --- /dev/null +++ b/configs/PascalContext59_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,101 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/PascalVOC20_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/PascalVOC20_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..45555dc --- /dev/null +++ b/configs/PascalVOC20_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,100 @@ +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detrex.modeling.neck import ChannelMapper +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) +from ape.modeling.text import EVA02CLIP + +from ...common.backbone.vitt_eva02 import backbone +from .ape_deta_vitl_eva02_lsj1024 import dataloader, lr_multiplier, model, optimizer, train + +model.model_vision.backbone = backbone + +train.init_checkpoint = ( + "models/Yuxin-CV/EVA-02/eva02/pt/eva02_Ti_pt_in21k_p14to16.pt?matching_heuristics=True" +) + +model.model_language = L(EVA02CLIP)( + clip_model="EVA02-CLIP-bigE-14-plus", + cache_dir="models/QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt", + dtype="float16", +) +model.model_vision.embed_dim_language = 1024 + +model.model_vision.neck = L(ChannelMapper)( + input_shapes={ + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), + }, + in_features=["p2", "p3", "p4", "p5", "p6"], + out_channels=256, + num_outs=5, + kernel_size=1, + norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256), +) + +model.model_vision.mask_in_features = ["p2"] +model.model_vision.input_shapes = { + "p2": ShapeSpec(channels=256), + "p3": ShapeSpec(channels=256), + "p4": ShapeSpec(channels=256), + "p5": ShapeSpec(channels=256), + "p6": ShapeSpec(channels=256), +} + +model.model_vision.transformer.encoder.num_layers = 6 +model.model_vision.transformer.decoder.num_layers = 6 +model.model_vision.transformer.encoder.embed_dim = 256 +model.model_vision.transformer.decoder.embed_dim = 256 +model.model_vision.embed_dim = 256 +model.model_vision.backbone.out_channels = 256 + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 +model.model_vision.transformer.proposal_ambiguous = 1 + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 diff --git a/configs/Roboflow_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/Roboflow_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..289ec7f --- /dev/null +++ b/configs/Roboflow_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,100 @@ +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from ape.data.detection_utils import get_fed_loss_cls_weights +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) + +from ...common.data.roboflow100_instance_lsj1024 import dataloader +from ...LVIS_InstanceSegmentation.ape_deta.ape_deta_vitt_eva02_vlf_lsj1024_cp_24ep import ( + model, + optimizer, + train, +) + +model.model_vision.num_classes = 1256 +model.model_vision.select_box_nums_for_evaluation = 300 + +criterion = model.model_vision.criterion[0] +del criterion.use_fed_loss +del criterion.get_fed_loss_cls_weights +model.model_vision.criterion = [criterion for _ in range(100)] +for criterion, num_classes in zip( + model.model_vision.criterion, + [ + 1000, + ] + * 100, +): + criterion.num_classes = num_classes + +model.model_vision.instance_on = True +model.model_vision.semantic_on = False +model.model_vision.panoptic_on = False + + +train.max_iter = 720000 +train.eval_period = 720000 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1], + milestones=[640000], + num_updates=720000, + ), + warmup_length=1000 / 720000, + warmup_method="linear", + warmup_factor=0.001, +) + + +model.model_vision.dataset_prompts = ["name" for _ in dataloader.tests] +model.model_vision.dataset_names = [x.dataset.names for x in dataloader.tests] +model.model_vision.dataset_metas = [x.dataset.names for x in dataloader.tests] +model.model_vision.name_prompt_fusion_text = dataloader.name_prompt_fusion_text +model.model_vision.select_box_nums_for_evaluation_list = ( + dataloader.select_box_nums_for_evaluation_list +) + + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +train.output_dir = "output/" + __file__[:-3] diff --git a/configs/SegInW_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py b/configs/SegInW_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py new file mode 100644 index 0000000..5d9fcdb --- /dev/null +++ b/configs/SegInW_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py @@ -0,0 +1,108 @@ +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from fvcore.common.param_scheduler import MultiStepParamScheduler +from ape.data.detection_utils import get_fed_loss_cls_weights +from ape.layers import VisionLanguageFusion +from ape.modeling.ape_deta import ( + DeformableDETRSegmVL, + DeformableDetrTransformerDecoderVL, + DeformableDetrTransformerEncoderVL, + DeformableDetrTransformerVL, +) + +from ...common.data.seginw_instance_lsj1024 import dataloader +from ...LVIS_InstanceSegmentation.ape_deta.ape_deta_vitt_eva02_lsj1024_cp_24ep import ( + model, + optimizer, + train, +) + +model.model_vision.num_classes = 1256 +model.model_vision.select_box_nums_for_evaluation = 300 + +criterion = model.model_vision.criterion[0] +del criterion.use_fed_loss +del criterion.get_fed_loss_cls_weights +model.model_vision.criterion = [criterion for _ in range(25)] +for criterion, num_classes in zip( + model.model_vision.criterion, + [ + 1000, + ] + * 25, +): + criterion.num_classes = num_classes + + +model.model_vision.stuff_dataset_learn_thing = False +model.model_vision.stuff_prob_thing = -1.0 + +model.model_vision.instance_on = True +model.model_vision.semantic_on = True +model.model_vision.panoptic_on = False + + +train.max_iter = 720000 +train.eval_period = 720000 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1], + milestones=[640000], + num_updates=720000, + ), + warmup_length=1000 / 720000, + warmup_method="linear", + warmup_factor=0.001, +) + +for i in range(len(dataloader.train)): + dataloader.train[i].total_batch_size = 16 + dataloader.train[i].total_batch_size_list = [16] + +model.model_vision.dataset_prompts = ["name" for _ in dataloader.tests] +model.model_vision.dataset_names = [x.dataset.names.replace("_val", "") for x in dataloader.tests] +model.model_vision.dataset_metas = [x.dataset.names for x in dataloader.tests] + + +model.model_vision.update( + _target_=DeformableDETRSegmVL, +) +model.model_vision.transformer.update( + _target_=DeformableDetrTransformerVL, +) +model.model_vision.transformer.encoder.update( + _target_=DeformableDetrTransformerEncoderVL, +) +model.model_vision.transformer.decoder.update( + _target_=DeformableDetrTransformerDecoderVL, +) + + +model.model_vision.transformer.encoder.vl_layer = L(VisionLanguageFusion)( + v_dim="${....embed_dim}", + l_dim="${....embed_dim_language}", + embed_dim=2048, + num_heads=8, + dropout=0.1, + drop_path=0.0, + init_values=1.0 / 6, + stable_softmax_2d=True, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True, + use_checkpoint=True, +) + +model.model_vision.text_feature_bank = True +model.model_vision.text_feature_reduce_before_fusion = True +model.model_vision.text_feature_batch_repeat = True +model.model_vision.expression_cumulative_gt_class = True +model.model_vision.name_prompt_fusion_type = "zero" + +train.output_dir = "output/" + __file__[:-3] +model.model_vision.vis_period = 12800 + +train.output_dir = "output/" + __file__[:-3] + + +train.output_dir = "output/" + __file__[:-3] diff --git a/configs/common/backbone/vitl_eva02.py b/configs/common/backbone/vitl_eva02.py index baca81e..7a549cb 100644 --- a/configs/common/backbone/vitl_eva02.py +++ b/configs/common/backbone/vitl_eva02.py @@ -27,6 +27,9 @@ out_feature="last_feat", use_act_checkpoint=True, xattn=True, + subln=True, + swiglu=False, + naiveswiglu=True, ), in_feature="${.net.out_feature}", out_channels=256, diff --git a/configs/common/backbone/vitl_eva02_1536.py b/configs/common/backbone/vitl_eva02_1536.py index e488a3b..45467f3 100644 --- a/configs/common/backbone/vitl_eva02_1536.py +++ b/configs/common/backbone/vitl_eva02_1536.py @@ -31,6 +31,9 @@ out_feature="last_feat", use_act_checkpoint=True, xattn=True, + subln=True, + swiglu=False, + naiveswiglu=True, ), in_feature="${.net.out_feature}", out_channels=256, diff --git a/configs/common/backbone/vitt_eva02.py b/configs/common/backbone/vitt_eva02.py new file mode 100644 index 0000000..5a1a0f5 --- /dev/null +++ b/configs/common/backbone/vitt_eva02.py @@ -0,0 +1,41 @@ +from functools import partial + +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.modeling.backbone.fpn import LastLevelMaxPool +from ape.modeling.backbone.vit_eva02 import SimpleFeaturePyramid, ViT + +# Creates Simple Feature Pyramid from ViT backbone +backbone = L(SimpleFeaturePyramid)( + net=L(ViT)( # Single-scale ViT backbone + img_size=1024, + patch_size=16, + embed_dim=192, + depth=12, + num_heads=3, + drop_path_rate=0.8, + window_size=14, + mlp_ratio=4 * 2 / 3, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + window_block_indexes=list(range(0, 2)) + + list(range(3, 5)) + + list(range(6, 8)) + + list(range(9, 11)), + residual_block_indexes=[], + use_rel_pos=True, + out_feature="last_feat", + use_act_checkpoint=False, + xattn=True, + subln=False, + swiglu=True, + naiveswiglu=False, + ), + in_feature="${.net.out_feature}", + out_channels=256, + scale_factors=(4.0, 2.0, 1.0, 0.5), + top_block=L(LastLevelMaxPool)(), + norm="LN", + square_pad=1024, +) diff --git a/scripts/eval_all_A.sh b/scripts/eval_APE-L_A.sh similarity index 82% rename from scripts/eval_all_A.sh rename to scripts/eval_APE-L_A.sh index 142da36..4df4862 100755 --- a/scripts/eval_all_A.sh +++ b/scripts/eval_APE-L_A.sh @@ -4,10 +4,11 @@ set -x set -e -init_checkpoint="output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth" +kwargs="" +init_checkpoint="output9/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth" num_gpus=7 -output_dir="./output2/eval_all/A/" +output_dir="./output9/APE/eval_APE-L_A/" config_files=( @@ -34,5 +35,5 @@ for config_file in ${config_files[@]} do echo "==============================================================================================" echo ${config_file} - python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} + python3 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} done diff --git a/scripts/eval_all_B.sh b/scripts/eval_APE-L_B.sh similarity index 86% rename from scripts/eval_all_B.sh rename to scripts/eval_APE-L_B.sh index e19b1c9..72fbec0 100755 --- a/scripts/eval_all_B.sh +++ b/scripts/eval_APE-L_B.sh @@ -4,12 +4,11 @@ set -x set -e +kwargs="" init_checkpoint="output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth" num_gpus=7 -output_dir="./output2/eval_all/B/" - -kwargs="" +output_dir="./output9/APE/eval_APE-L_B/" config_files=( @@ -36,5 +35,5 @@ for config_file in ${config_files[@]} do echo "==============================================================================================" echo ${config_file} - python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} ${kwargs} + python3 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} ${kwargs} done diff --git a/scripts/eval_all_C.sh b/scripts/eval_APE-L_C.sh similarity index 86% rename from scripts/eval_all_C.sh rename to scripts/eval_APE-L_C.sh index 32e4f1d..3d46dd8 100755 --- a/scripts/eval_all_C.sh +++ b/scripts/eval_APE-L_C.sh @@ -4,10 +4,11 @@ set -x set -e +kwargs="" init_checkpoint="output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth" num_gpus=7 -output_dir="output2/eval_all/C/" +output_dir="output9/APE/eval_APE-L_C/" config_files=( @@ -34,5 +35,5 @@ for config_file in ${config_files[@]} do echo "==============================================================================================" echo ${config_file} - python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49194 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} + python3 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49194 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} done diff --git a/scripts/eval_all_D.sh b/scripts/eval_APE-L_D.sh similarity index 87% rename from scripts/eval_all_D.sh rename to scripts/eval_APE-L_D.sh index 3c5d571..72f6874 100755 --- a/scripts/eval_all_D.sh +++ b/scripts/eval_APE-L_D.sh @@ -3,12 +3,12 @@ set -x set -e + kwargs="model.model_vision.transformer.proposal_ambiguous=1" init_checkpoint="output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth" -output_dir="output2/eval_all/D_20230829_162438/" - num_gpus=7 +output_dir="output9/APE/eval_APE-L_D/" config_files=( @@ -35,5 +35,5 @@ for config_file in ${config_files[@]} do echo "==============================================================================================" echo ${config_file} - python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} ${kwargs} + python3 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} ${kwargs} done diff --git a/scripts/eval_APE-Ti.sh b/scripts/eval_APE-Ti.sh new file mode 100755 index 0000000..c26c0a4 --- /dev/null +++ b/scripts/eval_APE-Ti.sh @@ -0,0 +1,39 @@ +#!/bin/bash -e + +set -x +set -e + + +kwargs="model.model_vision.transformer.proposal_ambiguous=1" +init_checkpoint="output9/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k_mdl_20240203_230000/model_final.pth" + +num_gpus=7 +output_dir="output9/APE/eval_APE-Ti/" + + +config_files=( + "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_16x4_1080k.py" + "configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_cp_12ep.py" + "configs/COCO_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_13.py" + "configs/ODinW_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024_35.py" + "configs/SegInW_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/Roboflow_Detection/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/ADE20k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/ADE20k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/ADE20kFull_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/BDD10k_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/BDD10k_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/Cityscapes_PanopticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/PascalContext459_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/PascalContext59_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/PascalVOC20_SemanticSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" + "configs/D3_InstanceSegmentation/ape_deta/ape_deta_vitt_eva02_vlf_lsj1024.py" +) + +for config_file in ${config_files[@]} +do + echo "==============================================================================================" + echo ${config_file} + python3 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" train.init_checkpoint=${init_checkpoint} ${kwargs} +done diff --git a/scripts/eval_flops.sh b/scripts/eval_flops.sh new file mode 100755 index 0000000..2813d09 --- /dev/null +++ b/scripts/eval_flops.sh @@ -0,0 +1,84 @@ +#!/bin/bash -e + +set -x +set -e + + +num_gpus=1 +output_dir="./output9/APE/eval_flops/" +mkdir -p ${output_dir} + +timestamp="`date +'%Y%m%d_%H%M%S'`" +LOG=${output_dir}/${timestamp}_log.txt +exec &> >(tee -a "$LOG") +echo Logging output to "$LOG" + + +# REC R50 +config_files=( + "configs/REFCOCO_VisualGrounding/ape_deta/ape_deta_r50_12ep.py" # bs=16 for training + "configs/REFCOCO_VisualGrounding/ape_deta/ape_deta_r50_vlf_12ep.py" # bs=16 for training +) + +kwargs="dataloader.train.total_batch_size=8 model.model_vision.test_mask_on=False model.model_vision.test_score_thresh=0.5 model.model_language.max_batch_size=128 model.model_vision.transformer.num_feature_levels=5 " + +for config_file in ${config_files[@]} +do + echo "==============================================================================================" + echo ${config_file} + python3.9 ../detectron2/tools/analyze_model.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} --tasks flop -n 1 train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" ${kwargs} model.model_vision.num_classes=1 model.model_vision.select_box_nums_for_evaluation=1 model.model_vision.test_score_thresh=0.5 + python3.9 ../detectron2/tools/analyze_model.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} --tasks flop -n 1 train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" ${kwargs} model.model_vision.num_classes=128 model.model_vision.select_box_nums_for_evaluation=128 model.model_vision.test_score_thresh=0.5 + python3.9 ../detectron2/tools/analyze_model.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} --tasks flop -n 1 train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" ${kwargs} model.model_vision.num_classes=1280 model.model_vision.select_box_nums_for_evaluation=1280 model.model_vision.test_score_thresh=0.5 +done + + +# REC ViT-L +config_files=( + "configs/REFCOCO_VisualGrounding/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_12ep.py" # bs=8 for training + "configs/REFCOCO_VisualGrounding/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_12ep.py" # bs=8 for training +) + +kwargs="dataloader.train.total_batch_size=8 model.model_vision.test_mask_on=False model.model_vision.test_score_thresh=0.5 model.model_language.max_batch_size=128 model.model_vision.neck.in_features=[\"p3\",\"p4\",\"p5\",\"p6\"] model.model_vision.mask_in_features=[\"p3\"] model.model_vision.neck.num_outs=5 model.model_vision.transformer.num_feature_levels=5 model.model_vision.backbone.scale_factors=[2.0,1.0,0.5] " +for config_file in ${config_files[@]} +do + echo "==============================================================================================" + echo ${config_file} + python3.9 ../detectron2/tools/analyze_model.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} --tasks flop -n 1 train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" ${kwargs} model.model_vision.num_classes=1 model.model_vision.select_box_nums_for_evaluation=1 + python3.9 ../detectron2/tools/analyze_model.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} --tasks flop -n 1 train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" ${kwargs} model.model_vision.num_classes=128 model.model_vision.select_box_nums_for_evaluation=128 + python3.9 ../detectron2/tools/analyze_model.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} --tasks flop -n 1 train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" ${kwargs} model.model_vision.num_classes=1280 model.model_vision.select_box_nums_for_evaluation=1280 +done + + +# OVD R50 +config_files=( + "configs/COCO_InstanceSegmentation/ape_deta/ape_deta_r50_12ep.py" # bs=16 for training + "configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_r50_24ep.py" # bs=16 for training + "configs/COCO_InstanceSegmentation/ape_deta/ape_deta_r50_vlf_12ep.py" # bs=16 for training + "configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_r50_vlf_24ep.py" # bs=16 for training +) + +kwargs="dataloader.train.total_batch_size=8 model.model_vision.test_mask_on=False model.model_vision.test_score_thresh=0.5 model.model_language.max_batch_size=128 model.model_vision.transformer.num_feature_levels=5 " + +for config_file in ${config_files[@]} +do + echo "==============================================================================================" + echo ${config_file} + python3.9 ../detectron2/tools/analyze_model.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} --tasks flop -n 1 train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" ${kwargs} +done + +# OVD ViT-L +config_files=( + "configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep.py" # bs=8 for training + "configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_24ep.py" # bs=8 for training + "configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_12ep.py" # bs=8 for training + "configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_24ep.py" # bs=8 for training +) + +kwargs="dataloader.train.total_batch_size=8 model.model_vision.test_mask_on=False model.model_vision.test_score_thresh=0.5 model.model_language.max_batch_size=128 model.model_vision.neck.in_features=[\"p3\",\"p4\",\"p5\",\"p6\"] model.model_vision.mask_in_features=[\"p3\"] model.model_vision.neck.num_outs=5 model.model_vision.transformer.num_feature_levels=5 model.model_vision.backbone.scale_factors=[2.0,1.0,0.5] " + +for config_file in ${config_files[@]} +do + echo "==============================================================================================" + echo ${config_file} + python3.9 ../detectron2/tools/analyze_model.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} --tasks flop -n 1 train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" ${kwargs} +done diff --git a/scripts/eval_computational_cost.sh b/scripts/eval_time.sh similarity index 69% rename from scripts/eval_computational_cost.sh rename to scripts/eval_time.sh index aeb6141..0d66d3f 100755 --- a/scripts/eval_computational_cost.sh +++ b/scripts/eval_time.sh @@ -10,8 +10,8 @@ output_dir="./output2/eval_computational_cost/" # REC R50 config_files=( - #"configs/REFCOCO_VisualGrounding/something/something_r50_12ep.py" # bs=16 for training - #"configs/REFCOCO_VisualGrounding/something/something_r50_vlf_12ep.py" # bs=16 for training + #"configs/REFCOCO_VisualGrounding/ape_deta/ape_deta_r50_12ep.py" # bs=16 for training + #"configs/REFCOCO_VisualGrounding/ape_deta/ape_deta_r50_vlf_12ep.py" # bs=16 for training ) for config_file in ${config_files[@]} @@ -20,15 +20,15 @@ do echo ${config_file} #python3.9 tools/train_net.py --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" model.model_vision.segm_type="" #python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" model.model_vision.segm_type="" model.model_vision.num_classes=1 model.model_vision.select_box_nums_for_evaluation=1 model.model_vision.test_score_thresh=0.5 - #python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" model.model_vision.segm_type="" model.model_vision.num_classes=128 model.model_vision.select_box_nums_for_evaluation=128 model.model_vision.test_score_thresh=0.5 - #python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" model.model_vision.segm_type="" model.model_vision.num_classes=1280 model.model_vision.select_box_nums_for_evaluation=1280 model.model_vision.test_score_thresh=0.5 + #python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" model.model_vision.segm_type="" model.model_vision.num_classes=128 model.model_vision.select_box_nums_for_evaluation=128 model.model_vision.test_score_thresh=0.5 + #python3.9 tools/train_net.py --eval-only --dist-url=tcp://127.0.0.1:49193 --config-file ${config_file} --num-gpus ${num_gpus} train.output_dir=${output_dir}/${config_file}/"`date +'%Y%m%d_%H%M%S'`" model.model_vision.segm_type="" model.model_vision.num_classes=1280 model.model_vision.select_box_nums_for_evaluation=1280 model.model_vision.test_score_thresh=0.5 done # REC ViT-L config_files=( - #"configs/REFCOCO_VisualGrounding/something/something_vitl_eva02_clip_lsj1024_12ep.py" # bs=8 for training - #"configs/REFCOCO_VisualGrounding/something/something_vitl_eva02_clip_vlf_lsj1024_12ep.py" # bs=8 for training + #"configs/REFCOCO_VisualGrounding/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_12ep.py" # bs=8 for training + #"configs/REFCOCO_VisualGrounding/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_12ep.py" # bs=8 for training ) kwargs="dataloader.train.total_batch_size=8 model.model_vision.segm_type=\"\" model.model_vision.test_score_thresh=0.5 model.model_language.max_batch_size=128 model.model_vision.neck.in_features=[\"p3\",\"p4\",\"p5\",\"p6\"] model.model_vision.neck.num_outs=5 model.model_vision.transformer.num_feature_levels=5 model.model_vision.backbone.scale_factors=[2.0,1.0,0.5]" @@ -45,10 +45,10 @@ done # OVD R50 config_files=( - #"configs/COCO_InstanceSegmentation/something/something_r50_12ep.py" # bs=16 for training - #"configs/LVIS_InstanceSegmentation/something/something_r50_24ep.py" # bs=16 for training - #"configs/COCO_InstanceSegmentation/something/something_r50_vlf_12ep.py" # bs=16 for training - #"configs/LVIS_InstanceSegmentation/something/something_r50_vlf_24ep.py" # bs=16 for training + #"configs/COCO_InstanceSegmentation/ape_deta/ape_deta_r50_12ep.py" # bs=16 for training + #"configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_r50_24ep.py" # bs=16 for training + #"configs/COCO_InstanceSegmentation/ape_deta/ape_deta_r50_vlf_12ep.py" # bs=16 for training + #"configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_r50_vlf_24ep.py" # bs=16 for training ) for config_file in ${config_files[@]} @@ -61,10 +61,10 @@ done # OVD ViT-L config_files=( - #"configs/COCO_InstanceSegmentation/something/something_vitl_eva02_clip_lsj1024_cp_12ep.py" # bs=8 for training - #"configs/LVIS_InstanceSegmentation/something/something_vitl_eva02_clip_lsj1024_cp_24ep.py" # bs=8 for training - #"configs/COCO_InstanceSegmentation/something/something_vitl_eva02_clip_vlf_lsj1024_cp_12ep.py" # bs=8 for training - "configs/LVIS_InstanceSegmentation/something/something_vitl_eva02_clip_vlf_lsj1024_cp_24ep.py" # bs=8 for training + #"configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_12ep.py" # bs=8 for training + #"configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_lsj1024_cp_24ep.py" # bs=8 for training + #"configs/COCO_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_12ep.py" # bs=8 for training + "configs/LVIS_InstanceSegmentation/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_24ep.py" # bs=8 for training ) kwargs="dataloader.train.total_batch_size=8 model.model_vision.segm_type=\"\" model.model_vision.test_score_thresh=0.5 model.model_language.max_batch_size=128 model.model_vision.neck.in_features=[\"p3\",\"p4\",\"p5\",\"p6\"] model.model_vision.neck.num_outs=5 model.model_vision.transformer.num_feature_levels=5 model.model_vision.backbone.scale_factors=[2.0,1.0,0.5]" diff --git a/tools/eva_interpolate_patch_14to16.py b/tools/eva_interpolate_patch_14to16.py index 07c915c..312b25a 100644 --- a/tools/eva_interpolate_patch_14to16.py +++ b/tools/eva_interpolate_patch_14to16.py @@ -73,6 +73,11 @@ def interpolate_pos_embed(checkpoint_model, new_size=16, image_size=224): checkpoint = torch.load(args.input, map_location=torch.device("cpu")) + print(checkpoint.keys()) + if "module" in checkpoint: + checkpoint["model"] = checkpoint.pop("module") + print(checkpoint.keys()) + # interpolate patch_embed if "model" in checkpoint: patch_embed = checkpoint["model"]["patch_embed.proj.weight"] @@ -97,6 +102,9 @@ def interpolate_pos_embed(checkpoint_model, new_size=16, image_size=224): print("======== new state_dict ========") if "model" in checkpoint: + for k, v in list(checkpoint["model"].items()): + checkpoint["model"]["backbone.net." + k] = checkpoint["model"].pop(k) + print("rename", k, " ", "backbone.net." + k) for k, v in list(checkpoint["model"].items()): print(k, " ", v.shape) else: diff --git a/tools/eva_interpolate_patch_14to162.py b/tools/eva_interpolate_patch_14to162.py new file mode 100644 index 0000000..b9c66ab --- /dev/null +++ b/tools/eva_interpolate_patch_14to162.py @@ -0,0 +1,122 @@ +# -------------------------------------------------------- +# EVA: Exploring the Limits of Masked Visual Representation Learning at Scale (https://arxiv.org/abs/2211.07636) +# Github source: https://github.com/baaivision/EVA +# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI) +# Licensed under The MIT License [see LICENSE for details] +# By Yuxin Fang +# Based on timm, DINO, DeiT and BEiT codebases +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# https://github.com/microsoft/unilm/tree/master/beit +# --------------------------------------------------------' + +import argparse + +import torch + + +def interpolate_pos_embed(checkpoint_model, new_size=16, image_size=224): + if "pos_embed" in checkpoint_model: + pos_embed_checkpoint = checkpoint_model["pos_embed"] + print("pos_embed_checkpoint", pos_embed_checkpoint.size(), pos_embed_checkpoint.dtype) + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = int(image_size / new_size) ** 2 + num_extra_tokens = 1 + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches**0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print( + "Position interpolate from %dx%d to %dx%d" + % (orig_size, orig_size, new_size, new_size) + ) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute( + 0, 3, 1, 2 + ) + ori_dtype = pos_tokens.dtype + pos_tokens = pos_tokens.to(torch.float32) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False + ) + pos_tokens = pos_tokens.to(ori_dtype) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model["pos_embed"] = new_pos_embed + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="interpolate patch_embed kernel") + parser.add_argument( + "--input", + default="/path/to/eva_psz14.pt", + type=str, + metavar="PATH", + required=True, + help="path to input EVA checkpoint with patch_embed kernel_size=14x14", + ) + parser.add_argument( + "--output", + default="/path/to/eva_psz14to16.pt", + type=str, + metavar="PATH", + required=True, + help="path to output EVA checkpoint with patch_embed kernel_size=16x16", + ) + parser.add_argument("--image_size", type=int, required=True) + args = parser.parse_args() + + checkpoint = torch.load(args.input, map_location=torch.device("cpu")) + + # interpolate patch_embed + if "model" in checkpoint: + patch_embed = checkpoint["model"]["patch_embed.proj.weight"] + elif "module" in checkpoint: + patch_embed = checkpoint["module"]["patch_embed.proj.weight"] + else: + patch_embed = checkpoint["visual.patch_embed.proj.weight"] + C_o, C_in, H, W = patch_embed.shape + patch_embed = torch.nn.functional.interpolate( + patch_embed.float(), size=(16, 16), mode="bicubic", align_corners=False + ) + if "model" in checkpoint: + checkpoint["model"]["patch_embed.proj.weight"] = patch_embed + elif "module" in checkpoint: + checkpoint["module"]["patch_embed.proj.weight"] = patch_embed + else: + checkpoint["visual.patch_embed.proj.weight"] = patch_embed + + # interpolate pos_embed too + if "model" in checkpoint: + interpolate_pos_embed(checkpoint["model"], new_size=16, image_size=args.image_size) + elif "module" in checkpoint: + interpolate_pos_embed(checkpoint["module"], new_size=16, image_size=args.image_size) + else: + checkpoint["pos_embed"] = checkpoint["visual.pos_embed"] + interpolate_pos_embed(checkpoint, new_size=16, image_size=args.image_size) + checkpoint["visual.pos_embed"] = checkpoint.pop("pos_embed") + + print("======== new state_dict ========") + if "model" in checkpoint: + for k, v in list(checkpoint["model"].items()): + print(k, " ", v.shape) + elif "module" in checkpoint: + for k, v in list(checkpoint["module"].items()): + print(k, " ", v.shape) + else: + for k, v in list(checkpoint.items()): + if k.startswith("text.") or k == "logit_scale": + checkpoint.pop(k) + print("pop", k, " ", v.shape) + if k.startswith("visual."): + checkpoint["backbone.net." + k[7:]] = checkpoint.pop(k) + print("rename", k, " ", "backbone.net." + k[7:]) + for k, v in list(checkpoint.items()): + print(k, " ", v.shape) + + torch.save(checkpoint, args.output) diff --git a/tools/train_net.py b/tools/train_net.py index 3fc0a49..90fa29c 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Training script using the new "LazyConfig" python config files. @@ -17,6 +18,7 @@ import time from collections import abc from contextlib import nullcontext +from datetime import timedelta import torch from torch.nn.parallel import DataParallel, DistributedDataParallel @@ -660,4 +662,5 @@ def main(args): machine_rank=args.machine_rank, dist_url=args.dist_url, args=(args,), + timeout=timedelta(minutes=120), ) diff --git a/tools/train_net_fsdp.py b/tools/train_net_fsdp.py new file mode 100644 index 0000000..004e54c --- /dev/null +++ b/tools/train_net_fsdp.py @@ -0,0 +1,703 @@ +#!/usr/bin/env python +""" +Training script using the new "LazyConfig" python config files. + +This scripts reads a given python config file and runs the training or evaluation. +It can be used to train any models or dataset as long as they can be +instantiated by the recursive construction defined in the given config file. + +Besides lazy construction of models, dataloader, etc., this scripts expects a +few common configuration parameters currently defined in "configs/common/train.py". +To add more complicated training logic, you can easily add other configs +in the config file and implement a new train_net.py to handle them. +""" +import logging +import os +import random +import sys +import time +from collections import abc +from contextlib import nullcontext + +import torch +from torch.nn.parallel import DataParallel, DistributedDataParallel +from torch.distributed.fsdp import FullyShardedDataParallel + +import ape +from ape.checkpoint import DetectionCheckpointer +from ape.checkpoint import FSDPDetectionCheckpointer +from ape.engine import SimpleTrainer +from ape.evaluation import inference_on_dataset +from ape.engine.defaults import create_fsdp_model +from detectron2.config import LazyConfig, instantiate +from detectron2.engine import default_argument_parser +from detectron2.engine import default_setup, hooks, launch +from detectron2.engine.defaults import create_ddp_model +from detectron2.evaluation import print_csv_format +from detectron2.utils import comm +from detectron2.utils.events import ( + CommonMetricPrinter, + JSONWriter, + TensorboardXWriter, + get_event_storage, +) +from detectron2.utils.file_io import PathManager +from detectron2.utils.logger import setup_logger +from detrex.modeling import ema +from detrex.utils import WandbWriter + +from accelerate import Accelerator + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) + +logger = logging.getLogger("ape") + + +class Trainer(SimpleTrainer): + """ + We've combine Simple and AMP Trainer together. + """ + + def __init__( + self, + model, + dataloader, + optimizer, + amp=False, + amp_dtype=None, + clip_grad_params=None, + grad_scaler=None, + iter_size=1, + iter_loop=True, + dataset_ratio=None, + save_memory=False, + ): + super().__init__(model=model, data_loader=dataloader, optimizer=optimizer) + + unsupported = "AMPTrainer does not support single-process multi-device training!" + if isinstance(model, DistributedDataParallel): + assert not (model.device_ids and len(model.device_ids) > 1), unsupported + assert not isinstance(model, DataParallel), unsupported + + if amp: + if grad_scaler is None: + from torch.cuda.amp import GradScaler + + grad_scaler = GradScaler() + if isinstance(model, FullyShardedDataParallel): + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler + + grad_scaler = ShardedGradScaler() + self.grad_scaler = grad_scaler + + self.amp = amp + self.amp_dtype = getattr(torch, amp_dtype) + + self.clip_grad_params = clip_grad_params + + if isinstance(model, DistributedDataParallel): + if hasattr(model.module, "model_vision"): + self.dataset_names = model.module.model_vision.dataset_names + else: + self.dataset_names = ["unknown"] + else: + if hasattr(model, "model_vision"): + self.dataset_names = model.model_vision.dataset_names + else: + self.dataset_names = ["unknown"] + self.dataset_image_counts = { + k: torch.tensor(0, dtype=torch.float).to(comm.get_local_rank()) + for k in self.dataset_names + } + self.dataset_object_counts = { + k: torch.tensor(0, dtype=torch.float).to(comm.get_local_rank()) + for k in self.dataset_names + } + + self.iter_size = iter_size + self.iter_loop = iter_loop + self.dataset_ratio = dataset_ratio + self.save_memory = save_memory + + def run_step(self): + if self.iter_size > 1: + if self.iter_loop: + return self.run_step_accumulate_iter_loop() + else: + return self.run_step_accumulate() + """ + Implement the standard training logic described above. + """ + assert self.model.training, "[Trainer] model was changed to eval mode!" + assert torch.cuda.is_available(), "[Trainer] CUDA is required for AMP training!" + from torch.cuda.amp import autocast + + start = time.perf_counter() + """ + If you want to do something with the data, you can wrap the dataloader. + """ + while True: + data = next(self._data_loader_iter) + if all([len(x["instances"]) > 0 for x in data]): + break + data_time = time.perf_counter() - start + + for d in data: + if d.get("dataloader_id", None) is not None: + d["dataset_id"] = d["dataloader_id"] + self.dataset_image_counts[self.dataset_names[d.get("dataset_id", 0)]] += 1 + self.dataset_object_counts[self.dataset_names[d.get("dataset_id", 0)]] += len( + d.get("instances", []) + ) + dataset_image_counts = {f"count_image/{k}": v for k, v in self.dataset_image_counts.items()} + dataset_object_counts = { + f"count_object/{k}": v for k, v in self.dataset_object_counts.items() + } + if self.async_write_metrics: + self.concurrent_executor.submit( + self._write_metrics_common, dataset_image_counts, iter=self.iter + ) + self.concurrent_executor.submit( + self._write_metrics_common, dataset_object_counts, iter=self.iter + ) + else: + self._write_metrics_common(dataset_image_counts) + self._write_metrics_common(dataset_object_counts) + + """ + If you want to do something with the losses, you can wrap the model. + """ + with autocast(enabled=self.amp, dtype=self.amp_dtype): + loss_dict = self.model(data) + if isinstance(loss_dict, torch.Tensor): + losses = loss_dict + loss_dict = {"total_loss": loss_dict} + else: + losses = sum(loss_dict.values()) + + """ + If you need to accumulate gradients or do something similar, you can + wrap the optimizer with your custom `zero_grad()` method. + """ + self.optimizer.zero_grad() + + if self.amp: + self.grad_scaler.scale(losses).backward() + torch.cuda.synchronize() + if self.clip_grad_params is not None: + self.grad_scaler.unscale_(self.optimizer) + self.clip_grads(self.model.parameters()) + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() + else: + losses.backward() + torch.cuda.synchronize() + if self.clip_grad_params is not None: + self.clip_grads(self.model.parameters()) + self.optimizer.step() + + if self.async_write_metrics: + self.concurrent_executor.submit( + self._write_metrics, loss_dict, data_time, iter=self.iter + ) + else: + self._write_metrics(loss_dict, data_time) + + if self.save_memory: + del losses + del loss_dict + torch.cuda.empty_cache() + + def run_step_accumulate(self): + """ + Implement the standard training logic described above. + """ + assert self.model.training, "[Trainer] model was changed to eval mode!" + assert torch.cuda.is_available(), "[Trainer] CUDA is required for AMP training!" + from torch.cuda.amp import autocast + + start = time.perf_counter() + """ + If you want to do something with the data, you can wrap the dataloader. + """ + while True: + data = next(self._data_loader_iter) + if all([len(x["instances"]) > 0 for x in data]): + break + data_time = time.perf_counter() - start + + for d in data: + if d.get("dataloader_id", None) is not None: + d["dataset_id"] = d["dataloader_id"] + self.dataset_image_counts[self.dataset_names[d.get("dataset_id", 0)]] += 1 + self.dataset_object_counts[self.dataset_names[d.get("dataset_id", 0)]] += len( + d.get("instances", []) + ) + dataset_image_counts = {f"count_image/{k}": v for k, v in self.dataset_image_counts.items()} + dataset_object_counts = { + f"count_object/{k}": v for k, v in self.dataset_object_counts.items() + } + if self.async_write_metrics: + self.concurrent_executor.submit( + self._write_metrics_common, dataset_image_counts, iter=self.iter + ) + self.concurrent_executor.submit( + self._write_metrics_common, dataset_object_counts, iter=self.iter + ) + else: + self._write_metrics_common(dataset_image_counts) + self._write_metrics_common(dataset_object_counts) + + sync_context = self.model.no_sync if (self.iter + 1) % self.iter_size != 0 else nullcontext + """ + If you want to do something with the losses, you can wrap the model. + """ + with sync_context(): + with autocast(enabled=self.amp, dtype=self.amp_dtype): + loss_dict = self.model(data) + + if isinstance(loss_dict, torch.Tensor): + losses = loss_dict + loss_dict = {"total_loss": loss_dict} + else: + losses = sum(loss_dict.values()) + + """ + If you need to accumulate gradients or do something similar, you can + wrap the optimizer with your custom `zero_grad()` method. + """ + if self.iter == self.start_iter: + self.optimizer.zero_grad() + + if self.iter_size > 1: + losses = losses / self.iter_size + + if self.amp: + self.grad_scaler.scale(losses).backward() + if (self.iter + 1) % self.iter_size == 0: + if self.clip_grad_params is not None: + self.grad_scaler.unscale_(self.optimizer) + self.clip_grads(self.model.parameters()) + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() + self.optimizer.zero_grad() + else: + losses.backward() + if (self.iter + 1) % self.iter_size == 0: + if self.clip_grad_params is not None: + self.clip_grads(self.model.parameters()) + self.optimizer.step() + self.optimizer.zero_grad() + + if self.async_write_metrics: + self.concurrent_executor.submit( + self._write_metrics, loss_dict, data_time, iter=self.iter + ) + else: + self._write_metrics(loss_dict, data_time) + + if self.save_memory: + del losses + del loss_dict + torch.cuda.empty_cache() + + def run_step_accumulate_iter_loop(self): + """ + Implement the standard training logic described above. + """ + assert self.model.training, "[Trainer] model was changed to eval mode!" + assert torch.cuda.is_available(), "[Trainer] CUDA is required for AMP training!" + from torch.cuda.amp import autocast + + self.optimizer.zero_grad() + for inner_iter in range(self.iter_size): + start = time.perf_counter() + """ + If you want to do something with the data, you can wrap the dataloader. + """ + while True: + data = next(self._data_loader_iter) + if all([len(x["instances"]) > 0 for x in data]): + break + data_time = time.perf_counter() - start + + for d in data: + if d.get("dataloader_id", None) is not None: + d["dataset_id"] = d["dataloader_id"] + self.dataset_image_counts[self.dataset_names[d.get("dataset_id", 0)]] += 1 + self.dataset_object_counts[self.dataset_names[d.get("dataset_id", 0)]] += len( + d.get("instances", []) + ) + dataset_image_counts = { + f"count_image/{k}": v for k, v in self.dataset_image_counts.items() + } + dataset_object_counts = { + f"count_object/{k}": v for k, v in self.dataset_object_counts.items() + } + if self.async_write_metrics: + self.concurrent_executor.submit( + self._write_metrics_common, dataset_image_counts, iter=self.iter + ) + self.concurrent_executor.submit( + self._write_metrics_common, dataset_object_counts, iter=self.iter + ) + else: + self._write_metrics_common(dataset_image_counts) + self._write_metrics_common(dataset_object_counts) + + sync_context = self.model.no_sync if inner_iter != self.iter_size - 1 else nullcontext + """ + If you want to do something with the losses, you can wrap the model. + """ + with sync_context(): + with autocast(enabled=self.amp, dtype=self.amp_dtype): + loss_dict = self.model(data) + + if isinstance(loss_dict, torch.Tensor): + losses = loss_dict + loss_dict = {"total_loss": loss_dict} + else: + losses = sum(loss_dict.values()) + + """ + If you need to accumulate gradients or do something similar, you can + wrap the optimizer with your custom `zero_grad()` method. + """ + + losses = losses / self.iter_size + + if self.amp: + self.grad_scaler.scale(losses).backward() + else: + losses.backward() + + if self.async_write_metrics: + self.concurrent_executor.submit( + self._write_metrics, loss_dict, data_time, iter=self.iter + ) + else: + self._write_metrics(loss_dict, data_time) + + if self.save_memory: + del losses + del loss_dict + torch.cuda.empty_cache() + + if self.amp: + if self.clip_grad_params is not None: + self.grad_scaler.unscale_(self.optimizer) + self.clip_grads(self.model.parameters()) + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() + else: + if self.clip_grad_params is not None: + self.clip_grads(self.model.parameters()) + self.optimizer.step() + + def clip_grads(self, params): + return self.model.clip_grad_norm_(**self.clip_grad_params) + params = list(filter(lambda p: p.requires_grad and p.grad is not None, params)) + if len(params) > 0: + return torch.nn.utils.clip_grad_norm_( + parameters=params, + **self.clip_grad_params, + ) + + def state_dict(self): + ret = super().state_dict() + if self.grad_scaler and self.amp: + ret["grad_scaler"] = self.grad_scaler.state_dict() + return ret + + def load_state_dict(self, state_dict): + super().load_state_dict(state_dict) + if self.grad_scaler and self.amp: + self.grad_scaler.load_state_dict(state_dict["grad_scaler"]) + + @property + def _data_loader_iter(self): + if isinstance(self.data_loader, abc.MutableSequence): + if self._data_loader_iter_obj is None: + self._data_loader_iter_obj = [iter(x) for x in self.data_loader] + self._data_loader_indices = [] + + if len(self._data_loader_indices) == 0: + self._data_loader_indices = random.choices( + list(range(len(self.data_loader))), weights=self.dataset_ratio, k=10000 + ) + idx = self._data_loader_indices.pop() + return self._data_loader_iter_obj[idx] + + if self._data_loader_iter_obj is None: + self._data_loader_iter_obj = iter(self.data_loader) + return self._data_loader_iter_obj + + +def do_test(cfg, model, eval_only=False): + + if isinstance(model, FullyShardedDataParallel) and False: + accelerator = Accelerator() + model = accelerator.unwrap_model(model, keep_fp32_wrapper=False) + + if isinstance(model, FullyShardedDataParallel) and False: + model = instantiate(cfg.model) + logger = logging.getLogger("ape") + logger.info("Model:\n{}".format(model)) + model.to(cfg.train.device) + model = create_ddp_model(model) + + checkpointer = FSDPDetectionCheckpointer( + model, + cfg.train.output_dir, + ) + checkpointer.resume_or_load(cfg.train.init_checkpoint, resume=True) + + logger = logging.getLogger("ape") + if "evaluator" in cfg.dataloader: + if isinstance(model, DistributedDataParallel): + if hasattr(model.module, "set_eval_dataset"): + model.module.set_eval_dataset(cfg.dataloader.test.dataset.names) + else: + if hasattr(model, "set_eval_dataset"): + model.set_eval_dataset(cfg.dataloader.test.dataset.names) + output_dir = os.path.join( + cfg.train.output_dir, "inference_{}".format(cfg.dataloader.test.dataset.names) + ) + if "cityscapes" in cfg.dataloader.test.dataset.names: + pass + else: + if isinstance(cfg.dataloader.evaluator, abc.MutableSequence): + for evaluator in cfg.dataloader.evaluator: + evaluator.output_dir = output_dir + else: + cfg.dataloader.evaluator.output_dir = output_dir + + ret = inference_on_dataset( + model, instantiate(cfg.dataloader.test), instantiate(cfg.dataloader.evaluator) + ) + logger.info( + "Evaluation results for {} in csv format:".format(cfg.dataloader.test.dataset.names) + ) + print_csv_format(ret) + ret = {f"{k}_{cfg.dataloader.test.dataset.names}": v for k, v in ret.items()} + else: + ret = {} + + if "evaluators" in cfg.dataloader: + for test, evaluator in zip(cfg.dataloader.tests, cfg.dataloader.evaluators): + if isinstance(model, DistributedDataParallel): + model.module.set_eval_dataset(test.dataset.names) + else: + model.set_eval_dataset(test.dataset.names) + output_dir = os.path.join( + cfg.train.output_dir, "inference_{}".format(test.dataset.names) + ) + if isinstance(evaluator, abc.MutableSequence): + for eva in evaluator: + eva.output_dir = output_dir + else: + evaluator.output_dir = output_dir + ret_ = inference_on_dataset(model, instantiate(test), instantiate(evaluator)) + logger.info("Evaluation results for {} in csv format:".format(test.dataset.names)) + print_csv_format(ret_) + ret.update({f"{k}_{test.dataset.names}": v for k, v in ret_.items()}) + + bbox_odinw_AP = {"AP": [], "AP50": [], "AP75": [], "APs": [], "APm": [], "APl": []} + segm_seginw_AP = {"AP": [], "AP50": [], "AP75": [], "APs": [], "APm": [], "APl": []} + bbox_rf100_AP = {"AP": [], "AP50": [], "AP75": [], "APs": [], "APm": [], "APl": []} + for k, v in ret.items(): + for kk, vv in v.items(): + if k.startswith("bbox_odinw") and kk in bbox_odinw_AP and vv == vv: + bbox_odinw_AP[kk].append(vv) + if k.startswith("segm_seginw") and kk in segm_seginw_AP and vv == vv: + segm_seginw_AP[kk].append(vv) + if k.startswith("bbox_rf100") and kk in bbox_rf100_AP and vv == vv: + bbox_rf100_AP[kk].append(vv) + + from statistics import median, mean + + logger.info("Evaluation results: {}".format(ret)) + for k, v in bbox_odinw_AP.items(): + if len(v) > 0: + logger.info( + "Evaluation results for odinw bbox {}: mean {} median {}".format( + k, mean(v), median(v) + ) + ) + for k, v in segm_seginw_AP.items(): + if len(v) > 0: + logger.info( + "Evaluation results for seginw segm {}: mean {} median {}".format( + k, mean(v), median(v) + ) + ) + for k, v in bbox_rf100_AP.items(): + if len(v) > 0: + logger.info( + "Evaluation results for rf100 bbox {}: mean {} median {}".format( + k, mean(v), median(v) + ) + ) + + return ret + + +def do_train(args, cfg): + """ + Args: + cfg: an object with the following attributes: + model: instantiate to a module + dataloader.{train,test}: instantiate to dataloaders + dataloader.evaluator: instantiate to evaluator for test set + optimizer: instantaite to an optimizer + lr_multiplier: instantiate to a fvcore scheduler + train: other misc config defined in `configs/common/train.py`, including: + output_dir (str) + init_checkpoint (str) + amp.enabled (bool) + max_iter (int) + eval_period, log_period (int) + device (str) + checkpointer (dict) + ddp (dict) + """ + model = instantiate(cfg.model) + logger = logging.getLogger("ape") + logger.info("Model:\n{}".format(model)) + model.to(cfg.train.device) + + # build training loader + if "wait_group" in cfg.dataloader: + wait = comm.get_local_rank() % cfg.dataloader.wait_group * cfg.dataloader.wait_time + logger.info("rank {} sleep {}".format(comm.get_local_rank(), wait)) + time.sleep(wait) + if isinstance(cfg.dataloader.train, abc.MutableSequence): + train_loader = [instantiate(x) for x in cfg.dataloader.train] + else: + train_loader = instantiate(cfg.dataloader.train) + + # create fsdp model + model = create_fsdp_model(model, **cfg.train.fsdp) + logger.info("Model:\n{}".format(model)) + + # build model ema + ema.may_build_model_ema(cfg, model) + + # instantiate optimizer + cfg.optimizer.params.model = model + optim = instantiate(cfg.optimizer) + + trainer = Trainer( + model=model, + dataloader=train_loader, + optimizer=optim, + amp=cfg.train.amp.enabled, + amp_dtype=cfg.train.fsdp.param_dtype, + clip_grad_params=cfg.train.clip_grad.params if cfg.train.clip_grad.enabled else None, + iter_size=cfg.train.iter_size if "iter_size" in cfg.train else 1, + iter_loop=cfg.train.iter_loop if "iter_loop" in cfg.train else True, + dataset_ratio=cfg.train.dataset_ratio if "dataset_ratio" in cfg.train else None, + ) + + checkpointer = FSDPDetectionCheckpointer( + model, + cfg.train.output_dir, + trainer=trainer, + **ema.may_get_ema_checkpointer(cfg, model), + ) + + if comm.is_main_process(): + output_dir = cfg.train.output_dir + PathManager.mkdirs(output_dir) + writers = [ + CommonMetricPrinter(cfg.train.max_iter), + JSONWriter(os.path.join(output_dir, "metrics.json")), + TensorboardXWriter(output_dir), + ] + if cfg.train.wandb.enabled: + PathManager.mkdirs(cfg.train.wandb.params.dir) + writers.append(WandbWriter(cfg)) + + trainer.register_hooks( + [ + hooks.IterationTimer(), + ema.EMAHook(cfg, model) if cfg.train.model_ema.enabled else None, + hooks.LRScheduler(scheduler=instantiate(cfg.lr_multiplier)), + # hooks.PeriodicCheckpointer(checkpointer, **cfg.train.checkpointer) + # if comm.is_main_process() + # else None, + hooks.PeriodicCheckpointer(checkpointer, **cfg.train.checkpointer), + hooks.EvalHook(cfg.train.eval_period, lambda: do_test(cfg, model)), + hooks.PeriodicWriter( + writers, + period=cfg.train.log_period, + ) + if comm.is_main_process() + else None, + ] + ) + + checkpointer.resume_or_load(cfg.train.init_checkpoint, resume=args.resume) + if args.resume and checkpointer.has_checkpoint(): + start_iter = trainer.iter + 1 + else: + start_iter = 0 + trainer.train(start_iter, cfg.train.max_iter) + + +def main(args): + cfg = LazyConfig.load(args.config_file) + cfg = LazyConfig.apply_overrides(cfg, args.opts) + + if "output_dir" in cfg.model: + cfg.model.output_dir = cfg.train.output_dir + if "model_vision" in cfg.model and "output_dir" in cfg.model.model_vision: + cfg.model.model_vision.output_dir = cfg.train.output_dir + if "train" in cfg.dataloader: + if isinstance(cfg.dataloader.train, abc.MutableSequence): + for i in range(len(cfg.dataloader.train)): + if "output_dir" in cfg.dataloader.train[i].mapper: + cfg.dataloader.train[i].mapper.output_dir = cfg.train.output_dir + else: + if "output_dir" in cfg.dataloader.train.mapper: + cfg.dataloader.train.mapper.output_dir = cfg.train.output_dir + + default_setup(cfg, args) + + setup_logger(cfg.train.output_dir, distributed_rank=comm.get_rank(), name="sota") + setup_logger(cfg.train.output_dir, distributed_rank=comm.get_rank(), name="ape") + setup_logger(cfg.train.output_dir, distributed_rank=comm.get_rank(), name="timm") + + if cfg.train.fast_dev_run.enabled: + cfg.train.max_iter = 20 + cfg.train.eval_period = 10 + cfg.train.log_period = 1 + + if args.eval_only: + model = instantiate(cfg.model) + logger = logging.getLogger("ape") + logger.info("Model:\n{}".format(model)) + model.to(cfg.train.device) + model = create_ddp_model(model) + + ema.may_build_model_ema(cfg, model) + DetectionCheckpointer(model, **ema.may_get_ema_checkpointer(cfg, model)).load( + cfg.train.init_checkpoint + ) + if cfg.train.model_ema.enabled and cfg.train.model_ema.use_ema_weights_for_eval_only: + ema.apply_model_ema(model) + print(do_test(cfg, model, eval_only=True)) + else: + do_train(args, cfg) + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + )