/detectron2/solver/build.py
https://github.com/facebookresearch/detectron2 · Python · 165 lines · 104 code · 21 blank · 40 comment · 12 complexity · fa7a5bcdf2b8b7734a595d92798ec890 MD5 · raw file
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
- from enum import Enum
- from typing import Any, Callable, Dict, Iterable, List, Set, Type, Union
- import torch
- from detectron2.config import CfgNode
- from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR
- _GradientClipperInput = Union[torch.Tensor, Iterable[torch.Tensor]]
- _GradientClipper = Callable[[_GradientClipperInput], None]
- class GradientClipType(Enum):
- VALUE = "value"
- NORM = "norm"
- def _create_gradient_clipper(cfg: CfgNode) -> _GradientClipper:
- """
- Creates gradient clipping closure to clip by value or by norm,
- according to the provided config.
- """
- cfg = cfg.clone()
- def clip_grad_norm(p: _GradientClipperInput):
- torch.nn.utils.clip_grad_norm_(p, cfg.CLIP_VALUE, cfg.NORM_TYPE)
- def clip_grad_value(p: _GradientClipperInput):
- torch.nn.utils.clip_grad_value_(p, cfg.CLIP_VALUE)
- _GRADIENT_CLIP_TYPE_TO_CLIPPER = {
- GradientClipType.VALUE: clip_grad_value,
- GradientClipType.NORM: clip_grad_norm,
- }
- return _GRADIENT_CLIP_TYPE_TO_CLIPPER[GradientClipType(cfg.CLIP_TYPE)]
- def _generate_optimizer_class_with_gradient_clipping(
- optimizer_type: Type[torch.optim.Optimizer], gradient_clipper: _GradientClipper
- ) -> Type[torch.optim.Optimizer]:
- """
- Dynamically creates a new type that inherits the type of a given instance
- and overrides the `step` method to add gradient clipping
- """
- def optimizer_wgc_step(self, closure=None):
- for group in self.param_groups:
- for p in group["params"]:
- gradient_clipper(p)
- super(type(self), self).step(closure)
- OptimizerWithGradientClip = type(
- optimizer_type.__name__ + "WithGradientClip",
- (optimizer_type,),
- {"step": optimizer_wgc_step},
- )
- return OptimizerWithGradientClip
- def maybe_add_gradient_clipping(
- cfg: CfgNode, optimizer: torch.optim.Optimizer
- ) -> torch.optim.Optimizer:
- """
- If gradient clipping is enabled through config options, wraps the existing
- optimizer instance of some type OptimizerType to become an instance
- of the new dynamically created class OptimizerTypeWithGradientClip
- that inherits OptimizerType and overrides the `step` method to
- include gradient clipping.
- Args:
- cfg: CfgNode
- configuration options
- optimizer: torch.optim.Optimizer
- existing optimizer instance
- Return:
- optimizer: torch.optim.Optimizer
- either the unmodified optimizer instance (if gradient clipping is
- disabled), or the same instance with adjusted __class__ to override
- the `step` method and include gradient clipping
- """
- if not cfg.SOLVER.CLIP_GRADIENTS.ENABLED:
- return optimizer
- grad_clipper = _create_gradient_clipper(cfg.SOLVER.CLIP_GRADIENTS)
- OptimizerWithGradientClip = _generate_optimizer_class_with_gradient_clipping(
- type(optimizer), grad_clipper
- )
- optimizer.__class__ = OptimizerWithGradientClip
- return optimizer
- def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
- """
- Build an optimizer from config.
- """
- norm_module_types = (
- torch.nn.BatchNorm1d,
- torch.nn.BatchNorm2d,
- torch.nn.BatchNorm3d,
- torch.nn.SyncBatchNorm,
- # NaiveSyncBatchNorm inherits from BatchNorm2d
- torch.nn.GroupNorm,
- torch.nn.InstanceNorm1d,
- torch.nn.InstanceNorm2d,
- torch.nn.InstanceNorm3d,
- torch.nn.LayerNorm,
- torch.nn.LocalResponseNorm,
- )
- params: List[Dict[str, Any]] = []
- memo: Set[torch.nn.parameter.Parameter] = set()
- for module in model.modules():
- for key, value in module.named_parameters(recurse=False):
- if not value.requires_grad:
- continue
- # Avoid duplicating parameters
- if value in memo:
- continue
- memo.add(value)
- lr = cfg.SOLVER.BASE_LR
- weight_decay = cfg.SOLVER.WEIGHT_DECAY
- if isinstance(module, norm_module_types):
- weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
- elif key == "bias":
- # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0
- # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer
- # hyperparameters are by default exactly the same as for regular
- # weights.
- lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
- weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
- params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
- optimizer = torch.optim.SGD(
- params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV
- )
- optimizer = maybe_add_gradient_clipping(cfg, optimizer)
- return optimizer
- def build_lr_scheduler(
- cfg: CfgNode, optimizer: torch.optim.Optimizer
- ) -> torch.optim.lr_scheduler._LRScheduler:
- """
- Build a LR scheduler from config.
- """
- name = cfg.SOLVER.LR_SCHEDULER_NAME
- if name == "WarmupMultiStepLR":
- return WarmupMultiStepLR(
- optimizer,
- cfg.SOLVER.STEPS,
- cfg.SOLVER.GAMMA,
- warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
- warmup_iters=cfg.SOLVER.WARMUP_ITERS,
- warmup_method=cfg.SOLVER.WARMUP_METHOD,
- )
- elif name == "WarmupCosineLR":
- return WarmupCosineLR(
- optimizer,
- cfg.SOLVER.MAX_ITER,
- warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
- warmup_iters=cfg.SOLVER.WARMUP_ITERS,
- warmup_method=cfg.SOLVER.WARMUP_METHOD,
- )
- else:
- raise ValueError("Unknown LR scheduler: {}".format(name))