Multi-trial Detection NAS With Efficiency Rewards

Define NAS Model

Defining a model is almost the same as defining a PyTorch. You need to replace the code import torch.nn as nn with import nni.retiarii.nn.pytorch as nn and add @model_wrapper at the beginning of the model

from nni.retiarii import model_wrapper

@model_wrapper
class Model(nn.Module):

Define Changable Modules

NASC3 is the variantion of the original CSP block (C3 module), it can adjust the output channel numnbers of cv1 and cv2.

import nni.retiarii.nn.pytorch as nn

class NASC3(nn.Module):
    # CSP Bottleneck with 3 convolutions
    def __init__(self, c1, c2, inputshape=(), id=0, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
        super().__init__()
        c_ = int(c2 * e)  # changeable output channels
        choice = []

        for scale in [1.0, 1.5, 2.0]:
            choice.append(NASC3sub(c1, c2, inputshape, id , n, shortcut, g, e, scale))
        self.total = LayerChoice(choice, label="c3_{}".format(id))
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c1, c_, 1, 1)
        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))

NASConv is the varation of the original Conv module, it can adjust the kernel size and padding of convolutions and choose between different activations.

import nni.retiarii.nn.pytorch as nn

class NASConv(nn.Module):
    # Standard convolution
    def __init__(self, c1, c2, inputshape=(), id=0, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
        super().__init__()

        choice = [nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)]
        # conv_2d_output_shape checks the output shape of convolutions (to make sure output size is the same)
        outputshape = conv_2d_output_shape(inputshape, k, s, autopad(k, p))
        for offsetk in (-2 , 2):
                for offsetpad in range(0 if p is None else -1*p,4):
                    if conv_2d_output_shape(inputshape, k+offsetk , s, autopad(k, p)+offsetpad) == outputshape:
                        choice.append(nn.Conv2d(c1, c2, k+offsetk, s, autopad(k, p)+offsetpad, groups=g, bias=False))

        self.conv  = LayerChoice(choice, label="nasconv_{}".format(id))
        self.shape = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        act_choice = [nn.SiLU(), nn.Identity(), nn.ReLU()]  # activation choices
        self.act = LayerChoice(act_choice, label="nasconv_{}_act".format(id))

Change Yaml File

YoloV5s NAS model’s yaml (only backbone part) full yolov5s_nas.yaml

# YOLOv5 v6.0 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, NASConv, [64, 6, 2, 2]],  # 0-P1/2
   [-1, 1, NASConv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, NASC3, [128]],
   [-1, 1, NASConv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, NASC3, [256]],
   [-1, 1, NASConv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, NASC3, [512]],
   [-1, 1, NASConv, [1024, 3, 2]],  # 7-P5/32
   [-1, 3, NASC3, [1024]],
   [-1, 1, SPPF, [1024, 5]]
  ]

Setup User-defined Nas Model

device = "cuda:0"
hyp = "data/hyps/hyp.scratch-low.yaml" # hyper-parameters in yolov5
cfg="./models/yolov5s_nas.yaml" # yaml file
model_space = Model(cfg=cfg, ch=3, nc=80, anchors=hyp.get('anchors')).to(device)

Explore The Defined Model Space

Pick An Exploration Strategy

NNI supports many exploration startegies, simply choosing (i.e., instantiate) an exploration strategy as below.

import nni.retiarii.strategy as strategy
search_strategy = strategy.Random(dedup=True)  # dedup=False if deduplication is not wanted

Customize A Model Evaluator

Setup parameters (ex: batch size, epochs) for model


def evaluate_model(model_detect):
    
    model = model_detect()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)


    # Parameters
    hyp = 'data/hyps/hyp.scratch-low.yaml'
    if isinstance(hyp, str):
        with open(hyp, errors='ignore') as f:
            hyp = yaml.safe_load(f) 
    WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
    imgsz = 640
    batch_size = 64
    single_cls = False
    from utils.general import colorstr
    train_path = "/home/raytjchen/Desktop/code/datasets/coco128/images/train2017"
    gs = 32
    nbs = 64  # nominal batch size
    epochs = 20 # how many epochs to train for a single choice 

Create optimizer and scheduler for Yolov5

    # Optimizer
    accumulate = max(round(nbs / batch_size), 1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= batch_size * accumulate / nbs  # scale weight_decay
    from utils.torch_utils import smart_optimizer
    optimizer = smart_optimizer(model, 'SGD', hyp['lr0'], hyp['momentum'], hyp['weight_decay'])

    # Scheduler
    lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf']  # linear
    from torch.optim import lr_scheduler
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)  # plot_lr_scheduler(optimizer, scheduler, epochs)

Create trainloader and dataloader on coco dataset

    # Create Trainloader
    from utils.dataloaders import create_dataloader
    train_loader, dataset = create_dataloader(train_path,
                                              imgsz,
                                              batch_size // WORLD_SIZE,
                                              gs,
                                              single_cls,
                                              hyp=hyp,
                                              augment=True,
                                              cache=None,
                                              rect=False,
                                              rank=-1,
                                              workers=0,
                                              image_weights=False,
                                              quad=False,
                                              prefix=colorstr('train: '),
                                              shuffle=True)
        
    # Testloader
    val_path = "/home/raytjchen/Desktop/code/datasets/coco128/images/train2017"
    val_loader = create_dataloader(
                                val_path,
                                imgsz,
                                batch_size // WORLD_SIZE * 2,
                                gs,
                                single_cls,
                                hyp=hyp,
                                cache=None,
                                rect=True,
                                rank=-1,
                                workers=0,
                                pad=0.5,
                                prefix=colorstr('val: '))[0]

Specify Model’s attribute

    # Model attributes
    hyp['obj'] *= (imgsz / 640) ** 2  # scale to image size and layers
    hyp['label_smoothing'] = 0.0
    model.nc = 80  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    from utils.general import labels_to_class_weights
    model.class_weights = labels_to_class_weights(dataset.labels, 80).to(device) * 80  # attach class weights
    model.names = "nas_yolov5s"

Start training

    import time 
    start_time = time.time()
    nb = len(train_loader)
    nw = max(round(hyp['warmup_epochs'] * nb), 100)
    last_opt_step = -1
    import numpy as np
    nc = 80
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0)  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
    scheduler.last_epoch = - 1  # do not move
    scaler = torch.cuda.amp.GradScaler(enabled=False)
    from utils.loss import NASComputeLoss
    compute_loss = NASComputeLoss(model=model, h=hyp)
    
    for epoch in range(epochs):
        
        model.train()
        mloss = torch.zeros(3, device=device)
        pbar = enumerate(train_loader)
        from tqdm import tqdm
        pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
       
        optimizer.zero_grad()

        for batch_idx, (imgs, targets, paths, _) in pbar:
            
            ni = batch_idx + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float() / 255  # uint8 to float32, 0-255 to 0.0-1.0
            pred = model(imgs)
            loss, loss_items = compute_loss(pred, targets.to(device))
            scaler.scale(loss).backward()
            
            # Optimizer step on 
            if ni - last_opt_step >= accumulate:
                scaler.unscale_(optimizer)  # unscale gradients
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)  # clip gradients
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                last_opt_step = ni

            mloss = (mloss * batch_idx + loss_items) / (batch_idx + 1)  # update mean losses
            # end batch ------------------------------------------------------------------------------------------------------------------------
        
        # Scheduler step
        scheduler.step()

        # Validate
        data_dict = check_dataset('data/coco128.yaml')
        import val as validate
        results, maps, _ = validate.run(data_dict,
                                batch_size=batch_size // WORLD_SIZE * 2,
                                imgsz=imgsz,
                                half=False,
                                model=model,
                                single_cls=single_cls,
                                dataloader=val_loader,
                                save_dir="./output/",
                                plots=False,
                                #callbacks=callbacks,
                                compute_loss=compute_loss)
        nni.report_intermediate_result(int(results[3]) * 1000)

    # report final test result
    model_time = start_time - time.time() 
    nni.report_final_result(results[3] * 1000)    

Add efficiency term into our decision function (optional)

    # report final test result
    alpha = 0.03
    nni.report_final_result(results[3] * 1000 - model_time * alpha) 

Create The Evaluator

fmrom nni.retiarii.evaluator import FunctionalEvaluator
evaluator = FunctionalEvaluator(evaluate_model)

Launch An Experiment

After all the above are prepared, it is time to start an experiment to do the model search. An example is shown below.

from nni.retiarii.evaluator import FunctionalEvaluator
evaluator = FunctionalEvaluator(evaluate_model)

from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
exp = RetiariiExperiment(model_space, evaluator, [], search_strategy)
exp_config = RetiariiExeConfig('local')
exp_config.experiment_name = 'yolov5s_nas_search'

The following configurations are useful to control how many trials to run at most / at the same time.

exp_config.max_trial_number = 4   # spawn 4 trials at most
exp_config.trial_concurrency = 1  # will run two trials concurrently 

Remember to set the following config if you want to GPU. use_active_gpu should be set true if you wish to use an occupied GPU (possibly running a GUI).

exp_config.trial_gpu_number = 1
exp_config.training_service.use_active_gpu = True

Launch the experiment

exp.run(exp_config, 8083)

Export Best Model

for model_dict in exp.export_top_models(formatter='dict'):
    print(model_dict)
save_json_path = "./yolov5s_nas.json"
with open(save_json_path, 'w') as fp:
    json.dump(model_dict, fp)

Full Code On Github

hello_nas.py