-
Notifications
You must be signed in to change notification settings - Fork 234
/
Copy pathutils.py
237 lines (208 loc) · 11.5 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import os
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.nn.functional as F
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
try:
# noinspection PyUnresolvedReferences
from apex import amp
except ImportError:
amp = None
import argparse
from config import get_config
def parse_option():
parser = argparse.ArgumentParser('Mini-Swin training and evaluation script', add_help=False)
parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', )
parser.add_argument(
"--opts",
help="Modify config options by adding 'KEY VALUE' pairs. ",
default=None,
nargs='+',
)
# easy config modification
parser.add_argument('--batch-size', type=int, help="batch size for single GPU")
parser.add_argument('--data-path', type=str, help='path to dataset')
parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
help='no: no cache, '
'full: cache all data, '
'part: sharding the dataset into nonoverlapping pieces and only cache one piece')
parser.add_argument('--resume', help='resume from checkpoint')
parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps")
parser.add_argument('--use-checkpoint', action='store_true',
help="whether to use gradient checkpointing to save memory")
parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'],
help='mixed precision opt level, if O0, no amp is used')
parser.add_argument('--output', default='output', type=str, metavar='PATH',
help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)')
parser.add_argument('--tag', help='tag of experiment')
parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
parser.add_argument('--throughput', action='store_true', help='Test throughput only')
parser.add_argument('--load_tar', action='store_true', help='whether to load data from tar files')
# distributed training
parser.add_argument("--local_rank", type=int, help='local rank for DistributedDataParallel')
# Training
parser.add_argument('--total_train_epoch', default=-1, type=int, help='the total number of epochs for training')
parser.add_argument('--resume_weight_only', action='store_true', help='whether to only restore weight, used for initialization of multi-stage training')
parser.add_argument('--base_lr', default=-1.0, type=float, help='the base learning rate')
parser.add_argument('--weight_decay', default=-1.0, type=float, help='the weight decay value!')
parser.add_argument('--drop_path_rate', default=-1.0, type=float, help='the value for drop path rate!')
parser.add_argument('--train_224to384', action='store_true', help='whether finetuning from resolution 224 to 384')
# MiniViT - Weight Distillation
parser.add_argument('--do_distill', action='store_true', help='start distillation')
parser.add_argument('--teacher', default='', type=str, metavar='PATH', help='the path for teacher model')
parser.add_argument('--temperature', default=1.0, type=float,
help='the temperature for distillation loss')
parser.add_argument('--alpha', default=0.0, type=float, help='the weight to balance the soft label loss and ground-truth label loss')
parser.add_argument('--ar', default=1, type=int, help='The number of relative heads')
parser.add_argument('--student_layer_list', default='11', type=str, help='The index of layer in the student to be used for distillation loss')
parser.add_argument('--teacher_layer_list', default='23', type=str, help='The index of layer in the teacher to be used for distillation loss')
parser.add_argument('--attn_loss', action='store_true', help='whether to use the attention loss')
parser.add_argument('--hidden_loss', action='store_true', help='whether to use hidden loss along with the attention loss!')
parser.add_argument('--hidden_weight', default=1.0, type=float, help='the weight for hidden loss!')
parser.add_argument('--hidden_relation', action='store_true', help='whether to use the hidden relation loss!')
parser.add_argument('--qkv_weight', default=1.0, type=float, help='the weight for qkv loss!')
parser.add_argument('--is_student', action='store_true', help='if True, additional linear layers are created for hidden MSE loss')
parser.add_argument('--fit_size_c', default=-1, type=int, help='when this number is positive, then the output dimension of the linear layers created for hidden MSE loss will be set to this number')
## MiniViT - Weight Transformation
parser.add_argument('--is_sep_layernorm', action='store_true', help='whether to use separate layer normalization in each shared layer')
parser.add_argument('--is_transform_ffn', action='store_true', help='whether to use transformations for FFN')
parser.add_argument('--is_transform_heads', action='store_true', help='whether to use transformations for MSA')
args = parser.parse_args()
config = get_config(args)
return args, config
def load_checkpoint(config, model, optimizer, lr_scheduler, logger):
logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
if config.MODEL.RESUME.startswith('https'):
checkpoint = torch.hub.load_state_dict_from_url(
config.MODEL.RESUME, map_location='cpu', check_hash=True)
else:
checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
model_state = checkpoint['model']
if config.TRAIN.TRAIN_224TO384:
mnames = ['head.weight', 'head.bias'] # (cls, 1024), (cls, )
now_model_state = model.state_dict()
if mnames[-1] in model_state:
ckpt_head_bias = model_state[mnames[-1]]
if ckpt_head_bias.shape != model.head.bias.shape:
for mname in mnames:
p = model_state[mname].new_zeros(now_model_state[mname].shape)
if mname.endswith('.weight'):
trunc_normal_(p, std=.02)
elif mname.endswith('.bias'):
nn.init.constant_(p, 0)
else:
assert 0
model_state[mname] = p
# drop attn mask
for k in list(model_state.keys()):
if 'attn_mask' in k or 'relative_position_index' in k:
model_state.pop(k)
mode = 'interpolate'
for key in list(model_state.keys()):
value = model_state[key]
if 'relative_position_bias_table' in key:
l, nh = value.size()
l2, nh2 = now_model_state[key].size()
l2 = int(l2 ** 0.5)
sl = int(l ** 0.5)
if sl == 13:
pad = 5
elif sl == 27:
pad = 10
else:
assert sl in [23, 47], sl
continue
if mode == "interpolate":
# table: (L, num_heads)
value = F.interpolate(value.permute(1, 0).view(1, nh, sl, sl),size=(l2, l2), mode='bicubic') # (1, nh, l2, l2)
value = value.reshape(nh, l2 * l2).permute(1, 0)
model_state[key] = value
if config.TRAIN.TRAIN_224TO384:
model.load_state_dict(model_state, strict=False)
else:
model.load_state_dict(model_state, strict=True)
max_accuracy = 0.0
if config.EVAL_MODE or config.DISTILL.RESUME_WEIGHT_ONLY:
logger.info(f"==============> RESUME_WEIGHT_ONLY mode is on....................")
del checkpoint
torch.cuda.empty_cache()
return max_accuracy
if not config.EVAL_MODE and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
if optimizer is not None and not config.DISTILL.STOP_LOADING_OPTIMIZER:
try:
optimizer.load_state_dict(checkpoint['optimizer'], strict=False)
except:
logger.info('==============> Inconsistency occurred! Skipping loading optimizer...')
if lr_scheduler is not None:
try:
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
except:
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'][0])
config.defrost()
config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
config.freeze()
if 'amp' in checkpoint and config.AMP_OPT_LEVEL != "O0" and checkpoint['config'].AMP_OPT_LEVEL != "O0":
amp.load_state_dict(checkpoint['amp'])
logger.info(f"=> loaded successfully '{config.MODEL.RESUME}' (epoch {checkpoint['epoch']})")
if 'max_accuracy' in checkpoint:
max_accuracy = checkpoint['max_accuracy']
if not config.EVAL_MODE and 'epoch' in checkpoint:
config.defrost()
config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
config.freeze()
if not config.EVAL_MODE and 'epoch' in checkpoint:
if optimizer is not None and not config.DISTILL.STOP_LOADING_OPTIMIZER:
try:
logger.info('Try loading optimizer (2nd trial)')
optimizer.load_state_dict(checkpoint['optimizer'])
logger.info('=> optimizer loaded successfully')
except:
logger.info('==============> Inconsistency occurred! Skipping loading optimizer...')
del checkpoint
torch.cuda.empty_cache()
return max_accuracy
def save_checkpoint(config, epoch, model, max_accuracy, optimizer, lr_scheduler, logger):
save_state = {'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'max_accuracy': max_accuracy,
'epoch': epoch,
'config': config}
if config.AMP_OPT_LEVEL != "O0":
save_state['amp'] = amp.state_dict()
if lr_scheduler is not None:
save_state['lr_scheduler'] = lr_scheduler.state_dict(),
save_path = os.path.join(config.OUTPUT, f'ckpt_epoch_{epoch}.pth')
logger.info(f"{save_path} saving......")
torch.save(save_state, save_path)
logger.info(f"{save_path} saved !!!")
def get_grad_norm(parameters, norm_type=2):
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = list(filter(lambda p: p.grad is not None, parameters))
norm_type = float(norm_type)
total_norm = 0
for p in parameters:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** norm_type
total_norm = total_norm ** (1. / norm_type)
return total_norm
def auto_resume_helper(output_dir):
checkpoints = os.listdir(output_dir)
checkpoints = [ckpt for ckpt in checkpoints if ckpt.endswith('pth')]
print(f"All checkpoints founded in {output_dir}: {checkpoints}")
if len(checkpoints) > 0:
latest_checkpoint = max([os.path.join(output_dir, d) for d in checkpoints], key=os.path.getmtime)
print(f"The latest checkpoint founded: {latest_checkpoint}")
resume_file = latest_checkpoint
else:
resume_file = None
return resume_file
def reduce_tensor(tensor, n=None):
if n is None:
n = dist.get_world_size()
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt = rt / n
return rt