train_size=min(8,len(train))# Check if model overfits on small data, to ensure DNN actually is effectivedev_size=min(8,len(dev))min_training_batches=4train_batch_size=min(32,max(1,train_size//min_training_batches))evaluation_batch_size=min(1_024,dev_size)
defget_max_len(arrays):returnmax([len(array)forarrayinarrays])defpad(array,max_len):returnlist(np.pad(array,pad_width=(0,max_len-len(array)),constant_values=np.nan))defget_all_nodes(model):network_nodes=[]layers=model.named_children()fori,layerinenumerate(layers):layer_nodes_formatted=[]sub_layer=layer[-1]forsub_layer_nodeinsub_layer:layer_nodes_formatted.append(sub_layer_node)network_nodes.append(layer_nodes_formatted)returnnetwork_nodesdefget_summary_agg(df,agg=["mean"],precision=2):df=(df.groupby(["Epoch","Train_Time","Subset"]).agg({"Loss":agg,"Accuracy":["mean"]}).round(precision))df.columns=list(map('_'.join,df.columns.values))df=(df.reset_index().pivot(index=["Epoch","Train_Time"],columns="Subset",# values = "Accuracy"))df.columns=list(map('_'.join,df.columns.values))# should not be part of data collection# df["Generalization_Gap"] = df["Loss_mean_Dev"] - df["Loss_mean_Train"]df=df.reset_index()returndf
# @torch.compile(mode="reduce-overhead")deftrain_batch(model,optimizer,loss,x,y,train_dl_len,batch_idx,device,accum_iter=1,k_frac=None):x=x.half()y=y.half()# x = x# y = ymodel.train()# with torch.set_grad_enabled(True): # turn on history tracking# forward passproba=model(x)loss_array=loss(proba,y)loss_scalar=loss_array.mean()# backward passoptimizer.zero_grad(set_to_none=True)# clear accumulated gradients from backpropagationloss_scalar.backward()# weights update# if accum_iter != 1 -> gradient accumulationbatch_num=batch_idx+1if((batch_num%accum_iter==0)or(batch_num==len(train_dl_len))):optimizer.step()# @torch.compile(mode="reduce-overhead")deftrain_epoch(dl,model,optimizer,loss,train_dl_len,device,eval=False,k_frac=None):# epoch_accuracies = []epoch_losses=[]epoch_accuracies=[]forbatch_idx,(x,y)inenumerate(dl):train_batch(model,optimizer,loss,x,y,train_dl_len,batch_idx,device,accum_iter=1,k_frac=k_frac)# epoch_accuracies += eval_batch(model, x, y)ifeval:temp=eval_batch(model,x,y,loss,device)epoch_losses+=temp[0]epoch_accuracies+=temp[1]returnepoch_losses,epoch_accuracies# @torch.compile(mode="reduce-overhead")defeval_batch(model,x,y,loss,device):x=x.half()y=y.half()# x = x# y = ymodel.eval()withtorch.inference_mode():# turn off history tracking# forward passproba=model(x)loss_value=loss(proba,y)epoch_loss_array=loss_value.detach()# loss_value.item() # batch losstrue=model.predict_from_proba(y)pred=model.predict_from_proba(proba)epoch_accuracy_array=(pred==true)# torch.sum()returnepoch_loss_array,epoch_accuracy_array# @torch.compile(mode="reduce-overhead")defeval_epoch(dl,model,loss,device):epoch_accuracies=[]epoch_losses=[]forbatch_idx,(x,y)inenumerate(dl):temp=eval_batch(model,x,y,loss,device)epoch_losses+=temp[0]epoch_accuracies+=temp[1]returnepoch_losses,epoch_accuraciesdeftrain_model(train_dl,dev_dl,model,loss,optimizer,n_epochs,device,train_eval_every=10,dev_eval_every=10,agg=None,k_frac=None,log=False):print(rf""" \n Training with {train_dl,dev_dl,model,loss,optimizer,n_epochs,device,train_eval_every,dev_eval_every,agg,k_frac,log} """)model=model.to(device).half()model.train()summary_list=[]train_dl_len=len(train_dl)print_epoch_every=dev_eval_everytrain_time=0forepochinrange(1,n_epochs+1):print_epoch=Falseeval_train=Falseeval_dev=Falseifepoch==1orepoch==n_epochs:eval_train=Trueeval_dev=Trueiflog:print_epoch=Trueifepoch%train_eval_every==0:eval_train=Trueifepoch%dev_eval_every==0:eval_dev=Trueifepoch%print_epoch_every==0:print_epoch=Trueifprint_epoch:print(f"Epoch {epoch}/{n_epochs} started",end="")start_time=time.time()epoch_train_losses,epoch_train_accuracies=train_epoch(train_dl,model,optimizer,loss,train_dl_len,device,eval=eval_train,k_frac=k_frac)end_time=time.time()duration=end_time-start_timetrain_time+=durationifeval_dev:epoch_dev_losses,epoch_dev_accuracies=eval_epoch(dev_dl,model,loss,device)else:epoch_dev_losses,epoch_dev_accuracies=[],[]fore,ainzip(epoch_train_losses,epoch_train_accuracies):summary_list.append([epoch,train_time,"Train",float(e),float(a)])fore,ainzip(epoch_dev_losses,epoch_dev_accuracies):summary_list.append([epoch,train_time,"Dev",float(e),float(a)])ifprint_epoch:print(f", completed")model.eval()summary=(pd.DataFrame(columns=["Epoch","Train_Time","Subset","Loss","Accuracy"],data=summary_list))ifaggisnotNone:summary=summary.pipe(get_summary_agg,agg)returnsummary
Create a way to bind the loss function to the network once, and then automatically clear accumulated gradients automatically when performing the backward pass.
classLoss(Value):def__init__(self,bound_network):self.bound_network=bound_networkdef__call__(self,batch_size=None):# loss function definitionself.data=data_loss+reg_lossdefbackward():# clear gradients of bound networkbound_network.zero_grad()super().backward()total_loss=Loss(bound_network=model)forkinrange(100):# ...# model.zero_grad() # since total_loss is bound to network, it should automatically perform model.zero_grad() before doing the backwardtotal_loss.backward()# ...
defplot_summary(df,x,y):df=df.copy()c="Optimizer"if"Accuracy"inyand"Generalization"notiny:sub_title=f"Higher is better"percentage=Trueelse:sub_title=f"Lower is better"percentage=Falseifpercentage:df[y]*=100if"Accuracy"inyand"Generalization"notiny:range_y=[0,100]else:range_y=[0,df[df[y]>0][y].quantile(0.90)*1.1]# if "loss" in y.lower():# range_y = [0, df[y].quantile(0.90)*1.1]# else:# range_y = None# if y == "Generalization_Gap":# sub_title = f"Lower is better"# range_y = None# else:# range_y = [0, 100 if percentage else 1]# sub_title = f"Higher is better"title=f'{y.replace("_"," ")}'title+=f"<br><sup>{sub_title}</sup>"facet_row="Train_Batch_Size"fig=px.line(data_frame=df,x=x,y=y,facet_col="Learning_Rate",facet_row="Train_Batch_Size",facet_row_spacing=0.1,color=c,title=title,range_x=[df[x].values.min(),df[x].values.max()],range_y=range_y,# df[y].values.min() * 0.95markers=True,)n_rows=df[facet_row].unique().shape[0]fig.update_layout(height=300*n_rows)fig.update_traces(patch={"marker":{"size":5},"line":{"width":1,# "dash": "dot"},})fig.update_traces(connectgaps=True)# required for connecting dev accuraciesst.plotly_chart(fig,use_container_width=True)returnfig
importinspectdeftrain_models(loss,model,n_epochs,optimizer_names,learning_rates,train_batch_sizes,device,agg=["mean"],train_eval_every=10,dev_eval_every=10,log=False,output_path="summary.csv"):# summaries = pd.DataFrame()# i=0train_size=min(2_048,len(train))# Check if model overfits on small data, to ensure DNN actually is effectivedev_size=min(2_048,len(dev))train_random_sampler=RandomSampler(train,num_samples=train_size)dev_random_sampler=RandomSampler(dev,num_samples=dev_size)evaluation_batch_size=2_048ifevaluation_batch_size>dev_size:raiseException("Evaluation batch size > dev size")fortrain_batch_sizeintrain_batch_sizes:ifevaluation_batch_size>train_size:raiseException("Evaluation batch size > dev size")train_dl=DataLoader(train,sampler=train_random_sampler,batch_size=train_batch_size,drop_last=True,# num_workers = 1 # 0)dev_dl=DataLoader(dev,sampler=dev_random_sampler,batch_size=evaluation_batch_size,drop_last=True,# num_workers = 1 # 0)forlearning_rateinlearning_rates:iflearning_rate>0.0100:raiseException("Very high learning rate")foroptimizer_nameinoptimizer_names:model_copy=copy.deepcopy(model)optimizer=getattr(optim_class,optimizer_name)optimizer_kwargs=dict(params=model_copy.parameters(),lr=learning_rate)if"eps"inlist(inspect.getfullargspec(optimizer.__init__)[0]):optimizer_kwargs.update(eps=1e-4)optimizer=optimizer(**optimizer_kwargs)forstateinoptimizer.state.values():fork,vinstate.items():ifisinstance(v,torch.Tensor):state[k]=torch.as_tensor(v,device=device).half()summary=train_model(train_dl,dev_dl,model_copy,loss,optimizer,n_epochs,device=device,train_eval_every=train_eval_every,dev_eval_every=dev_eval_every,log=log,agg=agg)summary["Model"]=str(get_all_nodes(model_copy))summary["Optimizer"]=optimizer_namesummary["Learning_Rate"]=learning_ratesummary["Train_Batch_Size"]=train_batch_size# disabled due too high space complexity# summaries = pd.concat([# summaries,# summary# ])summary.to_csv(output_path,index=False,mode="a",header=notos.path.exists(output_path))gc.collect(0)# i += 1# if i==1:# breakreturnNone
model=NeuralNet(init_data=train,hidden_layers=[nn.Flatten(),nn.LazyLinear(10),nn.ReLU(),# nn.LazyLinear(10),# nn.ReLU()# nn.Sigmoid() # not required])