s2vt-pipelines

Last updated on July 17, 2023 pm

一些pipeline(sequence to sequence –video to text)

utils

import numpy as np
import torch
import torch.nn as nn
import json
import imageio.v3 as iiov3
import torchvision.transforms as transforms
from torch.utils.data import Dataset

def load_vocab(if_reverse=False):
    """load a vocab for indice

    Args:
        input 
            if_reverse -> False : words->indice
                          True : indice->words
        output
            vocab -> a dict contains all words(from train_val dset) and its indice
            count -> total amount of words
            0 -> padding
    """
    vocab={}
    vocab['<pad>']=0
    vocab[' ']=1 #pad
    vocab['<BOS>']=2
    vocab['<EOS>']=3
    vocab['<UNK>']=4
    count=5
    with open('./dset/msrvtt/train_val_annotation/train_val_videodatainfo.json',mode='r',) as p:
        sentences=json.load(p)['sentences']
    for sentence in sentences:
        tmp=sentence['caption']
        tmp=tmp.split()
        for words in tmp:
            if not(words in vocab):
                vocab[words]=count
                count+=1
    if(if_reverse):
        tmp={}
        for i,words in enumerate(vocab):
            tmp[i]=words #0 for padding
        vocab=tmp
    return vocab,count
            

def load_annotation(max_lens):
    """to load train and val annotation

    Args:
        input -> max_lens the max number of words
        output -> train : a list of training annotation(indice) 
                  val : a list of validation annotation(indice)
                  max_lens words
    """
    vocab,_=load_vocab(if_reverse=False)
    train=[]
    val=[]
    with open('./dset/msrvtt/train_val_annotation/train_val_videodatainfo.json',mode='r',) as p:
        load=json.load(p)
    sentence_json=load['sentences']
    video_json=load['videos']
    for i in sentence_json:
        str_caption=i['caption']
        str_id=i['video_id']
        tmp=str_caption.split()
        caption_indice=[]
        for word in tmp:
            caption_indice.append(vocab[word])
        caption_indice=caption_indice[:max_lens-1]
        caption_indice.append(3)
        caption_indice=torch.tensor(np.array(caption_indice),dtype=torch.int64)
        caption_indice=torch.nn.functional.pad(caption_indice,(0,max_lens-caption_indice.size(0)),mode='constant')
        num=eval(str_id[5:])
        if(video_json[num]['split']=='train'):
            train.append((caption_indice,str_id))
        else:
            val.append((caption_indice,str_id))
    return train,val

def transform_initial_img(img):
    """
    To do pre-process for the image
    """
    process=torch.nn.Sequential(
        transforms.Resize((256,256),),
        transforms.RandomCrop((224,224),),
    )
    return process(img)

def load_videos(name,max_frames):
    """
    to load videos
    
    Args:
        input:
            name -> str (the video to be loaded) //
            max_frames -> int the max number of frames
        output:
            tensorimg -> (T(max_frames),C,H,W)
    """
    address='./dset/msrvtt/train_val_video/'+name+'.mp4'
    tmp=iiov3.imread(address,plugin='pyav')
    tmp_shape=tmp.shape[0]
    index=np.arange(0,min(tmp_shape,max_frames*10),10)
    tmp=tmp[index,:,:]
    tmp=torch.tensor(tmp)
    tmp=torch.swapaxes(tmp,1,3)
    tmp=torch.swapaxes(tmp,2,3)
    for per in range(tmp.shape[0]):
        cur=transform_initial_img(tmp[per])
        cur=torch.reshape(cur,(1,3,224,224))
        if(per==0):
            tensorimg=cur
        else:
            tensorimg=torch.cat((tensorimg,cur),dim=0)
    if(tensorimg.size(0)<max_frames):
        pad=max_frames-tensorimg.size(0)
        padtensor=torch.zeros([pad,3,224,224])
        tensorimg=torch.cat((tensorimg,padtensor),dim=0)
    tensorimg=tensorimg.type(dtype=torch.float32) #!! transform into torch.float32 for conv2d computation
    return tensorimg

class msrvtt_train_dataloader(Dataset):
    """subclass to Dataset
    
    Args:
        input:no input
        output: 
        img: #size(N,T,C,H,W) N->batch_size T->max_length(frames) C,H,W->img frame
        caption:tuple of diffrent captions perbatch
    """
    def __init__(self,max_frames,max_lens):
        self.annotation,_=load_annotation(max_lens)
        self.max_frames=max_frames

    def __len__(self):
        return len(self.annotation)
    
    def __getitem__(self, index):
        curr_caption,curr_name=self.annotation[index]
        curr_img_sequence=load_videos(curr_name,self.max_frames)
        return curr_img_sequence,curr_caption

class msrvtt_val_dataloader(Dataset):
    """subclass to Dataset
    
    Args:
        input:no input
        output: 
        img: #size(N,T,C,H,W) N->batch_size T->max_length(frames) C,H,W->img frame
        caption:tuple of diffrent captions perbatch
    """
    def __init__(self,max_lens,max_frames):
        _,self.annotation=load_annotation(max_lens)
        self.max_frames=max_frames
    
    def __len__(self):
        return len(self.annotation)

    def __getitem__(self, index):
        curr_caption,curr_name=self.annotation[index]
        curr_img_sequence=load_videos(curr_name,self.max_frames)
        return curr_img_sequence,curr_caption

Model

s2vt

class s2vtmodel(nn.Module):
    """
    Summary:
    
        A model subclass to nn.Module
        s2vtmodel from sequence to sequence --video to text
        
    Args:
    
        vocab_size -> the vocabulary size
        batch_size -> batch size
        CNN -> a cnn model used to extract features from img (need to be an object instead of a class)
        max_lens -> max number of words in caption
        max_frames -> max number of frames
        img_feature_size -> img feature after CNN
        hidden_size-> size of hidden unit in LSTM
        embedding_dim -> dim of words after word embedding
    
    Input:
    
        img_size(N,T,C,H,W) -> size limit 80
        caption (N,T) contains a number indice needed to be embedded -> size limit 28
        
    """
    def __init__(self,
                 vocab_size,
                 batch_size,
                 CNN,
                 device,
                 max_lens=28,
                 max_frames=80,
                 img_feature_size=500,
                 hidden_size=1000,
                 embedding_dim=500,
                 ):
        super().__init__()
        
        self.CNNmodel=CNN
        self.vocab_size=vocab_size
        self.embedding_dim=embedding_dim
        self.input_feature_size=img_feature_size
        self.hidden_size=hidden_size
        self.max_lens=max_lens
        self.max_frames=max_frames
        self.batch_size=batch_size
        self.device=device
        
        self.bos=2
        self.eos=3
        
        self.embedding=nn.Embedding(vocab_size,embedding_dim,)
        self.projection=nn.Linear(hidden_size,vocab_size,)
        
        self.padding_frame=torch.zeros((batch_size,1,img_feature_size),device=device) #(N,1,img_feature_size)
        self.padding_words=torch.zeros((batch_size,max_frames,embedding_dim),device=device)# (N,T,embedding dim)
        
        self.lstm1=nn.LSTM(
            input_size=img_feature_size,
            hidden_size=hidden_size,
            batch_first=True,
        )
        self.lstm2=nn.LSTM(
            input_size=hidden_size+embedding_dim,
            hidden_size=hidden_size,
            batch_first=True
        )
    def forward(self,img_seq,cap_seq,mode):
        """forward pass for the input img tensor and caption

        Args:
            img_seq (tensor): size(N,maxn_frames,C,H,W) N->batch_size T->max_frames C,H,W->img frame
            cap_seq (tensro): size(N,max_lens)
            mode (str): 'train'/'test'
        """
        features=self.CNNmodel(img_seq) #of shape (N,T(max_frames),img_feature_size)
        h1_out,h1_state=self.lstm1(features) #h1_out -> N,T,num of hidden, h1_state -> (N,1,num of hidden)
        # print(h1_out.size())
        # print(self.padding_words.size())
        h2_in=torch.cat((h1_out,self.padding_words),dim=2) #(N,T,num of hidden+embedding dim)
        _,h2_state=self.lstm2(h2_in)#(N,1,num of hidden)
        
        if (mode=='train'):
            seq_train_pred=[]
            for i in range(self.max_lens-1):
                cur_word=self.embedding(cap_seq[:,i]) #cur_word -> (N,embedding dim)
                cur_word=torch.reshape(cur_word,(self.batch_size,1,self.embedding_dim)) #(N,1,embedding dim)
                h1_out,h1_state=self.lstm1(self.padding_frame,h1_state)
                h2_in=torch.cat((h1_out,cur_word),dim=2)
                h2_out,h2_state=self.lstm2(h2_in,h2_state) #h2_out -> (N,1,num of hidden)
                h2_out=torch.squeeze(h2_out,dim=1) #h2_out -> (N,num of hidden)
                out_scores=self.projection(h2_out)
                seq_train_pred.append(out_scores)
            return torch.stack(seq_train_pred,dim=1)
        else:
            seq_test_pred=[]
            cur_word=self.embedding(torch.tensor(self.batch_size*[self.bos],device=self.device))
            for i in range(self.max_lens-1):
                cur_word=torch.reshape(cur_word,(self.batch_size,1,self.embedding_dim))
                h1_out,h1_state=self.lstm1(self.padding_frame,h1_state)
                h2_in=torch.cat((h1_out,cur_word),dim=2)
                h2_out,h2_state=self.lstm2(h2_in,h2_state)
                h2_out=torch.squeeze(h2_out,dim=1)
                out_scores=self.projection(h2_out)
                out_word=torch.argmax(out_scores,dim=1) # for test
                cur_word=self.embedding(out_word)
                seq_test_pred.append(out_word)
            return torch.stack(seq_test_pred,dim=1)

Pre-trained cnn(vgg16)

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import vgg16

class vggmodel(nn.Module):
    """
    return a pre-trained model
    the last layer is replaced by a learnable linear layer from 4096 -> 500 features
    """
    def __init__(self):
        super().__init__()
        self.model=vgg16(weights='DEFAULT',progress=True)
        for p in self.model.parameters():
            p.requires_grad=False
        self.model.classifier[6]=nn.Linear(4096,500)
    def forward(self,x):
        return self.model(x)

test

test part

import torch
from torch.utils.data import DataLoader
from model import s2vtmodel,vggmodel
from utils import *

vocab,vocab_size=load_vocab(if_reverse=False)
batch_size=12
max_lens=28
max_frames=40
img_feature_size=500
hidden_size=1000
embedding_dim=500
learning_rate=3e-3
betas=(0.9,0.999)
eps=1e-8
num_epoch=5
prt_perup=16
grad_accu=1

train_data=DataLoader(dataset=msrvtt_train_dataloader(max_frames=max_frames,max_lens=max_lens),batch_size=batch_size,shuffle=True)
val_data=DataLoader(dataset=msrvtt_val_dataloader(max_frames=max_frames,max_lens=max_lens),batch_size=batch_size,shuffle=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#size(N,T,C,H,W) N->batch_size T->max_length(frames) C,H,W->img frame
model=s2vtmodel(vocab_size,batch_size,vggmodel(),device,max_lens,max_frames,img_feature_size,hidden_size,embedding_dim).to(device)
optimizer=torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),lr=learning_rate,betas=betas,eps=eps)
# optimizer=torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),lr=learning_rate,)
loss_fn=nn.CrossEntropyLoss()

model.train()
loss_history=[]
loss_sum=0
for epoch in range(num_epoch):
    num_iter=0
    for img, cap in train_data: 
        num_iter+=1

        img=img.to(device)
        cap=cap.to(device)
        seq=model(img,cap,'train')
        seq_out=torch.reshape(seq,(-1,vocab_size))
        cap=cap[:,1:]
        cap=torch.reshape(cap,(cap.size(0)*cap.size(1),1)).squeeze()
        
        if grad_accu!=1:
            
            loss=loss_fn(seq_out,cap)/grad_accu
            loss.backward()
            loss_sum+=loss.item()*grad_accu
            
            if (num_iter//grad_accu)%prt_perup==0:
                print("------------------")
                print("cur_iter: ",num_iter)
                print("cur_epoch: ",epoch)
                print(loss_sum) #按更新次数输出
            
            if(num_iter%grad_accu==0):
                optimizer.step()
                optimizer.zero_grad()
                loss_history.append(loss_sum)
                loss_sum=0
        else:
            loss=loss_fn(seq_out,cap)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_history.append(loss.item())
            if num_iter%prt_perup==0:
                print("------------------")
                print("cur_iter: ",num_iter)
                print("cur_epoch: ",epoch)
                print(loss.item()) #按更新次数输出
                 
    torch.save(model.state_dict(),'net_paras_v2'+str(epoch)+'.pth')

model.eval()
vocab_reverse,_=load_vocab(if_reverse=True)
for img,cap in val_data:
    img=img.to(device)
    out_word=model(img,_,'test')
    print(out_word)
    break
print(img.size())
print(cap)

train

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from model import s2vtmodel,vggmodel
from utils import msrvtt_train_dataloader,load_vocab

def train_forcap(train_data,model,num_epoch,device,loss_fn,optimizer,vocab_size,saving_path,grad_accu=0,prt_perup=128,):
    model.train()
    loss_history=[]
    loss_sum=0
    for epoch in range(num_epoch):
        num_iter=0
        for img, cap in train_data: 
            num_iter+=1

            img=img.to(device)
            cap=cap.to(device)
            seq=model(img,cap,'train')
            seq_out=torch.reshape(seq,(-1,vocab_size))
            cap=cap[:,1:]
            cap=torch.reshape(cap,(cap.size(0)*cap.size(1),1)).squeeze()
        
            if grad_accu!=1:
            
                loss=loss_fn(seq_out,cap)/grad_accu
                loss.backward()
                loss_sum+=loss.item()*grad_accu
            
                if (num_iter//grad_accu)%prt_perup==0:
                    print("------------------")
                    print("cur_iter: ",num_iter)
                    print("cur_epoch: ",epoch)
                    print(loss_sum) #按更新次数输出
            
                if(num_iter%grad_accu==0):
                    optimizer.step()
                    optimizer.zero_grad()
                    loss_history.append(loss_sum)
                    loss_sum=0
            else:
                loss=loss_fn(seq_out,cap)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                loss_history.append(loss.item())
                if num_iter%prt_perup==0:
                    print("------------------")
                    print("cur_iter: ",num_iter)
                    print("cur_epoch: ",epoch)
                    print(loss.item()) #按更新次数输出
                 
        torch.save(model.state_dict(),saving_path)
    return model

def main():
    vocab,vocab_size=load_vocab(if_reverse=False)
    batch_size=2
    max_lens=28
    max_frames=40
    img_feature_size=500
    hidden_size=1000
    embedding_dim=500
    learning_rate=1e-3
    betas=(0.9,0.999)
    eps=1e-8
    num_epoch=5
    prt_perup=16
    grad_accu=1
    
    train_data=DataLoader(dataset=msrvtt_train_dataloader(max_frames=max_frames,max_lens=max_lens),batch_size=batch_size,shuffle=True)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #size(N,T,C,H,W) N->batch_size T->max_length(frames) C,H,W->img frame
    model=s2vtmodel(vocab_size,batch_size,vggmodel(),device,max_lens,max_frames,img_feature_size,hidden_size,embedding_dim).to(device)
    optimizer=torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),lr=learning_rate,betas=betas,eps=eps)
    loss_fn=nn.CrossEntropyLoss()
    
    trained_model=train_forcap(train_data,model,num_epoch,device,loss_fn,optimizer,vocab_size,'saving.pth',grad_accu,prt_perup)

tools

#tools #research #study

27thJan.2023 Previous

25thJan.2023 Next