s2vt-pipelines

Last updated on July 17, 2023 pm

一些pipeline(sequence to sequence –video to text)

utils
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import numpy as np
import torch
import torch.nn as nn
import json
import imageio.v3 as iiov3
import torchvision.transforms as transforms
from torch.utils.data import Dataset

def load_vocab(if_reverse=False):
"""load a vocab for indice

Args:
input
if_reverse -> False : words->indice
True : indice->words
output
vocab -> a dict contains all words(from train_val dset) and its indice
count -> total amount of words
0 -> padding
"""
vocab={}
vocab['<pad>']=0
vocab[' ']=1 #pad
vocab['<BOS>']=2
vocab['<EOS>']=3
vocab['<UNK>']=4
count=5
with open('./dset/msrvtt/train_val_annotation/train_val_videodatainfo.json',mode='r',) as p:
sentences=json.load(p)['sentences']
for sentence in sentences:
tmp=sentence['caption']
tmp=tmp.split()
for words in tmp:
if not(words in vocab):
vocab[words]=count
count+=1
if(if_reverse):
tmp={}
for i,words in enumerate(vocab):
tmp[i]=words #0 for padding
vocab=tmp
return vocab,count


def load_annotation(max_lens):
"""to load train and val annotation

Args:
input -> max_lens the max number of words
output -> train : a list of training annotation(indice)
val : a list of validation annotation(indice)
max_lens words
"""
vocab,_=load_vocab(if_reverse=False)
train=[]
val=[]
with open('./dset/msrvtt/train_val_annotation/train_val_videodatainfo.json',mode='r',) as p:
load=json.load(p)
sentence_json=load['sentences']
video_json=load['videos']
for i in sentence_json:
str_caption=i['caption']
str_id=i['video_id']
tmp=str_caption.split()
caption_indice=[]
for word in tmp:
caption_indice.append(vocab[word])
caption_indice=caption_indice[:max_lens-1]
caption_indice.append(3)
caption_indice=torch.tensor(np.array(caption_indice),dtype=torch.int64)
caption_indice=torch.nn.functional.pad(caption_indice,(0,max_lens-caption_indice.size(0)),mode='constant')
num=eval(str_id[5:])
if(video_json[num]['split']=='train'):
train.append((caption_indice,str_id))
else:
val.append((caption_indice,str_id))
return train,val

def transform_initial_img(img):
"""
To do pre-process for the image
"""
process=torch.nn.Sequential(
transforms.Resize((256,256),),
transforms.RandomCrop((224,224),),
)
return process(img)

def load_videos(name,max_frames):
"""
to load videos

Args:
input:
name -> str (the video to be loaded) //
max_frames -> int the max number of frames
output:
tensorimg -> (T(max_frames),C,H,W)
"""
address='./dset/msrvtt/train_val_video/'+name+'.mp4'
tmp=iiov3.imread(address,plugin='pyav')
tmp_shape=tmp.shape[0]
index=np.arange(0,min(tmp_shape,max_frames*10),10)
tmp=tmp[index,:,:]
tmp=torch.tensor(tmp)
tmp=torch.swapaxes(tmp,1,3)
tmp=torch.swapaxes(tmp,2,3)
for per in range(tmp.shape[0]):
cur=transform_initial_img(tmp[per])
cur=torch.reshape(cur,(1,3,224,224))
if(per==0):
tensorimg=cur
else:
tensorimg=torch.cat((tensorimg,cur),dim=0)
if(tensorimg.size(0)<max_frames):
pad=max_frames-tensorimg.size(0)
padtensor=torch.zeros([pad,3,224,224])
tensorimg=torch.cat((tensorimg,padtensor),dim=0)
tensorimg=tensorimg.type(dtype=torch.float32) #!! transform into torch.float32 for conv2d computation
return tensorimg

class msrvtt_train_dataloader(Dataset):
"""subclass to Dataset

Args:
input:no input
output:
img: #size(N,T,C,H,W) N->batch_size T->max_length(frames) C,H,W->img frame
caption:tuple of diffrent captions perbatch
"""
def __init__(self,max_frames,max_lens):
self.annotation,_=load_annotation(max_lens)
self.max_frames=max_frames

def __len__(self):
return len(self.annotation)

def __getitem__(self, index):
curr_caption,curr_name=self.annotation[index]
curr_img_sequence=load_videos(curr_name,self.max_frames)
return curr_img_sequence,curr_caption

class msrvtt_val_dataloader(Dataset):
"""subclass to Dataset

Args:
input:no input
output:
img: #size(N,T,C,H,W) N->batch_size T->max_length(frames) C,H,W->img frame
caption:tuple of diffrent captions perbatch
"""
def __init__(self,max_lens,max_frames):
_,self.annotation=load_annotation(max_lens)
self.max_frames=max_frames

def __len__(self):
return len(self.annotation)

def __getitem__(self, index):
curr_caption,curr_name=self.annotation[index]
curr_img_sequence=load_videos(curr_name,self.max_frames)
return curr_img_sequence,curr_caption
Model
s2vt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
class s2vtmodel(nn.Module):
"""
Summary:

A model subclass to nn.Module
s2vtmodel from sequence to sequence --video to text

Args:

vocab_size -> the vocabulary size
batch_size -> batch size
CNN -> a cnn model used to extract features from img (need to be an object instead of a class)
max_lens -> max number of words in caption
max_frames -> max number of frames
img_feature_size -> img feature after CNN
hidden_size-> size of hidden unit in LSTM
embedding_dim -> dim of words after word embedding

Input:

img_size(N,T,C,H,W) -> size limit 80
caption (N,T) contains a number indice needed to be embedded -> size limit 28

"""
def __init__(self,
vocab_size,
batch_size,
CNN,
device,
max_lens=28,
max_frames=80,
img_feature_size=500,
hidden_size=1000,
embedding_dim=500,
):
super().__init__()

self.CNNmodel=CNN
self.vocab_size=vocab_size
self.embedding_dim=embedding_dim
self.input_feature_size=img_feature_size
self.hidden_size=hidden_size
self.max_lens=max_lens
self.max_frames=max_frames
self.batch_size=batch_size
self.device=device

self.bos=2
self.eos=3

self.embedding=nn.Embedding(vocab_size,embedding_dim,)
self.projection=nn.Linear(hidden_size,vocab_size,)

self.padding_frame=torch.zeros((batch_size,1,img_feature_size),device=device) #(N,1,img_feature_size)
self.padding_words=torch.zeros((batch_size,max_frames,embedding_dim),device=device)# (N,T,embedding dim)

self.lstm1=nn.LSTM(
input_size=img_feature_size,
hidden_size=hidden_size,
batch_first=True,
)
self.lstm2=nn.LSTM(
input_size=hidden_size+embedding_dim,
hidden_size=hidden_size,
batch_first=True
)
def forward(self,img_seq,cap_seq,mode):
"""forward pass for the input img tensor and caption

Args:
img_seq (tensor): size(N,maxn_frames,C,H,W) N->batch_size T->max_frames C,H,W->img frame
cap_seq (tensro): size(N,max_lens)
mode (str): 'train'/'test'
"""
features=self.CNNmodel(img_seq) #of shape (N,T(max_frames),img_feature_size)
h1_out,h1_state=self.lstm1(features) #h1_out -> N,T,num of hidden, h1_state -> (N,1,num of hidden)
# print(h1_out.size())
# print(self.padding_words.size())
h2_in=torch.cat((h1_out,self.padding_words),dim=2) #(N,T,num of hidden+embedding dim)
_,h2_state=self.lstm2(h2_in)#(N,1,num of hidden)

if (mode=='train'):
seq_train_pred=[]
for i in range(self.max_lens-1):
cur_word=self.embedding(cap_seq[:,i]) #cur_word -> (N,embedding dim)
cur_word=torch.reshape(cur_word,(self.batch_size,1,self.embedding_dim)) #(N,1,embedding dim)
h1_out,h1_state=self.lstm1(self.padding_frame,h1_state)
h2_in=torch.cat((h1_out,cur_word),dim=2)
h2_out,h2_state=self.lstm2(h2_in,h2_state) #h2_out -> (N,1,num of hidden)
h2_out=torch.squeeze(h2_out,dim=1) #h2_out -> (N,num of hidden)
out_scores=self.projection(h2_out)
seq_train_pred.append(out_scores)
return torch.stack(seq_train_pred,dim=1)
else:
seq_test_pred=[]
cur_word=self.embedding(torch.tensor(self.batch_size*[self.bos],device=self.device))
for i in range(self.max_lens-1):
cur_word=torch.reshape(cur_word,(self.batch_size,1,self.embedding_dim))
h1_out,h1_state=self.lstm1(self.padding_frame,h1_state)
h2_in=torch.cat((h1_out,cur_word),dim=2)
h2_out,h2_state=self.lstm2(h2_in,h2_state)
h2_out=torch.squeeze(h2_out,dim=1)
out_scores=self.projection(h2_out)
out_word=torch.argmax(out_scores,dim=1) # for test
cur_word=self.embedding(out_word)
seq_test_pred.append(out_word)
return torch.stack(seq_test_pred,dim=1)
Pre-trained cnn(vgg16)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import vgg16

class vggmodel(nn.Module):
"""
return a pre-trained model
the last layer is replaced by a learnable linear layer from 4096 -> 500 features
"""
def __init__(self):
super().__init__()
self.model=vgg16(weights='DEFAULT',progress=True)
for p in self.model.parameters():
p.requires_grad=False
self.model.classifier[6]=nn.Linear(4096,500)
def forward(self,x):
return self.model(x)
test
test part
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import torch
from torch.utils.data import DataLoader
from model import s2vtmodel,vggmodel
from utils import *

vocab,vocab_size=load_vocab(if_reverse=False)
batch_size=12
max_lens=28
max_frames=40
img_feature_size=500
hidden_size=1000
embedding_dim=500
learning_rate=3e-3
betas=(0.9,0.999)
eps=1e-8
num_epoch=5
prt_perup=16
grad_accu=1

train_data=DataLoader(dataset=msrvtt_train_dataloader(max_frames=max_frames,max_lens=max_lens),batch_size=batch_size,shuffle=True)
val_data=DataLoader(dataset=msrvtt_val_dataloader(max_frames=max_frames,max_lens=max_lens),batch_size=batch_size,shuffle=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#size(N,T,C,H,W) N->batch_size T->max_length(frames) C,H,W->img frame
model=s2vtmodel(vocab_size,batch_size,vggmodel(),device,max_lens,max_frames,img_feature_size,hidden_size,embedding_dim).to(device)
optimizer=torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),lr=learning_rate,betas=betas,eps=eps)
# optimizer=torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),lr=learning_rate,)
loss_fn=nn.CrossEntropyLoss()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
model.train()
loss_history=[]
loss_sum=0
for epoch in range(num_epoch):
num_iter=0
for img, cap in train_data:
num_iter+=1

img=img.to(device)
cap=cap.to(device)
seq=model(img,cap,'train')
seq_out=torch.reshape(seq,(-1,vocab_size))
cap=cap[:,1:]
cap=torch.reshape(cap,(cap.size(0)*cap.size(1),1)).squeeze()

if grad_accu!=1:

loss=loss_fn(seq_out,cap)/grad_accu
loss.backward()
loss_sum+=loss.item()*grad_accu

if (num_iter//grad_accu)%prt_perup==0:
print("------------------")
print("cur_iter: ",num_iter)
print("cur_epoch: ",epoch)
print(loss_sum) #按更新次数输出

if(num_iter%grad_accu==0):
optimizer.step()
optimizer.zero_grad()
loss_history.append(loss_sum)
loss_sum=0
else:
loss=loss_fn(seq_out,cap)
loss.backward()
optimizer.step()
optimizer.zero_grad()
loss_history.append(loss.item())
if num_iter%prt_perup==0:
print("------------------")
print("cur_iter: ",num_iter)
print("cur_epoch: ",epoch)
print(loss.item()) #按更新次数输出

torch.save(model.state_dict(),'net_paras_v2'+str(epoch)+'.pth')
1
2
3
4
5
6
7
8
9
model.eval()
vocab_reverse,_=load_vocab(if_reverse=True)
for img,cap in val_data:
img=img.to(device)
out_word=model(img,_,'test')
print(out_word)
break
print(img.size())
print(cap)
train
train
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from model import s2vtmodel,vggmodel
from utils import msrvtt_train_dataloader,load_vocab

def train_forcap(train_data,model,num_epoch,device,loss_fn,optimizer,vocab_size,saving_path,grad_accu=0,prt_perup=128,):
model.train()
loss_history=[]
loss_sum=0
for epoch in range(num_epoch):
num_iter=0
for img, cap in train_data:
num_iter+=1

img=img.to(device)
cap=cap.to(device)
seq=model(img,cap,'train')
seq_out=torch.reshape(seq,(-1,vocab_size))
cap=cap[:,1:]
cap=torch.reshape(cap,(cap.size(0)*cap.size(1),1)).squeeze()

if grad_accu!=1:

loss=loss_fn(seq_out,cap)/grad_accu
loss.backward()
loss_sum+=loss.item()*grad_accu

if (num_iter//grad_accu)%prt_perup==0:
print("------------------")
print("cur_iter: ",num_iter)
print("cur_epoch: ",epoch)
print(loss_sum) #按更新次数输出

if(num_iter%grad_accu==0):
optimizer.step()
optimizer.zero_grad()
loss_history.append(loss_sum)
loss_sum=0
else:
loss=loss_fn(seq_out,cap)
loss.backward()
optimizer.step()
optimizer.zero_grad()
loss_history.append(loss.item())
if num_iter%prt_perup==0:
print("------------------")
print("cur_iter: ",num_iter)
print("cur_epoch: ",epoch)
print(loss.item()) #按更新次数输出

torch.save(model.state_dict(),saving_path)
return model

def main():
vocab,vocab_size=load_vocab(if_reverse=False)
batch_size=2
max_lens=28
max_frames=40
img_feature_size=500
hidden_size=1000
embedding_dim=500
learning_rate=1e-3
betas=(0.9,0.999)
eps=1e-8
num_epoch=5
prt_perup=16
grad_accu=1

train_data=DataLoader(dataset=msrvtt_train_dataloader(max_frames=max_frames,max_lens=max_lens),batch_size=batch_size,shuffle=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#size(N,T,C,H,W) N->batch_size T->max_length(frames) C,H,W->img frame
model=s2vtmodel(vocab_size,batch_size,vggmodel(),device,max_lens,max_frames,img_feature_size,hidden_size,embedding_dim).to(device)
optimizer=torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),lr=learning_rate,betas=betas,eps=eps)
loss_fn=nn.CrossEntropyLoss()

trained_model=train_forcap(train_data,model,num_epoch,device,loss_fn,optimizer,vocab_size,'saving.pth',grad_accu,prt_perup)