forked from EricGuo5513/TM2T
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
241 lines (198 loc) · 9.09 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# coding=utf8
"""================================
@Author: Mr.Chang
@Date : 2022/12/22 9:56 上午
==================================="""
# a flask api return a mp4 file
import os
from os.path import join as pjoin
import spacy
import torch
from flask import Flask, send_file, request, jsonify, Response
import sys
import numpy as np
sys.path.append(os.path.dirname(os.getcwd()))
from torch.utils.data import DataLoader
from data.dataset import Motion2TextEvalDataset
from networks.modules import Seq2SeqText2MotModel, VQDecoderV3
from networks.quantizer import Quantizer, EMAVectorQuantizer
from options.evaluate_options import TestT2MOptions
from utils import paramUtil
from utils.plot_script import plot_3d_motion
from utils.utils import motion_temporal_filter
from utils.word_vectorizer import WordVectorizerV2
from scripts.motion_process import recover_from_ric
app = Flask(__name__)
def plot_t2m(data, captions, save_dir):
data = data * std + mean # (1 ,192, 263)
for i in range(len(data)):
joint_data = data[i]
caption = captions[i]
joint = recover_from_ric(torch.from_numpy(joint_data).float(), opt.joints_num).numpy()
joint = motion_temporal_filter(joint)
save_path = '%s_%02d.mp4' % (save_dir, i)
np.save('%s_%02d.npy' % (save_dir, i),
{'motion': [data], 'text': [captions], 'lengths': [len(data)],
'num_samples': 1, 'num_repetitions': 1})
# np.save('%s_%02d.npy' % (save_dir, i), joint)
plot_3d_motion(save_path, kinematic_chain, joint, title=caption, fps=fps, radius=radius)
return save_path
class DataProcess:
def __init__(self, opt, mean, std, w_vectorizer, nlp):
self.opt = opt
self.mean = mean
self.std = std
self.w_vectorizer = w_vectorizer
self.nlp = nlp
def process(self, data):
sentence = data.strip()
doc = self.nlp(sentence)
word_list = []
pos_list = []
for token in doc:
word = token.text
if not word.isalpha():
continue
if (token.pos_ == 'NOUN' or token.pos_ == 'VERB') and (word != 'left'):
word_list.append(token.lemma_)
else:
word_list.append(word)
pos_list.append(token.pos_)
return word_list, pos_list
def inv_transform(self, data):
data = data * self.std + self.mean
return data
def embeded(self, sentence):
word_list, pos_list = self.process(sentence.strip())
tokens = ['%s/%s' % (word_list[i], pos_list[i]) for i in range(len(word_list))]
if len(tokens) < self.opt.max_text_len:
tokens += ['sos/OTHER'] + tokens + ['eos/OTHER']
sent_len = len(tokens)
tokens = tokens + ['unk/OTHER'] * (self.opt.max_text_len + 2 - sent_len)
else:
tokens = tokens[:self.opt.max_text_len]
sent_len = len(tokens)
tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
pos_one_hots = []
word_embeddings = []
for token in tokens:
word_emb, pos_oh, _ = self.w_vectorizer[token]
pos_one_hots.append(pos_oh[None, :])
word_embeddings.append(word_emb[None, :])
pos_one_hots = np.concatenate(pos_one_hots, axis=0)
word_embeddings = np.concatenate(word_embeddings, axis=0)
return word_embeddings, pos_one_hots, [sentence.strip()], torch.tensor([sent_len])
def get_chunk(file_name, start, end):
file_size = os.path.getsize(file_name)
start = 0 if start is None else int(start)
length = file_size - start if end is None else int(end) - start
with open(file_name, 'rb') as f:
f.seek(start)
data = f.read(length)
return data, start, length, file_size
def build_models(opt, dec_channels):
vq_decoder = VQDecoderV3(opt.dim_vq_latent, dec_channels, opt.n_resblk, opt.n_down)
quantizer = None
if opt.q_mode == 'ema':
quantizer = EMAVectorQuantizer(opt.codebook_size, opt.dim_vq_latent, opt.lambda_beta)
elif opt.q_mode == 'cmt':
quantizer = Quantizer(opt.codebook_size, opt.dim_vq_latent, opt.lambda_beta)
checkpoint = torch.load(pjoin(opt.checkpoints_dir, opt.dataset_name, opt.tokenizer_name, 'model', 'finest.tar'),
map_location=opt.device)
vq_decoder.load_state_dict(checkpoint['vq_decoder'])
quantizer.load_state_dict(checkpoint['quantizer'])
t2m_model = Seq2SeqText2MotModel(300, n_mot_vocab, opt.dim_txt_hid, opt.dim_mot_hid,
opt.n_mot_layers, opt.device, opt.early_or_late)
checkpoint = torch.load(pjoin(opt.checkpoints_dir, opt.dataset_name, opt.name, 'model', 'finest.tar'),
map_location=opt.device)
t2m_model.load_state_dict(checkpoint['t2m_model'])
print('Loading t2m_model model: Epoch %03d Total_Iter %03d' % (checkpoint['ep'], checkpoint['total_it']))
return vq_decoder, quantizer, t2m_model
@app.route('/t2video', methods=['POST'])
def video():
params = request.get_json()
text = params.get('text', None)
if text is None:
return jsonify({'status': 'error', 'msg': 'text is None'}), 400
vq_decoder.to(opt.device)
quantizer.to(opt.device)
t2m_model.to(opt.device)
vq_decoder.eval()
quantizer.eval()
t2m_model.eval()
word_emb, pos_one_hots, captions, cap_lens = data_process.embeded(text)
word_emb = torch.from_numpy(word_emb).float().unsqueeze(0).to(opt.device)
pos_one_hots = torch.from_numpy(pos_one_hots).float().unsqueeze(0).to(opt.device)
cap_lens = cap_lens.to(opt.device)
pred_tokens, len_map = t2m_model.sample_batch(word_emb, cap_lens, trg_sos=opt.mot_start_idx,
trg_eos=opt.mot_end_idx, max_steps=49, top_k=opt.top_k)
pred_tokens = pred_tokens[:, 1:len_map[0] + 1]
vq_latent = quantizer.get_codebook_entry(pred_tokens)
gen_motion = vq_decoder(vq_latent)
sub_dict = {}
sub_dict['motion'] = gen_motion.cpu().detach().numpy()
sub_dict['length'] = len(gen_motion[0])
joint_save_path = pjoin(opt.joint_dir, 'joint')
animation_save_path = pjoin(opt.animation_dir, 'animation')
os.makedirs(joint_save_path, exist_ok=True)
os.makedirs(animation_save_path, exist_ok=True)
file_name = plot_t2m(sub_dict['motion'], captions,
pjoin(animation_save_path, 'gen_motion_L%03d' % (sub_dict['motion'].shape[1])))
data, start, length, file_size = get_chunk(file_name, 0, 0)
current_path = os.getcwd()
data_path = os.path.join(current_path, file_name)
return jsonify({'status': 'success', 'data': data_path}), 200
if __name__ == '__main__':
parser = TestT2MOptions()
opt = parser.parse()
opt.device = torch.device("cpu" if opt.gpu_id == -1 else "cuda:" + str(opt.gpu_id))
torch.autograd.set_detect_anomaly(True)
if opt.gpu_id != -1:
torch.cuda.set_device(opt.gpu_id)
opt.result_dir = pjoin(opt.result_path, opt.dataset_name, opt.name, opt.ext)
opt.joint_dir = pjoin(opt.result_dir, 'joints')
opt.animation_dir = pjoin(opt.result_dir, 'animations')
if opt.dataset_name == 't2m':
opt.data_root = './dataset/HumanML3D/'
opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
opt.m_token_dir = pjoin(opt.data_root, 'VQVAEV3_CB1024_CMT_H1024_NRES3')
opt.text_dir = pjoin(opt.data_root, 'texts')
opt.joints_num = 22
opt.max_motion_token = 55
opt.max_motion_frame = 196
dim_pose = 263
radius = 4
fps = 20
kinematic_chain = paramUtil.t2m_kinematic_chain
elif opt.dataset_name == 'kit':
opt.data_root = './dataset/KIT'
opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
opt.m_token_dir = pjoin(opt.data_root, 'VQVAEV3_CB1024_CMT_H1024_NRES3')
opt.text_dir = pjoin(opt.data_root, 'texts')
opt.joints_num = 21
radius = 240 * 8
fps = 12.5
dim_pose = 251
opt.max_motion_token = 55
opt.max_motion_frame = 196
kinematic_chain = paramUtil.kit_kinematic_chain
else:
raise KeyError('Dataset Does Not Exist')
mean = np.load(pjoin(opt.checkpoints_dir, opt.dataset_name, opt.tokenizer_name, 'meta', 'mean.npy'))
std = np.load(pjoin(opt.checkpoints_dir, opt.dataset_name, opt.tokenizer_name, 'meta', 'std.npy'))
n_mot_vocab = opt.codebook_size + 3
opt.mot_start_idx = opt.codebook_size
opt.mot_end_idx = opt.codebook_size + 1
opt.mot_pad_idx = opt.codebook_size + 2
enc_channels = [1024, opt.dim_vq_latent]
dec_channels = [opt.dim_vq_latent, 1024, dim_pose]
w_vectorizer = WordVectorizerV2('./text2motion/glove', 'our_vab')
n_txt_vocab = len(w_vectorizer) + 1
_, _, opt.txt_start_idx = w_vectorizer['sos/OTHER']
_, _, opt.txt_end_idx = w_vectorizer['eos/OTHER']
opt.txt_pad_idx = len(w_vectorizer)
vq_decoder, quantizer, t2m_model = build_models(opt, dec_channels)
split_file = pjoin(opt.data_root, opt.split_file)
nlp = spacy.load('en_core_web_sm')
data_process = DataProcess(opt, mean, std, w_vectorizer, nlp)
app.run(debug=False, host='0.0.0.0', port=8800)