video_action_preprocess.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import cv2
  15. import numpy as np
  16. from collections.abc import Sequence
  17. from PIL import Image
  18. import paddle
  19. class Sampler(object):
  20. """
  21. Sample frames id.
  22. NOTE: Use PIL to read image here, has diff with CV2
  23. Args:
  24. num_seg(int): number of segments.
  25. seg_len(int): number of sampled frames in each segment.
  26. valid_mode(bool): True or False.
  27. Returns:
  28. frames_idx: the index of sampled #frames.
  29. """
  30. def __init__(self,
  31. num_seg,
  32. seg_len,
  33. frame_interval=None,
  34. valid_mode=True,
  35. dense_sample=False,
  36. linspace_sample=False,
  37. use_pil=True):
  38. self.num_seg = num_seg
  39. self.seg_len = seg_len
  40. self.frame_interval = frame_interval
  41. self.valid_mode = valid_mode
  42. self.dense_sample = dense_sample
  43. self.linspace_sample = linspace_sample
  44. self.use_pil = use_pil
  45. def _get(self, frames_idx, results):
  46. data_format = results['format']
  47. if data_format == "frame":
  48. frame_dir = results['frame_dir']
  49. imgs = []
  50. for idx in frames_idx:
  51. img = Image.open(
  52. os.path.join(frame_dir, results['suffix'].format(
  53. idx))).convert('RGB')
  54. imgs.append(img)
  55. elif data_format == "video":
  56. if results['backend'] == 'cv2':
  57. frames = np.array(results['frames'])
  58. imgs = []
  59. for idx in frames_idx:
  60. imgbuf = frames[idx]
  61. img = Image.fromarray(imgbuf, mode='RGB')
  62. imgs.append(img)
  63. elif results['backend'] == 'decord':
  64. container = results['frames']
  65. if self.use_pil:
  66. frames_select = container.get_batch(frames_idx)
  67. # dearray_to_img
  68. np_frames = frames_select.asnumpy()
  69. imgs = []
  70. for i in range(np_frames.shape[0]):
  71. imgbuf = np_frames[i]
  72. imgs.append(Image.fromarray(imgbuf, mode='RGB'))
  73. else:
  74. if frames_idx.ndim != 1:
  75. frames_idx = np.squeeze(frames_idx)
  76. frame_dict = {
  77. idx: container[idx].asnumpy()
  78. for idx in np.unique(frames_idx)
  79. }
  80. imgs = [frame_dict[idx] for idx in frames_idx]
  81. elif results['backend'] == 'pyav':
  82. imgs = []
  83. frames = np.array(results['frames'])
  84. for idx in frames_idx:
  85. imgbuf = frames[idx]
  86. imgs.append(imgbuf)
  87. imgs = np.stack(imgs) # thwc
  88. else:
  89. raise NotImplementedError
  90. else:
  91. raise NotImplementedError
  92. results['imgs'] = imgs # all image data
  93. return results
  94. def _get_train_clips(self, num_frames):
  95. ori_seg_len = self.seg_len * self.frame_interval
  96. avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
  97. if avg_interval > 0:
  98. base_offsets = np.arange(self.num_seg) * avg_interval
  99. clip_offsets = base_offsets + np.random.randint(
  100. avg_interval, size=self.num_seg)
  101. elif num_frames > max(self.num_seg, ori_seg_len):
  102. clip_offsets = np.sort(
  103. np.random.randint(
  104. num_frames - ori_seg_len + 1, size=self.num_seg))
  105. elif avg_interval == 0:
  106. ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
  107. clip_offsets = np.around(np.arange(self.num_seg) * ratio)
  108. else:
  109. clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
  110. return clip_offsets
  111. def _get_test_clips(self, num_frames):
  112. ori_seg_len = self.seg_len * self.frame_interval
  113. avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
  114. if num_frames > ori_seg_len - 1:
  115. base_offsets = np.arange(self.num_seg) * avg_interval
  116. clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
  117. else:
  118. clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
  119. return clip_offsets
  120. def __call__(self, results):
  121. """
  122. Args:
  123. frames_len: length of frames.
  124. return:
  125. sampling id.
  126. """
  127. frames_len = int(results['frames_len']) # total number of frames
  128. frames_idx = []
  129. if self.frame_interval is not None:
  130. assert isinstance(self.frame_interval, int)
  131. if not self.valid_mode:
  132. offsets = self._get_train_clips(frames_len)
  133. else:
  134. offsets = self._get_test_clips(frames_len)
  135. offsets = offsets[:, None] + np.arange(self.seg_len)[
  136. None, :] * self.frame_interval
  137. offsets = np.concatenate(offsets)
  138. offsets = offsets.reshape((-1, self.seg_len))
  139. offsets = np.mod(offsets, frames_len)
  140. offsets = np.concatenate(offsets)
  141. if results['format'] == 'video':
  142. frames_idx = offsets
  143. elif results['format'] == 'frame':
  144. frames_idx = list(offsets + 1)
  145. else:
  146. raise NotImplementedError
  147. return self._get(frames_idx, results)
  148. print("self.frame_interval:", self.frame_interval)
  149. if self.linspace_sample: # default if False
  150. if 'start_idx' in results and 'end_idx' in results:
  151. offsets = np.linspace(results['start_idx'], results['end_idx'],
  152. self.num_seg)
  153. else:
  154. offsets = np.linspace(0, frames_len - 1, self.num_seg)
  155. offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
  156. if results['format'] == 'video':
  157. frames_idx = list(offsets)
  158. frames_idx = [x % frames_len for x in frames_idx]
  159. elif results['format'] == 'frame':
  160. frames_idx = list(offsets + 1)
  161. else:
  162. raise NotImplementedError
  163. return self._get(frames_idx, results)
  164. average_dur = int(frames_len / self.num_seg)
  165. print("results['format']:", results['format'])
  166. if self.dense_sample: # For ppTSM, default is False
  167. if not self.valid_mode: # train
  168. sample_pos = max(1, 1 + frames_len - 64)
  169. t_stride = 64 // self.num_seg
  170. start_idx = 0 if sample_pos == 1 else np.random.randint(
  171. 0, sample_pos - 1)
  172. offsets = [(idx * t_stride + start_idx) % frames_len + 1
  173. for idx in range(self.num_seg)]
  174. frames_idx = offsets
  175. else:
  176. sample_pos = max(1, 1 + frames_len - 64)
  177. t_stride = 64 // self.num_seg
  178. start_list = np.linspace(0, sample_pos - 1, num=10, dtype=int)
  179. offsets = []
  180. for start_idx in start_list.tolist():
  181. offsets += [(idx * t_stride + start_idx) % frames_len + 1
  182. for idx in range(self.num_seg)]
  183. frames_idx = offsets
  184. else:
  185. for i in range(self.num_seg):
  186. idx = 0
  187. if not self.valid_mode:
  188. if average_dur >= self.seg_len:
  189. idx = random.randint(0, average_dur - self.seg_len)
  190. idx += i * average_dur
  191. elif average_dur >= 1:
  192. idx += i * average_dur
  193. else:
  194. idx = i
  195. else:
  196. if average_dur >= self.seg_len:
  197. idx = (average_dur - 1) // 2
  198. idx += i * average_dur
  199. elif average_dur >= 1:
  200. idx += i * average_dur
  201. else:
  202. idx = i
  203. for jj in range(idx, idx + self.seg_len):
  204. if results['format'] == 'video':
  205. frames_idx.append(int(jj % frames_len))
  206. elif results['format'] == 'frame':
  207. frames_idx.append(jj + 1)
  208. elif results['format'] == 'MRI':
  209. frames_idx.append(jj)
  210. else:
  211. raise NotImplementedError
  212. return self._get(frames_idx, results)
  213. class Scale(object):
  214. """
  215. Scale images.
  216. Args:
  217. short_size(float | int): Short size of an image will be scaled to the short_size.
  218. fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True
  219. do_round(bool): Whether to round up when calculating the zoom ratio. default: False
  220. backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'
  221. """
  222. def __init__(self,
  223. short_size,
  224. fixed_ratio=True,
  225. keep_ratio=None,
  226. do_round=False,
  227. backend='pillow'):
  228. self.short_size = short_size
  229. assert (fixed_ratio and not keep_ratio) or (
  230. not fixed_ratio
  231. ), "fixed_ratio and keep_ratio cannot be true at the same time"
  232. self.fixed_ratio = fixed_ratio
  233. self.keep_ratio = keep_ratio
  234. self.do_round = do_round
  235. assert backend in [
  236. 'pillow', 'cv2'
  237. ], "Scale's backend must be pillow or cv2, but get {backend}"
  238. self.backend = backend
  239. def __call__(self, results):
  240. """
  241. Performs resize operations.
  242. Args:
  243. imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
  244. For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
  245. return:
  246. resized_imgs: List where each item is a PIL.Image after scaling.
  247. """
  248. imgs = results['imgs']
  249. resized_imgs = []
  250. for i in range(len(imgs)):
  251. img = imgs[i]
  252. if isinstance(img, np.ndarray):
  253. h, w, _ = img.shape
  254. elif isinstance(img, Image.Image):
  255. w, h = img.size
  256. else:
  257. raise NotImplementedError
  258. if w <= h:
  259. ow = self.short_size
  260. if self.fixed_ratio: # default is True
  261. oh = int(self.short_size * 4.0 / 3.0)
  262. elif not self.keep_ratio: # no
  263. oh = self.short_size
  264. else:
  265. scale_factor = self.short_size / w
  266. oh = int(h * float(scale_factor) +
  267. 0.5) if self.do_round else int(h *
  268. self.short_size / w)
  269. ow = int(w * float(scale_factor) +
  270. 0.5) if self.do_round else int(w *
  271. self.short_size / h)
  272. else:
  273. oh = self.short_size
  274. if self.fixed_ratio:
  275. ow = int(self.short_size * 4.0 / 3.0)
  276. elif not self.keep_ratio: # no
  277. ow = self.short_size
  278. else:
  279. scale_factor = self.short_size / h
  280. oh = int(h * float(scale_factor) +
  281. 0.5) if self.do_round else int(h *
  282. self.short_size / w)
  283. ow = int(w * float(scale_factor) +
  284. 0.5) if self.do_round else int(w *
  285. self.short_size / h)
  286. if type(img) == np.ndarray:
  287. img = Image.fromarray(img, mode='RGB')
  288. if self.backend == 'pillow':
  289. resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
  290. elif self.backend == 'cv2' and (self.keep_ratio is not None):
  291. resized_imgs.append(
  292. cv2.resize(
  293. img, (ow, oh), interpolation=cv2.INTER_LINEAR))
  294. else:
  295. resized_imgs.append(
  296. Image.fromarray(
  297. cv2.resize(
  298. np.asarray(img), (ow, oh),
  299. interpolation=cv2.INTER_LINEAR)))
  300. results['imgs'] = resized_imgs
  301. return results
  302. class CenterCrop(object):
  303. """
  304. Center crop images
  305. Args:
  306. target_size(int): Center crop a square with the target_size from an image.
  307. do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True
  308. """
  309. def __init__(self, target_size, do_round=True, backend='pillow'):
  310. self.target_size = target_size
  311. self.do_round = do_round
  312. self.backend = backend
  313. def __call__(self, results):
  314. """
  315. Performs Center crop operations.
  316. Args:
  317. imgs: List where each item is a PIL.Image.
  318. For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
  319. return:
  320. ccrop_imgs: List where each item is a PIL.Image after Center crop.
  321. """
  322. imgs = results['imgs']
  323. ccrop_imgs = []
  324. th, tw = self.target_size, self.target_size
  325. if isinstance(imgs, paddle.Tensor):
  326. h, w = imgs.shape[-2:]
  327. x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2
  328. y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2
  329. ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw]
  330. else:
  331. for img in imgs:
  332. if self.backend == 'pillow':
  333. w, h = img.size
  334. elif self.backend == 'cv2':
  335. h, w, _ = img.shape
  336. else:
  337. raise NotImplementedError
  338. assert (w >= self.target_size) and (h >= self.target_size), \
  339. "image width({}) and height({}) should be larger than crop size".format(
  340. w, h, self.target_size)
  341. x1 = int(round((w - tw) / 2.0)) if self.do_round else (
  342. w - tw) // 2
  343. y1 = int(round((h - th) / 2.0)) if self.do_round else (
  344. h - th) // 2
  345. if self.backend == 'cv2':
  346. ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw])
  347. elif self.backend == 'pillow':
  348. ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))
  349. results['imgs'] = ccrop_imgs
  350. return results
  351. class Image2Array(object):
  352. """
  353. transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.
  354. Args:
  355. transpose: whether to transpose or not, default True, False for slowfast.
  356. """
  357. def __init__(self, transpose=True, data_format='tchw'):
  358. assert data_format in [
  359. 'tchw', 'cthw'
  360. ], "Target format must in ['tchw', 'cthw'], but got {data_format}"
  361. self.transpose = transpose
  362. self.data_format = data_format
  363. def __call__(self, results):
  364. """
  365. Performs Image to NumpyArray operations.
  366. Args:
  367. imgs: List where each item is a PIL.Image.
  368. For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
  369. return:
  370. np_imgs: Numpy array.
  371. """
  372. imgs = results['imgs']
  373. if 'backend' in results and results[
  374. 'backend'] == 'pyav': # [T,H,W,C] in [0, 1]
  375. if self.transpose:
  376. if self.data_format == 'tchw':
  377. t_imgs = imgs.transpose((0, 3, 1, 2)) # tchw
  378. else:
  379. t_imgs = imgs.transpose((3, 0, 1, 2)) # cthw
  380. results['imgs'] = t_imgs
  381. else:
  382. t_imgs = np.stack(imgs).astype('float32')
  383. if self.transpose:
  384. if self.data_format == 'tchw':
  385. t_imgs = t_imgs.transpose(0, 3, 1, 2) # tchw
  386. else:
  387. t_imgs = t_imgs.transpose(3, 0, 1, 2) # cthw
  388. results['imgs'] = t_imgs
  389. return results
  390. class VideoDecoder(object):
  391. """
  392. Decode mp4 file to frames.
  393. Args:
  394. filepath: the file path of mp4 file
  395. """
  396. def __init__(self,
  397. backend='cv2',
  398. mode='train',
  399. sampling_rate=32,
  400. num_seg=8,
  401. num_clips=1,
  402. target_fps=30):
  403. self.backend = backend
  404. # params below only for TimeSformer
  405. self.mode = mode
  406. self.sampling_rate = sampling_rate
  407. self.num_seg = num_seg
  408. self.num_clips = num_clips
  409. self.target_fps = target_fps
  410. def __call__(self, results):
  411. """
  412. Perform mp4 decode operations.
  413. return:
  414. List where each item is a numpy array after decoder.
  415. """
  416. file_path = results['filename']
  417. results['format'] = 'video'
  418. results['backend'] = self.backend
  419. if self.backend == 'cv2': # here
  420. cap = cv2.VideoCapture(file_path)
  421. videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  422. sampledFrames = []
  423. for i in range(videolen):
  424. ret, frame = cap.read()
  425. # maybe first frame is empty
  426. if ret == False:
  427. continue
  428. img = frame[:, :, ::-1]
  429. sampledFrames.append(img)
  430. results['frames'] = sampledFrames
  431. results['frames_len'] = len(sampledFrames)
  432. elif self.backend == 'decord':
  433. container = de.VideoReader(file_path)
  434. frames_len = len(container)
  435. results['frames'] = container
  436. results['frames_len'] = frames_len
  437. else:
  438. raise NotImplementedError
  439. return results
  440. class Normalization(object):
  441. """
  442. Normalization.
  443. Args:
  444. mean(Sequence[float]): mean values of different channels.
  445. std(Sequence[float]): std values of different channels.
  446. tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]
  447. """
  448. def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False):
  449. if not isinstance(mean, Sequence):
  450. raise TypeError(
  451. 'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
  452. if not isinstance(std, Sequence):
  453. raise TypeError(
  454. 'Std must be list, tuple or np.ndarray, but got {type(std)}')
  455. self.inplace = inplace
  456. if not inplace:
  457. self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)
  458. self.std = np.array(std).reshape(tensor_shape).astype(np.float32)
  459. else:
  460. self.mean = np.array(mean, dtype=np.float32)
  461. self.std = np.array(std, dtype=np.float32)
  462. def __call__(self, results):
  463. """
  464. Performs normalization operations.
  465. Args:
  466. imgs: Numpy array.
  467. return:
  468. np_imgs: Numpy array after normalization.
  469. """
  470. if self.inplace: # default is False
  471. n = len(results['imgs'])
  472. h, w, c = results['imgs'][0].shape
  473. norm_imgs = np.empty((n, h, w, c), dtype=np.float32)
  474. for i, img in enumerate(results['imgs']):
  475. norm_imgs[i] = img
  476. for img in norm_imgs: # [n,h,w,c]
  477. mean = np.float64(self.mean.reshape(1, -1)) # [1, 3]
  478. stdinv = 1 / np.float64(self.std.reshape(1, -1)) # [1, 3]
  479. cv2.subtract(img, mean, img)
  480. cv2.multiply(img, stdinv, img)
  481. else:
  482. imgs = results['imgs']
  483. norm_imgs = imgs / 255.0
  484. norm_imgs -= self.mean
  485. norm_imgs /= self.std
  486. if 'backend' in results and results['backend'] == 'pyav':
  487. norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32)
  488. results['imgs'] = norm_imgs
  489. return results