inference_benchmark.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from __future__ import division
  16. from __future__ import print_function
  17. import os
  18. import sys
  19. import six
  20. import glob
  21. import time
  22. import yaml
  23. import argparse
  24. import cv2
  25. import numpy as np
  26. import paddle
  27. import paddle.version as paddle_version
  28. from paddle.inference import Config, create_predictor, PrecisionType, get_trt_runtime_version
  29. TUNED_TRT_DYNAMIC_MODELS = {'DETR'}
  30. def check_version(version='2.2'):
  31. err = "PaddlePaddle version {} or higher is required, " \
  32. "or a suitable develop version is satisfied as well. \n" \
  33. "Please make sure the version is good with your code.".format(version)
  34. version_installed = [
  35. paddle_version.major, paddle_version.minor, paddle_version.patch,
  36. paddle_version.rc
  37. ]
  38. if version_installed == ['0', '0', '0', '0']:
  39. return
  40. if version == 'develop':
  41. raise Exception("PaddlePaddle develop version is required!")
  42. version_split = version.split('.')
  43. length = min(len(version_installed), len(version_split))
  44. for i in six.moves.range(length):
  45. if version_installed[i] > version_split[i]:
  46. return
  47. if version_installed[i] < version_split[i]:
  48. raise Exception(err)
  49. def check_trt_version(version='8.2'):
  50. err = "TensorRT version {} or higher is required," \
  51. "Please make sure the version is good with your code.".format(version)
  52. version_split = list(map(int, version.split('.')))
  53. version_installed = get_trt_runtime_version()
  54. length = min(len(version_installed), len(version_split))
  55. for i in six.moves.range(length):
  56. if version_installed[i] > version_split[i]:
  57. return
  58. if version_installed[i] < version_split[i]:
  59. raise Exception(err)
  60. # preprocess ops
  61. def decode_image(im_file, im_info):
  62. if isinstance(im_file, str):
  63. with open(im_file, 'rb') as f:
  64. im_read = f.read()
  65. data = np.frombuffer(im_read, dtype='uint8')
  66. im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode
  67. im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
  68. else:
  69. im = im_file
  70. im_info['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
  71. im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32)
  72. return im, im_info
  73. class Resize(object):
  74. def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
  75. if isinstance(target_size, int):
  76. target_size = [target_size, target_size]
  77. self.target_size = target_size
  78. self.keep_ratio = keep_ratio
  79. self.interp = interp
  80. def __call__(self, im, im_info):
  81. assert len(self.target_size) == 2
  82. assert self.target_size[0] > 0 and self.target_size[1] > 0
  83. im_channel = im.shape[2]
  84. im_scale_y, im_scale_x = self.generate_scale(im)
  85. im = cv2.resize(
  86. im,
  87. None,
  88. None,
  89. fx=im_scale_x,
  90. fy=im_scale_y,
  91. interpolation=self.interp)
  92. im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
  93. im_info['scale_factor'] = np.array(
  94. [im_scale_y, im_scale_x]).astype('float32')
  95. return im, im_info
  96. def generate_scale(self, im):
  97. origin_shape = im.shape[:2]
  98. im_c = im.shape[2]
  99. if self.keep_ratio:
  100. im_size_min = np.min(origin_shape)
  101. im_size_max = np.max(origin_shape)
  102. target_size_min = np.min(self.target_size)
  103. target_size_max = np.max(self.target_size)
  104. im_scale = float(target_size_min) / float(im_size_min)
  105. if np.round(im_scale * im_size_max) > target_size_max:
  106. im_scale = float(target_size_max) / float(im_size_max)
  107. im_scale_x = im_scale
  108. im_scale_y = im_scale
  109. else:
  110. resize_h, resize_w = self.target_size
  111. im_scale_y = resize_h / float(origin_shape[0])
  112. im_scale_x = resize_w / float(origin_shape[1])
  113. return im_scale_y, im_scale_x
  114. class Permute(object):
  115. def __init__(self, ):
  116. super(Permute, self).__init__()
  117. def __call__(self, im, im_info):
  118. im = im.transpose((2, 0, 1))
  119. return im, im_info
  120. class NormalizeImage(object):
  121. def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
  122. self.mean = mean
  123. self.std = std
  124. self.is_scale = is_scale
  125. self.norm_type = norm_type
  126. def __call__(self, im, im_info):
  127. im = im.astype(np.float32, copy=False)
  128. if self.is_scale:
  129. scale = 1.0 / 255.0
  130. im *= scale
  131. if self.norm_type == 'mean_std':
  132. mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
  133. std = np.array(self.std)[np.newaxis, np.newaxis, :]
  134. im -= mean
  135. im /= std
  136. return im, im_info
  137. class PadStride(object):
  138. def __init__(self, stride=0):
  139. self.coarsest_stride = stride
  140. def __call__(self, im, im_info):
  141. coarsest_stride = self.coarsest_stride
  142. if coarsest_stride <= 0:
  143. return im, im_info
  144. im_c, im_h, im_w = im.shape
  145. pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
  146. pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
  147. padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
  148. padding_im[:, :im_h, :im_w] = im
  149. return padding_im, im_info
  150. def preprocess(im, preprocess_ops):
  151. # process image by preprocess_ops
  152. im_info = {
  153. 'scale_factor': np.array(
  154. [1., 1.], dtype=np.float32),
  155. 'im_shape': None,
  156. }
  157. im, im_info = decode_image(im, im_info)
  158. for operator in preprocess_ops:
  159. im, im_info = operator(im, im_info)
  160. return im, im_info
  161. def parse_args():
  162. parser = argparse.ArgumentParser()
  163. parser.add_argument(
  164. '--model_dir', type=str, help='directory of inference model')
  165. parser.add_argument(
  166. '--run_mode', type=str, default='paddle', help='running mode')
  167. parser.add_argument('--batch_size', type=int, default=1, help='batch size')
  168. parser.add_argument(
  169. '--image_dir',
  170. type=str,
  171. default='/paddle/data/DOTA_1024_ss/test1024/images',
  172. help='directory of test images')
  173. parser.add_argument(
  174. '--warmup_iter', type=int, default=5, help='num of warmup iters')
  175. parser.add_argument(
  176. '--total_iter', type=int, default=2000, help='num of total iters')
  177. parser.add_argument(
  178. '--log_iter', type=int, default=50, help='num of log interval')
  179. parser.add_argument(
  180. '--tuned_trt_shape_file',
  181. type=str,
  182. default='shape_range_info.pbtxt',
  183. help='dynamic shape range info')
  184. args = parser.parse_args()
  185. return args
  186. def init_predictor(FLAGS):
  187. model_dir, run_mode, batch_size = FLAGS.model_dir, FLAGS.run_mode, FLAGS.batch_size
  188. yaml_file = os.path.join(model_dir, 'infer_cfg.yml')
  189. with open(yaml_file) as f:
  190. yml_conf = yaml.safe_load(f)
  191. config = Config(
  192. os.path.join(model_dir, 'model.pdmodel'),
  193. os.path.join(model_dir, 'model.pdiparams'))
  194. # initial GPU memory(M), device ID
  195. config.enable_use_gpu(200, 0)
  196. # optimize graph and fuse op
  197. config.switch_ir_optim(True)
  198. precision_map = {
  199. 'trt_int8': Config.Precision.Int8,
  200. 'trt_fp32': Config.Precision.Float32,
  201. 'trt_fp16': Config.Precision.Half
  202. }
  203. arch = yml_conf['arch']
  204. tuned_trt_shape_file = os.path.join(model_dir, FLAGS.tuned_trt_shape_file)
  205. if run_mode in precision_map.keys():
  206. if arch in TUNED_TRT_DYNAMIC_MODELS and not os.path.exists(
  207. tuned_trt_shape_file):
  208. print(
  209. 'dynamic shape range info is saved in {}. After that, rerun the code'.
  210. format(tuned_trt_shape_file))
  211. config.collect_shape_range_info(tuned_trt_shape_file)
  212. config.enable_tensorrt_engine(
  213. workspace_size=(1 << 25) * batch_size,
  214. max_batch_size=batch_size,
  215. min_subgraph_size=yml_conf['min_subgraph_size'],
  216. precision_mode=precision_map[run_mode],
  217. use_static=True,
  218. use_calib_mode=False)
  219. if yml_conf['use_dynamic_shape']:
  220. if arch in TUNED_TRT_DYNAMIC_MODELS and os.path.exists(
  221. tuned_trt_shape_file):
  222. config.enable_tuned_tensorrt_dynamic_shape(tuned_trt_shape_file,
  223. True)
  224. else:
  225. min_input_shape = {
  226. 'image': [batch_size, 3, 640, 640],
  227. 'scale_factor': [batch_size, 2]
  228. }
  229. max_input_shape = {
  230. 'image': [batch_size, 3, 1280, 1280],
  231. 'scale_factor': [batch_size, 2]
  232. }
  233. opt_input_shape = {
  234. 'image': [batch_size, 3, 1024, 1024],
  235. 'scale_factor': [batch_size, 2]
  236. }
  237. config.set_trt_dynamic_shape_info(
  238. min_input_shape, max_input_shape, opt_input_shape)
  239. # disable print log when predict
  240. config.disable_glog_info()
  241. # enable shared memory
  242. config.enable_memory_optim()
  243. # disable feed, fetch OP, needed by zero_copy_run
  244. config.switch_use_feed_fetch_ops(False)
  245. predictor = create_predictor(config)
  246. return predictor, yml_conf
  247. def create_preprocess_ops(yml_conf):
  248. preprocess_ops = []
  249. for op_info in yml_conf['Preprocess']:
  250. new_op_info = op_info.copy()
  251. op_type = new_op_info.pop('type')
  252. preprocess_ops.append(eval(op_type)(**new_op_info))
  253. return preprocess_ops
  254. def get_test_images(image_dir):
  255. images = set()
  256. infer_dir = os.path.abspath(image_dir)
  257. exts = ['jpg', 'jpeg', 'png', 'bmp']
  258. exts += [ext.upper() for ext in exts]
  259. for ext in exts:
  260. images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
  261. images = list(images)
  262. return images
  263. def create_inputs(image_files, preprocess_ops):
  264. inputs = dict()
  265. im_list, im_info_list = [], []
  266. for im_path in image_files:
  267. im, im_info = preprocess(im_path, preprocess_ops)
  268. im_list.append(im)
  269. im_info_list.append(im_info)
  270. inputs['im_shape'] = np.stack(
  271. [e['im_shape'] for e in im_info_list], axis=0).astype('float32')
  272. inputs['scale_factor'] = np.stack(
  273. [e['scale_factor'] for e in im_info_list], axis=0).astype('float32')
  274. inputs['image'] = np.stack(im_list, axis=0).astype('float32')
  275. return inputs
  276. def measure_speed(FLAGS):
  277. predictor, yml_conf = init_predictor(FLAGS)
  278. input_names = predictor.get_input_names()
  279. preprocess_ops = create_preprocess_ops(yml_conf)
  280. image_files = get_test_images(FLAGS.image_dir)
  281. batch_size = FLAGS.batch_size
  282. warmup_iter, log_iter, total_iter = FLAGS.warmup_iter, FLAGS.log_iter, FLAGS.total_iter
  283. total_time = 0
  284. fps = 0
  285. for i in range(0, total_iter, batch_size):
  286. # make data ready
  287. inputs = create_inputs(image_files[i:i + batch_size], preprocess_ops)
  288. for name in input_names:
  289. input_tensor = predictor.get_input_handle(name)
  290. input_tensor.copy_from_cpu(inputs[name])
  291. paddle.device.cuda.synchronize()
  292. # start running
  293. start_time = time.perf_counter()
  294. predictor.run()
  295. paddle.device.cuda.synchronize()
  296. if i >= warmup_iter:
  297. total_time += time.perf_counter() - start_time
  298. if (i + 1) % log_iter == 0:
  299. fps = (i + 1 - warmup_iter) / total_time
  300. print(
  301. f'Done image [{i + 1:<3}/ {total_iter}], '
  302. f'fps: {fps:.1f} img / s, '
  303. f'times per image: {1000 / fps:.1f} ms / img',
  304. flush=True)
  305. if (i + 1) == total_iter:
  306. fps = (i + 1 - warmup_iter) / total_time
  307. print(
  308. f'Overall fps: {fps:.1f} img / s, '
  309. f'times per image: {1000 / fps:.1f} ms / img',
  310. flush=True)
  311. break
  312. if __name__ == '__main__':
  313. FLAGS = parse_args()
  314. if 'trt' in FLAGS.run_mode:
  315. check_version('develop')
  316. check_trt_version('8.2')
  317. else:
  318. check_version('2.4')
  319. measure_speed(FLAGS)