files_fetching.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. import argparse
  2. import os
  3. import shutil
  4. import concurrent.futures
  5. import random
  6. # 脚本功能:可以随机抓取指定目录下的文件,可指定后缀名,指定要抓取的数量
  7. version = "1.0.0"
  8. def is_in_whitelist(file_name, whitelist):
  9. # 获取文件的后缀名
  10. _, ext = os.path.splitext(file_name)
  11. # 判断后缀名是否在白名单中
  12. return ext.lower() in whitelist
  13. def list_files(directory, whitelist, blacklist):
  14. # 遍历目录中的文件和文件夹
  15. res = []
  16. for root, dirs, files in os.walk(directory):
  17. # 只输出文件名,不包括文件夹
  18. for file in files:
  19. if not file.startswith(".") and is_in_whitelist(file, whitelist):
  20. file_path = os.path.join(root, file)
  21. res.append(file_path)
  22. return res
  23. def check_path(path_to_check):
  24. try:
  25. if not os.path.exists(path_to_check):
  26. os.makedirs(path_to_check)
  27. except OSError as e:
  28. print(f"发生错误: {e}")
  29. def copy_file(src_file, dst_file):
  30. if os.path.exists(dst_file):
  31. print(f"文件 {dst_file} 已存在")
  32. else:
  33. try:
  34. shutil.copy(src_file, dst_file)
  35. print(f"文件已成功从 {src_file} 拷贝到 {dst_file}")
  36. except FileNotFoundError:
  37. print(f"源文件 {src_file} 不存在")
  38. except PermissionError:
  39. print(f"没有权限拷贝文件到 {dst_file}")
  40. except Exception as e:
  41. print(f"发生错误: {e}")
  42. def copy_files_concurrently(src_files, dst_dir):
  43. with concurrent.futures.ThreadPoolExecutor() as executor:
  44. futures = [executor.submit(copy_file, src, os.path.join(dst_dir, os.path.basename(src))) for src in src_files]
  45. for future in concurrent.futures.as_completed(futures):
  46. try:
  47. # 你可以在这里处理返回的结果(如果有的话),但在这个例子中,copy_file没有返回值
  48. pass
  49. except Exception as exc:
  50. print(f'Generated an exception: {exc}')
  51. def main():
  52. parse = argparse.ArgumentParser("可以随机抓取指定目录下的文件,可指定后缀名,指定要抓取的数量\n")
  53. parse.add_argument("-v", "--version", action="version", version=version)
  54. parse.add_argument("input", help="输入路径")
  55. parse.add_argument("output", help="输出路径")
  56. parse.add_argument("--count", type=int, help="随机抓取的文件数量,不设置则抓取所有文件")
  57. parse.add_argument("--whitelist", nargs='+', help="文件后缀名白名单,如果设置白名单,则只会选取白名单中的文件格式")
  58. parse.add_argument("--blacklist", nargs='+', help="[未实装]文件后缀名黑名单,如果设置黑名单,则会选取过滤黑名单中的文件格式")
  59. args = parse.parse_args()
  60. whitelist = []
  61. blacklist = []
  62. count = -1
  63. if args.whitelist:
  64. for suffix in args.whitelist:
  65. if not suffix.startswith("."):
  66. whitelist.append("." + suffix)
  67. else:
  68. whitelist.append(suffix)
  69. if args.blacklist:
  70. for suffix in args.blacklist:
  71. if not suffix.startswith("."):
  72. blacklist.append("." + suffix)
  73. else:
  74. blacklist.append(suffix)
  75. if args.count:
  76. count = args.count
  77. files = list_files(args.input, whitelist, blacklist)
  78. files_to_copy = []
  79. check_path(args.output)
  80. if count == -1 or len(files) <= count:
  81. # 拷贝全部
  82. files_to_copy = files
  83. else:
  84. files_to_copy = random.sample(files, count)
  85. copy_files_concurrently(files_to_copy, args.output)
  86. if __name__ == "__main__":
  87. main()