|
@@ -0,0 +1,308 @@
|
|
|
+# Description:
|
|
|
+# 1. 读取指定文件夹下的所有txt文件
|
|
|
+# 2. 逐行读取文件内容,根据一定规则过滤行
|
|
|
+# 3. 一定概率将过滤后的文件写入新文件
|
|
|
+# 4. 将其他未处理的文件移动到out文件夹
|
|
|
+# 5. 输出日志
|
|
|
+
|
|
|
+import os
|
|
|
+import argparse
|
|
|
+from datetime import datetime
|
|
|
+import re
|
|
|
+from pathlib import Path
|
|
|
+import shutil
|
|
|
+
|
|
|
+def list_files(dir_path):
|
|
|
+ file_list = []
|
|
|
+ for root, dirs, files in os.walk(dir_path):
|
|
|
+ for file in files:
|
|
|
+ file_list.append(os.path.join(root, file))
|
|
|
+ return file_list
|
|
|
+
|
|
|
+
|
|
|
+def get_files_in_directory(dir_path):
|
|
|
+ files_and_folders = os.listdir(dir_path)
|
|
|
+
|
|
|
+ files = [file for file in files_and_folders if os.path.isfile(os.path.join(dir_path, file))]
|
|
|
+
|
|
|
+ return files
|
|
|
+
|
|
|
+
|
|
|
+def read_file_by_line(file_path):
|
|
|
+ try:
|
|
|
+ with open(file_path, 'r', errors='ignore') as file:
|
|
|
+ lines = file.readlines()
|
|
|
+ return lines
|
|
|
+ except FileNotFoundError:
|
|
|
+ print(f"文件 '{file_path}' 不存在")
|
|
|
+
|
|
|
+summary_line_count = 0
|
|
|
+catched_line_count = 0
|
|
|
+handled_line_count = 0
|
|
|
+handled_file_count = 0
|
|
|
+
|
|
|
+lines_nessary_catched = []
|
|
|
+lines_optional_catched = []
|
|
|
+
|
|
|
+is_missed = False
|
|
|
+is_empty = False
|
|
|
+
|
|
|
+def find_non_empty_line(lines, index, direction):
|
|
|
+ if direction == 'next':
|
|
|
+ step = 1
|
|
|
+ elif direction == 'previous':
|
|
|
+ step = -1
|
|
|
+ else:
|
|
|
+ return None
|
|
|
+
|
|
|
+ new_index = index + step
|
|
|
+ while new_index >= 0 and new_index < len(lines):
|
|
|
+ if lines[new_index].strip() != '':
|
|
|
+ return lines[new_index]
|
|
|
+ new_index += step
|
|
|
+
|
|
|
+ return None
|
|
|
+
|
|
|
+def filter_lines(file_path, output_success_files):
|
|
|
+ global summary_line_count
|
|
|
+ global lines_nessary_catched
|
|
|
+ global lines_optional_catched
|
|
|
+ global is_missed
|
|
|
+ global is_empty
|
|
|
+
|
|
|
+ if True:
|
|
|
+ tags = ["图", "来源", "请参阅"]
|
|
|
+ tail_tags = ["走势", "趋势"]
|
|
|
+ except_signs = ['。','.....']
|
|
|
+ else:
|
|
|
+ tags = ["图", "来源"]
|
|
|
+ tail_tags = []
|
|
|
+
|
|
|
+ except_signs = []
|
|
|
+
|
|
|
+ lines_nessary_catched = []
|
|
|
+ lines_optional_catched = []
|
|
|
+
|
|
|
+ lines = read_file_by_line(file_path)
|
|
|
+ for line in lines:
|
|
|
+ summary_line_count += 1
|
|
|
+ # 如果行中包含关键字
|
|
|
+ for tag in tags:
|
|
|
+ found = False
|
|
|
+ if tag in line:
|
|
|
+ # except_signs 不在其中,避免遗漏长文段内容与目录
|
|
|
+ if not any(except_tag in line for except_tag in except_signs):
|
|
|
+ found = True
|
|
|
+ append_to_necessary_array(lines.index(line))
|
|
|
+ break
|
|
|
+ if found:
|
|
|
+ continue
|
|
|
+ for tail_tag in tail_tags:
|
|
|
+ if line.endswith(tail_tag):
|
|
|
+ # except_signs 不在其中,避免遗漏长文段内容与目录
|
|
|
+ if not any(except_tag in line for except_tag in except_signs):
|
|
|
+ found = True
|
|
|
+ append_to_necessary_array(lines.index(line))
|
|
|
+ break
|
|
|
+ if found:
|
|
|
+ continue
|
|
|
+ # 如果行是纯数字、带逗号的数字、带小数点的数字,或者是百分比数字,且上一个内容行或下一个内容行也满足条件,大概率是无意义内容,如果上下行是空格,则再往前或往后查询一行
|
|
|
+ opt_regix = r'^[,\.\%-]*\d+[,\.\%-]*$'
|
|
|
+ if re.match(opt_regix, line.strip()):
|
|
|
+ index = lines.index(line)
|
|
|
+ if index == 0:
|
|
|
+ next_line = find_non_empty_line(lines, index, 'next')
|
|
|
+ if next_line and re.match(opt_regix, next_line.strip()):
|
|
|
+ append_to_optional_array(index)
|
|
|
+ continue
|
|
|
+ elif index == len(lines) - 1:
|
|
|
+ previous_line = find_non_empty_line(lines, index, 'previous')
|
|
|
+ if previous_line and re.match(opt_regix, previous_line.strip()):
|
|
|
+ append_to_optional_array(index)
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ next_line = find_non_empty_line(lines, index, 'next')
|
|
|
+ previous_line = find_non_empty_line(lines, index, 'previous')
|
|
|
+ if (next_line and re.match(opt_regix, next_line.strip())) or (previous_line and re.match(opt_regix, previous_line.strip())):
|
|
|
+ append_to_optional_array(index)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 对 lines_optional_catched 和 lines_nessary_catched 进行去重
|
|
|
+ lines_optional_catched = list(set(lines_optional_catched))
|
|
|
+ lines_nessary_catched = list(set(lines_nessary_catched))
|
|
|
+
|
|
|
+ if len(lines) > 0:
|
|
|
+ is_empty = False
|
|
|
+ if(len(lines_optional_catched)/( len(lines)) >= 0.02):
|
|
|
+ is_missed = True
|
|
|
+ else:
|
|
|
+ is_missed = False
|
|
|
+ delete_catched_lines(file_path, output_success_files)
|
|
|
+ else:
|
|
|
+ is_empty = True
|
|
|
+ is_missed = True
|
|
|
+ return set(lines_nessary_catched) | set(lines_optional_catched)
|
|
|
+
|
|
|
+def remove_output_next_level(file_path):
|
|
|
+ # 将文件路径转换为Path对象
|
|
|
+ path = Path(file_path)
|
|
|
+ # 找到"output"在路径中的位置
|
|
|
+ output_index = path.parts.index('output')
|
|
|
+ # 删除"output"后的一级结构
|
|
|
+ new_parts = path.parts[:output_index+1] + path.parts[output_index+2:]
|
|
|
+ # 将新的路径部分连接起来
|
|
|
+ new_path = Path(os.path.join(*new_parts))
|
|
|
+ return str(new_path)
|
|
|
+
|
|
|
+def delete_catched_lines(file_path, output_success_files):
|
|
|
+ global lines_nessary_catched
|
|
|
+ global handled_line_count
|
|
|
+ global handled_file_count
|
|
|
+ global is_missed
|
|
|
+
|
|
|
+ handled_file_count += 1
|
|
|
+ handled_line_count += len(lines_nessary_catched)
|
|
|
+
|
|
|
+ with open(file_path, 'r', errors='ignore') as file:
|
|
|
+ read_lines = file.readlines()
|
|
|
+ if len(read_lines) == 0:
|
|
|
+ is_missed = True
|
|
|
+ return
|
|
|
+
|
|
|
+ output_success_file = convert_with_relative_path(remove_output_next_level(file_path), output_success_files, False)
|
|
|
+ lines = set(lines_nessary_catched) | set(lines_optional_catched)
|
|
|
+ with open(output_success_file, 'w', errors='ignore') as file:
|
|
|
+ for line in read_lines:
|
|
|
+ if read_lines.index(line) not in lines:
|
|
|
+ file.write(line)
|
|
|
+
|
|
|
+ # 如果文件处理后为空,不保存这个文件
|
|
|
+ if os.path.getsize(output_success_file) == 0:
|
|
|
+ os.remove(output_success_file)
|
|
|
+ is_missed = True
|
|
|
+ print(f"文件 {file_path} 处理后为空,已删除\n")
|
|
|
+
|
|
|
+def convert_with_relative_path(path, output_path, is_dir=True):
|
|
|
+ global input_dir_path
|
|
|
+
|
|
|
+ # Get the relative path of the file
|
|
|
+ relative_path = os.path.relpath(path, os.path.dirname(input_dir_path))
|
|
|
+ output_path = os.path.join(output_path, relative_path)
|
|
|
+
|
|
|
+ # Create the directory if it does not exist
|
|
|
+ if not os.path.exists(output_path):
|
|
|
+ if is_dir:
|
|
|
+ os.makedirs(output_path)
|
|
|
+ else :
|
|
|
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
+
|
|
|
+ return output_path
|
|
|
+ # shutil.copy(path, output_path)
|
|
|
+
|
|
|
+def append_to_necessary_array(line):
|
|
|
+ global catched_line_count
|
|
|
+ global lines_nessary_catched
|
|
|
+ catched_line_count += 1
|
|
|
+ lines_nessary_catched.append(line)
|
|
|
+
|
|
|
+def append_to_optional_array(line):
|
|
|
+ global catched_line_count
|
|
|
+ global lines_optional_catched
|
|
|
+ catched_line_count += 1
|
|
|
+ lines_optional_catched.append(line)
|
|
|
+
|
|
|
+input_dir_path = ""
|
|
|
+
|
|
|
+def process(dir_path, output_dir_path):
|
|
|
+ global is_missed
|
|
|
+ global handled_file_count
|
|
|
+ global input_dir_path
|
|
|
+
|
|
|
+ input_dir_path = dir_path
|
|
|
+ output_success_files = os.path.join(output_dir_path, "success_files")
|
|
|
+ output_failed_files = os.path.join(output_dir_path, "failed_files")
|
|
|
+
|
|
|
+ os.makedirs(output_dir_path, exist_ok=True)
|
|
|
+ os.makedirs(output_failed_files, exist_ok=True)
|
|
|
+
|
|
|
+ files = list_files(dir_path)
|
|
|
+ output_name = "filter_lines.txt"
|
|
|
+
|
|
|
+ summary_file_count = 0
|
|
|
+ catched_file_count = 0
|
|
|
+ handled_file_count = 0
|
|
|
+ too_big_file_count = 0
|
|
|
+
|
|
|
+ global catched_line_count
|
|
|
+
|
|
|
+ collected_file_count = 0
|
|
|
+ empty_file_count = 0
|
|
|
+
|
|
|
+ output_path = os.path.join(output_dir_path, output_name)
|
|
|
+
|
|
|
+ with open(os.path.join(dir_path, output_path) , "a") as output_txt:
|
|
|
+ output_txt.write(f"{dir_path}\n")
|
|
|
+ for file in files:
|
|
|
+ if not file.endswith(".txt") and not os.path.basename(file).startswith("filter_lines"):
|
|
|
+ continue
|
|
|
+
|
|
|
+ lines_catched = filter_lines(file, output_success_files)
|
|
|
+ summary_file_count += 1
|
|
|
+ if is_empty:
|
|
|
+ empty_file_count += 1
|
|
|
+ print(f"文件 {file} 为空,当前 {empty_file_count} 个空文件\n")
|
|
|
+ continue
|
|
|
+ # 如果文件大于2M,直接处理为失败,过大的文件会引起超长时间的处理
|
|
|
+ if os.path.getsize(file) > 2 * 1024 * 1024:
|
|
|
+ output_failed_file = convert_with_relative_path(remove_output_next_level(file), output_failed_files, False)
|
|
|
+ # 将file复制到output_failed_files
|
|
|
+ shutil.copy(file, output_failed_file)
|
|
|
+ summary_file_count += 1
|
|
|
+ too_big_file_count += 1
|
|
|
+ print(f"文件 {file} 大于2M,不处理\n")
|
|
|
+ continue
|
|
|
+ if lines_catched:
|
|
|
+ catched_file_count += 1
|
|
|
+
|
|
|
+ if is_missed:
|
|
|
+ output_failed_file = convert_with_relative_path(remove_output_next_level(file), output_failed_files, False)
|
|
|
+ # 将file复制到output_failed_files
|
|
|
+ shutil.copy(file, output_failed_file)
|
|
|
+
|
|
|
+ collected_file_count += 1
|
|
|
+ # 每循环1000次,打印一次日志
|
|
|
+ if summary_file_count % 1000 == 0:
|
|
|
+ print(f"已扫描txt文件 {summary_file_count} 个,其中有问题的文件 {catched_file_count} 个,已处理 {handled_file_count} 个,已收集 {collected_file_count} 个问题文件。\n")
|
|
|
+ print(f"已扫描行数 {summary_line_count} 行,其中有问题的行数 {catched_line_count} 行,已处理 {handled_line_count} 行\n")
|
|
|
+
|
|
|
+ print(f"已扫描txt文件 {summary_file_count} 个,其中有问题的文件 {catched_file_count} 个,已处理 {handled_file_count} 个,已收集 {collected_file_count} 个问题文件。\n")
|
|
|
+ print(f"已扫描行数 {summary_line_count} 行,其中有问题的行数 {catched_line_count} 行,已处理 {handled_line_count} 行\n")
|
|
|
+ # 写入日志
|
|
|
+ output_txt.write(f"\n===============================================\n \n")
|
|
|
+ output_txt.write(f"已扫描文件 {summary_file_count} 个,其中有问题的文件 {catched_file_count} 个,已处理 {handled_file_count} 个,已收集 {collected_file_count} 个问题文件, 空文件 {empty_file_count} 个。\n \n")
|
|
|
+ output_txt.write(f"已扫描行数 {summary_line_count} 行,其中有问题的行数 {catched_line_count} 行,已处理 {handled_line_count} 行\n \n")
|
|
|
+ # 写入文件和行的处理率(已处理x / 有问题y)
|
|
|
+ file_process_rate = 0
|
|
|
+ line_process_rate = 0
|
|
|
+ if catched_file_count > 0:
|
|
|
+ file_process_rate = handled_file_count / catched_file_count
|
|
|
+ if catched_line_count > 0:
|
|
|
+ line_process_rate = handled_line_count / catched_line_count
|
|
|
+ output_txt.write(f"文件处理率 {file_process_rate} 行处理率 {line_process_rate}\n \n")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+
|
|
|
+ parser = argparse.ArgumentParser(description='批量转档程序')
|
|
|
+ parser.add_argument("--input", type=str, nargs='+', help='输入文件夹根路径')
|
|
|
+ parser.add_argument("--output", type=str, help='输出文件夹根路径')
|
|
|
+
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ if not args.input:
|
|
|
+ parser.print_help()
|
|
|
+
|
|
|
+ input_dir_paths = args.input
|
|
|
+ output_dir_path = args.output
|
|
|
+ for input_dir_path in input_dir_paths:
|
|
|
+ process(input_dir_path, output_dir_path)
|