clean_label_la_0.1.4l.py 3.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. import json
  2. import os
  3. import random
  4. import shutil
  5. import argparse
  6. from tqdm import tqdm
  7. def clean_label_files(labels_dir, output_labels_dir, log_file):
  8. if not os.path.exists(output_labels_dir):
  9. os.makedirs(output_labels_dir)
  10. processed_files = []
  11. label_files = os.listdir(labels_dir)
  12. with tqdm(total=len(label_files), desc="Processing files", unit="file") as pbar:
  13. for label_file in label_files: # 遍历每一个txt文件名
  14. file_path = os.path.join(labels_dir, label_file) # 拼接得到旧的txt文件地址
  15. output_file_path = os.path.join(output_labels_dir, label_file) # 新的txt文件地址
  16. _, file_extension = os.path.splitext(file_path)
  17. if not file_extension.lower() == ".txt":
  18. continue
  19. try:
  20. with open(file_path, 'r') as f, open(log_file, 'a+') as log: # 打开旧的txt文件
  21. lines = f.readlines() # 得到文件内容
  22. new_lines = []
  23. file_needs_logging = False
  24. for line_number, line in enumerate(lines, start=1):
  25. parts = line.strip().split()
  26. if parts:
  27. first_num = int(parts[0])
  28. other_nums = [float(num) for num in parts[1:]]
  29. if not all(0 <= num <= 1 for num in other_nums): # label 数据有误
  30. log.write(f"{label_file}:\n")
  31. log.write(f"\terror line: {line_number}\n")
  32. elif first_num in [4, 20]: # 4 和 20 是暂时不需要的数据
  33. log.write(f"{label_file}:\n")
  34. log.write(f"\tdelete line: {line_number}\n")
  35. elif first_num in [5, 9, 13, 21, 25]: # 修改为标题
  36. log.write(f"{label_file}:\n")
  37. log.write(f"\tmodify line {line_number}: {first_num} -> 26\n")
  38. new_content = '26 '
  39. new_content += ' '.join(map(str, other_nums))
  40. new_content += '\n'
  41. new_lines.append(new_content)
  42. else:
  43. new_lines.append(line)
  44. # if first_num not in [4, 20] and all(
  45. # 0 <= num <= 1 for num in other_nums): # 只有当一行内容的第一个数字不是4或20,其他数字都在[0,1]之间时这一行内容才能被保留
  46. # new_lines.append(line)
  47. # elif not all(0 <= num <= 1 for num in other_nums): # 如果不满足上诉条件且有数字不在【0,1】的,要记录文件名
  48. # file_needs_logging = True
  49. # =====
  50. with open(output_file_path, 'w') as out: # 将文本内容写入新文件夹中
  51. out.writelines(new_lines)
  52. except Exception as e:
  53. print(f"发生错误: {e}")
  54. pbar.update(1) # 更新进度条
  55. # with open(log_file, 'w') as log:
  56. # for file_name in processed_files:
  57. # log.write(f"{file_name}\n")
  58. if __name__ == '__main__':
  59. parser = argparse.ArgumentParser("删除特定标注信息\n")
  60. parser.add_argument("dataset_dir", help="图片文件夹路径")
  61. parser.add_argument("--log_file", default="processed_files.log", help="记录被处理文件的日志文件路径")
  62. args = parser.parse_args()
  63. output_labels_dir = os.path.join(args.dataset_dir, 'processed_labels') # 生成的新的labels文件夹
  64. log_file_dir = os.path.join(args.dataset_dir, args.log_file) # 日志文件
  65. clean_label_files(os.path.join(args.dataset_dir, 'labels'), output_labels_dir, log_file_dir)