Others
/
DocumentAIKit


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
							import json
import os
import random
import shutil
import argparse
from tqdm import tqdm


def clean_label_files(labels_dir, output_labels_dir, log_file):
    if not os.path.exists(output_labels_dir):
        os.makedirs(output_labels_dir)

    processed_files = []
    label_files = os.listdir(labels_dir)

    with tqdm(total=len(label_files), desc="Processing files", unit="file") as pbar:
        for label_file in label_files:  # 遍历每一个txt文件名
            file_path = os.path.join(labels_dir, label_file)  # 拼接得到旧的txt文件地址
            output_file_path = os.path.join(output_labels_dir, label_file)  # 新的txt文件地址

            _, file_extension = os.path.splitext(file_path)
            if not file_extension.lower() == ".txt":
                continue

            try:
                with open(file_path, 'r') as f, open(log_file, 'a+') as log:  # 打开旧的txt文件
                    lines = f.readlines()  # 得到文件内容
                    new_lines = []
                    file_needs_logging = False

                    for line_number, line in enumerate(lines, start=1):
                        parts = line.strip().split()
                        if parts:
                            first_num = int(parts[0])
                            other_nums = [float(num) for num in parts[1:]]

                            if not all(0 <= num <= 1 for num in other_nums):  # label 数据有误
                                log.write(f"{label_file}:\n")
                                log.write(f"\terror line: {line_number}\n")
                            elif first_num in [4, 20]:  # 4 和 20 是暂时不需要的数据
                                log.write(f"{label_file}:\n")
                                log.write(f"\tdelete line: {line_number}\n")
                            elif first_num in [5, 9, 13, 21, 25]:  # 修改为标题
                                log.write(f"{label_file}:\n")
                                log.write(f"\tmodify line {line_number}: {first_num} -> 26\n")
                                new_content = '26 '
                                new_content += ' '.join(map(str, other_nums))
                                new_content += '\n'
                                new_lines.append(new_content)
                            else:
                                new_lines.append(line)

                            # if first_num not in [4, 20] and all(
                            #         0 <= num <= 1 for num in other_nums):  # 只有当一行内容的第一个数字不是4或20，其他数字都在[0,1]之间时这一行内容才能被保留
                            #     new_lines.append(line)
                            # elif not all(0 <= num <= 1 for num in other_nums):  # 如果不满足上诉条件且有数字不在【0，1】的，要记录文件名
                            #     file_needs_logging = True
                            # =====

                    with open(output_file_path, 'w') as out:  # 将文本内容写入新文件夹中
                        out.writelines(new_lines)

            except Exception as e:
                print(f"发生错误: {e}")

            pbar.update(1)  # 更新进度条

    # with open(log_file, 'w') as log:
    #     for file_name in processed_files:
    #         log.write(f"{file_name}\n")


if __name__ == '__main__':
    parser = argparse.ArgumentParser("删除特定标注信息\n")
    parser.add_argument("dataset_dir", help="图片文件夹路径")
    parser.add_argument("--log_file", default="processed_files.log", help="记录被处理文件的日志文件路径")

    args = parser.parse_args()

    output_labels_dir = os.path.join(args.dataset_dir, 'processed_labels')  # 生成的新的labels文件夹
    log_file_dir = os.path.join(args.dataset_dir, args.log_file)  # 日志文件
    clean_label_files(os.path.join(args.dataset_dir, 'labels'), output_labels_dir, log_file_dir)