Переглянути джерело

[feat] 新增清除特定标签的脚本

WangChao 5 місяців тому
батько
коміт
e6cd5126b8

+ 1 - 0
data_collection/check_repeat.py

@@ -2,6 +2,7 @@ import os
 import shutil
 import argparse
 
+# 脚本功能:可以查询目标路径下面多个文件夹中的重复文件,并将多余的重复文件删除
 
 def find_and_delete_duplicate_files(path):
     # 字典,用于跟踪文件名及其出现的路径

+ 1 - 0
data_collection/files_fetching.py

@@ -4,6 +4,7 @@ import shutil
 import concurrent.futures
 import random
 
+# 脚本功能:可以随机抓取指定目录下的文件,可指定后缀名,指定要抓取的数量
 
 version = "1.0.0"
 

+ 101 - 0
data_collection/images_fetching.py

@@ -0,0 +1,101 @@
+import argparse
+import os
+import shutil
+import fitz
+import uuid
+import random
+import json
+
+
+json_data = []
+
+
+def is_in_whitelist(file_name, whitelist):
+    # 获取文件的后缀名
+    _, ext = os.path.splitext(file_name)
+    # 判断后缀名是否在白名单中
+    return ext.lower() in whitelist
+
+
+def generate_unique_random_numbers(start, end, count):
+    numbers = set()
+    while len(numbers) < count:
+        numbers.add(random.randint(start, end-1))
+    return list(numbers)[:count]
+
+
+def get_render_pages(count, need):
+    if count < 3:
+        return list(range(count))
+    else:
+        return generate_unique_random_numbers(0, count, need)
+
+
+def pyMuPDF_fitz(pdfPath, imagePath, need):
+    try:
+        pdfDoc = fitz.open(pdfPath)
+
+        file_name = os.path.splitext(os.path.basename(pdfPath))[0]
+        page_array = get_render_pages(pdfDoc.page_count, need)
+
+        file_info = {
+            "file_name" : file_name,
+            "page_numbers" : page_array
+        }
+        json_data.append(file_info)
+
+        for pg in page_array:
+            page = pdfDoc[pg]
+            info = page.bound()
+            radio = 0.0
+            if info.width > info.height:
+                radio = 800.0 / info.width
+            else:
+                radio = 800.0 / info.height
+            rotate = int(0)
+            zoom_x = 1.33333
+            zoom_y = 1.333333
+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
+            try:
+                pix = page.get_pixmap(matrix=mat, alpha=False)
+            except Exception as e:
+                print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
+                continue
+            if not os.path.exists(imagePath):
+                os.makedirs(imagePath)
+
+            save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg')
+            pix.save(save_path)
+
+    except Exception as e:
+        print(f"发生错误: {e}")
+
+    pdfDoc.close()
+
+
+def process_files(input, output, need):
+    # 遍历目录中的文件和文件夹
+    for root, dirs, files in os.walk(input):
+        # 只输出文件名,不包括文件夹
+        for file in files:
+            if not file.startswith(".") and is_in_whitelist(file, [".pdf"]):
+                file_path = os.path.join(root, file)
+                pyMuPDF_fitz(file_path, output, need)
+
+
+def main():
+    parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n")
+    parse.add_argument("input", help="输入路径")
+    parse.add_argument("output", help="输出路径")
+    parse.add_argument("--count", type=int, help="每份PDF需要提取的图片数量")
+
+    args = parse.parse_args()
+    process_files(args.input, os.path.join(args.output, "images"), args.count)
+
+    json_path = os.path.join(args.output, "files_info.json")
+    with open(json_path, 'w', encoding='utf-8') as f:
+        json.dump(json_data, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == "__main__":
+    main()

+ 65 - 0
delete_anno_by_label/delete_annot.py

@@ -0,0 +1,65 @@
+import argparse
+import json
+import os
+from pycocotools.coco import COCO
+
+
+def new_path(old_path):
+    # 使用os.path.dirname获取目录部分
+    directory = os.path.dirname(old_path)
+    # 使用os.path.basename获取文件名部分(包括扩展名)
+    filename_with_extension = os.path.basename(old_path)
+    # 使用os.path.splitext分割文件名和扩展名
+    filename, extension = os.path.splitext(filename_with_extension)
+    # 修改文件名部分,添加_filter并重新组合扩展名
+    new_filename = filename + '_filter' + extension
+    # 如果directory为空(即文件名只包含一个文件名没有目录),则使用'.'作为目录
+    if not directory:
+        directory = '.'
+    # 使用os.path.join组合目录和新的文件名
+    return os.path.join(directory, new_filename)
+
+
+def process_coco(file, annot_name, output=""):
+    # 初始化COCO对象
+    coco = COCO(file)
+    # 获取所有的类别信息
+    categories = coco.loadCats(coco.getCatIds())
+    # 找到"line"类别的ID
+    cat_id = []
+    for cat in categories:
+        if cat['name'] in annot_name:
+            cat_id.append(cat['id'])
+
+    if cat_id is None:
+        print("Category not found in annotations.")
+    else:
+        # 获取所有的标注
+        annotations = coco.dataset['annotations']
+
+        # 创建一个新的标注列表,不包含"line"类别的标注
+        new_annotations = [ann for ann in annotations if ann['category_id'] not in cat_id]
+
+        # 将新的标注列表写回到一个新的JSON文件中
+        if output == "":
+            output = new_path(file)
+
+        with open(output, 'w') as f:
+            coco.dataset['annotations'] = new_annotations
+            json.dump(coco.dataset, f, ensure_ascii=True, indent=2)
+
+
+def main():
+    parser = argparse.ArgumentParser("删除特定标注信息\n")
+    parser.add_argument("input", help="输入路径")
+    parser.add_argument("type", help="数据标注格式类型,目前支持coco")
+    parser.add_argument("--labels", nargs='+', help="需要清除的标签")
+    parser.add_argument("--output", help="[可选]输出路径")
+
+    args = parser.parse_args()
+    if args.type == 'coco':
+        process_coco(args.input, args.labels)
+
+
+if __name__ == "__main__":
+    main()