5 tháng trước cách đây · e6cd5126b8
--- a/data_collection/check_repeat.py
+++ b/data_collection/check_repeat.py
@@ -2,6 +2,7 @@ import os
 
																 import shutil
															
 
																 import argparse
															
 
																+# 脚本功能：可以查询目标路径下面多个文件夹中的重复文件，并将多余的重复文件删除
															
 
																 def find_and_delete_duplicate_files(path):
															
 
																     # 字典，用于跟踪文件名及其出现的路径
															
--- a/data_collection/files_fetching.py
+++ b/data_collection/files_fetching.py
@@ -4,6 +4,7 @@ import shutil
 
																 import concurrent.futures
															
 
																 import random
															
 
																+# 脚本功能：可以随机抓取指定目录下的文件，可指定后缀名，指定要抓取的数量
															
 
																 version = "1.0.0"
															
--- a/data_collection/images_fetching.py
+++ b/data_collection/images_fetching.py
@@ -0,0 +1,101 @@
 
																+import argparse
															
 
																+import os
															
 
																+import shutil
															
 
																+import fitz
															
 
																+import uuid
															
 
																+import random
															
 
																+import json
															
 
																+
															
 
																+
															
 
																+json_data = []
															
 
																+
															
 
																+
															
 
																+def is_in_whitelist(file_name, whitelist):
															
 
																+    # 获取文件的后缀名
															
 
																+    _, ext = os.path.splitext(file_name)
															
 
																+    # 判断后缀名是否在白名单中
															
 
																+    return ext.lower() in whitelist
															
 
																+
															
 
																+
															
 
																+def generate_unique_random_numbers(start, end, count):
															
 
																+    numbers = set()
															
 
																+    while len(numbers) < count:
															
 
																+        numbers.add(random.randint(start, end-1))
															
 
																+    return list(numbers)[:count]
															
 
																+
															
 
																+
															
 
																+def get_render_pages(count, need):
															
 
																+    if count < 3:
															
 
																+        return list(range(count))
															
 
																+    else:
															
 
																+        return generate_unique_random_numbers(0, count, need)
															
 
																+
															
 
																+
															
 
																+def pyMuPDF_fitz(pdfPath, imagePath, need):
															
 
																+    try:
															
 
																+        pdfDoc = fitz.open(pdfPath)
															
 
																+
															
 
																+        file_name = os.path.splitext(os.path.basename(pdfPath))[0]
															
 
																+        page_array = get_render_pages(pdfDoc.page_count, need)
															
 
																+
															
 
																+        file_info = {
															
 
																+            "file_name" : file_name,
															
 
																+            "page_numbers" : page_array
															
 
																+        }
															
 
																+        json_data.append(file_info)
															
 
																+
															
 
																+        for pg in page_array:
															
 
																+            page = pdfDoc[pg]
															
 
																+            info = page.bound()
															
 
																+            radio = 0.0
															
 
																+            if info.width > info.height:
															
 
																+                radio = 800.0 / info.width
															
 
																+            else:
															
 
																+                radio = 800.0 / info.height
															
 
																+            rotate = int(0)
															
 
																+            zoom_x = 1.33333
															
 
																+            zoom_y = 1.333333
															
 
																+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
															
 
																+            try:
															
 
																+                pix = page.get_pixmap(matrix=mat, alpha=False)
															
 
																+            except Exception as e:
															
 
																+                print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
															
 
																+                continue
															
 
																+            if not os.path.exists(imagePath):
															
 
																+                os.makedirs(imagePath)
															
 
																+
															
 
																+            save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg')
															
 
																+            pix.save(save_path)
															
 
																+
															
 
																+    except Exception as e:
															
 
																+        print(f"发生错误: {e}")
															
 
																+
															
 
																+    pdfDoc.close()
															
 
																+
															
 
																+
															
 
																+def process_files(input, output, need):
															
 
																+    # 遍历目录中的文件和文件夹
															
 
																+    for root, dirs, files in os.walk(input):
															
 
																+        # 只输出文件名，不包括文件夹
															
 
																+        for file in files:
															
 
																+            if not file.startswith(".") and is_in_whitelist(file, [".pdf"]):
															
 
																+                file_path = os.path.join(root, file)
															
 
																+                pyMuPDF_fitz(file_path, output, need)
															
 
																+
															
 
																+
															
 
																+def main():
															
 
																+    parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n")
															
 
																+    parse.add_argument("input", help="输入路径")
															
 
																+    parse.add_argument("output", help="输出路径")
															
 
																+    parse.add_argument("--count", type=int, help="每份PDF需要提取的图片数量")
															
 
																+
															
 
																+    args = parse.parse_args()
															
 
																+    process_files(args.input, os.path.join(args.output, "images"), args.count)
															
 
																+
															
 
																+    json_path = os.path.join(args.output, "files_info.json")
															
 
																+    with open(json_path, 'w', encoding='utf-8') as f:
															
 
																+        json.dump(json_data, f, ensure_ascii=False, indent=4)
															
 
																+
															
 
																+
															
 
																+if __name__ == "__main__":
															
 
																+    main()
															
--- a/delete_anno_by_label/delete_annot.py
+++ b/delete_anno_by_label/delete_annot.py
@@ -0,0 +1,65 @@
 
																+import argparse
															
 
																+import json
															
 
																+import os
															
 
																+from pycocotools.coco import COCO
															
 
																+
															
 
																+
															
 
																+def new_path(old_path):
															
 
																+    # 使用os.path.dirname获取目录部分
															
 
																+    directory = os.path.dirname(old_path)
															
 
																+    # 使用os.path.basename获取文件名部分（包括扩展名）
															
 
																+    filename_with_extension = os.path.basename(old_path)
															
 
																+    # 使用os.path.splitext分割文件名和扩展名
															
 
																+    filename, extension = os.path.splitext(filename_with_extension)
															
 
																+    # 修改文件名部分，添加_filter并重新组合扩展名
															
 
																+    new_filename = filename + '_filter' + extension
															
 
																+    # 如果directory为空（即文件名只包含一个文件名没有目录），则使用'.'作为目录
															
 
																+    if not directory:
															
 
																+        directory = '.'
															
 
																+    # 使用os.path.join组合目录和新的文件名
															
 
																+    return os.path.join(directory, new_filename)
															
 
																+
															
 
																+
															
 
																+def process_coco(file, annot_name, output=""):
															
 
																+    # 初始化COCO对象
															
 
																+    coco = COCO(file)
															
 
																+    # 获取所有的类别信息
															
 
																+    categories = coco.loadCats(coco.getCatIds())
															
 
																+    # 找到"line"类别的ID
															
 
																+    cat_id = []
															
 
																+    for cat in categories:
															
 
																+        if cat['name'] in annot_name:
															
 
																+            cat_id.append(cat['id'])
															
 
																+
															
 
																+    if cat_id is None:
															
 
																+        print("Category not found in annotations.")
															
 
																+    else:
															
 
																+        # 获取所有的标注
															
 
																+        annotations = coco.dataset['annotations']
															
 
																+
															
 
																+        # 创建一个新的标注列表，不包含"line"类别的标注
															
 
																+        new_annotations = [ann for ann in annotations if ann['category_id'] not in cat_id]
															
 
																+
															
 
																+        # 将新的标注列表写回到一个新的JSON文件中
															
 
																+        if output == "":
															
 
																+            output = new_path(file)
															
 
																+
															
 
																+        with open(output, 'w') as f:
															
 
																+            coco.dataset['annotations'] = new_annotations
															
 
																+            json.dump(coco.dataset, f, ensure_ascii=True, indent=2)
															
 
																+
															
 
																+
															
 
																+def main():
															
 
																+    parser = argparse.ArgumentParser("删除特定标注信息\n")
															
 
																+    parser.add_argument("input", help="输入路径")
															
 
																+    parser.add_argument("type", help="数据标注格式类型，目前支持coco")
															
 
																+    parser.add_argument("--labels", nargs='+', help="需要清除的标签")
															
 
																+    parser.add_argument("--output", help="[可选]输出路径")
															
 
																+
															
 
																+    args = parser.parse_args()
															
 
																+    if args.type == 'coco':
															
 
																+        process_coco(args.input, args.labels)
															
 
																+
															
 
																+
															
 
																+if __name__ == "__main__":
															
 
																+    main()