vor 5 Monaten · e6cd5126b8
--- a/data_collection/check_repeat.py
+++ b/data_collection/check_repeat.py
@@ -2,6 +2,7 @@ import os
 
				 import shutil
			
 
				 import argparse
			
 
				 
			
 
				+# 脚本功能：可以查询目标路径下面多个文件夹中的重复文件，并将多余的重复文件删除
			
 
				 
			
 
				 def find_and_delete_duplicate_files(path):
			
 
				     # 字典，用于跟踪文件名及其出现的路径
			
--- a/data_collection/files_fetching.py
+++ b/data_collection/files_fetching.py
@@ -4,6 +4,7 @@ import shutil
 
				 import concurrent.futures
			
 
				 import random
			
 
				 
			
 
				+# 脚本功能：可以随机抓取指定目录下的文件，可指定后缀名，指定要抓取的数量
			
 
				 
			
 
				 version = "1.0.0"
			
 
				 
			
--- a/data_collection/images_fetching.py
+++ b/data_collection/images_fetching.py
@@ -0,0 +1,101 @@
 
				+import argparse
			
 
				+import os
			
 
				+import shutil
			
 
				+import fitz
			
 
				+import uuid
			
 
				+import random
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+json_data = []
			
 
				+
			
 
				+
			
 
				+def is_in_whitelist(file_name, whitelist):
			
 
				+    # 获取文件的后缀名
			
 
				+    _, ext = os.path.splitext(file_name)
			
 
				+    # 判断后缀名是否在白名单中
			
 
				+    return ext.lower() in whitelist
			
 
				+
			
 
				+
			
 
				+def generate_unique_random_numbers(start, end, count):
			
 
				+    numbers = set()
			
 
				+    while len(numbers) < count:
			
 
				+        numbers.add(random.randint(start, end-1))
			
 
				+    return list(numbers)[:count]
			
 
				+
			
 
				+
			
 
				+def get_render_pages(count, need):
			
 
				+    if count < 3:
			
 
				+        return list(range(count))
			
 
				+    else:
			
 
				+        return generate_unique_random_numbers(0, count, need)
			
 
				+
			
 
				+
			
 
				+def pyMuPDF_fitz(pdfPath, imagePath, need):
			
 
				+    try:
			
 
				+        pdfDoc = fitz.open(pdfPath)
			
 
				+
			
 
				+        file_name = os.path.splitext(os.path.basename(pdfPath))[0]
			
 
				+        page_array = get_render_pages(pdfDoc.page_count, need)
			
 
				+
			
 
				+        file_info = {
			
 
				+            "file_name" : file_name,
			
 
				+            "page_numbers" : page_array
			
 
				+        }
			
 
				+        json_data.append(file_info)
			
 
				+
			
 
				+        for pg in page_array:
			
 
				+            page = pdfDoc[pg]
			
 
				+            info = page.bound()
			
 
				+            radio = 0.0
			
 
				+            if info.width > info.height:
			
 
				+                radio = 800.0 / info.width
			
 
				+            else:
			
 
				+                radio = 800.0 / info.height
			
 
				+            rotate = int(0)
			
 
				+            zoom_x = 1.33333
			
 
				+            zoom_y = 1.333333
			
 
				+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
			
 
				+            try:
			
 
				+                pix = page.get_pixmap(matrix=mat, alpha=False)
			
 
				+            except Exception as e:
			
 
				+                print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
			
 
				+                continue
			
 
				+            if not os.path.exists(imagePath):
			
 
				+                os.makedirs(imagePath)
			
 
				+
			
 
				+            save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg')
			
 
				+            pix.save(save_path)
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"发生错误: {e}")
			
 
				+
			
 
				+    pdfDoc.close()
			
 
				+
			
 
				+
			
 
				+def process_files(input, output, need):
			
 
				+    # 遍历目录中的文件和文件夹
			
 
				+    for root, dirs, files in os.walk(input):
			
 
				+        # 只输出文件名，不包括文件夹
			
 
				+        for file in files:
			
 
				+            if not file.startswith(".") and is_in_whitelist(file, [".pdf"]):
			
 
				+                file_path = os.path.join(root, file)
			
 
				+                pyMuPDF_fitz(file_path, output, need)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n")
			
 
				+    parse.add_argument("input", help="输入路径")
			
 
				+    parse.add_argument("output", help="输出路径")
			
 
				+    parse.add_argument("--count", type=int, help="每份PDF需要提取的图片数量")
			
 
				+
			
 
				+    args = parse.parse_args()
			
 
				+    process_files(args.input, os.path.join(args.output, "images"), args.count)
			
 
				+
			
 
				+    json_path = os.path.join(args.output, "files_info.json")
			
 
				+    with open(json_path, 'w', encoding='utf-8') as f:
			
 
				+        json.dump(json_data, f, ensure_ascii=False, indent=4)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/delete_anno_by_label/delete_annot.py
+++ b/delete_anno_by_label/delete_annot.py
@@ -0,0 +1,65 @@
 
				+import argparse
			
 
				+import json
			
 
				+import os
			
 
				+from pycocotools.coco import COCO
			
 
				+
			
 
				+
			
 
				+def new_path(old_path):
			
 
				+    # 使用os.path.dirname获取目录部分
			
 
				+    directory = os.path.dirname(old_path)
			
 
				+    # 使用os.path.basename获取文件名部分（包括扩展名）
			
 
				+    filename_with_extension = os.path.basename(old_path)
			
 
				+    # 使用os.path.splitext分割文件名和扩展名
			
 
				+    filename, extension = os.path.splitext(filename_with_extension)
			
 
				+    # 修改文件名部分，添加_filter并重新组合扩展名
			
 
				+    new_filename = filename + '_filter' + extension
			
 
				+    # 如果directory为空（即文件名只包含一个文件名没有目录），则使用'.'作为目录
			
 
				+    if not directory:
			
 
				+        directory = '.'
			
 
				+    # 使用os.path.join组合目录和新的文件名
			
 
				+    return os.path.join(directory, new_filename)
			
 
				+
			
 
				+
			
 
				+def process_coco(file, annot_name, output=""):
			
 
				+    # 初始化COCO对象
			
 
				+    coco = COCO(file)
			
 
				+    # 获取所有的类别信息
			
 
				+    categories = coco.loadCats(coco.getCatIds())
			
 
				+    # 找到"line"类别的ID
			
 
				+    cat_id = []
			
 
				+    for cat in categories:
			
 
				+        if cat['name'] in annot_name:
			
 
				+            cat_id.append(cat['id'])
			
 
				+
			
 
				+    if cat_id is None:
			
 
				+        print("Category not found in annotations.")
			
 
				+    else:
			
 
				+        # 获取所有的标注
			
 
				+        annotations = coco.dataset['annotations']
			
 
				+
			
 
				+        # 创建一个新的标注列表，不包含"line"类别的标注
			
 
				+        new_annotations = [ann for ann in annotations if ann['category_id'] not in cat_id]
			
 
				+
			
 
				+        # 将新的标注列表写回到一个新的JSON文件中
			
 
				+        if output == "":
			
 
				+            output = new_path(file)
			
 
				+
			
 
				+        with open(output, 'w') as f:
			
 
				+            coco.dataset['annotations'] = new_annotations
			
 
				+            json.dump(coco.dataset, f, ensure_ascii=True, indent=2)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser("删除特定标注信息\n")
			
 
				+    parser.add_argument("input", help="输入路径")
			
 
				+    parser.add_argument("type", help="数据标注格式类型，目前支持coco")
			
 
				+    parser.add_argument("--labels", nargs='+', help="需要清除的标签")
			
 
				+    parser.add_argument("--output", help="[可选]输出路径")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    if args.type == 'coco':
			
 
				+        process_coco(args.input, args.labels)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()