gyrojeff commited on
Commit
3daa9d7
·
1 Parent(s): eb77f3e

feat: add broken detection script

Browse files
Files changed (1) hide show
  1. font_ds_detect_broken.py +96 -0
font_ds_detect_broken.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import traceback
3
+ import pickle
4
+ import os
5
+ import concurrent.futures
6
+ from tqdm import tqdm
7
+ import time
8
+ from font_dataset.font import load_fonts
9
+ import cv2
10
+
11
+ cjk_ratio = 3
12
+
13
+ train_cnt = 100
14
+ val_cnt = 5
15
+ test_cnt = 30
16
+
17
+ train_cnt_cjk = int(train_cnt * cjk_ratio)
18
+ val_cnt_cjk = int(val_cnt * cjk_ratio)
19
+ test_cnt_cjk = int(test_cnt * cjk_ratio)
20
+
21
+ dataset_path = "./dataset/font_img"
22
+ os.makedirs(dataset_path, exist_ok=True)
23
+
24
+ unqualified_log_file_name = f"unqualified_font_{time.time()}.txt"
25
+ runtime_exclusion_list = []
26
+
27
+ fonts, exclusion_rule = load_fonts()
28
+
29
+
30
+ def generate_dataset(dataset_type: str, cnt: int):
31
+ dataset_bath_dir = os.path.join(dataset_path, dataset_type)
32
+ os.makedirs(dataset_bath_dir, exist_ok=True)
33
+
34
+ def _generate_single(args):
35
+ i, j, font = args
36
+ print(
37
+ f"Checking {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}",
38
+ end="\r",
39
+ )
40
+
41
+ if exclusion_rule(font):
42
+ print(f"Excluded font: {font.path}")
43
+ return
44
+ if font.path in runtime_exclusion_list:
45
+ print(f"Excluded font: {font.path}")
46
+ return
47
+
48
+ image_file_name = f"font_{i}_img_{j}.jpg"
49
+ label_file_name = f"font_{i}_img_{j}.bin"
50
+
51
+ image_file_path = os.path.join(dataset_bath_dir, image_file_name)
52
+ label_file_path = os.path.join(dataset_bath_dir, label_file_name)
53
+
54
+ # detect cache
55
+ if (not os.path.exists(image_file_path)) or (
56
+ not os.path.exists(label_file_path)
57
+ ):
58
+ print(
59
+ f"Missing {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
60
+ )
61
+
62
+ # detect broken
63
+ try:
64
+ # check image
65
+ cv2.imread(image_file_path)
66
+ # check label
67
+ with open(label_file_path, "rb") as f:
68
+ pickle.load(f)
69
+ except Exception as e:
70
+ print(
71
+ f"Broken {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
72
+ )
73
+ os.remove(image_file_path)
74
+ os.remove(label_file_path)
75
+
76
+ return
77
+
78
+ work_list = []
79
+
80
+ # divide len(fonts) into 64 parts and choose the third part for this script
81
+ for i in range(len(fonts)):
82
+ font = fonts[i]
83
+ if font.language == "CJK":
84
+ true_cnt = cnt * cjk_ratio
85
+ else:
86
+ true_cnt = cnt
87
+ for j in range(true_cnt):
88
+ work_list.append((i, j, font))
89
+
90
+ for i in tqdm(range(len(work_list))):
91
+ _generate_single(work_list[i])
92
+
93
+
94
+ generate_dataset("train", train_cnt)
95
+ generate_dataset("val", val_cnt)
96
+ generate_dataset("test", test_cnt)