Gabor Cselle commited on
Commit
95ccd40
·
1 Parent(s): 41d52be

Train and test splitter. Clean up comments and code a bit (but not too much).

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +3 -0
  3. arrange_train_test_images.py +38 -0
  4. gen_sample_data.py +5 -8
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  font_images
 
2
  .DS_Store
 
1
  font_images
2
+ train_test_images
3
  .DS_Store
README.md CHANGED
@@ -6,3 +6,6 @@ Follow along:
6
  - [On Pebble.social](https://pebble.social/@gabor/111376050835874755)
7
  - [On Threads.net](https://www.threads.net/@gaborcselle/post/CzZJpJCpxTz)
8
  - [On Twitter](https://twitter.com/gabor/status/1722300841691103467)
 
 
 
 
6
  - [On Pebble.social](https://pebble.social/@gabor/111376050835874755)
7
  - [On Threads.net](https://www.threads.net/@gaborcselle/post/CzZJpJCpxTz)
8
  - [On Twitter](https://twitter.com/gabor/status/1722300841691103467)
9
+
10
+ Generate sample images (note this will work only on Mac): [gen_sample_data.py]
11
+ Arrange test images into test and train: [arrange_train_test_images.py]
arrange_train_test_images.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # moves the font images into train and test folders
2
+ # TODO(gabor): maybe we should copy these instead, so we don't have to regenerate the images every times?
3
+ import os
4
+ import shutil
5
+ import random
6
+
7
+ source_dir = './font_images'
8
+ organized_dir = './train_test_images'
9
+ train_dir = os.path.join(organized_dir, 'train')
10
+ test_dir = os.path.join(organized_dir, 'test')
11
+
12
+ # create directories if they don't exist
13
+ os.makedirs(train_dir, exist_ok=True)
14
+ os.makedirs(test_dir, exist_ok=True)
15
+
16
+ # make a list of all the font names
17
+ fonts = [f.split('_')[0] for f in os.listdir(source_dir) if f.endswith('.png')]
18
+ fonts = list(set(fonts)) # getting unique font names
19
+
20
+ for font in fonts:
21
+ font_train_dir = os.path.join(train_dir, font)
22
+ font_test_dir = os.path.join(test_dir, font)
23
+ os.makedirs(font_train_dir, exist_ok=True)
24
+ os.makedirs(font_test_dir, exist_ok=True)
25
+
26
+ font_files = [f for f in os.listdir(source_dir) if f.startswith(font)]
27
+ random.shuffle(font_files)
28
+
29
+ train_files = font_files[:int(0.8 * len(font_files))]
30
+ test_files = font_files[int(0.8 * len(font_files)):]
31
+
32
+ # Moving training files
33
+ for train_file in train_files:
34
+ shutil.move(os.path.join(source_dir, train_file), font_train_dir)
35
+
36
+ # Moving test files
37
+ for test_file in test_files:
38
+ shutil.move(os.path.join(source_dir, test_file), font_test_dir)
gen_sample_data.py CHANGED
@@ -10,9 +10,6 @@ import random
10
  # Download the necessary data from nltk
11
  nltk.download('brown')
12
 
13
- # Sample text for prose and code
14
- prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
15
-
16
  # Note that this will only work on MacOS where this is the default font directory
17
  font_dirs = ['/System/Library/Fonts/', '/System/Library/Fonts/Supplemental/']
18
  output_dir = './font_images'
@@ -24,17 +21,17 @@ all_brown_words = sorted(set(brown.words(categories='news')))
24
  FONT_ALLOWLIST = ["Arial", "Avenir", "Courier", "Helvetica", "Georgia", "Tahoma", "Times New Roman", "Verdana"]
25
 
26
  def wrap_text(text, line_length=10):
27
- """
28
- Wraps the provided text every 'line_length' words.
29
- """
30
  words = text.split()
31
  return "\n".join([" ".join(words[i:i+line_length]) for i in range(0, len(words), line_length)])
32
 
33
- def random_prose_text(words, num_words=200): # Sample random words
 
34
  random_words = " ".join(random.sample(words, num_words))
35
  return wrap_text(random_words)
36
 
37
- def random_code_text(base_code, num_lines=15): # Increase number of lines
 
38
  lines = base_code.split("\n")
39
  return "\n".join(random.sample(lines, min(num_lines, len(lines))))
40
 
 
10
  # Download the necessary data from nltk
11
  nltk.download('brown')
12
 
 
 
 
13
  # Note that this will only work on MacOS where this is the default font directory
14
  font_dirs = ['/System/Library/Fonts/', '/System/Library/Fonts/Supplemental/']
15
  output_dir = './font_images'
 
21
  FONT_ALLOWLIST = ["Arial", "Avenir", "Courier", "Helvetica", "Georgia", "Tahoma", "Times New Roman", "Verdana"]
22
 
23
  def wrap_text(text, line_length=10):
24
+ """Wraps the provided text every 'line_length' words."""
 
 
25
  words = text.split()
26
  return "\n".join([" ".join(words[i:i+line_length]) for i in range(0, len(words), line_length)])
27
 
28
+ def random_prose_text(words, num_words=200):
29
+ """Returns a random selection of 'num_words' words from the provided list of words."""
30
  random_words = " ".join(random.sample(words, num_words))
31
  return wrap_text(random_words)
32
 
33
+ def random_code_text(base_code, num_lines=15):
34
+ """Returns a random selection of 'num_lines' lines from the provided code."""
35
  lines = base_code.split("\n")
36
  return "\n".join(random.sample(lines, min(num_lines, len(lines))))
37