Gabor Cselle
commited on
Commit
·
41d52be
1
Parent(s):
d0f419a
Font allowlist, deal with .ttc files, let's just generate 10 per font
Browse files- .gitignore +2 -1
- gen_sample_data.py +36 -22
.gitignore
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
font_images
|
|
|
|
|
|
| 1 |
+
font_images
|
| 2 |
+
.DS_Store
|
gen_sample_data.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# Generate sample data with 800x400 images of fonts in /System/Library/Fonts
|
| 2 |
# 50 images per font, 1 font per image
|
| 3 |
|
| 4 |
-
|
| 5 |
import os
|
| 6 |
from PIL import Image, ImageDraw, ImageFont
|
| 7 |
import nltk
|
|
@@ -14,12 +13,16 @@ nltk.download('brown')
|
|
| 14 |
# Sample text for prose and code
|
| 15 |
prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
|
| 16 |
|
| 17 |
-
|
|
|
|
| 18 |
output_dir = './font_images'
|
| 19 |
os.makedirs(output_dir, exist_ok=True)
|
| 20 |
|
| 21 |
all_brown_words = sorted(set(brown.words(categories='news')))
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
def wrap_text(text, line_length=10):
|
| 24 |
"""
|
| 25 |
Wraps the provided text every 'line_length' words.
|
|
@@ -35,27 +38,38 @@ def random_code_text(base_code, num_lines=15): # Increase number of lines
|
|
| 35 |
lines = base_code.split("\n")
|
| 36 |
return "\n".join(random.sample(lines, min(num_lines, len(lines))))
|
| 37 |
|
| 38 |
-
for
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
font_size = random.choice(range(32, 128)) # Increased minimum font size
|
| 52 |
font = ImageFont.truetype(font_path, font_size)
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Generate sample data with 800x400 images of fonts in /System/Library/Fonts
|
| 2 |
# 50 images per font, 1 font per image
|
| 3 |
|
|
|
|
| 4 |
import os
|
| 5 |
from PIL import Image, ImageDraw, ImageFont
|
| 6 |
import nltk
|
|
|
|
| 13 |
# Sample text for prose and code
|
| 14 |
prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
|
| 15 |
|
| 16 |
+
# Note that this will only work on MacOS where this is the default font directory
|
| 17 |
+
font_dirs = ['/System/Library/Fonts/', '/System/Library/Fonts/Supplemental/']
|
| 18 |
output_dir = './font_images'
|
| 19 |
os.makedirs(output_dir, exist_ok=True)
|
| 20 |
|
| 21 |
all_brown_words = sorted(set(brown.words(categories='news')))
|
| 22 |
|
| 23 |
+
# This is a list of fonts that we want to use for our sample data
|
| 24 |
+
FONT_ALLOWLIST = ["Arial", "Avenir", "Courier", "Helvetica", "Georgia", "Tahoma", "Times New Roman", "Verdana"]
|
| 25 |
+
|
| 26 |
def wrap_text(text, line_length=10):
|
| 27 |
"""
|
| 28 |
Wraps the provided text every 'line_length' words.
|
|
|
|
| 38 |
lines = base_code.split("\n")
|
| 39 |
return "\n".join(random.sample(lines, min(num_lines, len(lines))))
|
| 40 |
|
| 41 |
+
for font_dir in font_dirs:
|
| 42 |
+
for font_file in os.listdir(font_dir):
|
| 43 |
+
if font_file.endswith('.ttf') or font_file.endswith('.ttc'):
|
| 44 |
+
font_path = os.path.join(font_dir, font_file)
|
| 45 |
+
font_name = font_file.split('.')[0]
|
| 46 |
+
if font_name not in FONT_ALLOWLIST:
|
| 47 |
+
continue
|
| 48 |
+
# Output the font name so we can see the progress
|
| 49 |
+
print(font_path, font_name)
|
| 50 |
|
| 51 |
+
if font_file.endswith('.ttc'):
|
| 52 |
+
# ttc fonts have multiple fonts in one file, so we need to specify which one we want
|
| 53 |
+
font = ImageFont.truetype(font_path, random.choice(range(32, 128)), index=0)
|
| 54 |
+
else:
|
| 55 |
+
# ttf fonts have only one font in the file
|
| 56 |
font_size = random.choice(range(32, 128)) # Increased minimum font size
|
| 57 |
font = ImageFont.truetype(font_path, font_size)
|
| 58 |
|
| 59 |
+
# Counter for the image filename
|
| 60 |
+
j = 0
|
| 61 |
+
for i in range(10): # Generate 50 images per font - reduced to 10 for now to make things faster
|
| 62 |
+
prose_sample = random_prose_text(all_brown_words)
|
| 63 |
+
|
| 64 |
+
for text in [prose_sample]:
|
| 65 |
+
img = Image.new('RGB', (800, 400), color="white") # Canvas size
|
| 66 |
+
draw = ImageDraw.Draw(img)
|
| 67 |
+
|
| 68 |
+
# Random offsets, but ensuring that text isn't too far off the canvas
|
| 69 |
+
offset_x = random.randint(-20, 10)
|
| 70 |
+
offset_y = random.randint(-20, 10)
|
| 71 |
+
draw.text((offset_x, offset_y), text, fill="black", font=font)
|
| 72 |
+
|
| 73 |
+
j += 1
|
| 74 |
+
output_file = os.path.join(output_dir, f"{font_name}_{j}.png")
|
| 75 |
+
img.save(output_file)
|