from PIL import ImageFont, ImageDraw, Image, ImageOps
import matplotlib.pyplot as plt
import os
import numpy as np
from datetime import datetime as dt
import arabic_reshaper
from bidi.algorithm import get_display
def data_gen(font_folder, font_list,
data_str = ["0 1 2 3 4 5 6 7 8 9"],
img_size = [12000, 64],font_size=[20]):
color = 255
count = 0
#list to store images and labels
all_roi = []
all_labels = []
#loop over to generate data
#loope over fonts
for i,f in enumerate(font_list):
print("\nfont",i+1, ":", f)
#iterate over angles
for size in font_size:
for word in data_str:
#generate image
#blank image
font = ImageFont.truetype(font_folder + str(f), size,encoding='UTF-16')
text = arabic_reshaper.reshape(word)
bidi = get_display(text)
img = Image.new('L', img_size, "black")
#position of text
x = 20
y = img_size[1]//2 - size//2
#draw text on image
draw = ImageDraw.Draw(img)
draw.text((x, y), bidi, fill = 255, font=font)
#convert image from PIL format to array
img = np.array(img)
#cv2.imshow('test',img)
#cv2.waitKey(0)
#find contours
rois = []
labels = []
cv2.imwrite(r'OCR/seg/'+"letter "+str(count)+".png",img)
count= count+1
return all_roi, all_labels
all_let = " ا إ ب ت ث ج چ ح خ د ذ ر ز س ش ص ض ط ظ ع غ ڠ ف ڤ ق ک ݢ ل م ن و ۏ ه ة ء ي ڽ ى "
all_let_styles = "ـا بـ ـبـ ــب تـ ـتـ ـت ثـ ـثـ ـث جـ ـجـ ـج چـ ـچـ ـچ حـ ـح ـحـ خـ ـخـ ـخ \
ـد ـذ ـر ـز سـ ـسـ ـس شـ ـشـ ـش صـ ـصـ ـص ضـ ـضـ ـض طـ ـطـ ـط ظـ ـظـ ـظ عـ ـعـ ـع غـ ـغـ ـغ ڠـ ـڠـ ـڠ فـ ـفـ ـف ڤـ ـڤـ ـڤ \
قـ ـقـ ـق کـ ـکـ ـک ݢـ ـݢـ ـݢ لـ ـلـ ـل مـ ـمـ ـم نـ ـنـ ـن ـو ـۏ هـ ـهـ ـه يـ ـيـ ـي ـى ڽـ ـڽـ ـڽ ـة"
digit = ' 0 1 2 3 4 5 6 7 8 9'
signs = ''' ! @ # % ^ & ? / ( ) { } [ ] < > * - + = \ : ; ' . '''
ds = all_let + all_let_styles+digit+signs
dataset = list(all_let_styles.split(' '))
# Change this to the current user folder that holds the fonts
font_folder = r'/fonts/'
font_list = os.listdir(font_folder)
ims = [64, 64]
# print("data string:", dataset)
# print("total number of character:", len(dataset)/2)
data, labels = data_gen(font_folder, font_list,
data_str = dataset,
img_size = ims)
print("\nlength of image list:", len(data))
print("length of label list:", len(labels))
好的,我正在尝试创建一个ocr,这是我的代码片段,用于建立一个在黑色图像上绘制的jawi(阿拉伯语书写系统)字符数据集,我当前的问题是一些不在官方阿拉伯语系统中的字母无法正确显示,一些字母有奇怪的显示,我将附上示例,有人知道问题出在哪里吗?我是一个新手程序员,很抱歉给您带来不便。例1例2
暂无答案!
目前还没有任何答案,快来回答吧!