numpy Pytesseract结果包含意外内容“\n\x0c”

wnrlj8wa  于 2022-11-24  发布在  其他
关注(0)|答案(1)|浏览(174)

我正在做python OCR图像到文本,并比较是否有重复,我正在检查一个接一个,以便我可以更容易地定位
选取链接:https://imgur.com/a/0BGmtEV

主要问题:从(图片链接中的原始图片)我保存了每个图像的结果到文本,ex: CAT4B5, CA7T4BB, CATAAF ...,我保存在列表中,但当我打印该列表时,如下:
如何删除\n\n\n\x0c

我做了一段时间的研究,我发现了这个,Remove '\n\n\n', '\n' from python list,但是我想在开始的时候避免这种情况,而不是之后,应该有一些方法在开始的时候避免这种情况。
['\n\nCAT4B5\n\x0c','CA7T4BB\n\x0c','CATAAF\n\x0c','CAT4C1\n\x0c','CAT4C7\n\x0c','CAT4B6\n\x0c','CAT4B0\n\x0c','CAT4BC\n\x0c','CAT4C2\n\x0c','CAT4C8\n\x0c','\n\nCAT4B7\n\x0c','CATAC3\n\x0c','CAT4C9\n\x0c',“\n\nCAT4B2\n\x0c”、“\n\nCAT7T4B8\n\x0c”、“\n\nCATACS\n\x0c”、“CATAC4\n\x0c "、”CATACA\n\x0c"、“\n\nCATABS\n\x0c”、“\n\nCAT4B9\n\x0c”、“\n\nCAT4BF\n\x0c”、"CAT4CS\n\x0c“、”CAT4CB\n\x0c“、”\n\nCAT4B4\n\x0c"、"CATABA\n\x0c“、'\n\n目录\n\x0c','目录\n\x0c','四类目录\n\x0c']

  • 整个.py脚本:
import os
import cv2
import numpy as np
from PIL import Image
import pytesseract

image = cv2.imread("/home/student_DC/desktop/optimization_11_10/original_duplicate.png")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
white_bg = 255*np.ones_like(image)

ret, thresh = cv2.threshold(gray, 60, 255, cv2.THRESH_BINARY_INV)
blur = cv2.medianBlur(thresh, 1)
kernel = np.ones((10, 20), np.uint8)
img_dilation = cv2.dilate(blur, kernel, iterations=1)
im2, ctrs, hier = cv2.findContours(img_dilation.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])

xy_list = []
listOfElems = []
listOfDuplicate = []
list_for_duplicate_x_and_y = [ ]

for i, ctr in enumerate(sorted_ctrs):
    # Get bounding box
    x, y, w, h = cv2.boundingRect(ctr)
    roi = image[y:y + h, x:x + w]
    if (h > 50 and w > 50) and h < 200:

        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), 1)        
        
        print(x , y )
        
        for xc in (45,150,255,360,465,570):
            if xc-20 < x < xc+20:
                x = xc + 26
                break
        else:
            x = 0
                    
        for yc in (132, 243,586,357,470):
            if yc-20 < y < yc+20:
                y = yc + 48
                break

             
        else:
            y = 0           

        print("new number" , x , y )
        
        tem_list_x_and_y = [ ] 
        tem_list_for_duplicate_x_and_y = [ ] 

        if (x != 0) and (y != 0):
            # cv2.imwrite(f"/home/student_DC/desktop/optimization_11_10/output_11_10__001/output_y:{y}_x:{x}.png", roi)
            tem_list_x_and_y.append(x)
            tem_list_x_and_y.append(y)
            
            xy_list.append(tem_list_x_and_y)
            w = 59
            h = 23
            new_crop = image[y:y+h, x:x+w]
            # cv2.imwrite(f"/home/student_DC/desktop/optimization_11_10/output_11_10__002/output_y:{y}_x:{x}.png" , new_crop)
            text = pytesseract.image_to_string(new_crop, lang='eng')
            
            
            if text not in listOfElems:
                listOfElems.append(text)
                print(text)
                print("=  =  =  =  =  ")
            else:
                print("Duplicate text is here:")
                print("x :" , x , "y :",y)
                tem_list_for_duplicate_x_and_y.append(x)
                tem_list_for_duplicate_x_and_y.append(y)
                list_for_duplicate_x_and_y.append(tem_list_x_and_y)

                print("=  =  =  =  =  ")
       
         

print("len is : " ,len(xy_list))

aaa_list = (sorted(xy_list , key=lambda k: [k[1], k[0]]))
print(aaa_list)        

print("list_for_duplicate_x_and_y is :")      

print(list_for_duplicate_x_and_y)      

print("listOfElems is :")
print(listOfElems)

img = cv2.imread("/home/student_DC/desktop/optimization_11_10/original_duplicate.png")

#               cv2.rectangle(img, (duplicate_x, duplicate_y), (duplicate_x + 92, duplicate_y + 82), (0, 255, 5), 5)              # 綠細框
# cv2.rectangle(img, (120, 120), (150, 150), (255, 0, 0), 5)         # 藍粗框

cv2.imwrite("/home/student_DC/desktop/optimization_11_10/original_duplicate_output.png" , img)
  • 整个输出:
0 0
new number 0 0
44 472
new number 71 518
 

CAT4B5

=  =  =  =  =  
44 357
new number 71 405
CA7T4BB

=  =  =  =  =  
45 586
new number 71 634
CATAAF

=  =  =  =  =  
46 242
new number 71 291
CAT4C1

=  =  =  =  =  
50 132
new number 71 180
‘CAT4C7

=  =  =  =  =  
148 472
new number 176 518
CAT4B6

=  =  =  =  =  
149 587
new number 176 634
CAT4B0

=  =  =  =  =  
149 357
new number 176 405
CAT4BC

=  =  =  =  =  
150 243
new number 176 291
CAT4C2

=  =  =  =  =  
153 132
new number 176 180
CAT4C8

=  =  =  =  =  
253 588
new number 281 634
 

=  =  =  =  =  
253 473
new number 281 518
 

CAT4B7

=  =  =  =  =  
254 357
new number 281 405
Duplicate text is here:
x : 281 y : 405
=  =  =  =  =  
255 243
new number 281 291
CATAC3

=  =  =  =  =  
257 132
new number 281 180
CAT4C9

=  =  =  =  =  
357 588
new number 386 634
 

CAT4B2

=  =  =  =  =  
358 473
new number 386 518
 

CA7T4B8

=  =  =  =  =  
358 361
new number 386 405
 

CATACS

=  =  =  =  =  
359 243
new number 386 291
CATAC4

=  =  =  =  =  
360 132
new number 386 180
CATACA

=  =  =  =  =  
461 589
new number 491 634
 

CATABS

=  =  =  =  =  
462 474
new number 491 518
 

CAT4B9

=  =  =  =  =  
463 358
new number 491 405
 

CAT4BF

=  =  =  =  =  
463 243
new number 491 291
CAT4CS

=  =  =  =  =  
464 131
new number 491 180
CAT4CB

=  =  =  =  =  
566 589
new number 596 634
 

CAT4B4

=  =  =  =  =  
567 474
new number 596 518
CATABA

=  =  =  =  =  
567 361
new number 596 405
 

CATACE

=  =  =  =  =  
568 244
new number 596 291
CATACE

=  =  =  =  =  
568 131
new number 596 180
CAT4CC

=  =  =  =  =  
len is :  30
[[71, 180], [176, 180], [281, 180], [386, 180], [491, 180], [596, 180], [71, 291], [176, 291], [281, 291], [386, 291], [491, 291], [596, 291], [71, 405], [176, 405], [281, 405], [386, 405], [491, 405], [596, 405], [71, 518], [176, 518], [281, 518], [386, 518], [491, 518], [596, 518], [71, 634], [176, 634], [281, 634], [386, 634], [491, 634], [596, 634]]
list_for_duplicate_x_and_y is :
[[281, 405]]
listOfElems is :
[' \n\nCAT4B5\n\x0c', 'CA7T4BB\n\x0c', 'CATAAF\n\x0c', 'CAT4C1\n\x0c', '‘CAT4C7\n\x0c', 'CAT4B6\n\x0c', 'CAT4B0\n\x0c', 'CAT4BC\n\x0c', 'CAT4C2\n\x0c', 'CAT4C8\n\x0c', ' \n\x0c', ' \n\nCAT4B7\n\x0c', 'CATAC3\n\x0c', 'CAT4C9\n\x0c', ' \n\nCAT4B2\n\x0c', ' \n\nCA7T4B8\n\x0c', ' \n\nCATACS\n\x0c', 'CATAC4\n\x0c', 'CATACA\n\x0c', ' \n\nCATABS\n\x0c', ' \n\nCAT4B9\n\x0c', ' \n\nCAT4BF\n\x0c', 'CAT4CS\n\x0c', 'CAT4CB\n\x0c', ' \n\nCAT4B4\n\x0c', 'CATABA\n\x0c', ' \n\nCATACE\n\x0c', 'CATACE\n\x0c', 'CAT4CC\n\x0c']
xfb7svmp

xfb7svmp1#

将字串值指派给text变数时,请使用strip方法移除字串中不需要的字符。

text = pytesseract.image_to_string(new_crop, lang='eng').strip()

示例:

t = ' \n\nCAT4B5\n\x0c'
t.strip()
# 'CAT4B5'

相关问题