초보자를위한 사과 - 저는 진정한 파이썬 초보자입니다.표준화 된 이미지에서 테두리가있는 섹션을 확인하고 파이썬으로 자르기
첨부 이미지는 UB-04 양식 (또는 CMS-1450)이며 청구 청구를 위해 기관 공급자가 사용하는 균일 한 청구서 양식입니다. Medicare and Medicaid (CMS)에 의해 The Centers에서 개발되었지만 모든 보험 회사에서 사용하는 표준 양식이되었습니다.
나는 각 공급자들이 정부 나 민간 보험에 의한 지급을 받기 위해 준 치료에 관련하여 정보를 채웠다 PDF로이 양식을받을 수 있습니다. 필자는이 PDF에서 수동으로 정보를 추출했지만이 작업을보다 빠르게 진행할 수있는 프로세스를 만들었습니다. 그러나이 프로세스는 충분히 강력하지 않으며 가능하면 개선하기를 원합니다.
는 제가 임의로 편집 텍스트와이 양식의 샘플을 추가 한
#I need to clean up these imports -_-
import pytesseract
from PIL import Image as IMG
import PIL
import cv2
from wand.image import Image
import PythonMagick
import pandas as pd
import operator
import functools
import csv
from openpyxl import Workbook, load_workbook
import numpy as np
import math
from matplotlib import pyplot as plt
import os
import xlsxwriter
from random import randint
import glob
import datetime
# Choose PDF series to convert to images - parse the images page by page
pdfclaimtoconvert = "UB04 PDF"
with(Image(filename=pdfclaimtoconvert+".pdf",resolution=200)) as source:
images=source.sequence
pages=len(images)
for i in range(pages):
Image(images[i]).save(filename='Cropped Images/'+pdfclaimtoconvert+str(i)+'.tiff')
# Loop through all pages
for page in range(0,pages):
# Select page number to parse text from
pagenumber = str(page)
filetoworkon = 'Cropped Images/'+pdfclaimtoconvert+pagenumber+'.tiff'
# Read & Write image back as tiff
image = cv2.imread(filetoworkon)
cv2.imwrite("test.tiff",image)
# Convert Red Pixels to white (this helps eliminate noise for tesseract to work properly)
image_tiff = IMG.open('test.tiff')
# load the pixel info
width, height = image_tiff.size
for x in range(width):
for y in range(height):
r,g,b = image_tiff.getpixel((x,y))
if r > 130 and b < 240:
image_tiff.putpixel((x, y), (255,255,255))
image_tiff.save('test.tiff')
이 이번 섹션은 지금 내 주요 세차게 내 던지다입니다 -. 이상적으로, 오히려 엄격하게 세그먼트를 지정하는 것보다, 내가 좋아하는 것 그 경계의 각 하나를 식별 할 수있는 프로세스를 시작합니다 (예를 들어 왼쪽 상단은 경계선 1, 그 옆에있는 경계선은 경계선 2 등입니다) -이 문서에는 수백 개가있을 것이라고 가정합니다.) 그런 다음 그 경계에서 정확히 구분할 스크립트를 실행하고 필요에 따라 정보를 구문 분석 할 수 있습니다.
#Read Image and Crop relevant sections
img = cv2.imread('test.tiff')
crop_img = img[20:170, 20:530] # Address
cv2.imwrite("Cropped Images/test1.tiff", crop_img)
crop_img = img[200:240, 20:530] #patient name
cv2.imwrite("Cropped Images/test2.tiff", crop_img)
crop_img = img[260:310, 20:205] #birthdate
cv2.imwrite("Cropped Images/test3.tiff", crop_img)
crop_img = img[400:570, 20:860] #Payer Address
cv2.imwrite("Cropped Images/test4.tiff", crop_img)
crop_img = img[600:1340, 20:120] #Treatment Codes
cv2.imwrite("Cropped Images/test5.tiff", crop_img)
crop_img = img[600:1340, 121:620] #Treatment Descriptions
cv2.imwrite("Cropped Images/test6.tiff", crop_img)
crop_img = img[600:1340, 620:910] #HCPCS
cv2.imwrite("Cropped Images/test7.tiff", crop_img)
crop_img = img[600:1340, 910:1059] #Service Dates
cv2.imwrite("Cropped Images/test8.tiff", crop_img)
crop_img = img[600:1340, 1059:1220] #Service Units
cv2.imwrite("Cropped Images/test9.tiff", crop_img)
crop_img = img[600:1340, 1214:1365] #Service Charges
cv2.imwrite("Cropped Images/test10.tiff", crop_img)
crop_img = img[600:1340, 1355:1420] #Service Charges decimals
cv2.imwrite("Cropped Images/test11.tiff", crop_img)
crop_img = img[1400:1510,20:480] #Payer Name
cv2.imwrite("Cropped Images/test12.tiff", crop_img)
crop_img = img[20:75,1070:1580] #Patient Control No
cv2.imwrite("Cropped Images/test13.tiff", crop_img)
crop_img = img[75:105,1070:1400] #Med Rec
cv2.imwrite("Cropped Images/test14.tiff", crop_img)
crop_img = img[130:175,1015:1220] #Fed Tax No
cv2.imwrite("Cropped Images/test15.tiff", crop_img)
crop_img = img[135:175,1220:1510] #Statement from and To
cv2.imwrite("Cropped Images/test16.tiff", crop_img)
crop_img = img[1340:1372,900:1050] #Creation Date
cv2.imwrite("Cropped Images/test17.tiff", crop_img)
crop_img = img[1662:1700,630:1160] #Document Control No
cv2.imwrite("Cropped Images/test18.tiff", crop_img)
crop_img = img[1340:1372,130:280] #Pages #1
cv2.imwrite("Cropped Images/test19.tiff", crop_img)
crop_img = img[1340:1372,280:500] #Pages #2
cv2.imwrite("Cropped Images/test20.tiff", crop_img)
# Use Tesseract to Read text for each of the crops - input in a list
# Separate numerical exrtactions from mixed or string so that we can force tesseract to recognize them as digits
TextExtract = {}
numericals = [1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18]
nonnumericals = [9, 10, 11, 19, 20]
for i in numericals:
img = IMG.open(
'YOURPATH/Cropped Images/test' + str(i) + '.tiff')
img.load()
TextExtract["test{0}".format(i)] = pytesseract.image_to_string(img, config='-psm 6')
for i in nonnumericals:
img = IMG.open(
'YOURPATH/Cropped Images/test' + str(i) + '.tiff')
img.load()
TextExtract["test{0}".format(i)] = pytesseract.image_to_string(img,
config='-c tessedit_char_whitelist=-psm 6')
# Split each item in dictionary by page break
ParsedText = {}
for i in range(1, 21):
ParsedText["test{0}".format(i)] = TextExtract['test' + str(i)].split('\n')
# Delete empty list items
for x in range(1, 21):
ParsedText['test' + str(x)] = [i for i in ParsedText['test' + str(x)] if i != '']
# Collapse lists into single values
CollapsedLists = {}
collapsablelist=(1,2,3,4,12,13,14,15,16,17,18,19,20)
for i in range(1,21):
if i in collapsablelist:
CollapsedLists["test{0}".format(i)] = ' '.join(ParsedText['test'+str(i)])
else:
CollapsedLists["test{0}".format(i)] = ParsedText['test'+str(i)]
# Extraction list
extractionlist = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
# Create empty Pandas Dataframe
extractionframe = pd.DataFrame(columns=['test1', 'test2', 'test3', 'test4', 'test5', 'test6', 'test7', 'test8'
, 'test9', 'test10', 'test11', 'test12', 'test13', 'test14', 'test15', 'test16'
, 'test17', 'test18', 'test19', 'test20'
])
# Populate dataframe
for x in extractionlist:
if isinstance(CollapsedLists['test' + str(x)], list) is False:
try:
extractionframe.loc[1, 'test' + str(x)] = CollapsedLists['test' + str(x)]
except:
pass
else:
for i in range(len(CollapsedLists['test' + str(x)])):
extractionframe.loc[i + 1, 'test' + str(x)] = CollapsedLists['test' + str(x)][i]
# Populate source
extractionframe.loc[1, 'source'] = pdfclaimtoconvert + '_page_' + pagenumber
# Fill NaN values forward
extractionframe = extractionframe.fillna(method='ffill')
#Convert to excel formula in order to preserve any leading zeroes
extractionframe = extractionframe.applymap(str)
extractionframe2 = '="'+extractionframe+'"'
# Export to excel
with open('Claim Data Extractions.csv', 'a') as f:
extractionframe2.to_csv(f, header=False, index=False)
매우 도움이 보입니다. 나는 그것을 통해 노력하고 잘하면 그것을 기반으로 솔루션을 고안합니다! –