from transformers import TrOCRProcessor, VisionEncoderDecoderModel import numpy as np from PIL import Image import requests import sys import cv2 blocksize = 51 mname='microsoft/trocr-large-handwritten' processor = TrOCRProcessor.from_pretrained(mname) model = VisionEncoderDecoderModel.from_pretrained(mname) def linetotext(img): # Show the image in a window # cv2.imshow('Line of Text', image) # cv2.waitKey(0) image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) pixel_values = processor(images=image, return_tensors="pt").pixel_values generated_ids = model.generate(pixel_values) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] print(generated_text) def rotimg(img,angle): img_center = tuple(np.array(img.shape[1::-1]) / 2) rot_mat = cv2.getRotationMatrix2D(img_center, angle, 1.0) rotated = cv2.warpAffine(img, rot_mat, img.shape[1::-1], flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) return rotated def deskew(img): gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, blocksize, 10) contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) angles = [] for cnt in contours: rect = cv2.minAreaRect(cnt) angle = rect[2]-45 #if angle < -45: # angle += 90 angles.append(angle) median_angle = np.median(angles) return rotimg(img,median_angle) def find_lines(image_path): # Load the image image = deskew(cv2.imread(image_path)) # Convert the image to grayscale gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, 20) blend = cv2.addWeighted(gray, 0.8, bw, 0.2, 0) # Apply a Gaussian blur to the image blur = cv2.GaussianBlur(gray, (5, 5), 0) # Apply adaptive thresholding to the image thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, blocksize, 10) # Find contours in the image contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # Sort the contours from top to bottom contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[1]) # Keep track of the last y-coordinate of a contour last_y = -1 # Iterate over the contours and call linetotext for each one for contour in contours: # Get the bounding box for the contour x, y, w, h = cv2.boundingRect(contour) if h < 8: continue if y> h/4: y = y- int(h/4) h=h+int(h/2) # Reject overlapping contours if last_y >= 0 and y < last_y + h // 2: continue # Stretch the contour to be the full width of the image x = 0 w = image.shape[1] # Extract the line of text from the image line_image = blend[y:y+h, x:x+w] rotimg(line_image,cv2.minAreaRect(contour)[2]) # Call linetotext on the line of text linetotext(line_image) # Update last_y last_y = y if __name__ == '__main__': # Get the image path from the command line arguments if len(sys.argv) < 2: print('Usage: python linetotext.py ') sys.exit(1) image_path = sys.argv[1] find_lines(image_path) # Destroy all windows cv2.destroyAllWindows()