""" OpenCV for matching historical documents test code Author: Alexander Persaud Department of Economics University of Richmond apersaud@richmond.edu Suggestions for users of this file: 1. Identify a template image or multiple images. It is not necessary, though it's easier, to keep all images in the same directory. 2. Put all image files to search in one folder (directory). It is not necessary, though it's easier, to keep all images in the same directory. 3. Assign directory parameters for both types of image 4. Change the loop structure as needed to reflect image types. Test comparison data available from the National Library of Australia https://nla.gov.au/nla.obj-2787566579/view?partId=nla.obj-2787578098#page/n0/mode/1up Ship: "Hereford" from 1888 Suggested additional readings: https://docs.opencv.org/master/dc/dc3/tutorial_py_matcher.html https://opencv24-python-tutorials.readthedocs.io/en/latest/py_tutorials/py_feature2d/py_matcher/py_matcher.html """ from datetime import datetime startTime = datetime.now() # Start timing the duration of the script import numpy as np #Numpy / statistical import cv2 as cv # Computer vision (CV) library import matplotlib.pyplot as plt # Plotting library #import os # Required to change the working directory; not recommended. import pathlib # Better than working directory changes import glob # Used for looping over images IF pathlib is not used import csv #to do # figure out what each command means # create a value for matches (or set of values) to quantify the quality of the match # create some kind of output file to hold match quality #end to do # Set directory/ies depending on the user's file locations: templateP = pathlib.Path('') searchP = pathlib.Path('') writeP = pathlib.Path('') # Set parameters for the entire program index_params = dict(algorithm = 1, trees = 5) # FLANN parameters: k-d tree algorithm (=1) with 5 trees search_params = dict(checks=50) # FLANN parameter: number of times to search plot_match_img_yn = 0 # Set 1 to plot match points or 0 otherwise verbose_yn = 0 # Set to 1 to print more intermediate output or 0 otherwise # Note that the verbose here does NOT automatically plot (see above) # More intermediate output may slow overall processing match_dist = 0.8 # Lower indicates a stricter match criterion. # Lowe (2004) suggests ~0.8. The CDF of true matches drops off between 0.7 and 0.8. # Additionally, the CDF of false matches grows rapidly after 0.8. height_crop = 1.55 # Value to crop sample images from top (1 = no crop and is the min) width_crop = 1 # Value to crop sample images from left (1 = no crop and is the min) # Implement # Define sift for Scale-Invariant Feature Transform (SIFT) sift = cv.SIFT_create() # Initialize the export file # Load images in grayscale to reduce comparison issues template_img=[] # Initialize the template image list. template_files = sorted (templateP.glob('*.png')) # Create list of template files header_csv = ['Image file'] # Start the header for the .csv output template_kp = [] template_des = [] for template_file in template_files: # Loop over template files if verbose_yn == 1: print(template_file) img1 = cv.imread(str(template_file), # Import the template file cv.IMREAD_GRAYSCALE) # Load it in grayscale for ease. template_img.append(img1) # Append it to the template image list. header_csv.append(pathlib.PurePath(template_file).stem) # Append file name to the header kp1, des1 = sift.detectAndCompute(img1, None) # Key points and descriptors for both images template_kp.append(kp1) template_des.append(des1) if verbose_yn == 1: print(header_csv) with open(str(pathlib.PurePath(writeP,'matches.csv')), # Export the header to a .csv file 'w',newline='') as csvfile: match_writer = csv.writer(csvfile) match_writer.writerow(header_csv) comparison_files = searchP.glob('*.jpg') # Create list of comparison files for comparison_file in comparison_files: # Loop over comparison files if verbose_yn == 1: print(comparison_file) img2_raw = cv.imread(str(comparison_file), cv.IMREAD_GRAYSCALE) # Comparison (search) image; load it in grayscale for ease. img2 = img2_raw[0:int(img2_raw.shape[0]/height_crop), # Crop image per parameters above 0:int(img2_raw.shape[1]/width_crop)] plt.imshow(img2) match_line=[] match_line.append(pathlib.PurePath(comparison_file).stem) # Get the file name for the row kp2, des2 = sift.detectAndCompute(img2, None) # Get key points. No mask to pass so None is specified. for i, img1 in enumerate(template_img): # Loop over the template files saved previously. # Load key points and descriptors for the template (the essential parts of each image). kp1 = template_kp[i] des1 = template_des[i] # Define flann for Fast Library for Approximate Nearest Neighbors (FLANN) flann = cv.FlannBasedMatcher(index_params,search_params) matches = flann.knnMatch(des1, # template descriptors from SIFT des2, # comparison descriptors from SIFT k=2) # number of best matches set to 2 # Create a mask, an 'area' to search for matches # The size of the mask is set to the size of the match variable just created matchesMask = [[0,0] for i in range(len(matches))] match_calib = np.shape(matches)[0] # Get dims of matches # 0th item in the array is the total num of matches # Ratio test match_num = 0 # Number of matches found for i,(m,n) in enumerate(matches): if m.distance < match_dist*n.distance: # match_dist set above matchesMask[i] = [1, 0] # Set to be a match match_num = match_num + 1 # Iterate up # Set base parameters for a plotted image draw_params = dict(matchColor = (60,0,250), # RGB coordinates for connector lines singlePointColor = (155,110,30), # RGB coordinates matchesMask = matchesMask, # Created above flags = cv.DrawMatchesFlags_DEFAULT) # img3 = cv.drawMatchesKnn(img1,kp1, # Information from template img2,kp2, # Information from search image matches, # Match array created above None, # Replace with a file name to export the matched image **draw_params) # Plotting parameters set just above if verbose_yn == 1: print("Actual/Total possible matches: ", match_num, "/", match_calib) if plot_match_img_yn == 1: # Parameter set at top to plot or not plt.imshow(img3,),plt.show() # Calculate % of matches as a match quality measure # print(match_line) match_line.append(match_num/match_calib) # Export data down here match_writer.writerow(match_line) print("Total time: ", datetime.now() - startTime)