This post will help you to get output using fuzzy lookup in python.
Another way to write the same code which provide % of match which is additional information.
In this example I have used 2 datasets, input dataset and master dataset.
import pandas as pd import fuzzywuzzy as fuzz from fuzzywuzzy import fuzz from fuzzywuzzy import process import difflib dfinput = pd.read_excel("E:/006 Projects/005 Dataset/fuzzygroup.xlsx", sheet="input") dfinput.describe() dfinput.head() dfiCorrect = pd.read_excel("E:/006 Projects/005 Dataset/fuzzygroupcorrect.xlsx",
sheet="Correct") dfiCorrect.describe() dfiCorrect.head() dfiCorrect.columns dfinput.columns compare = pd.MultiIndex.from_product([dfinput['name'], dfiCorrect['TextMat']]).to_series() def metrics(tup): return pd.Series([fuzz.ratio(*tup),
fuzz.token_sort_ratio(*tup)],
['ratio', 'token']) result = compare.apply(metrics).unstack().idxmax().unstack(0) result.to_csv("E:/006 Projects/005 Dataset/output.csv")
using fuzzywuzzy python library we can generate the correct output with few lines of code.
Another way to write the same code which provide % of match which is additional information.
# second option wrong_names=dfinput['name'].dropna().values #Correct country names dataset correct_names=dfiCorrect['TextMat'].values names_array=[] ratio_array=[] def match_names(wrong_names,correct_names): for row in wrong_names: x=process.extractOne(row, correct_names) names_array.append(x[0]) ratio_array.append(x[1]) return names_array,ratio_array name_match,ratio_match=match_names(wrong_names,correct_names) dfinput['correct_country_name']=pd.Series(name_match) dfinput['country_names_ratio']=pd.Series(ratio_match) dfinput.columns dfinput.to_csv("E:/006 Projects/005 Dataset/string_matched_country_names.csv") result.to_csv("E:/006 Projects/005 Dataset/output.csv") print(dfinput[['name','correct_country_name','country_names_ratio']].head(10))
Refer this screenshot
Comments
Post a Comment