python - Pythonic Name Matching -
i have database names of football teams, (for instance in first entry below, marshall , southern methodist). then, matched database names different, yet recognizable names (in first entry below, smu, marshall):
[u'houston', u'alabama'] [u'houst', u'alab'] [u'florida state', u'north carolina state'] [u'ncst', u'flast'] [u'penn state', u'iowa'] [u'pnst', u'iowa'] [u'oklahoma', u'texas'] [u'texas', u'okla'] [u'florida atlantic', u'south florida'] [u'sfla', u'flatl'] [u'georgia', u'tennessee'] [u'geo', u'tenn'] [u'san jose state', u'idaho'] [u'ui', u'sjsu'] [u'washington state', u'arizona state'] [u'arzst', u'wshst'] [u'fresno state', u'nevada'] [u'nevad', u'frsst'] [u'oregon state', u'arizona'] [u'ariz', u'osu'] [u'clemson', u'virginia tech'] [u'vtech', u'clem'] [u'chattanooga', u'arkansas'] [u'utc', u'ar'] [u'usc', u'stanford'] [u'usc', u'stanf'] [u'baylor', u'colorado'] [u'bu', u'cu'] [u'north texas', u'louisiana-lafayette'] [u'notex', u'lalaf'] [u'tulane', u'army'] [u'tln', u'army'] [u'troy', u'florida international'] [u'troy', u'fiu'] [u'louisiana-monroe', u'arkansas state'] [u'asu', u'ulm'] [u'texas tech', u'iowa state'] [u'tt', u'isu'] [u'akron', u'western michigan'] [u'akron', u'wmu'] [u'liberty', u'toledo'] [u'liberty', u'toledo'] [u'virginia', u'middle tennessee'] [u'virg', u'mtnst'] [u'oklahoma state', u'texas a&m'] [u'texam', u'okst'] [u'notre dame', u'ucla'] [u'ndame', u'ucla'] [u'rutgers', u'cincinnati'] [u'cincy', u'rutgr'] [u'ohio state', u'purdue'] [u'prdue', u'ohst'] [u'lsu', u'florida'] [u'fla', u'lsu'] [u'air force', u'unlv'] [u'afa', u'unlv'] [u'nebraska', u'missouri'] [u'misso', u'neb'] [u'new mexico state', u'boise state'] [u'nmxst', u'boist'] [u'pittsburgh', u'navy'] [u'navy', u'pitt'] [u'wake forest', u'florida state'] [u'wfrst', u'flast'] [u'san jose state', u'hawaii'] [u'hawa', u'sjst'] [u'ucf', u'south florida'] [u'ucf', u'sfla']
for each group of 4 names, need match database name correct new name. right using lot of if statements, take lot of code, , wouldn't particularly elegant. there better way match here?
from difflib import sequencematcher li = [\ ([u'houston', u'alabama'], [u'houst', u'alab']), ([u'florida state', u'north carolina state'], [u'ncst', u'flast']), ([u'penn state', u'iowa'], [u'pnst', u'iowa']), ([u'oklahoma', u'texas'], [u'texas', u'okla']), ([u'florida atlantic', u'south florida'], [u'sfla', u'flatl']), ([u'georgia', u'tennessee'], [u'geo', u'tenn']), ([u'san jose state', u'idaho'], [u'ui', u'sjsu']), ([u'washington state', u'arizona state'], [u'arzst', u'wshst']), ([u'fresno state', u'nevada'], [u'nevad', u'frsst']), ([u'oregon state', u'arizona'], [u'ariz', u'osu']), ([u'clemson', u'virginia tech'], [u'vtech', u'clem']), ([u'chattanooga', u'arkansas'], [u'utc', u'ar']), ([u'usc', u'stanford'], [u'usc', u'stanf']), ([u'baylor', u'colorado'], [u'bu', u'cu']), ([u'north texas', u'louisiana-lafayette'], [u'notex', u'lalaf']), ([u'tulane', u'army'], [u'tln', u'army']), ([u'troy', u'florida international'], [u'troy', u'fiu']), ([u'louisiana-monroe', u'arkansas state'], [u'asu', u'ulm']), ([u'texas tech', u'iowa state'], [u'tt', u'isu']), ([u'akron', u'western michigan'], [u'akron', u'wmu']), ([u'liberty', u'toledo'], [u'liberty', u'toledo']), ([u'virginia', u'middle tennessee'], [u'virg', u'mtnst']), ([u'oklahoma state', u'texas a&m'], [u'texam', u'okst']), ([u'notre dame', u'ucla'], [u'ndame', u'ucla']), ([u'rutgers', u'cincinnati'], [u'cincy', u'rutgr']), ([u'ohio state', u'purdue'], [u'prdue', u'ohst']), ([u'lsu', u'florida'], [u'fla', u'lsu']), ([u'air force', u'unlv'], [u'afa', u'unlv']), ([u'nebraska', u'missouri'], [u'misso', u'neb']), ([u'new mexico state', u'boise state'], [u'nmxst', u'boist']), ([u'pittsburgh', u'navy'], [u'navy', u'pitt']), ([u'wake forest', u'florida state'], [u'wfrst', u'flast']), ([u'san jose state', u'hawaii'], [u'hawa', u'sjst']), ([u'ucf', u'south florida'], [u'ucf', u'sfla']) ] def comp(n,d,sq = sequencematcher(none)): sq.set_seqs(n[0],d[0]) = sq.ratio() sq.set_seqs(n[1],d[1]) b = sq.ratio() sq.set_seqs(n[0],d[1]) x = sq.ratio() sq.set_seqs(n[1],d[0]) y = sq.ratio() if a>x , b>y: return (n[0],d[0]), (n[1],d[1]) else: return (n[0],d[1]),(n[1],d[0]) print '\n'.join('%-30s %s' % comp(n,d) n,d in li)
result
(u'houston', u'houst') (u'alabama', u'alab') (u'florida state', u'flast') (u'north carolina state', u'ncst') (u'penn state', u'pnst') (u'iowa', u'iowa') (u'oklahoma', u'okla') (u'texas', u'texas') (u'florida atlantic', u'flatl') (u'south florida', u'sfla') (u'georgia', u'geo') (u'tennessee', u'tenn') (u'san jose state', u'sjsu') (u'idaho', u'ui') (u'washington state', u'wshst') (u'arizona state', u'arzst') (u'fresno state', u'frsst') (u'nevada', u'nevad') (u'oregon state', u'osu') (u'arizona', u'ariz') (u'clemson', u'clem') (u'virginia tech', u'vtech') (u'chattanooga', u'utc') (u'arkansas', u'ar') (u'usc', u'usc') (u'stanford', u'stanf') (u'baylor', u'bu') (u'colorado', u'cu') (u'north texas', u'notex') (u'louisiana-lafayette', u'lalaf') (u'tulane', u'tln') (u'army', u'army') (u'troy', u'troy') (u'florida international', u'fiu') (u'louisiana-monroe', u'ulm') (u'arkansas state', u'asu') (u'texas tech', u'tt') (u'iowa state', u'isu') (u'akron', u'akron') (u'western michigan', u'wmu') (u'liberty', u'toledo') (u'toledo', u'liberty') (u'virginia', u'virg') (u'middle tennessee', u'mtnst') (u'oklahoma state', u'okst') (u'texas a&m', u'texam') (u'notre dame', u'ndame') (u'ucla', u'ucla') (u'rutgers', u'rutgr') (u'cincinnati', u'cincy') (u'ohio state', u'ohst') (u'purdue', u'prdue') (u'lsu', u'lsu') (u'florida', u'fla') (u'air force', u'afa') (u'unlv', u'unlv') (u'nebraska', u'neb') (u'missouri', u'misso') (u'new mexico state', u'nmxst') (u'boise state', u'boist') (u'pittsburgh', u'pitt') (u'navy', u'navy') (u'wake forest', u'wfrst') (u'florida state', u'flast') (u'san jose state', u'sjst') (u'hawaii', u'hawa') (u'ucf', u'ucf') (u'south florida', u'sfla')
.
edit
from difflib import sequencematcher li = [\ ([u'liberty', u'toledo'], ####### [u'liberty', u'toledo']), ([u'chattanooga', u'arkansas'], ################ [u'utc', u'ar']), ([u'texas tech', u'iowa state'], ########### [u'tt', u'isu']) ] def comp(n,d,sq = sequencematcher(none)): sq.set_seqs(n[0],d[0]) = sq.ratio() sq.set_seqs(n[1],d[1]) b = sq.ratio() sq.set_seqs(n[0],d[1]) x = sq.ratio() sq.set_seqs(n[1],d[0]) y = sq.ratio() sq.set_seqs(n[0].lower(),d[0].lower()) al = sq.ratio() sq.set_seqs(n[1].lower(),d[1].lower()) bl = sq.ratio() sq.set_seqs(n[0].lower(),d[1].lower()) xl = sq.ratio() sq.set_seqs(n[1].lower(),d[0].lower()) yl = sq.ratio() return ((n[0],d[0]), (n[1],d[1]), a,b,a*b,a+b, (n[0].lower(),d[0].lower()), (n[1].lower(),d[1].lower()), al,bl,al*bl,al+bl, (n[0],d[1]),(n[1],d[0]), x,y,x*y,x+y, (n[0].lower(),d[1].lower()),(n[1].lower(),d[0].lower()), xl,yl,xl*yl,xl+yl) print '\n'.join(('====='*14)+ '\n' '%-25s %s\n' ' %-10f %f --> x%f +%f\n' '%-25s %s\n' ' %-10f %f --> x%f +%f\n\n' '%-25s %s\n' ' %-10f %f --> x%f +%f\n' '%-25s %s\n' ' %-10f %f --> x%f +%f\n' % comp(n,d) n,d in li)
result
====================================================================== (u'liberty', u'liberty') (u'toledo', u'toledo') 0.142857 0.166667 --> x0.023810 +0.309524 (u'liberty', u'liberty') (u'toledo', u'toledo') 1.000000 1.000000 --> x1.000000 +2.000000 (u'liberty', u'toledo') (u'toledo', u'liberty') 0.153846 0.153846 --> x0.023669 +0.307692 (u'liberty', u'toledo') (u'toledo', u'liberty') 0.307692 0.153846 --> x0.047337 +0.461538 ====================================================================== (u'chattanooga', u'utc') (u'arkansas', u'ar') 0.142857 0.200000 --> x0.028571 +0.342857 (u'chattanooga', u'utc') (u'arkansas', u'ar') 0.142857 0.400000 --> x0.057143 +0.542857 (u'chattanooga', u'ar') (u'arkansas', u'utc') 0.000000 0.000000 --> x0.000000 +0.000000 (u'chattanooga', u'ar') (u'arkansas', u'utc') 0.153846 0.000000 --> x0.000000 +0.153846 ====================================================================== (u'texas tech', u'tt') (u'iowa state', u'isu') 0.333333 0.307692 --> x0.102564 +0.641026 (u'texas tech', u'tt') (u'iowa state', u'isu') 0.333333 0.307692 --> x0.102564 +0.641026 (u'texas tech', u'isu') (u'iowa state', u'tt') 0.000000 0.000000 --> x0.000000 +0.000000 (u'texas tech', u'isu') (u'iowa state', u'tt') 0.153846 0.333333 --> x0.051282 +0.487179
this above result of code shows:
1/
ratio = 0.142857 of association (u'liberty', u'liberty')
inferior ratio x = 0.153846 of association (u'liberty', u'toledo')
!
enough make condition a>x , b>y
evaluated false , leads association (u'liberty', u'toledo')
returned part of result while undesired association,
, though, besides, ratios of associations (u'toledo', u'toledo')
, (u'toledo', u'liberty')
describe correctly first 1 (u'toledo', u'toledo')
desired one.
when lower()
method applied string, evidently solves flaw since associations (u'liberty', u'liberty')
, (u'toledo', u'toledo')
have ratios of 1.000000
2/
however, intervention of lower()
provokes flaws 2 other cases formerly correct.
without lower()
,
incorrect associations (u'chattanooga', u'ar')
, (u'arkansas', u'utc')
had ratios of 0.000000
wining associations (u'chattanooga', u'utc') (u'arkansas', u'ar')
correct result.
with lower()
,
lowered correct (u'chattanooga', u'utc')
has same ratio 0.142857 unlowered version,
compared incorrect (u'chattanooga', u'ar')
valued 0.153846,
happens correct (u'chattanooga', u'utc')
inferior incorrect (u'chattanooga', u'ar')
condition evaluated false , associations (u'chattanooga', u'ar') (u'arkansas', u'utc')
returned while incorrect.
.
that's same incorrect associations (u'texas tech', u'isu') (u'iowa state', u'tt')
have ratios 0.000000 inferior ratio 0.333333 , 0.307692 of correct associations (u'texas tech', u'tt') (u'iowa state', u'isu')
when lowered,
ratio of (u'iowa state', u'tt')
increases 0.000000 0.333333 while other association (u'iowa state', u'isu')
remains same inferior ratio 0.307692. condition again evaluated false.
3/
clear new flaws due fact labels u'ar'
, u'tt'
short. 1 or 2 lowered letters match long lowered names 'chattanooga'
, u'texas tech'
while there no match between unlowered versions of these strings, , situation tumbled.
it clear problems emerge because boolean expression a>x , b>y
gives lot of weight each of 2 expressions a>x
, b>y
separately.
consider must found condition combines result of a>x
, result of b>y
multiplying them doesn't give me impression it's way.
in following code, choosed add ratios , perform more 1 evaluation of condition.
from difflib import sequencematcher li = [\ ([u'houston', u'alabama'], [u'houst', u'alab']), ([u'florida state', u'north carolina state'], [u'ncst', u'flast']), ([u'penn state', u'iowa'], [u'pnst', u'iowa']), ([u'oklahoma', u'texas'], [u'texas', u'okla']), ([u'florida atlantic', u'south florida'], [u'sfla', u'flatl']), ([u'georgia', u'tennessee'], [u'geo', u'tenn']), ([u'san jose state', u'idaho'], [u'ui', u'sjsu']), ([u'washington state', u'arizona state'], [u'arzst', u'wshst']), ([u'fresno state', u'nevada'], [u'nevad', u'frsst']), ([u'oregon state', u'arizona'], [u'ariz', u'osu']), ([u'clemson', u'virginia tech'], [u'vtech', u'clem']), ([u'chattanooga', u'arkansas'], [u'utc', u'ar']), ([u'usc', u'stanford'], [u'usc', u'stanf']), ([u'baylor', u'colorado'], [u'bu', u'cu']), ([u'north texas', u'louisiana-lafayette'], [u'notex', u'lalaf']), ([u'tulane', u'army'], [u'tln', u'army']), ([u'troy', u'florida international'], [u'troy', u'fiu']), ([u'louisiana-monroe', u'arkansas state'], [u'asu', u'ulm']), ([u'texas tech', u'iowa state'], [u'tt', u'isu']), ([u'akron', u'western michigan'], [u'akron', u'wmu']), ([u'liberty', u'toledo'], [u'liberty', u'toledo']), ([u'virginia', u'middle tennessee'], [u'virg', u'mtnst']), ([u'oklahoma state', u'texas a&m'], [u'texam', u'okst']), ([u'notre dame', u'ucla'], [u'ndame', u'ucla']), ([u'rutgers', u'cincinnati'], [u'cincy', u'rutgr']), ([u'ohio state', u'purdue'], [u'prdue', u'ohst']), ([u'lsu', u'florida'], [u'fla', u'lsu']), ([u'air force', u'unlv'], [u'afa', u'unlv']), ([u'nebraska', u'missouri'], [u'misso', u'neb']), ([u'new mexico state', u'boise state'], [u'nmxst', u'boist']), ([u'pittsburgh', u'navy'], [u'navy', u'pitt']), ([u'wake forest', u'florida state'], [u'wfrst', u'flast']), ([u'san jose state', u'hawaii'], [u'hawa', u'sjst']), ([u'ucf', u'south florida'], [u'ucf', u'sfla']) ] def comp(n,d,sq = sequencematcher(none)): sq.set_seqs(n[0],d[0]) = sq.ratio() sq.set_seqs(n[1],d[1]) b = sq.ratio() sq.set_seqs(n[0],d[1]) x = sq.ratio() sq.set_seqs(n[1],d[0]) y = sq.ratio() sq.set_seqs(n[0].lower(),d[0].lower()) al = sq.ratio() sq.set_seqs(n[1].lower(),d[1].lower()) bl = sq.ratio() sq.set_seqs(n[0].lower(),d[1].lower()) xl = sq.ratio() sq.set_seqs(n[1].lower(),d[0].lower()) yl = sq.ratio() if ((a>0.5 , b>0.5 , a+b>1.4) or (al>0.5 , bl>0.5 , al+bl>1.4)): return (n[0],d[0]), (n[1],d[1]) elif ((x>0.4 , y>0.4 , x+y>1.4) or (xl>0.4 , yl>0.4 , xl+yl>1.4)): return (n[0],d[1]), (n[1],d[0]) elif x+y==0.0 , a+b>0.1: return (n[0],d[0]), (n[1],d[1]) elif a+b==0.00 , x+y>0.1: return (n[0],d[1]), (n[1],d[0]) elif a+b > x + y + 0.5: return (n[0],d[0]), (n[1],d[1]) elif x+y > + b + 0.5: return (n[0],d[1]), (n[1],d[0]) elif a+b > x + y: return (n[0],d[0]), (n[1],d[1]) elif x+y > + b: return (n[0],d[1]), (n[1],d[0]) print '\n'.join('%-30s %s' % comp(n,d) n,d in li)
result
(u'houston', u'houst') (u'alabama', u'alab') (u'florida state', u'flast') (u'north carolina state', u'ncst') (u'penn state', u'pnst') (u'iowa', u'iowa') (u'oklahoma', u'okla') (u'texas', u'texas') (u'florida atlantic', u'flatl') (u'south florida', u'sfla') (u'georgia', u'geo') (u'tennessee', u'tenn') (u'san jose state', u'sjsu') (u'idaho', u'ui') (u'washington state', u'wshst') (u'arizona state', u'arzst') (u'fresno state', u'frsst') (u'nevada', u'nevad') (u'oregon state', u'osu') (u'arizona', u'ariz') (u'clemson', u'clem') (u'virginia tech', u'vtech') (u'chattanooga', u'utc') (u'arkansas', u'ar') (u'usc', u'usc') (u'stanford', u'stanf') (u'baylor', u'bu') (u'colorado', u'cu') (u'north texas', u'notex') (u'louisiana-lafayette', u'lalaf') (u'tulane', u'tln') (u'army', u'army') (u'troy', u'troy') (u'florida international', u'fiu') (u'louisiana-monroe', u'ulm') (u'arkansas state', u'asu') (u'texas tech', u'tt') (u'iowa state', u'isu') (u'akron', u'akron') (u'western michigan', u'wmu') (u'liberty', u'liberty') (u'toledo', u'toledo') (u'virginia', u'virg') (u'middle tennessee', u'mtnst') (u'oklahoma state', u'okst') (u'texas a&m', u'texam') (u'notre dame', u'ndame') (u'ucla', u'ucla') (u'rutgers', u'rutgr') (u'cincinnati', u'cincy') (u'ohio state', u'ohst') (u'purdue', u'prdue') (u'lsu', u'lsu') (u'florida', u'fla') (u'air force', u'afa') (u'unlv', u'unlv') (u'nebraska', u'neb') (u'missouri', u'misso') (u'new mexico state', u'nmxst') (u'boise state', u'boist') (u'pittsburgh', u'pitt') (u'navy', u'navy') (u'wake forest', u'wfrst') (u'florida state', u'flast') (u'san jose state', u'sjst') (u'hawaii', u'hawa') (u'ucf', u'ucf') (u'south florida', u'sfla')
all result seems correct
Comments
Post a Comment