diff --git a/build/lib/cleanco.py b/build/lib/cleanco.py index 3970d8d..97a3df4 100644 --- a/build/lib/cleanco.py +++ b/build/lib/cleanco.py @@ -1,770 +1,256 @@ # Note that this script is geard towards identifying businesses in terms of the US/UK from collections import OrderedDict - class cleanco(): - def __init__(self, corpname): + def __init__(self, business_name): - # Throw this in there so that I know it's there - self.corpname = corpname - - ## Business Type ## + self.business_name = business_name + + # Business Types + type_dict = {} + type_dict['Professional Limited Liability Company'] = ["pllc", "p.l.l.c."] + type_dict['Limited Liability Limited Partnership'] = ["lllp", "l.l.l.p."] + type_dict['Limited Partnership'] = ["gmbh & co. kg", "gmbh & co. kg", "lp", "l.p.", "s.c.s.", "s.c.p.a", "comm.v", "k.d.", "k.d.a.", "s. en c.", "e.e.", "s.a.s.", "s. en c.", "c.v.", "s.k.a.", "sp.k.", "s.cra.", "ky", "scs", "kg", "kd", "k/s", "ee", "secs", "kda", "ks", "kb", "kt"] + type_dict['Corporation'] = ["company", "incorporated", "corporation", "corp", "inc", "& co.", "& co", "inc.", "s.p.a.", "n.v.", "a.g.", "ag", "nuf", "s.a.", "s.f.", "oao", "co."] + type_dict['General Partnership'] = ["soc.col.", "stg", "d.n.o.", "ltda.", "v.o.s.", "kgaa", "o.e.", "s.f.", "s.n.c.", "s.a.p.a.", "j.t.d.", "v.o.f.", "sp.j.", "og", "sd", "vos", " i/s", "ay", "snc", "oe", "bt.", "s.s.", "mb", "ans", "da", "o.d.", "hb", "pt"] + type_dict['Limited Liability Company'] = ["pllc", "llc", "l.l.c.", "plc.", "plc", "hf.", "oyj", "a.e.", "nyrt.", "p.l.c.", "sh.a.", "s.a.", "s.r.l.", "srl.", "aat", "3at", "d.d.", "akc. spol.", "a.s.", "s.r.o.", "s.m.b.a.", "smba", "sarl", "nv", "sa", "aps", "a/s", "p/s", "sae", "sasu", "eurl", "ae", "cpt", "as", "ab", "asa", "ooo", "dat", "vat", "zat", "mchj", "a.d."] + type_dict['Limited Liability Partnership'] = ["llp", "l.l.p.", "sp.p.", "s.c.a.", "s.c.s."] + type_dict['Limited'] = ["pty. ltd.", "pty ltd", "ltd", "l.t.d.", "bvba", "d.o.o.", "ltda", "gmbh", "g.m.b.h", "kft.", "kht.", "zrt.", "ehf.", "s.a.r.l.", "d.o.o.e.l.", "s. de r.l.", "b.v.", "tapui", "sp. z.o.o.", "s.r.l.", "s.l.", "s.l.n.e.", "ood", "oy", "rt.", "teo", "uab", "scs", "sprl", "limited", "bhd.", "sdn. bhd.", "sdn bhd", "as", "lda.", "tov", "pp"] + type_dict['Professional Corporation'] = ["p.c.", "vof", "snc"] + type_dict['No Liability'] = ["nl"] + type_dict['Sole Proprietorship'] = ["e.u.", "s.p.", "t:mi", "e.v.", "e.c.", "et", "obrt", "fie", "ij", "fop", "xt"] + type_dict['Joint Stock / Unlimited'] = ["unltd", "ultd", "sal", "unlimited", "saog", "saoc", "aj", "yoaj", "oaj"] + type_dict['Joint Venture'] = ["esv", "gie", "kv.", "qk"] + type_dict['Non-Profit'] = ["vzw", "ses.", "gte."] + type_dict['Mutual Fund'] = ["sicav"] + type_dict['Private Company'] = ["private", "pte", " xk"] - #Weird US ones - # Professional Limited Liability Company - PLLC - self.pllc = [" pllc", " p.l.l.c."] + # Countries that can be identified due to specific business types in the name -- thanks Wikipedia! + country_dict = {} + country_dict['Albania'] = ["sh.a.", "sh.p.k."] + country_dict['Argentina'] = ["s.a.", "s.r.l.", "s.c.p.a", "scpa", "s.c.e i.", "s.e.", "s.g.r", "soc.col."] + country_dict['Australia'] = ["nl", "pty. ltd.", "pty ltd"] + country_dict['Austria'] = ["e.u.", "stg", "gesbr", "a.g.", "ag", "og", "kg"] + country_dict['Belarus'] = ["aat", "3at"] + country_dict['Belgium'] = ["esv", "vzw", "vof", "snc", "comm.v", "scs", "bvba", "sprl", "cbva", "cvoa", "sca", "sep", "gie"] + country_dict['Bosnia / Herzegovina'] = ["d.d.", "a.d.", "d.n.o.", "d.o.o.", "k.v.", "s.p."] + country_dict['Bulgaria'] = ["ad", "adsitz", "ead", "et", "kd", "kda", "sd"] + country_dict['Brazil'] = ["ltda", "s.a.", "pllc", "ad", "adsitz", "ead", "et", "kd", "kda", "sd"] + country_dict['Cambodia'] = ["gp", "sm pte ltd.", "pte ltd.", "plc ltd.", "peec", "sp"] + country_dict['Canada'] = ["gp", "lp", "sp"] + country_dict['Chile'] = ["eirl", "s.a.", "sgr", "s.g.r.", "ltda", "s.p.a.", "sa", "s. en c.", "ltda."] + country_dict['Columbia'] = ["s.a.", "e.u.", "s.a.s.", "suc. de descendants", "sca"] + country_dict['Croatia'] = ["d.d.", "d.d.o.", "obrt"] + country_dict['Czech Republic'] = ["a.s.", "akc. spol.", "s.r.o.", "v.o.s.", "k.s.", "sro", "vos"] + country_dict['Denmark'] = ["i/s", "a/s", "k/s", "p/s", "amba", "a.m.b.a.", "fmba", "f.m.b.a.", "smba", "s.m.b.a.", "g/s"] + country_dict['Dominican Republic'] = ["c. por a.", "cxa", "s.a.", "s.a.s.", "srl.", "eirl.", "sa", "sas"] + country_dict['Ecuador'] = ["s.a.", "c.a.", "sa", "ep"] + country_dict['Egypt'] = ["sae"] + country_dict['Estonia'] = ["fie"] + country_dict['Finland'] = ["t:mi", "ay", "ky", "oy", "oyj", "ok"] + country_dict['France'] = ["sicav", "sarl", "sogepa", "ei", "eurl", "sasu", "fcp", "gie", "sep", "snc", "scs", "sca", "scop", "sem", "sas"] + country_dict['Germany'] = ["gmbh & co. kg", "gmbh & co. kg", "e.g.", "e.v.", "gbr", "ohg", "partg", "kgaa", "gmbh", "g.m.b.h.", "ag"] + country_dict['Greece'] = ["a.e.", "ae", "e.e.", "ee", "epe", "e.p.e.", "mepe", "m.e.p.e.", "o.e.", "oe", "ovee", "o.v.e.e."] + country_dict['Guatemala'] = ["s.a.", "sa"] + country_dict['Haiti'] = ["sa"] + country_dict['Hong Kong'] = ["ltd", "unltd", "ultd"] + country_dict['Hungary'] = ["e.v.", "e.c.", "bt.", "kft.", "kht.", "kkt.", "k.v.", "zrt.", "nyrt", "ev", "ec", "rt."] + country_dict['Iceland'] = ["ehf.", "hf.", "ohf.", "s.f.", "ses."] + country_dict['India'] = ["pvt. ltd.", "ltd.", "psu", "pse"] + country_dict['Indonesia'] = ["ud", "fa", "pt"] + country_dict['Ireland'] = ["cpt", "teo"] + country_dict['Israel'] = ["b.m.", "bm", "ltd"] + country_dict['Italy'] = ["s.n.c.", "s.a.s.", "s.p.a.", "s.a.p.a.", "s.r.l.", "s.c.r.l.", "s.s."] + country_dict['Latvia'] = ["as", "sia", "ik", "ps", "ks"] + country_dict['Lebanon'] = ["sal"] + country_dict['Lithuania'] = ["uab", "ab", "ij", "mb"] + country_dict['Luxemborg'] = ["s.a.", "s.a.r.l.", "secs"] + country_dict['Macedonia'] = ["d.o.o.", "d.o.o.e.l", "k.d.a.", "j.t.d.", "a.d.", "k.d."] + country_dict['Malaysia'] = ["bhd.", "sdn. bhd."] + country_dict['Mexico'] = ["s.a.", "s. de. r.l.", "s. en c.", "s.a.b.", "s.a.p.i."] + country_dict['Mongolia'] = ["xk", "xxk"] + country_dict['Netherlands'] = ["v.o.f.", "c.v.", "b.v.", "n.v."] + country_dict['New Zealand'] = ["tapui", "ltd"] + country_dict['Nigeria'] = ["gte.", "plc", "ltd.", "ultd."] + country_dict['Norway'] = ["asa", "as", "ans", "ba", "bl", "da", "etat", "fkf", "hf", "iks", "kf", "ks", "nuf", "rhf", "sf"] + country_dict['Oman'] = ["saog", "saoc"] + country_dict['Pakistan'] = ["ltd.", "pvt. ltd.", "ltd"] + country_dict['Peru'] = ["sa", "s.a.", "s.a.a."] + country_dict['Philippines'] = ["coop.", "corp.", "corp", "ent.", "inc.", "inc", "llc", "l.l.c.", "ltd."] + country_dict['Poland'] = ["p.p.", "s.k.a.", "sp.j.", "sp.k.", "sp.p.", "sp. z.o.o.", "s.c.", "s.a."] + country_dict['Portugal'] = ["lda.", "crl", "s.a.", "s.f.", "sgps"] + country_dict['Romania'] = ["s.c.a.", "s.c.s.", "s.n.c.", "s.r.l.", "o.n.g.", "s.a."] + country_dict['Russia'] = ["ooo", "oao", "zao", "3ao"] + country_dict['Serbia'] = ["d.o.o.", "a.d.", "k.d.", "o.d."] + country_dict['Singapore'] = ["bhd", "pte ltd", "sdn bhd", "llp", "l.l.p.", "ltd.", "pte"] + country_dict['Slovokia'] = ["a.s.", "s.r.o.", "k.s.", "v.o.s."] + country_dict['Slovenia'] = ["d.d.", "d.o.o.", "d.n.o.", "k.d.", "s.p."] + country_dict['Spain'] = ["s.a.", "s.a.d.", "s.l.", "s.l.l.", "s.l.n.e.", "s.c.", "s.cra", "s.coop", "sal", "sccl"] + country_dict['Sweden'] = ["ab", "hb", "kb"] + country_dict['Switzerland'] = ["ab", "sa", "gmbh", "g.m.b.h.", "sarl", "sagl"] + country_dict['Turkey'] = ["koop."] + country_dict['Ukraine'] = ["dat", "fop", "kt", "pt", "tdv", "tov", "pp", "vat", "zat", "at"] + country_dict['United Kingdon'] = ["plc.", "plc", "uk", "cic", "cio", "l.l.p.", "llp", "l.p.", "lp", "ltd.", "ltd"] + country_dict['United States of America'] = ["llc", "inc.", "corporation", "incorporated", "company", "limited", "corp.", "inc.", "inc", "llp", "l.l.p.", "pllc", "and company", "& company", "usa", "inc", "inc.", "corp.", "corp", "ltd.", "ltd", "& co.", "& co", "co.", "co", "lp", "us"] + country_dict['Uzbekistan'] = ["mchj", "qmj", "aj", "oaj", "yoaj", "xk", "xt", "ok", "uk", "qk"] - # Limited Liability Limited Partnership - self.lllp = [" lllp", " l.l.l.p."] + ## Abbreviations ## + self.abbv = {'intl.':'International', 'intl':'International', 'co.':'Company', 'mfg':'Manufacturing', ' med ':' Medical ', 'ctr':'Center'} - # LP - Limited Partnership - self.lp = [" gmbh & co. kg", " gmbh & co. kg", " lp", " l.p.", "s.c.s.", "s.c.p.a", "comm.v", "k.d.", "k.d.a.", "s. en c.", "e.e.", "s.a.s.", "s. en c.", "c.v.", "s.k.a.", "sp.k.", "s.cra.", " ky", " scs", " kg", " kd", " k/s", " ee", " secs", " kda", " ks", " kb", " kt"] + # Abbreviations when strings end with these + self.abbvend = {' co':'Company'} - # Corporation - self.corporation = [" company", "incorporated", "corporation", "corp", " inc", " & co.", " & co", "inc.", "s.p.a.", "n.v.", " a.g.", " ag", " nuf", " s.a.", " s.f.", " oao"] + # Industry + industry_dict = {} + industry_dict['Pharmaceutical'] = ["therapeutic", "biopharmaceuticals", "biopharmaceutical", "biopharma", "biopharm", "pharmaceuticals", "pharmaceutical", "pharma"] + industry_dict['Biotechnology'] = ["therapeutic", "biopharmaceuticals", "biopharmaceutical", "biopharma", "biopharm", "biotechnology", "biotechnologies", "bioventures", "biolabs", "biosciences", "bioscience", "biotech"] + industry_dict['Engineering'] = ["engineer"] + industry_dict['Education'] = ["education", "university", "school of", "academy"] - # GP - General Partnership - self.gp = ["soc.col.", "stg", "d.n.o.", "ltda.", "v.o.s.", "kgaa", "o.e.", "s.f.", "s.n.c.", "s.a.p.a.", "j.t.d.", "v.o.f.", "sp.j.", " og", " sd", " vos", " i/s", " ay", " snc", " oe", " bt.", " s.s.", " mb", " ans", " da", " o.d.", " hb", " pt"] + # Sorted business types / abbreviation by length of business type + sorted_types = [] + for business_type in type_dict: + for item in type_dict[business_type]: + temp_tuple = [business_type, item] + sorted_types.append(temp_tuple) - # LLC - Limited Liability Company (PLC - UK) - self.llc = [" pllc", " llc", " l.l.c.", "plc.", " plc", " hf.", " oyj", " a.e.", " nyrt.", " p.l.c.", " sh.a.", " s.a.", " s.r.l.", " srl.", " aat", " 3at", " d.d.", " akc. spol.", " a.s.", " s.r.o.", " s.m.b.a.", "smba", "sarl", " nv", " sa", " aps", " a/s", " p/s", " sae", " sasu", "eurl", " ae", " cpt", " as", " ab", " asa", " ooo", " dat", " vat", " zat", " mchj", " a.d."] - - # LLP - Limited Liability Partnership - self.llp = [" llp", " l.l.p.", " sp.p.", " s.c.a.", " s.c.s."] - - # Ltd - Private Company Limited By Shares - UK - self.ltd = [" pty. ltd.", " pty ltd", " ltd", " l.t.d.", " bvba", " d.o.o.", "ltda", "gmbh", "g.m.b.h", "kft.", "kht.", "zrt.", "ehf.", "s.a.r.l.", "d.o.o.e.l.", "s. de r.l.", "b.v.", "tapui", "sp. z.o.o.", "s.r.l.", "s.l.", "s.l.n.e.", " ood", " oy", " rt.", " teo", " uab", " scs", " sprl", " limited", " bhd.", " sdn. bhd.", " sdn bhd", " as", " lda.", " tov", " pp"] - - # PC - Professional Corporation - self.pc_comma = ["p.c.", ", pc", " vof", " snc"] + self.sorted_types = sorted(sorted_types, key=lambda part: len(part[1]), reverse=True) - # NL - No Liability - Australia - self.nl = [" nl"] + # Sorted business countries / type abbrviations by length of business type abbreviations + sorted_countries = [] + for country in country_dict: + for item in country_dict[country]: + temp_tuple = [country, item] + sorted_countries.append(temp_tuple) - # SP - Sole Proprietorship - self.sp = [" e.u.", " s.p.", " t:mi", "e.v.", "e.c.", " et", " obrt", " fie", " ij", " fop", " xt"] + self.sorted_countries = sorted(sorted_countries, key=lambda part: len(part[1]), reverse=True) - # Joint Stock - Unlimited - self.js = [" unltd", " ultd", " sal", " unlimited", " saog", " saoc", " aj", " yoaj", " oaj"] + # All of the suffixes sorted by length + all_sorted = sorted_types + sorted_countries + suffix_sort = [] + for item in all_sorted: + suffix_sort.append(item[1]) - # Joint Venture - self.jv = [" esv", " gie", " kv.", " qk"] + self.suffix_sort = sorted(suffix_sort, key=lambda part: len(part), reverse=True) - # Non-Profit - self.np = [" vzw", " ses.", " gte."] + # Industries put into a giant listing + industry_list = [] + for industry in industry_dict: + for item in industry_dict[industry]: + temp_tuple = [industry, item] + industry_list.append(temp_tuple) - # Mutual Fund - self.mf = [" sicav"] + self.industry_list = industry_list - # Countries that can be identified due to specific business types in the name -- thanks Wikipedia! - self.albania = ["sh.a.", "sh.p.k."] - self.argentina = ["s.a.", "s.r.l.", "s.c.p.a", " scpa", "s.c.e i.", "s.e.", "s.g.r", "soc.col."] - self.australia = ["nl", "pty. ltd.", "pty ltd"] - self.austria = ["e.u.", "stg", "gesbr", "a.g.", "ag", "og", "kg"] - self.belarus = ["aat", "3at"] - self.belgium = ["esv", "vzw", "vof", "snc", "comm.v", "scs", "bvba", "sprl", "cbva", "cvoa", " sca", "sep", "gie"] - self.bosherz = ["d.d.", "a.d.", "d.n.o.", "d.o.o.", "k.v.", "s.p."] - self.bulgaria = [" ad", "adsitz", "ead", "et", "kd", "kda", "sd"] - self.brazil = ["ltda", "s.a.", "pllc", "ad", "adsitz", "ead", "et", "kd", "kda", "sd"] - self.cambodia = ["gp", "sm pte ltd.", "pte ltd.", "plc ltd.", "peec", "sp"] - self.canada = ["gp", "lp", "sp"] - self.chile = ["eirl", "s.a.", "sgr", "s.g.r.", "ltda", "s.p.a.", "sa", "s. en c.", "ltda."] - self.columbia = ["s.a.", "e.u.", "s.a.s.", "suc. de descendants", "sca"] - self.croatia = ["d.d.", "d.d.o.", "obrt"] - self.czech = ["a.s.", "akc. spol.", "s.r.o.", "v.o.s.", "k.s.", "sro", "vos"] - self.denmark = ["i/s", "a/s", "k/s", "p/s", "amba", "a.m.b.a.", "fmba", "f.m.b.a.", "smba", "s.m.b.a.", "g/s"] - self.domrep = ["c. por a.", "cxa", "s.a.", "s.a.s.", "srl.", "eirl.", "sa", "sas"] - self.ecuador = ["s.a.", "c.a.", "sa", "ep"] - self.egypt = ["sae"] - self.estonia = ["fie"] - self.finland = ["t:mi", "ay", "ky", " oy", " oyj", " ok"] - self.france = ["sicav", "sarl", "sogepa", " ei", " eurl", "sasu", "fcp", "gie", "sep", "snc", "scs", "sca", "scop", "sem", "sas"] - self.germany = ["gmbh & co. kg", "gmbh & co. kg", "e.g.", "e.v.", "gbr", "ohg", "partg", "kgaa", "gmbh", "g.m.b.h.", "ag"] - self.greece = ["a.e.", "ae", "e.e.", "ee", "epe", "e.p.e.", "mepe", "m.e.p.e.", "o.e.", " oe", "ovee", "o.v.e.e."] - self.guatemala = ["s.a.", "sa"] - self.haiti = [" sa"] - self.hongkong = ["ltd", "unltd", "ultd"] - self.hungary = ["e.v.", "e.c.", "bt.", "kft.", "kht.", "kkt.", "k.v.", "zrt.", "nyrt", " ev", " ec", " rt."] - self.iceland = ["ehf.", "hf.", "ohf.", "s.f.", "ses."] - self.india = ["pvt. ltd.", "ltd.", "psu", "pse"] - self.indonesia = [" ud", "fa", "pt"] - self.ireland = [" cpt", " teo"] - self.israel = ["b.m.", "bm", "ltd"] - self.italy = ["s.n.c.", "s.a.s.", "s.p.a.", "s.a.p.a.", "s.r.l.", "s.c.r.l.", "s.s."] - self.latvia = ["as", "sia", "ik", "ps", "ks"] - self.lebanon = ["sal"] - self.lithuania = ["uab", "ab", "ij", "mb"] - self.luxemborg = ["s.a.", "s.a.r.l.", " secs"] - self.macedonia = ["d.o.o.", "d.o.o.e.l", "k.d.a.", "j.t.d.", " a.d.", " k.d."] - self.malaysia = [" bhd.", "sdn. bhd."] - self.mexico = ["s.a.", "s. de. r.l.", "s. en c.", "s.a.b.", "s.a.p.i."] - self.mongolia = [" xk", " xxk"] - self.netherlands = ["v.o.f.", "c.v.", "b.v.", "n.v."] - self.newzealand = ["tapui", "ltd"] - self.nigeria = ["gte.", "plc", " ltd.", "ultd."] - self.norway = ["asa", "as", "ans", "ba", "bl", "da", "etat", "fkf", "hf", "iks", "kf", "ks", "nuf", "rhf", "sf"] - self.oman = ["saog", "saoc"] - self.pakistan = ["ltd.", "pvt. ltd.", "ltd"] - self.peru = ["sa", "s.a.", "s.a.a."] - self.philippines = ["coop.", "corp.", "corp", "ent.", "inc.", "inc", "llc", "l.l.c.", "ltd."] - self.poland = ["p.p.", "s.k.a.", "sp.j.", "sp.k.", "sp.p.", "sp. z.o.o.", "s.c.", "s.a."] - self.portugal = ["lda.", " crl", " s.a.", " s.f.", " sgps"] - self.romania = ["s.c.a.", "s.c.s.", "s.n.c.", "s.r.l.", "o.n.g.", "s.a."] - self.russia = ["ooo", "oao", "zao", "3ao"] - self.serbia = ["d.o.o.", "a.d.", "k.d.", "o.d."] - self.singapore = ["bhd", "pte ltd", "sdn bhd", "llp", " l.l.p.", " ltd.", "pte"] - self.slovokia = [" a.s.", " s.r.o.", " k.s.", " v.o.s."] - self.slovenia = ["d.d.", "d.o.o.", "d.n.o.", "k.d.", "s.p."] - self.spain = ["s.a.", "s.a.d.", "s.l.", "s.l.l.", "s.l.n.e.", "s.c.", "s.cra", "s.coop", "sal", "sccl"] - self.sweden = [" ab", " hb", " kb"] - self.switzerland = ["ab", "sa", " gmbh", "g.m.b.h.", "sarl", "sagl"] - self.turkey = ["koop."] - self.ukraine = ["dat", "fop", "kt", "pt", "tdv", "tov", "pp", "vat", "zat", "at"] - self.uk = ["plc.", "plc", "uk", "cic", "cio", "l.l.p.", "llp", "l.p.", "lp", "ltd.", "ltd"] - self.usa = [" llc", " inc.", "corporation", "incorporated", "company", "limited", "corp.", "inc.", "inc", "llp", "l.l.p.", "pllc", "and company", "& company", "usa", "inc", "inc.", "corp.", "corp", "ltd.", "ltd", "& co.", "& co", "co.", "co", "lp", ", pc", "us"] - self.uzbekistan = ["mchj", "qmj", "aj", "oaj", "yoaj", "xk", "xt", "ok" "uk", "qk"] - - # Private company? - self.pte = ["private", "pte", " xk"] + def string_stripper(self, business_name): - ## Abbreviations ## - self.abbv = {'intl.':'International', 'intl':'International', 'co.':'Company', 'mfg':'Manufacturing', ' med ':' Medical ', 'ctr':'Center'} + # Get rid of everything in parenthesis + if " (" and ")" in business_name: + beginpar = business_name.find(" (") + endpar = business_name.find(")") + business_name = business_name.replace(business_name[beginpar:endpar+1],"") - # Abbreviations when strings end with these - self.abbvend = {' co':'Company'} + # Strip out commas + if "," in business_name: + business_name = business_name.replace(",", " ") + # Strip spaces on the left + business_name = business_name.lstrip() - ## Industry ## - self.busind = ["Pharmaceutical, Biotechnology", "Engineering"] - self.pharma = [" therap", "biopharmaceuticals", "biopharmaceutical", "biopharma", "biopharm", "pharmaceuticals", "pharmaceutical", "pharma"] - self.biotech = [" therap", "biopharmaceuticals", "biopharmaceutical", "biopharma", "biopharm", "biotechnology", "biotechnologies", "bioventures", "biolabs", "biosciences", "bioscience", "biotech"] - self.engineering = ["engineer"] - self.education = ["education", "university", "school of", "academy"] + #Strip spaces on the right + business_name = business_name.strip() - def masterlist(self): - mlist = self.albania + self.argentina + self.australia + self.austria + self.belarus + self.belgium + self.bosherz + self.bulgaria + self.brazil + self.cambodia + self.canada + self.chile + self.columbia + self.croatia + self.czech + self.denmark + self.domrep + self.ecuador + self.egypt + self.estonia + self.finland + self.france + self.germany + self.greece + self.guatemala + self.haiti + self.hongkong + self.hungary + self.iceland + self.india + self.indonesia + self.ireland + self.israel + self.italy + self.latvia + self.lebanon + self.lithuania + self.luxemborg + self.macedonia + self.malaysia + self.mexico + self.mongolia + self.netherlands + self.newzealand + self.nigeria + self.norway + self.oman + self.pakistan + self.peru + self.philippines + self.poland + self.portugal + self.romania + self.russia + self.serbia + self.singapore + self.slovokia + self.slovenia + self.spain + self.sweden + self.switzerland + self.turkey + self.ukraine + self.uk + self.usa + self.uzbekistan - mlist = list(OrderedDict.fromkeys(mlist)) - mlist.sort(key=len, reverse=True) - - mlist2 = [] - for item in mlist: - mlist2.append(" " + item) + # Get rid of misc spaces in between + business_name = " ".join(business_name.split()) - return mlist + return business_name - def type(self): - - type_set = [] - - # Limited Partnership - for item in self.lp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Partnership") - break - - # Professional Limited Liability Company - PLLC - for item in self.pllc: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Professional Limited Liability Company") - break - - # Limited Liability Limited Partnership - for item in self.lllp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Liability Limited Partnership") - break - - # Corporation - for item in self.corporation: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Corporation") - break - - # General Partnership - for item in self.gp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("General Partnership") - break - - # Limited Liability Company - for item in self.llc: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Liability Company") - break - - # Limited Liability Partnership - for item in self.llp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Liability Partnership") - break - - # Limited Company -- LTD - for item in self.ltd: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Company") - break - - # Professional Corporation - for item in self.pc_comma: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Professional Corporation") - break - - # No Liability - Aus - for item in self.nl: - if (((self.corpname).lower()).endswith(item)): - type_set.append("No Liability") - break - - # Sole Proprietorship - for item in self.sp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Sole Proprietorship") - break - - # Joint Venture - for item in self.jv: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Sole Proprietorship") - break - - # Non-Profit - for item in self.np: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Non-Profit") - break - - # Joint Stock - Unlimited - for item in self.js: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Joint Stock") - break - - # Mutual Fund - for item in self.mf: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Mutual Fund") - break - - if type_set == []: - return None - else: - return type_set + def end_strip(self, a_set): - - def industry(self): - - industry_set = [] + end_set = [] - # Pharmaceutical - for item in self.pharma: - if item in (self.corpname).lower(): - industry_set.append("Pharmaceutical") - break - - # Biotechnology - for item in self.biotech: - if item in (self.corpname).lower(): - industry_set.append("Biotechnology") - break - - # Education - for item in self.education: - if item in (self.corpname).lower(): - industry_set.append("Education") - break - - # Engineering - for item in self.engineering: - if item in (self.corpname).lower(): - industry_set.append("Engineering") - break - - if industry_set == []: - return None - else: - return industry_set + business_name = self.business_name + business_name = self.string_stripper(business_name) - def country(self): - - country_set = [] - - # Albania - for item in self.albania: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Albania") - break - - # Argentina - for item in self.argentina: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Argentina") - break - - # Australia - for item in self.australia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Australia") - break - - # Austria - for item in self.austria: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Austria") - break - - # Belarus - for item in self.belarus: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Belarus") - break - - # Belgium - for item in self.belgium: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Belgium") - break - - # Bosnia / "Herzegovina" - for item in self.bosherz: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Bosnia") - country_set.append("Herzegovina") - break - - # Brazil - for item in self.brazil: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Brazil") - break - - # Bulgaria - for item in self.bulgaria: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Bulgaria") - break - - # Cambodia - for item in self.cambodia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Cambodia") - break - - # Canada - for item in self.canada: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Canada") - break - - # Chile - for item in self.chile: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Chile") - break - - # Columbia - for item in self.columbia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Columbia") - break - - # Croatia - for item in self.croatia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Croatia") - break - - # Czech Republic - for item in self.czech: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Czech Republic") - break - - # Denmark - for item in self.denmark: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Denmark") - break - - # Dominican Republic - for item in self.domrep: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Dominican Republic") - break - - # Ecuador - for item in self.ecuador: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Ecuador") - break - - # Egypt - for item in self.egypt: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Egypt") - break - - # Estonia - for item in self.estonia: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Estonia") - break - - # Finland - for item in self.finland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Finland") - break - - # France - for item in self.france: - if (((self.corpname).lower()).endswith(item)): - country_set.append("France") - break - - # Germany - for item in self.germany: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Germany") - break - - # Greece - for item in self.greece: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Greece") - break - - # Guatemala - for item in self.guatemala: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Guatemala") - break - - # Haiti - for item in self.haiti: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Haiti") - break - - # Hong Kong - for item in self.hongkong: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Hong Kong") - break - - # Hungary - for item in self.hungary: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Hungary") - break - - # Iceland - for item in self.iceland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Iceland") - break - - # India - for item in self.india: - if (((self.corpname).lower()).endswith(item)): - country_set.append("India") - break - - # Indonesia - for item in self.indonesia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Indonesia") - break - - # Ireland - for item in self.ireland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Ireland") - break - - # Israel - for item in self.israel: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Israel") - break - - # Italy - for item in self.italy: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Italy") - break - - # Latvia - for item in self.latvia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Latvia") - break - - # Lebanon - for item in self.lebanon: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Lebanon") - break - - # Lithuania - for item in self.lithuania: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Lithuania") - break - - # Luxemborg - for item in self.luxemborg: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Luxemborg") - break - - # Macedonia - for item in self.macedonia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Macedonia") - break - - # Malaysia - for item in self.malaysia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Malaysia") - break - - # Mexico - for item in self.mexico: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Mexico") - break - - # Mongolia - for item in self.mongolia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Mongolia") - break - - # Netherlands - for item in self.netherlands: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Netherlands") - break - - # New Zealand - for item in self.newzealand: - if (((self.corpname).lower()).endswith(item)): - country_set.append("New Zealand") - break - - # Nigeria - for item in self.nigeria: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Nigeria") - break - - # Norway - for item in self.norway: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Norway") - break - - # Oman - for item in self.oman: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Oman") - break - - # Pakistan - for item in self.pakistan: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Pakistan") - break - - # Peru - for item in self.peru: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Peru") - break - - # Philippines - for item in self.philippines: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Philippines") - break - - # Poland - for item in self.poland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Poland") - break - - # Portugal - for item in self.portugal: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Portugal") - break - - # Romania - for item in self.romania: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Romania") - break - - # Russia - for item in self.russia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Russia") - break - - # Serbia - for item in self.serbia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Serbia") - break - - # Singapore - for item in self.singapore: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Singapore") - break - - # Slovokia - for item in self.slovokia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Slovokia") - break - - # Spain - for item in self.spain: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Spain") - break - - # Sweden - for item in self.sweden: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Sweden") - break - - # Switzerland - for item in self.switzerland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Switzerland") - break - - # Turkey - for item in self.turkey: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Turkey") - break - - # Ukraine - for item in self.ukraine: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Ukraine") - break - - # United Kingdom - for item in self.uk: - if (((self.corpname).lower()).endswith(item)): - country_set.append("United Kingdom") - break - - # United States - for item in self.usa: - if (((self.corpname).lower()).endswith(item)): - country_set.append("United States") - break - - # Uzbekistan - for item in self.uzbekistan: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Uzbekistan") - break - - if country_set == []: - return None + for key, suffix in a_set: + suffix = " " + suffix + if ((business_name.lower()).endswith(suffix)): + end_set.append(key) + + end_set = list(OrderedDict.fromkeys(end_set)) + + if end_set != []: + return end_set else: - return country_set + return None - def spacecleanup(self, corpname): + def industry(self): + + industry_set = [] - corpname = corpname.lstrip() - corpname = corpname.strip() - corpname = " ".join(corpname.split()) + business_name = self.business_name - return corpname + for industry, keyword in self.industry_list: + if keyword in business_name.lower(): + industry_set.append(industry) + industry_set = list(OrderedDict.fromkeys(industry_set)) + + if industry_set != []: + return industry_set + else: + return None # A clean version of the business name def cleanname(self): - corpname = self.corpname + business_name = self.business_name - # Get rid of country items: - for item in self.masterlist(): - if ((corpname.lower()).endswith(item)): - start = (corpname.lower()).find(item) - end = len(item) - end = end * -1 - corpname = corpname[0:end] - corpname = corpname.lstrip() - corpname = corpname.strip() - corpname = " ".join(corpname.split()) + # Get rid of everything in parenthesis + if " (" and ")" in business_name: + beginpar = business_name.find(" (") + endpar = business_name.find(")") + business_name = business_name.replace(business_name[beginpar:endpar+1],"") + + # Get rid of everything after hyphen and spaces + if " - " in business_name: + corplen = len(business_name) + hypenloc = business_name.find(" - ") + business_name = business_name.replace(business_name[hypenloc:corplen],"") # Abbrv. cleanup for abbv in self.abbv: - if abbv in corpname.lower(): - start = (corpname.lower()).find(abbv) - end = len(corpname) - corpname = corpname[0:start] + self.abbv[abbv] + corpname[end+1:len(corpname)] - - # Strip out commas - if "," in corpname: - corpname = corpname.replace(",", " ") - - # Strip spaces on the left - corpname = corpname.lstrip() + if abbv in business_name.lower(): + start = (business_name.lower()).find(abbv) + end = len(business_name) + business_name = business_name[0:start] + self.abbv[abbv] + business_name[end+1:len(business_name)] - #Strip spaces on the right - corpname = corpname.strip() - - # Get rid of misc spaces in between - corpname = " ".join(corpname.split()) + # Replace single hyphen with space + if "-" in business_name: + corplen = len(business_name) + hypenloc = business_name.find("-") + business_name = business_name.replace("-"," ") - return corpname + # Get rid of country items: + for item in self.suffix_sort: + if ((business_name.lower()).endswith(item)): + start = (business_name.lower()).find(item) + end = len(item) + end = end * -1 + business_name = business_name[0:end] + business_name = self.string_stripper(business_name) + business_name = self.string_stripper(business_name) - # A short version of the corporate name - def shortname(self): + return business_name - corpname = self.cleanname() - - # Get rid of everything in parenthesis - if " (" and ")" in corpname: - beginpar = corpname.find(" (") - endpar = corpname.find(")") - corpname = corpname.replace(corpname[beginpar:endpar+1],"") + def cleaner(self): - # Get rid of everything after hyphen and spaces - if " - " in corpname: - corplen = len(corpname) - hypenloc = corpname.find(" - ") - corpname = corpname.replace(corpname[hypenloc:corplen],"") - - # Replace single hyphen with space - if "-" in corpname: - corplen = len(corpname) - hypenloc = corpname.find("-") - corpname = corpname.replace("-"," ") + self.clean_name = self.cleanname() + self.industry = self.industry() + self.type = self.end_strip(self.sorted_types) + self.country = self.end_strip(self.sorted_countries) - # Get rid of misc spaces in between - corpname = " ".join(corpname.split()) - - # Strip spaces on the left - corpname = corpname.lstrip() - - #Strip spaces on the right - corpname = corpname.strip() - - return corpname \ No newline at end of file + return self \ No newline at end of file diff --git a/cleanco.py b/cleanco.py index 3970d8d..97a3df4 100644 --- a/cleanco.py +++ b/cleanco.py @@ -1,770 +1,256 @@ # Note that this script is geard towards identifying businesses in terms of the US/UK from collections import OrderedDict - class cleanco(): - def __init__(self, corpname): + def __init__(self, business_name): - # Throw this in there so that I know it's there - self.corpname = corpname - - ## Business Type ## + self.business_name = business_name + + # Business Types + type_dict = {} + type_dict['Professional Limited Liability Company'] = ["pllc", "p.l.l.c."] + type_dict['Limited Liability Limited Partnership'] = ["lllp", "l.l.l.p."] + type_dict['Limited Partnership'] = ["gmbh & co. kg", "gmbh & co. kg", "lp", "l.p.", "s.c.s.", "s.c.p.a", "comm.v", "k.d.", "k.d.a.", "s. en c.", "e.e.", "s.a.s.", "s. en c.", "c.v.", "s.k.a.", "sp.k.", "s.cra.", "ky", "scs", "kg", "kd", "k/s", "ee", "secs", "kda", "ks", "kb", "kt"] + type_dict['Corporation'] = ["company", "incorporated", "corporation", "corp", "inc", "& co.", "& co", "inc.", "s.p.a.", "n.v.", "a.g.", "ag", "nuf", "s.a.", "s.f.", "oao", "co."] + type_dict['General Partnership'] = ["soc.col.", "stg", "d.n.o.", "ltda.", "v.o.s.", "kgaa", "o.e.", "s.f.", "s.n.c.", "s.a.p.a.", "j.t.d.", "v.o.f.", "sp.j.", "og", "sd", "vos", " i/s", "ay", "snc", "oe", "bt.", "s.s.", "mb", "ans", "da", "o.d.", "hb", "pt"] + type_dict['Limited Liability Company'] = ["pllc", "llc", "l.l.c.", "plc.", "plc", "hf.", "oyj", "a.e.", "nyrt.", "p.l.c.", "sh.a.", "s.a.", "s.r.l.", "srl.", "aat", "3at", "d.d.", "akc. spol.", "a.s.", "s.r.o.", "s.m.b.a.", "smba", "sarl", "nv", "sa", "aps", "a/s", "p/s", "sae", "sasu", "eurl", "ae", "cpt", "as", "ab", "asa", "ooo", "dat", "vat", "zat", "mchj", "a.d."] + type_dict['Limited Liability Partnership'] = ["llp", "l.l.p.", "sp.p.", "s.c.a.", "s.c.s."] + type_dict['Limited'] = ["pty. ltd.", "pty ltd", "ltd", "l.t.d.", "bvba", "d.o.o.", "ltda", "gmbh", "g.m.b.h", "kft.", "kht.", "zrt.", "ehf.", "s.a.r.l.", "d.o.o.e.l.", "s. de r.l.", "b.v.", "tapui", "sp. z.o.o.", "s.r.l.", "s.l.", "s.l.n.e.", "ood", "oy", "rt.", "teo", "uab", "scs", "sprl", "limited", "bhd.", "sdn. bhd.", "sdn bhd", "as", "lda.", "tov", "pp"] + type_dict['Professional Corporation'] = ["p.c.", "vof", "snc"] + type_dict['No Liability'] = ["nl"] + type_dict['Sole Proprietorship'] = ["e.u.", "s.p.", "t:mi", "e.v.", "e.c.", "et", "obrt", "fie", "ij", "fop", "xt"] + type_dict['Joint Stock / Unlimited'] = ["unltd", "ultd", "sal", "unlimited", "saog", "saoc", "aj", "yoaj", "oaj"] + type_dict['Joint Venture'] = ["esv", "gie", "kv.", "qk"] + type_dict['Non-Profit'] = ["vzw", "ses.", "gte."] + type_dict['Mutual Fund'] = ["sicav"] + type_dict['Private Company'] = ["private", "pte", " xk"] - #Weird US ones - # Professional Limited Liability Company - PLLC - self.pllc = [" pllc", " p.l.l.c."] + # Countries that can be identified due to specific business types in the name -- thanks Wikipedia! + country_dict = {} + country_dict['Albania'] = ["sh.a.", "sh.p.k."] + country_dict['Argentina'] = ["s.a.", "s.r.l.", "s.c.p.a", "scpa", "s.c.e i.", "s.e.", "s.g.r", "soc.col."] + country_dict['Australia'] = ["nl", "pty. ltd.", "pty ltd"] + country_dict['Austria'] = ["e.u.", "stg", "gesbr", "a.g.", "ag", "og", "kg"] + country_dict['Belarus'] = ["aat", "3at"] + country_dict['Belgium'] = ["esv", "vzw", "vof", "snc", "comm.v", "scs", "bvba", "sprl", "cbva", "cvoa", "sca", "sep", "gie"] + country_dict['Bosnia / Herzegovina'] = ["d.d.", "a.d.", "d.n.o.", "d.o.o.", "k.v.", "s.p."] + country_dict['Bulgaria'] = ["ad", "adsitz", "ead", "et", "kd", "kda", "sd"] + country_dict['Brazil'] = ["ltda", "s.a.", "pllc", "ad", "adsitz", "ead", "et", "kd", "kda", "sd"] + country_dict['Cambodia'] = ["gp", "sm pte ltd.", "pte ltd.", "plc ltd.", "peec", "sp"] + country_dict['Canada'] = ["gp", "lp", "sp"] + country_dict['Chile'] = ["eirl", "s.a.", "sgr", "s.g.r.", "ltda", "s.p.a.", "sa", "s. en c.", "ltda."] + country_dict['Columbia'] = ["s.a.", "e.u.", "s.a.s.", "suc. de descendants", "sca"] + country_dict['Croatia'] = ["d.d.", "d.d.o.", "obrt"] + country_dict['Czech Republic'] = ["a.s.", "akc. spol.", "s.r.o.", "v.o.s.", "k.s.", "sro", "vos"] + country_dict['Denmark'] = ["i/s", "a/s", "k/s", "p/s", "amba", "a.m.b.a.", "fmba", "f.m.b.a.", "smba", "s.m.b.a.", "g/s"] + country_dict['Dominican Republic'] = ["c. por a.", "cxa", "s.a.", "s.a.s.", "srl.", "eirl.", "sa", "sas"] + country_dict['Ecuador'] = ["s.a.", "c.a.", "sa", "ep"] + country_dict['Egypt'] = ["sae"] + country_dict['Estonia'] = ["fie"] + country_dict['Finland'] = ["t:mi", "ay", "ky", "oy", "oyj", "ok"] + country_dict['France'] = ["sicav", "sarl", "sogepa", "ei", "eurl", "sasu", "fcp", "gie", "sep", "snc", "scs", "sca", "scop", "sem", "sas"] + country_dict['Germany'] = ["gmbh & co. kg", "gmbh & co. kg", "e.g.", "e.v.", "gbr", "ohg", "partg", "kgaa", "gmbh", "g.m.b.h.", "ag"] + country_dict['Greece'] = ["a.e.", "ae", "e.e.", "ee", "epe", "e.p.e.", "mepe", "m.e.p.e.", "o.e.", "oe", "ovee", "o.v.e.e."] + country_dict['Guatemala'] = ["s.a.", "sa"] + country_dict['Haiti'] = ["sa"] + country_dict['Hong Kong'] = ["ltd", "unltd", "ultd"] + country_dict['Hungary'] = ["e.v.", "e.c.", "bt.", "kft.", "kht.", "kkt.", "k.v.", "zrt.", "nyrt", "ev", "ec", "rt."] + country_dict['Iceland'] = ["ehf.", "hf.", "ohf.", "s.f.", "ses."] + country_dict['India'] = ["pvt. ltd.", "ltd.", "psu", "pse"] + country_dict['Indonesia'] = ["ud", "fa", "pt"] + country_dict['Ireland'] = ["cpt", "teo"] + country_dict['Israel'] = ["b.m.", "bm", "ltd"] + country_dict['Italy'] = ["s.n.c.", "s.a.s.", "s.p.a.", "s.a.p.a.", "s.r.l.", "s.c.r.l.", "s.s."] + country_dict['Latvia'] = ["as", "sia", "ik", "ps", "ks"] + country_dict['Lebanon'] = ["sal"] + country_dict['Lithuania'] = ["uab", "ab", "ij", "mb"] + country_dict['Luxemborg'] = ["s.a.", "s.a.r.l.", "secs"] + country_dict['Macedonia'] = ["d.o.o.", "d.o.o.e.l", "k.d.a.", "j.t.d.", "a.d.", "k.d."] + country_dict['Malaysia'] = ["bhd.", "sdn. bhd."] + country_dict['Mexico'] = ["s.a.", "s. de. r.l.", "s. en c.", "s.a.b.", "s.a.p.i."] + country_dict['Mongolia'] = ["xk", "xxk"] + country_dict['Netherlands'] = ["v.o.f.", "c.v.", "b.v.", "n.v."] + country_dict['New Zealand'] = ["tapui", "ltd"] + country_dict['Nigeria'] = ["gte.", "plc", "ltd.", "ultd."] + country_dict['Norway'] = ["asa", "as", "ans", "ba", "bl", "da", "etat", "fkf", "hf", "iks", "kf", "ks", "nuf", "rhf", "sf"] + country_dict['Oman'] = ["saog", "saoc"] + country_dict['Pakistan'] = ["ltd.", "pvt. ltd.", "ltd"] + country_dict['Peru'] = ["sa", "s.a.", "s.a.a."] + country_dict['Philippines'] = ["coop.", "corp.", "corp", "ent.", "inc.", "inc", "llc", "l.l.c.", "ltd."] + country_dict['Poland'] = ["p.p.", "s.k.a.", "sp.j.", "sp.k.", "sp.p.", "sp. z.o.o.", "s.c.", "s.a."] + country_dict['Portugal'] = ["lda.", "crl", "s.a.", "s.f.", "sgps"] + country_dict['Romania'] = ["s.c.a.", "s.c.s.", "s.n.c.", "s.r.l.", "o.n.g.", "s.a."] + country_dict['Russia'] = ["ooo", "oao", "zao", "3ao"] + country_dict['Serbia'] = ["d.o.o.", "a.d.", "k.d.", "o.d."] + country_dict['Singapore'] = ["bhd", "pte ltd", "sdn bhd", "llp", "l.l.p.", "ltd.", "pte"] + country_dict['Slovokia'] = ["a.s.", "s.r.o.", "k.s.", "v.o.s."] + country_dict['Slovenia'] = ["d.d.", "d.o.o.", "d.n.o.", "k.d.", "s.p."] + country_dict['Spain'] = ["s.a.", "s.a.d.", "s.l.", "s.l.l.", "s.l.n.e.", "s.c.", "s.cra", "s.coop", "sal", "sccl"] + country_dict['Sweden'] = ["ab", "hb", "kb"] + country_dict['Switzerland'] = ["ab", "sa", "gmbh", "g.m.b.h.", "sarl", "sagl"] + country_dict['Turkey'] = ["koop."] + country_dict['Ukraine'] = ["dat", "fop", "kt", "pt", "tdv", "tov", "pp", "vat", "zat", "at"] + country_dict['United Kingdon'] = ["plc.", "plc", "uk", "cic", "cio", "l.l.p.", "llp", "l.p.", "lp", "ltd.", "ltd"] + country_dict['United States of America'] = ["llc", "inc.", "corporation", "incorporated", "company", "limited", "corp.", "inc.", "inc", "llp", "l.l.p.", "pllc", "and company", "& company", "usa", "inc", "inc.", "corp.", "corp", "ltd.", "ltd", "& co.", "& co", "co.", "co", "lp", "us"] + country_dict['Uzbekistan'] = ["mchj", "qmj", "aj", "oaj", "yoaj", "xk", "xt", "ok", "uk", "qk"] - # Limited Liability Limited Partnership - self.lllp = [" lllp", " l.l.l.p."] + ## Abbreviations ## + self.abbv = {'intl.':'International', 'intl':'International', 'co.':'Company', 'mfg':'Manufacturing', ' med ':' Medical ', 'ctr':'Center'} - # LP - Limited Partnership - self.lp = [" gmbh & co. kg", " gmbh & co. kg", " lp", " l.p.", "s.c.s.", "s.c.p.a", "comm.v", "k.d.", "k.d.a.", "s. en c.", "e.e.", "s.a.s.", "s. en c.", "c.v.", "s.k.a.", "sp.k.", "s.cra.", " ky", " scs", " kg", " kd", " k/s", " ee", " secs", " kda", " ks", " kb", " kt"] + # Abbreviations when strings end with these + self.abbvend = {' co':'Company'} - # Corporation - self.corporation = [" company", "incorporated", "corporation", "corp", " inc", " & co.", " & co", "inc.", "s.p.a.", "n.v.", " a.g.", " ag", " nuf", " s.a.", " s.f.", " oao"] + # Industry + industry_dict = {} + industry_dict['Pharmaceutical'] = ["therapeutic", "biopharmaceuticals", "biopharmaceutical", "biopharma", "biopharm", "pharmaceuticals", "pharmaceutical", "pharma"] + industry_dict['Biotechnology'] = ["therapeutic", "biopharmaceuticals", "biopharmaceutical", "biopharma", "biopharm", "biotechnology", "biotechnologies", "bioventures", "biolabs", "biosciences", "bioscience", "biotech"] + industry_dict['Engineering'] = ["engineer"] + industry_dict['Education'] = ["education", "university", "school of", "academy"] - # GP - General Partnership - self.gp = ["soc.col.", "stg", "d.n.o.", "ltda.", "v.o.s.", "kgaa", "o.e.", "s.f.", "s.n.c.", "s.a.p.a.", "j.t.d.", "v.o.f.", "sp.j.", " og", " sd", " vos", " i/s", " ay", " snc", " oe", " bt.", " s.s.", " mb", " ans", " da", " o.d.", " hb", " pt"] + # Sorted business types / abbreviation by length of business type + sorted_types = [] + for business_type in type_dict: + for item in type_dict[business_type]: + temp_tuple = [business_type, item] + sorted_types.append(temp_tuple) - # LLC - Limited Liability Company (PLC - UK) - self.llc = [" pllc", " llc", " l.l.c.", "plc.", " plc", " hf.", " oyj", " a.e.", " nyrt.", " p.l.c.", " sh.a.", " s.a.", " s.r.l.", " srl.", " aat", " 3at", " d.d.", " akc. spol.", " a.s.", " s.r.o.", " s.m.b.a.", "smba", "sarl", " nv", " sa", " aps", " a/s", " p/s", " sae", " sasu", "eurl", " ae", " cpt", " as", " ab", " asa", " ooo", " dat", " vat", " zat", " mchj", " a.d."] - - # LLP - Limited Liability Partnership - self.llp = [" llp", " l.l.p.", " sp.p.", " s.c.a.", " s.c.s."] - - # Ltd - Private Company Limited By Shares - UK - self.ltd = [" pty. ltd.", " pty ltd", " ltd", " l.t.d.", " bvba", " d.o.o.", "ltda", "gmbh", "g.m.b.h", "kft.", "kht.", "zrt.", "ehf.", "s.a.r.l.", "d.o.o.e.l.", "s. de r.l.", "b.v.", "tapui", "sp. z.o.o.", "s.r.l.", "s.l.", "s.l.n.e.", " ood", " oy", " rt.", " teo", " uab", " scs", " sprl", " limited", " bhd.", " sdn. bhd.", " sdn bhd", " as", " lda.", " tov", " pp"] - - # PC - Professional Corporation - self.pc_comma = ["p.c.", ", pc", " vof", " snc"] + self.sorted_types = sorted(sorted_types, key=lambda part: len(part[1]), reverse=True) - # NL - No Liability - Australia - self.nl = [" nl"] + # Sorted business countries / type abbrviations by length of business type abbreviations + sorted_countries = [] + for country in country_dict: + for item in country_dict[country]: + temp_tuple = [country, item] + sorted_countries.append(temp_tuple) - # SP - Sole Proprietorship - self.sp = [" e.u.", " s.p.", " t:mi", "e.v.", "e.c.", " et", " obrt", " fie", " ij", " fop", " xt"] + self.sorted_countries = sorted(sorted_countries, key=lambda part: len(part[1]), reverse=True) - # Joint Stock - Unlimited - self.js = [" unltd", " ultd", " sal", " unlimited", " saog", " saoc", " aj", " yoaj", " oaj"] + # All of the suffixes sorted by length + all_sorted = sorted_types + sorted_countries + suffix_sort = [] + for item in all_sorted: + suffix_sort.append(item[1]) - # Joint Venture - self.jv = [" esv", " gie", " kv.", " qk"] + self.suffix_sort = sorted(suffix_sort, key=lambda part: len(part), reverse=True) - # Non-Profit - self.np = [" vzw", " ses.", " gte."] + # Industries put into a giant listing + industry_list = [] + for industry in industry_dict: + for item in industry_dict[industry]: + temp_tuple = [industry, item] + industry_list.append(temp_tuple) - # Mutual Fund - self.mf = [" sicav"] + self.industry_list = industry_list - # Countries that can be identified due to specific business types in the name -- thanks Wikipedia! - self.albania = ["sh.a.", "sh.p.k."] - self.argentina = ["s.a.", "s.r.l.", "s.c.p.a", " scpa", "s.c.e i.", "s.e.", "s.g.r", "soc.col."] - self.australia = ["nl", "pty. ltd.", "pty ltd"] - self.austria = ["e.u.", "stg", "gesbr", "a.g.", "ag", "og", "kg"] - self.belarus = ["aat", "3at"] - self.belgium = ["esv", "vzw", "vof", "snc", "comm.v", "scs", "bvba", "sprl", "cbva", "cvoa", " sca", "sep", "gie"] - self.bosherz = ["d.d.", "a.d.", "d.n.o.", "d.o.o.", "k.v.", "s.p."] - self.bulgaria = [" ad", "adsitz", "ead", "et", "kd", "kda", "sd"] - self.brazil = ["ltda", "s.a.", "pllc", "ad", "adsitz", "ead", "et", "kd", "kda", "sd"] - self.cambodia = ["gp", "sm pte ltd.", "pte ltd.", "plc ltd.", "peec", "sp"] - self.canada = ["gp", "lp", "sp"] - self.chile = ["eirl", "s.a.", "sgr", "s.g.r.", "ltda", "s.p.a.", "sa", "s. en c.", "ltda."] - self.columbia = ["s.a.", "e.u.", "s.a.s.", "suc. de descendants", "sca"] - self.croatia = ["d.d.", "d.d.o.", "obrt"] - self.czech = ["a.s.", "akc. spol.", "s.r.o.", "v.o.s.", "k.s.", "sro", "vos"] - self.denmark = ["i/s", "a/s", "k/s", "p/s", "amba", "a.m.b.a.", "fmba", "f.m.b.a.", "smba", "s.m.b.a.", "g/s"] - self.domrep = ["c. por a.", "cxa", "s.a.", "s.a.s.", "srl.", "eirl.", "sa", "sas"] - self.ecuador = ["s.a.", "c.a.", "sa", "ep"] - self.egypt = ["sae"] - self.estonia = ["fie"] - self.finland = ["t:mi", "ay", "ky", " oy", " oyj", " ok"] - self.france = ["sicav", "sarl", "sogepa", " ei", " eurl", "sasu", "fcp", "gie", "sep", "snc", "scs", "sca", "scop", "sem", "sas"] - self.germany = ["gmbh & co. kg", "gmbh & co. kg", "e.g.", "e.v.", "gbr", "ohg", "partg", "kgaa", "gmbh", "g.m.b.h.", "ag"] - self.greece = ["a.e.", "ae", "e.e.", "ee", "epe", "e.p.e.", "mepe", "m.e.p.e.", "o.e.", " oe", "ovee", "o.v.e.e."] - self.guatemala = ["s.a.", "sa"] - self.haiti = [" sa"] - self.hongkong = ["ltd", "unltd", "ultd"] - self.hungary = ["e.v.", "e.c.", "bt.", "kft.", "kht.", "kkt.", "k.v.", "zrt.", "nyrt", " ev", " ec", " rt."] - self.iceland = ["ehf.", "hf.", "ohf.", "s.f.", "ses."] - self.india = ["pvt. ltd.", "ltd.", "psu", "pse"] - self.indonesia = [" ud", "fa", "pt"] - self.ireland = [" cpt", " teo"] - self.israel = ["b.m.", "bm", "ltd"] - self.italy = ["s.n.c.", "s.a.s.", "s.p.a.", "s.a.p.a.", "s.r.l.", "s.c.r.l.", "s.s."] - self.latvia = ["as", "sia", "ik", "ps", "ks"] - self.lebanon = ["sal"] - self.lithuania = ["uab", "ab", "ij", "mb"] - self.luxemborg = ["s.a.", "s.a.r.l.", " secs"] - self.macedonia = ["d.o.o.", "d.o.o.e.l", "k.d.a.", "j.t.d.", " a.d.", " k.d."] - self.malaysia = [" bhd.", "sdn. bhd."] - self.mexico = ["s.a.", "s. de. r.l.", "s. en c.", "s.a.b.", "s.a.p.i."] - self.mongolia = [" xk", " xxk"] - self.netherlands = ["v.o.f.", "c.v.", "b.v.", "n.v."] - self.newzealand = ["tapui", "ltd"] - self.nigeria = ["gte.", "plc", " ltd.", "ultd."] - self.norway = ["asa", "as", "ans", "ba", "bl", "da", "etat", "fkf", "hf", "iks", "kf", "ks", "nuf", "rhf", "sf"] - self.oman = ["saog", "saoc"] - self.pakistan = ["ltd.", "pvt. ltd.", "ltd"] - self.peru = ["sa", "s.a.", "s.a.a."] - self.philippines = ["coop.", "corp.", "corp", "ent.", "inc.", "inc", "llc", "l.l.c.", "ltd."] - self.poland = ["p.p.", "s.k.a.", "sp.j.", "sp.k.", "sp.p.", "sp. z.o.o.", "s.c.", "s.a."] - self.portugal = ["lda.", " crl", " s.a.", " s.f.", " sgps"] - self.romania = ["s.c.a.", "s.c.s.", "s.n.c.", "s.r.l.", "o.n.g.", "s.a."] - self.russia = ["ooo", "oao", "zao", "3ao"] - self.serbia = ["d.o.o.", "a.d.", "k.d.", "o.d."] - self.singapore = ["bhd", "pte ltd", "sdn bhd", "llp", " l.l.p.", " ltd.", "pte"] - self.slovokia = [" a.s.", " s.r.o.", " k.s.", " v.o.s."] - self.slovenia = ["d.d.", "d.o.o.", "d.n.o.", "k.d.", "s.p."] - self.spain = ["s.a.", "s.a.d.", "s.l.", "s.l.l.", "s.l.n.e.", "s.c.", "s.cra", "s.coop", "sal", "sccl"] - self.sweden = [" ab", " hb", " kb"] - self.switzerland = ["ab", "sa", " gmbh", "g.m.b.h.", "sarl", "sagl"] - self.turkey = ["koop."] - self.ukraine = ["dat", "fop", "kt", "pt", "tdv", "tov", "pp", "vat", "zat", "at"] - self.uk = ["plc.", "plc", "uk", "cic", "cio", "l.l.p.", "llp", "l.p.", "lp", "ltd.", "ltd"] - self.usa = [" llc", " inc.", "corporation", "incorporated", "company", "limited", "corp.", "inc.", "inc", "llp", "l.l.p.", "pllc", "and company", "& company", "usa", "inc", "inc.", "corp.", "corp", "ltd.", "ltd", "& co.", "& co", "co.", "co", "lp", ", pc", "us"] - self.uzbekistan = ["mchj", "qmj", "aj", "oaj", "yoaj", "xk", "xt", "ok" "uk", "qk"] - - # Private company? - self.pte = ["private", "pte", " xk"] + def string_stripper(self, business_name): - ## Abbreviations ## - self.abbv = {'intl.':'International', 'intl':'International', 'co.':'Company', 'mfg':'Manufacturing', ' med ':' Medical ', 'ctr':'Center'} + # Get rid of everything in parenthesis + if " (" and ")" in business_name: + beginpar = business_name.find(" (") + endpar = business_name.find(")") + business_name = business_name.replace(business_name[beginpar:endpar+1],"") - # Abbreviations when strings end with these - self.abbvend = {' co':'Company'} + # Strip out commas + if "," in business_name: + business_name = business_name.replace(",", " ") + # Strip spaces on the left + business_name = business_name.lstrip() - ## Industry ## - self.busind = ["Pharmaceutical, Biotechnology", "Engineering"] - self.pharma = [" therap", "biopharmaceuticals", "biopharmaceutical", "biopharma", "biopharm", "pharmaceuticals", "pharmaceutical", "pharma"] - self.biotech = [" therap", "biopharmaceuticals", "biopharmaceutical", "biopharma", "biopharm", "biotechnology", "biotechnologies", "bioventures", "biolabs", "biosciences", "bioscience", "biotech"] - self.engineering = ["engineer"] - self.education = ["education", "university", "school of", "academy"] + #Strip spaces on the right + business_name = business_name.strip() - def masterlist(self): - mlist = self.albania + self.argentina + self.australia + self.austria + self.belarus + self.belgium + self.bosherz + self.bulgaria + self.brazil + self.cambodia + self.canada + self.chile + self.columbia + self.croatia + self.czech + self.denmark + self.domrep + self.ecuador + self.egypt + self.estonia + self.finland + self.france + self.germany + self.greece + self.guatemala + self.haiti + self.hongkong + self.hungary + self.iceland + self.india + self.indonesia + self.ireland + self.israel + self.italy + self.latvia + self.lebanon + self.lithuania + self.luxemborg + self.macedonia + self.malaysia + self.mexico + self.mongolia + self.netherlands + self.newzealand + self.nigeria + self.norway + self.oman + self.pakistan + self.peru + self.philippines + self.poland + self.portugal + self.romania + self.russia + self.serbia + self.singapore + self.slovokia + self.slovenia + self.spain + self.sweden + self.switzerland + self.turkey + self.ukraine + self.uk + self.usa + self.uzbekistan - mlist = list(OrderedDict.fromkeys(mlist)) - mlist.sort(key=len, reverse=True) - - mlist2 = [] - for item in mlist: - mlist2.append(" " + item) + # Get rid of misc spaces in between + business_name = " ".join(business_name.split()) - return mlist + return business_name - def type(self): - - type_set = [] - - # Limited Partnership - for item in self.lp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Partnership") - break - - # Professional Limited Liability Company - PLLC - for item in self.pllc: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Professional Limited Liability Company") - break - - # Limited Liability Limited Partnership - for item in self.lllp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Liability Limited Partnership") - break - - # Corporation - for item in self.corporation: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Corporation") - break - - # General Partnership - for item in self.gp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("General Partnership") - break - - # Limited Liability Company - for item in self.llc: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Liability Company") - break - - # Limited Liability Partnership - for item in self.llp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Liability Partnership") - break - - # Limited Company -- LTD - for item in self.ltd: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Limited Company") - break - - # Professional Corporation - for item in self.pc_comma: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Professional Corporation") - break - - # No Liability - Aus - for item in self.nl: - if (((self.corpname).lower()).endswith(item)): - type_set.append("No Liability") - break - - # Sole Proprietorship - for item in self.sp: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Sole Proprietorship") - break - - # Joint Venture - for item in self.jv: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Sole Proprietorship") - break - - # Non-Profit - for item in self.np: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Non-Profit") - break - - # Joint Stock - Unlimited - for item in self.js: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Joint Stock") - break - - # Mutual Fund - for item in self.mf: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Mutual Fund") - break - - if type_set == []: - return None - else: - return type_set + def end_strip(self, a_set): - - def industry(self): - - industry_set = [] + end_set = [] - # Pharmaceutical - for item in self.pharma: - if item in (self.corpname).lower(): - industry_set.append("Pharmaceutical") - break - - # Biotechnology - for item in self.biotech: - if item in (self.corpname).lower(): - industry_set.append("Biotechnology") - break - - # Education - for item in self.education: - if item in (self.corpname).lower(): - industry_set.append("Education") - break - - # Engineering - for item in self.engineering: - if item in (self.corpname).lower(): - industry_set.append("Engineering") - break - - if industry_set == []: - return None - else: - return industry_set + business_name = self.business_name + business_name = self.string_stripper(business_name) - def country(self): - - country_set = [] - - # Albania - for item in self.albania: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Albania") - break - - # Argentina - for item in self.argentina: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Argentina") - break - - # Australia - for item in self.australia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Australia") - break - - # Austria - for item in self.austria: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Austria") - break - - # Belarus - for item in self.belarus: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Belarus") - break - - # Belgium - for item in self.belgium: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Belgium") - break - - # Bosnia / "Herzegovina" - for item in self.bosherz: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Bosnia") - country_set.append("Herzegovina") - break - - # Brazil - for item in self.brazil: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Brazil") - break - - # Bulgaria - for item in self.bulgaria: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Bulgaria") - break - - # Cambodia - for item in self.cambodia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Cambodia") - break - - # Canada - for item in self.canada: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Canada") - break - - # Chile - for item in self.chile: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Chile") - break - - # Columbia - for item in self.columbia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Columbia") - break - - # Croatia - for item in self.croatia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Croatia") - break - - # Czech Republic - for item in self.czech: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Czech Republic") - break - - # Denmark - for item in self.denmark: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Denmark") - break - - # Dominican Republic - for item in self.domrep: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Dominican Republic") - break - - # Ecuador - for item in self.ecuador: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Ecuador") - break - - # Egypt - for item in self.egypt: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Egypt") - break - - # Estonia - for item in self.estonia: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Estonia") - break - - # Finland - for item in self.finland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Finland") - break - - # France - for item in self.france: - if (((self.corpname).lower()).endswith(item)): - country_set.append("France") - break - - # Germany - for item in self.germany: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Germany") - break - - # Greece - for item in self.greece: - if (((self.corpname).lower()).endswith(item)): - type_set.append("Greece") - break - - # Guatemala - for item in self.guatemala: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Guatemala") - break - - # Haiti - for item in self.haiti: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Haiti") - break - - # Hong Kong - for item in self.hongkong: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Hong Kong") - break - - # Hungary - for item in self.hungary: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Hungary") - break - - # Iceland - for item in self.iceland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Iceland") - break - - # India - for item in self.india: - if (((self.corpname).lower()).endswith(item)): - country_set.append("India") - break - - # Indonesia - for item in self.indonesia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Indonesia") - break - - # Ireland - for item in self.ireland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Ireland") - break - - # Israel - for item in self.israel: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Israel") - break - - # Italy - for item in self.italy: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Italy") - break - - # Latvia - for item in self.latvia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Latvia") - break - - # Lebanon - for item in self.lebanon: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Lebanon") - break - - # Lithuania - for item in self.lithuania: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Lithuania") - break - - # Luxemborg - for item in self.luxemborg: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Luxemborg") - break - - # Macedonia - for item in self.macedonia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Macedonia") - break - - # Malaysia - for item in self.malaysia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Malaysia") - break - - # Mexico - for item in self.mexico: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Mexico") - break - - # Mongolia - for item in self.mongolia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Mongolia") - break - - # Netherlands - for item in self.netherlands: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Netherlands") - break - - # New Zealand - for item in self.newzealand: - if (((self.corpname).lower()).endswith(item)): - country_set.append("New Zealand") - break - - # Nigeria - for item in self.nigeria: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Nigeria") - break - - # Norway - for item in self.norway: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Norway") - break - - # Oman - for item in self.oman: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Oman") - break - - # Pakistan - for item in self.pakistan: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Pakistan") - break - - # Peru - for item in self.peru: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Peru") - break - - # Philippines - for item in self.philippines: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Philippines") - break - - # Poland - for item in self.poland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Poland") - break - - # Portugal - for item in self.portugal: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Portugal") - break - - # Romania - for item in self.romania: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Romania") - break - - # Russia - for item in self.russia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Russia") - break - - # Serbia - for item in self.serbia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Serbia") - break - - # Singapore - for item in self.singapore: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Singapore") - break - - # Slovokia - for item in self.slovokia: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Slovokia") - break - - # Spain - for item in self.spain: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Spain") - break - - # Sweden - for item in self.sweden: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Sweden") - break - - # Switzerland - for item in self.switzerland: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Switzerland") - break - - # Turkey - for item in self.turkey: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Turkey") - break - - # Ukraine - for item in self.ukraine: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Ukraine") - break - - # United Kingdom - for item in self.uk: - if (((self.corpname).lower()).endswith(item)): - country_set.append("United Kingdom") - break - - # United States - for item in self.usa: - if (((self.corpname).lower()).endswith(item)): - country_set.append("United States") - break - - # Uzbekistan - for item in self.uzbekistan: - if (((self.corpname).lower()).endswith(item)): - country_set.append("Uzbekistan") - break - - if country_set == []: - return None + for key, suffix in a_set: + suffix = " " + suffix + if ((business_name.lower()).endswith(suffix)): + end_set.append(key) + + end_set = list(OrderedDict.fromkeys(end_set)) + + if end_set != []: + return end_set else: - return country_set + return None - def spacecleanup(self, corpname): + def industry(self): + + industry_set = [] - corpname = corpname.lstrip() - corpname = corpname.strip() - corpname = " ".join(corpname.split()) + business_name = self.business_name - return corpname + for industry, keyword in self.industry_list: + if keyword in business_name.lower(): + industry_set.append(industry) + industry_set = list(OrderedDict.fromkeys(industry_set)) + + if industry_set != []: + return industry_set + else: + return None # A clean version of the business name def cleanname(self): - corpname = self.corpname + business_name = self.business_name - # Get rid of country items: - for item in self.masterlist(): - if ((corpname.lower()).endswith(item)): - start = (corpname.lower()).find(item) - end = len(item) - end = end * -1 - corpname = corpname[0:end] - corpname = corpname.lstrip() - corpname = corpname.strip() - corpname = " ".join(corpname.split()) + # Get rid of everything in parenthesis + if " (" and ")" in business_name: + beginpar = business_name.find(" (") + endpar = business_name.find(")") + business_name = business_name.replace(business_name[beginpar:endpar+1],"") + + # Get rid of everything after hyphen and spaces + if " - " in business_name: + corplen = len(business_name) + hypenloc = business_name.find(" - ") + business_name = business_name.replace(business_name[hypenloc:corplen],"") # Abbrv. cleanup for abbv in self.abbv: - if abbv in corpname.lower(): - start = (corpname.lower()).find(abbv) - end = len(corpname) - corpname = corpname[0:start] + self.abbv[abbv] + corpname[end+1:len(corpname)] - - # Strip out commas - if "," in corpname: - corpname = corpname.replace(",", " ") - - # Strip spaces on the left - corpname = corpname.lstrip() + if abbv in business_name.lower(): + start = (business_name.lower()).find(abbv) + end = len(business_name) + business_name = business_name[0:start] + self.abbv[abbv] + business_name[end+1:len(business_name)] - #Strip spaces on the right - corpname = corpname.strip() - - # Get rid of misc spaces in between - corpname = " ".join(corpname.split()) + # Replace single hyphen with space + if "-" in business_name: + corplen = len(business_name) + hypenloc = business_name.find("-") + business_name = business_name.replace("-"," ") - return corpname + # Get rid of country items: + for item in self.suffix_sort: + if ((business_name.lower()).endswith(item)): + start = (business_name.lower()).find(item) + end = len(item) + end = end * -1 + business_name = business_name[0:end] + business_name = self.string_stripper(business_name) + business_name = self.string_stripper(business_name) - # A short version of the corporate name - def shortname(self): + return business_name - corpname = self.cleanname() - - # Get rid of everything in parenthesis - if " (" and ")" in corpname: - beginpar = corpname.find(" (") - endpar = corpname.find(")") - corpname = corpname.replace(corpname[beginpar:endpar+1],"") + def cleaner(self): - # Get rid of everything after hyphen and spaces - if " - " in corpname: - corplen = len(corpname) - hypenloc = corpname.find(" - ") - corpname = corpname.replace(corpname[hypenloc:corplen],"") - - # Replace single hyphen with space - if "-" in corpname: - corplen = len(corpname) - hypenloc = corpname.find("-") - corpname = corpname.replace("-"," ") + self.clean_name = self.cleanname() + self.industry = self.industry() + self.type = self.end_strip(self.sorted_types) + self.country = self.end_strip(self.sorted_countries) - # Get rid of misc spaces in between - corpname = " ".join(corpname.split()) - - # Strip spaces on the left - corpname = corpname.lstrip() - - #Strip spaces on the right - corpname = corpname.strip() - - return corpname \ No newline at end of file + return self \ No newline at end of file diff --git a/cleanco.pyc b/cleanco.pyc index 49cffae..894067e 100644 Binary files a/cleanco.pyc and b/cleanco.pyc differ diff --git a/cleancotest.py b/cleancotest.py index c614859..acd5bd6 100644 --- a/cleancotest.py +++ b/cleancotest.py @@ -1,20 +1,15 @@ from cleanco import cleanco -companyname = "GlaxoSmithKline plc." -cleanco = cleanco(companyname) +business_name = "Merck Pharmaceutials - Corporate (formerly Not Merck) LLC" -bustype = cleanco.type() -busind = cleanco.industry() -cleanname = cleanco.cleanname() -short = cleanco.shortname() -country = cleanco.country() +processing = cleanco(business_name) +x = processing.cleaner() print -print("String: %s") % companyname +print business_name print -print("Clean Name: %s") % cleanname -print("Short Name: %s") % short -print("Possible Business Types: %s") % bustype -print("Possible Industry: %s") % busind -print("Possible Country: %s") % country +print("Clean Name: %s") % (x.clean_name) +print("Possible Industries: %s") % (x.industry) +print("Possible Business Types: %s") % (x.type) +print("Possible Countries: %s") % (x.country) print \ No newline at end of file diff --git a/readme.md b/readme.md index f044574..dcaadf1 100644 --- a/readme.md +++ b/readme.md @@ -1,4 +1,4 @@ -#cleanco +#cleanco - 1.0 ## What is it / what does it do? This is a Python module that processes company names. @@ -8,9 +8,6 @@ Download it from this site and unzip the directory. * Mac: `cd` into it, and enter `sudo python setup.py install` along with your system password. * Windows: Same thing but without `sudo`. -* Linux: ??? - -There may also be a way to do `pip install` but I can't guarantee this. ## How does the module work? Let's look at some sample code. First, initialize the module: @@ -19,37 +16,32 @@ Let's look at some sample code. First, initialize the module: Now, come up with a company name that you want to process: - >>> companyname = "Paul Pharmaceutical, Inc." + >>> business_name = "Some Big Pharma, LLC" Throw it into the module: - >>> processing = cleanco(companyname) + >>> processing = cleanco(business_name) + >>> x = processing.cleaner() You can now get the company types: - >>> cotype = processing.type() - >>> print cotype - ['Corporation'] + >>> x.type + ['Limited Liability Company'] ...the possible countries... - >>> country = processing.country() - >>> print country - ['Philippines', 'United States'] + >>> x.country + ['United States of America', 'Philippines'] ...the possible industries... - >>> industry = processing.industry() - >>> print industry + >>> x.industry ['Pharmaceutical'] ...and a clean version of the company name. - >>> clean = processing.cleanname() - >>> print clean - Paul Pharmaceutical - -There is also a short version of the company name for times when you want to remove things in parenthesis or everything after a hyphen. You can access this with `.shortname()`. + >>> x.clean_name + 'Some Big Pharma' ## Are there bugs? You better believe it. Please let me know or fork this project. I'm sure some of the company suffixes are way incorrect and I'm missing a lot more information. diff --git a/setup.py b/setup.py index 758e6a6..80424e2 100755 --- a/setup.py +++ b/setup.py @@ -2,6 +2,6 @@ from distutils.core import setup setup(name='cleanco', - version='0.4', + version='1.0', py_modules=['cleanco'], ) \ No newline at end of file