diff --git a/parse_domain.py b/parse_domain.py index 405b72c..b60be4b 100644 --- a/parse_domain.py +++ b/parse_domain.py @@ -8,33 +8,35 @@ def parse_domain(url, levels=2): """ if levels < 1 or not url: return None - + # Parse the hostname from the url parsed = urlparse(url) - hostname = getattr(parsed,'netloc',url) - + hostname = getattr(parsed,'hostname',url) + partial_domains = [] partial_domain = "" + if hostname is None: + return None for section in reversed(hostname.split(".")): partial_domain = "." + section + partial_domain partial_domains.append(partial_domain) - + # Find the longest matching TLD, recording its index tld_idx = 0 for idx, item in enumerate(partial_domains): if item in tlds: tld_idx = idx - + # Add the desired number of levels to the tld index, # counting the TLD itself as the first level try: domain = partial_domains[tld_idx + levels - 1] except IndexError: domain = partial_domains[-1] - + # Remove the initial dot return domain[1:] - + tlds = set(( '.2000.hu', '.ab.ca', @@ -1576,7 +1578,7 @@ def parse_domain(url, levels=2): if __name__=="__main__": - for item in ['http://something.unknown','http://a.b.c.something.unknown','http://something','http://google.com','http://a.b.c.d.e.google.com','http://something.uk','http://aa.bb.cc.dd.ee.ff.gg.guardian.co.uk','http://guardian.co.uk','http://www.guardian.co.uk','http://www.google.com','http://wikipedia.org','http://www.wikipedia.org','',None,'whatever']: + for item in ['http://something.unknown','http://a.b.c.something.unknown','http://something','http://google.com','http://a.b.c.d.e.google.com','http://something.uk','http://aa.bb.cc.dd.ee.ff.gg.guardian.co.uk','http://guardian.co.uk','http://www.guardian.co.uk','http://www.google.com','http://wikipedia.org','http://www.wikipedia.org','http://www.example.com:1234','',None,'whatever']: for level in range(0,5): print "url=%s, levels=%s => %s" % (item, level, parse_domain(item,level))