From 8e3725501bb32742e383765e37cdc89530517300 Mon Sep 17 00:00:00 2001 From: "mingang.he" Date: Mon, 2 Mar 2015 17:04:27 +0800 Subject: [PATCH 1/3] remove trailing spaces --- parse_domain.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/parse_domain.py b/parse_domain.py index 405b72c..bcec43c 100644 --- a/parse_domain.py +++ b/parse_domain.py @@ -8,33 +8,33 @@ def parse_domain(url, levels=2): """ if levels < 1 or not url: return None - + # Parse the hostname from the url parsed = urlparse(url) hostname = getattr(parsed,'netloc',url) - + partial_domains = [] partial_domain = "" for section in reversed(hostname.split(".")): partial_domain = "." + section + partial_domain partial_domains.append(partial_domain) - + # Find the longest matching TLD, recording its index tld_idx = 0 for idx, item in enumerate(partial_domains): if item in tlds: tld_idx = idx - + # Add the desired number of levels to the tld index, # counting the TLD itself as the first level try: domain = partial_domains[tld_idx + levels - 1] except IndexError: domain = partial_domains[-1] - + # Remove the initial dot return domain[1:] - + tlds = set(( '.2000.hu', '.ab.ca', From 27eafb4ede616c285f71863353fd7cfc801b96b2 Mon Sep 17 00:00:00 2001 From: "mingang.he" Date: Mon, 2 Mar 2015 17:31:56 +0800 Subject: [PATCH 2/3] fix #2: replace 'netloc' to 'hostname' --- parse_domain.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parse_domain.py b/parse_domain.py index bcec43c..f86dae6 100644 --- a/parse_domain.py +++ b/parse_domain.py @@ -11,10 +11,12 @@ def parse_domain(url, levels=2): # Parse the hostname from the url parsed = urlparse(url) - hostname = getattr(parsed,'netloc',url) + hostname = getattr(parsed,'hostname',url) partial_domains = [] partial_domain = "" + if hostname is None: + return None for section in reversed(hostname.split(".")): partial_domain = "." + section + partial_domain partial_domains.append(partial_domain) From 81f00751c92dd647a0f3f2ee31e1f7a6e3416fc9 Mon Sep 17 00:00:00 2001 From: "mingang.he" Date: Mon, 2 Mar 2015 17:36:55 +0800 Subject: [PATCH 3/3] add example --- parse_domain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parse_domain.py b/parse_domain.py index f86dae6..b60be4b 100644 --- a/parse_domain.py +++ b/parse_domain.py @@ -1578,7 +1578,7 @@ def parse_domain(url, levels=2): if __name__=="__main__": - for item in ['http://something.unknown','http://a.b.c.something.unknown','http://something','http://google.com','http://a.b.c.d.e.google.com','http://something.uk','http://aa.bb.cc.dd.ee.ff.gg.guardian.co.uk','http://guardian.co.uk','http://www.guardian.co.uk','http://www.google.com','http://wikipedia.org','http://www.wikipedia.org','',None,'whatever']: + for item in ['http://something.unknown','http://a.b.c.something.unknown','http://something','http://google.com','http://a.b.c.d.e.google.com','http://something.uk','http://aa.bb.cc.dd.ee.ff.gg.guardian.co.uk','http://guardian.co.uk','http://www.guardian.co.uk','http://www.google.com','http://wikipedia.org','http://www.wikipedia.org','http://www.example.com:1234','',None,'whatever']: for level in range(0,5): print "url=%s, levels=%s => %s" % (item, level, parse_domain(item,level))