diff --git a/README.rst b/README.rst index f673e0c..a6c015f 100644 --- a/README.rst +++ b/README.rst @@ -147,6 +147,11 @@ an invalid TLD will return None.:: >>> psl.get_public_suffix('www.mine.local', strict=True) is None True +Keep in mind that 'valid' is determined by the list that the PSL object is created from, and is not necessarily the +Mozilla list. You can create a version with your own lists, for example, using only the ICANN public suffixes, or +ICANN TLDS, or some other custom list. In all cases, strict=True will verify that the right most label of the lookup +item is contained in the list. + **Return eTLD only.** The standard use case for publicsuffix2 is to return the registrable, or base, domain according to the public suffix list. In some cases, however, we only wish to find the eTLD @@ -162,6 +167,34 @@ All of the methods and functions include the wildcard and strict parameters. For convenience, the public method get_sld() is available. This is identical to the method get_public_suffix() and is intended to clarify the output for some users. +**Accelerated Functions** + +If your data is already normalized to lowercase with trailing dots, e.g., 'com.' or '.com' removed, you can use +functions that will perform much faster by avoiding these element-wise operations on the input data.:: + + psl.get_sld_unsafe('www.google.com') + 'google.com' + psl.get_tld_unsafe('www.google.co.uk') + 'co.uk' + +**Edge Case Processing** + +Domain names occur in many sources and varieties, including use cases from urls, dns data, and emails. All of these, +particularly at large volume, may have improperly formatted data and noise. In order to provide backward compatibility +across these cases, the user is left to deal with some edge cases in their own data. The library chooses to lowercase +and remove trailing and leading dots from domains. This means that inputs that are technically invalid DNS domains +will generally return a value. For example, empty labels are invalid in DNS and thus '..google.com' is not a valid +domain, however, this library will return a value:: + + psl.get_sld('..google.com') + 'google.com' + +Users who wish to remove invalid DNS labels will need to clean their data prior to using the library functions. +However, domains that contain empty labels in the middle will return None.:: + + psl.get_sld('google..com') == None + True + To **update the bundled suffix list** use the provided setup.py command:: python setup.py update_psl diff --git a/UseCases.md b/UseCases.md index 9a67023..91d1258 100644 --- a/UseCases.md +++ b/UseCases.md @@ -10,14 +10,15 @@ For adhoc usage, the original functions are sufficient. 1. general difference of get_*() and get_*_unsafe() methods: get_*_unsafe() does not perform if the input string is None and does not -transforms it to the lower case. +transforms it to the lower case or strip '.'. 2. The listed above methods works only with non-canonical FQDN strings - trailing dot must be removed before call the method. This restriction allows get rid of fuzzy logic in edge cases. 3. DNS does not support empty labels - if some label detected to be empty, -None will be returned. +None will be returned. Trailing dots are stripped at both the start and end of +the string to avoid unintentional empty labels. 4. Every method processes provided FQDN in the reverse order, from the last label towards the start of the string. It stops when the specific task is @@ -49,7 +50,7 @@ The behavior of the library can be illustrated best on the small examples: | 'abc' | true | | None | 'abc' not in the list | | '.abc' | false | | 'abc' | non-strict mode, the last label is TLD | | '.abc' | true | | None | 'abc' not in the list | -| 'abc.' | | | None | empty labels are not allowed | +| 'abc.' | | | 'abc' | trailing dot is stripped | | '....abc' | false | | 'abc' | non-strict mode, string head is not processed| | '....abc' | true | | None | 'abc' not in the list | | 'example.abc' | false | | 'abc' | non-strict mode, the last label is TLD | @@ -69,13 +70,15 @@ The behavior of the library can be illustrated best on the small examples: | '.abc' | false | | 'abc' | non-strict mode | | '.abc' | true | | None | not in the list | | '.com' | | | 'com' | allowed TLD | -| 'abc.' | | | None | empty labels are not allowed | +| 'abc.' | | | 'abc' | trailing dots removed | | '....abc' | false | | 'abc' | non-strict mode, string head is not processed| | '....abc' | true | | None | not in the list | | '....com' | | | 'com' | allowed TLD, string head is not processed| | 'example.abc' | false | | 'abc' | non-strict mode, the last label is TLD | | 'example.abc' | true | | None | 'abc' not in the list | | 'example.com' | | | 'com' | allowed TDL | +| 'example..com' | | | None | empty labels are not allowed | + ### Simple case, negation, no wildcards (['com', '!org']) @@ -93,9 +96,6 @@ The behavior of the library can be illustrated best on the small examples: | '.abc' | true | | None | not in the list | | '.com' | | | 'com' | allowed TLD | | '.org' | | | None | not allowed TLD | -| 'abc.' | | | None | empty labels are not allowed | -| 'com.' | | | None | empty labels are not allowed | -| 'org.' | | | None | empty labels are not allowed | | '....abc' | false | | 'abc' | non-strict mode, string head is not processed| | '....abc' | true | | None | not in the list | | '....com' | | | 'com' | allowed TLD, string head is not processed| diff --git a/src/publicsuffix2/__init__.py b/src/publicsuffix2/__init__.py index 14dba6b..f1008a5 100644 --- a/src/publicsuffix2/__init__.py +++ b/src/publicsuffix2/__init__.py @@ -260,7 +260,8 @@ def get_sld(self, domain, wildcard=True, strict=False): """ if not domain or len(domain) == 0: return None - domain = domain.lower() + domain = domain.lower().strip('.') + return self.get_sld_unsafe(domain, wildcard, strict) def get_sld_unsafe(self, domain, wildcard=True, strict=False): @@ -322,7 +323,7 @@ def get_tld(self, domain, wildcard=True, strict=False): """ if domain is None: return None - domain = domain.lower() + domain = domain.lower().strip('.') return self.get_tld_unsafe(domain, wildcard, strict) @@ -387,7 +388,7 @@ def get_tld_unsafe(self, domain, wildcard=True, strict=False): tld = domain[tld_start + 1:] if tld_start is not None else None return tld or None # empty string -> None - def get_components(self, domain: str, wildcard=True, strict=False) -> (str, str, str): + def get_components(self, domain, wildcard=True, strict=False): """ Returns 3-tuple of components of the domain name: (prefix, SLL, TLD/eTLD) where @@ -432,10 +433,10 @@ class is crafted for use in bulk-processors (such as pandas), therefore it """ if not domain or len(domain) == 0: return None, None, None - domain = domain.lower() + domain = domain.lower().strip('.') return self.get_components_unsafe(domain, wildcard, strict) - def get_components_unsafe(self, domain: str, wildcard=True, strict=False) -> (str, str, str): + def get_components_unsafe(self, domain, wildcard=True, strict=False): """ This is unsafe method that does not checks if the domain is None. Also it does not perform conversion of the domain into lowercase. diff --git a/tests.py b/tests.py index f65b25a..d1167f4 100644 --- a/tests.py +++ b/tests.py @@ -49,8 +49,8 @@ def test_get_sld_from_empty_list(self): psl = publicsuffix.PublicSuffixList([]) assert 'com' == psl.get_sld('com') assert 'com' == psl.get_sld('COM') - # '.com' -> . -> None, empty labels are not allowed - assert None == psl.get_sld('.com') + # '.com' -> -> 'com' + assert 'com' == psl.get_sld('.com') # 'a.example.com', strict=False -> .. -> 'example.com' assert 'example.com' == psl.get_sld('a.example.com') @@ -81,8 +81,7 @@ def test_get_sld_from_list_with_exception_rule(self): def test_get_sld_from_list_with_fqdn(self): psl = publicsuffix.PublicSuffixList(['com']) - # 'example.com.' -> .. -> None, empty labels are not allowed - assert None == psl.get_sld('example.com.') + assert 'example.com' == psl.get_sld('example.com.') def test_get_sld_from_list_with_unicode(self): psl = publicsuffix.PublicSuffixList([u'\u0440\u0444'], idna=False) @@ -107,13 +106,11 @@ def test_get_sld_from_builtin_full_publicsuffix_org_list_with_mixed_case(self): def test_get_sld_from_builtin_full_publicsuffix_org_list_with_leading_dot(self): psl = publicsuffix.PublicSuffixList(None) - # '.com' -> . -> None, empty labels are not allowed - assert None == psl.get_sld('.com') - # '.example' -> . -> None, empty labels are not allowed - assert None == psl.get_sld('.example') assert 'example.com' == psl.get_sld('.example.com') # note: non-strict mode: TLD 'example' -> SLD example.example assert 'example.example' == psl.get_sld('.example.example') + # strict mode + assert None == psl.get_sld('.example.example', strict=True) def test_get_sld_from_builtin_full_publicsuffix_org_list_with_unlisted_tld(self): psl = publicsuffix.PublicSuffixList(None) @@ -252,8 +249,8 @@ def test_get_tld_returns_correct_tld_or_etld(self): def test_get_tld_returns_correct_tld_or_etld_for_fqdn(self): psl = publicsuffix.PublicSuffixList() - # note: empty label or dot on the right side is not allowed - assert None == psl.get_tld('www.foo.com.') + assert 'com' == psl.get_tld('www.foo.com.') + assert 'co.uk' == psl.get_tld('www.foo.co.uk') def test_get_tld_returns_correct_tld_or_etld_for_root_domain(self): psl = publicsuffix.PublicSuffixList() @@ -297,8 +294,7 @@ def test_get_sld_backward_compatibility_sld_for_empty_string(self): def test_get_sld_backward_compatibility_sld_for_fqdn(self): psl = publicsuffix.PublicSuffixList() - # 'www.foo.com.' -> ... -> None, empty labels are not allowed - assert None == psl.get_sld('www.foo.com.') + assert 'foo.com' == psl.get_sld('www.foo.com.') def test_get_sld_backward_compatibility_sld_for_root_domain(self): psl = publicsuffix.PublicSuffixList()