# coding=utf-8

import re

SGML_TAG = ur"""
    <!-- .*? -->                # XML/SGML comment
    |                           # -- OR --
    <[!?/]?(?!\d)\w[-\.:\w]*    # Start of tag/directive
    (                           # Attributes
        [^>'"]*                 # - attribute name (+whitespace +equal sign)
        ('[^']*'|"[^"]*")       # - attribute value
    )*
    \s*                         # Spaces at the end
    /?                          # Forward slash at the end of singleton tags
    \s*                         # More spaces at the end
    >                           # +End of tag/directive
"""
SGML_TAG_RE = re.compile(SGML_TAG, re.UNICODE | re.VERBOSE | re.DOTALL)

WHITESPACE = ur"\s+"
WHITESPACE_RE = re.compile(WHITESPACE)

DNS_HOST = ur"(([-a-z0-9]+\.)+[a-z]{2,})"

URL = ur"""
    (
    # scheme://[user:password]
    (ftps?|https?|file)://([-a-z0-9_;?&=](:[-a-z0-9_;?&=]*)?@)?
    # or "www" without the scheme part
    |www\.
    )
    # DNS host / localhost / IP
    (""" + DNS_HOST + """
    | localhost |
    ([0-9]{1,3}\.){3}[0-9]{1,3})
    # Port specification (optional)
    (:[0-9]+)?
    # Scheme specific extension (optional)
    (/[-\w;/?:@=&\$_.+!*'(~#%,]*)?
"""
URL_RE = re.compile(URL, re.VERBOSE | re.IGNORECASE | re.UNICODE)

EMAIL = ur"[-a-z0-9._']+@" + DNS_HOST
EMAIL_RE = re.compile(EMAIL, re.IGNORECASE)

HTMLENTITY = ur"&(#x?[0-9A-F]+|\w+);"
HTMLENTITY_RE = re.compile(HTMLENTITY)

HASHTAG = ur"(?<!\w)#[A-Za-z]\w+"
HASHTAG_RE = re.compile(HASHTAG)

DOTCOM = ur"""
(?<!\w)
    ([-a-z0-9]+\.){1,2}(com|org|eu|no|nu)
(?!\w)
"""
DOTCOM_RE = re.compile(DOTCOM, re.IGNORECASE | re.VERBOSE)

ABBREVIATION = ur"""
(?<!\w)
    (?:
    #general
    co\.|inc\.|ltd\.|dr\.|prof\.|jr\.
    |
    #http://en.wiktionary.org/wiki/Category:Swedish_abbreviations
    1:a|2:a|3:a|4:a|5:a|6:a|7:a|8:a|9:a|10:a|11:a|12:a|13:a|14:a|15:a|16:a|17:a|18:a|19:a|20:a|21:a|22:a|23:a|24:a|25:a|26:a|27:a|28:a|29:a|30:a|31:a|1:e|2:e|3:e|4:e|5:e|6:e|7:e|8:e|9:e|10:e|11:e|12:e|13:e|14:e|15:e|16:e|17:e|18:e|19:e|20:e|21:e|22:e|23:e|24:e|25:e|26:e|27:e|28:e|29:e|30:e|31:e|ack\.|adj\.|adv\.|amer\.|anat\.|anv\.|Apg\.|arab\.|aram\.|arkeol\.|arkit\.|astr\.|bankv\.|bet\.|betyd\.|bibl\.|bildl\.|biol\.|bl\.a\.|bokf\.|boktr\.|bot\.|d\.|d\.v\.s\.|d\.y\.|d\.ä\.|da\.|data\.|cont\.|dets\.|dial\.|dim\.|Dr\.|dvs\.|e\.d\.|e\.dyl\.|e\.Kr\.|e\.m\.|eg\.|ekon\.|el\.|eng\.|etc\.|ev\.|ex\.|exkl\.|f\.|f\.d\.|f\.Kr\.|f\.m\.|f\.v\.t\.|fam\.|fem\.|fig\.|fil\.|filos\.|fonet\.|forneng\.|fornfra\.|fornhögty\.|fr\.|fr\.o\.m\.|fra\.|fsv\.|fys\.|förk\.|geogr\.|geol\.|geom\.|germ\.|got\.|grek\.|hand\.|hebr\.|hist\.|holl\.|ibl\.|cont\.|imperf\.|inf\.|ink\.|inkl\.|inst\.|interj\.|it\.|jap\.|jmf\.|jur\.|kem\.|kl\.|komp\.|konst\.|l\.|lat\.|litt\.|log\.|m\.fl\.|m\.m\.|mask\.|mat\.|med\.|medeleng\.|medelholl\.|medelhögty\.|medellågty\.|medeltidslat\.|meteor\.|mil\.|miner\.|mus\.|myt\.|N\.N\.|neds\.|neutr\.|no\.|nr\.|o\.d\.|o\.dyl\.|o\.s\.v\.|oböjl\.|omkr\.|osv\.|p\.g\.a\.|p\.m\.s\.|p\.s\.s\.|part\.|pedag\.|perf\.part\.|pers\.|plur\.|polit\.|port\.|prep\.|pres\.part\.|pron\.|psykol\.|real\.|resp\.|runsv\.|ry\.|s\.a\.s\.|s\.k\.|s\.ö\.|senlat\.|sing\.|sjö\.|skämts\.|sl\.|spa\.|sport\.|språkv\.|subst\.|särsk\.|t\.|t\. ex\.|t\.ex\.|t\.o\.m\.|tekn\.|teol\.|tex\.|cont\.|tr\.|ty\.|v\.t\.|vanl\.|vard\.|vers\.|vulgärlat\.|y\.|zool\.|ä\.|äv\.|åld\.
    |
    #https://en.wiktionary.org/wiki/Category:Norwegian_abbreviations
    adr\.|ans\.|bm\.|bto\.|d\.d\.|d\.m\.|d\.y\.|div\.|dvs\.|e\.Kr\.|eg\.|el\.|f\.eks\.|m\.m\.|ma\.|mva\.|n\.å\.|ndf\.|nkr|nov\.|nr\.|nto\.|nyno\.|osv\.|o\.s\.v\.|pga\.|rv\.
    )
(?!\w)
"""
ABBREVIATION_RE = re.compile(ABBREVIATION, re.UNICODE | re.VERBOSE | re.IGNORECASE)

NUMBER = ur"""
(?<!\S)
    [-+]?
    (\d[\d,.]*\d | \d)
    ([eE][-+][0-9]+)?
(?![-\w])
"""
NUMBER_RE = re.compile(NUMBER, re.UNICODE | re.VERBOSE)

USA = ur"""
(?<!\w)
    ([A-Z]\.){2,}
(?![\w.])
"""
USA_RE = re.compile(USA, re.UNICODE | re.VERBOSE)

WORD = ur"\w[\w-]*\w|\w"
WORD_RE = re.compile(WORD, re.UNICODE)

MULTICHAR_PUNCTUATION = ur"([?!]+|'')"
MULTICHAR_PUNCTUATION_RE = re.compile(MULTICHAR_PUNCTUATION)

SINGLECHAR_PUNCTUATION = ur"[\u0028\u005b\u007b\u0029\u005d\u007d\u2985\u2989\u3008\u298d\u300c\u2991\u3010\u2995\u3014\u2018\u169b\u201c\xab\u23b5\xbb\u0f3a\ufd3e\u29d9\u27e9\u276a\u276e\u2772\u29fd\u2986\u300b\u298a\u300f\u298e\u2992\u3017\u3018\u301b\u169c\u301f\u0f3d\u29da\u27e6\u2769\u27ea\u276d\u2771\u2775\u2983\u2987\u298b\u300a\u298f\u300e\u2993\u2997\u3016\u301a\u301e\u203a\u0f3c\u2046\u29db\u27e7\u2768\u27eb\u276c\u2770\u2774\u3019\u2984\u3009\u2988\u2996\u300d\u298c\u3011\u2990\u3015\u2994\u2019\u2998\u201d\u301d\u23b4\u2039\u0f3b\ufd3f\u2045\u29d8\u27e8\u276b\u276f\u2773\u29fc]"
SINGLECHAR_PUNCTUATION_RE = re.compile(SINGLECHAR_PUNCTUATION, re.UNICODE)

ANY_SEQUENCE = ur"(.)\1*"
ANY_SEQUENCE_RE = re.compile(ANY_SEQUENCE)

re_list = [
    ('SGML_TAG', SGML_TAG_RE),
    ('WHITESPACE', WHITESPACE_RE),
    ('URL', URL_RE),
    ('EMAIL', EMAIL_RE),
    ('HTMLENTITY', HTMLENTITY_RE),
    ('HASHTAG', HASHTAG_RE),
    ('DOTCOM', DOTCOM_RE),
    ('ABBREVIATION', ABBREVIATION_RE),
    ('NUMBER', NUMBER_RE),
    ('USA', USA_RE),
    ('WORD', WORD_RE),
    ('MULTICHAR_PUNCTUATION', MULTICHAR_PUNCTUATION_RE),
    ('SINGLECHAR_PUNCTUATION', SINGLECHAR_PUNCTUATION_RE),
    ('ANY_SEQUENCE', ANY_SEQUENCE_RE),
]
