From ea39b04dc5f91f6377d912395fa99b0466fc95af Mon Sep 17 00:00:00 2001 From: elebur Date: Wed, 21 May 2025 21:00:36 +0300 Subject: [PATCH 01/16] feat: add the `_fix_url` function for validating URLs --- ptbtest/entityparser.py | 266 +++++++++++++++++++++++++++ tests/test_EntityParser/test_misc.py | 74 +++++++- 2 files changed, 339 insertions(+), 1 deletion(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index 031d1d3..6a91908 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -23,6 +23,7 @@ `Telegram Docs `_ """ import html +import ipaddress import re import string from collections.abc import Sequence @@ -63,6 +64,171 @@ "u", "ins", "tg-spoiler", "tg-emoji", "span", "pre", "code", "blockquote") +COMMON_TLDS = ("aaa", "aarp", "abb", "abbott", "abbvie", "abc", "able", "abogado", + "abudhabi", "ac", "academy", "accenture", "accountant", "accountants", + "aco", "actor", "ad", "ads", "adult", "ae", "aeg", "aero", "aetna", + "af", "afl", "africa", "ag", "agakhan", "agency", "ai", "aig", + "airbus", "airforce", "airtel", "akdn", "al", "alibaba", "alipay", + "allfinanz", "allstate", "ally", "alsace", "alstom", "am", "amazon", + "americanexpress", "americanfamily", "amex", "amfam", "amica", + "amsterdam", "analytics", "android", "anquan", "anz", "ao", "aol", + "apartments", "app", "apple", "aq", "aquarelle", "ar", "arab", + "aramco", "archi", "army", "arpa", "art", "arte", "as", "asda", + "asia", "associates", "at", "athleta", "attorney", "au", "auction", + "audi", "audible", "audio", "auspost", "author", "auto", "autos", "aw", + "aws", "ax", "axa", "az", "azure", "ba", "baby", "baidu", "banamex", + "band", "bank", "bar", "barcelona", "barclaycard", "barclays", "barefoot", + "bargains", "baseball", "basketball", "bauhaus", "bayern", "bb", "bbc", + "bbt", "bbva", "bcg", "bcn", "bd", "be", "beats", "beauty", "beer", + "bentley", "berlin", "best", "bestbuy", "bet", "bf", "bg", "bh", "bharti", + "bi", "bible", "bid", "bike", "bing", "bingo", "bio", "biz", "bj", + "black", "blackfriday", "blockbuster", "blog", "bloomberg", "blue", + "bm", "bms", "bmw", "bn", "bnpparibas", "bo", "boats", "boehringer", + "bofa", "bom", "bond", "boo", "book", "booking", "bosch", "bostik", + "boston", "bot", "boutique", "box", "br", "bradesco", "bridgestone", + "broadway", "broker", "brother", "brussels", "bs", "bt", "build", "builders", + "business", "buy", "buzz", "bv", "bw", "by", "bz", "bzh", "ca", "cab", + "cafe", "cal", "call", "calvinklein", "cam", "camera", "camp", "canon", + "capetown", "capital", "capitalone", "car", "caravan", "cards", "care", + "career", "careers", "cars", "casa", "case", "cash", "casino", "cat", + "catering", "catholic", "cba", "cbn", "cbre", "cc", "cd", "center", "ceo", + "cern", "cf", "cfa", "cfd", "cg", "ch", "chanel", "channel", "charity", + "chase", "chat", "cheap", "chintai", "christmas", "chrome", "church", "ci", + "cipriani", "circle", "cisco", "citadel", "citi", "citic", "city", "ck", + "cl", "claims", "cleaning", "click", "clinic", "clinique", "clothing", + "cloud", "club", "clubmed", "cm", "cn", "co", "coach", "codes", "coffee", + "college", "cologne", "com", "commbank", "community", "company", "compare", + "computer", "comsec", "condos", "construction", "consulting", "contact", + "contractors", "cooking", "cool", "coop", "corsica", "country", "coupon", + "coupons", "courses", "cpa", "cr", "credit", "creditcard", "creditunion", + "cricket", "crown", "crs", "cruise", "cruises", "cu", "cuisinella", "cv", + "cw", "cx", "cy", "cymru", "cyou", "cz", "dabur", "dad", "dance", "data", + "date", "dating", "datsun", "day", "dclk", "dds", "de", "deal", "dealer", + "deals", "degree", "delivery", "dell", "deloitte", "delta", "democrat", + "dental", "dentist", "desi", "design", "dev", "dhl", "diamonds", "diet", + "digital", "direct", "directory", "discount", "discover", "dish", "diy", + "dj", "dk", "dm", "dnp", "do", "docs", "doctor", "dog", "domains", "dot", + "download", "drive", "dtv", "dubai", "dunlop", "dupont", "durban", "dvag", + "dvr", "dz", "earth", "eat", "ec", "eco", "edeka", "edu", "education", "ee", + "eg", "email", "emerck", "energy", "engineer", "engineering", "enterprises", + "epson", "equipment", "er", "ericsson", "erni", "es", "esq", "estate", "et", + "eu", "eurovision", "eus", "events", "exchange", "expert", "exposed", "express", + "extraspace", "fage", "fail", "fairwinds", "faith", "family", "fan", "fans", + "farm", "farmers", "fashion", "fast", "fedex", "feedback", "ferrari", "ferrero", + "fi", "fidelity", "fido", "film", "final", "finance", "financial", "fire", + "firestone", "firmdale", "fish", "fishing", "fit", "fitness", "fj", "fk", + "flickr", "flights", "flir", "florist", "flowers", "fly", "fm", "fo", "foo", + "food", "football", "ford", "forex", "forsale", "forum", "foundation", "fox", + "fr", "free", "fresenius", "frl", "frogans", "frontier", "ftr", "fujitsu", "fun", + "fund", "furniture", "futbol", "fyi", "ga", "gal", "gallery", "gallo", "gallup", + "game", "games", "gap", "garden", "gay", "gb", "gbiz", "gd", "gdn", "ge", "gea", + "gent", "genting", "george", "gf", "gg", "ggee", "gh", "gi", "gift", "gifts", + "gives", "giving", "gl", "glass", "gle", "global", "globo", "gm", "gmail", "gmbh", + "gmo", "gmx", "gn", "godaddy", "gold", "goldpoint", "golf", "goo", "goodyear", + "goog", "google", "gop", "got", "gov", "gp", "gq", "gr", "grainger", "graphics", + "gratis", "green", "gripe", "grocery", "group", "gs", "gt", "gu", "gucci", "guge", + "guide", "guitars", "guru", "gw", "gy", "hair", "hamburg", "hangout", "haus", + "hbo", "hdfc", "hdfcbank", "health", "healthcare", "help", "helsinki", "here", + "hermes", "hiphop", "hisamitsu", "hitachi", "hiv", "hk", "hkt", "hm", "hn", + "hockey", "holdings", "holiday", "homedepot", "homegoods", "homes", "homesense", + "honda", "horse", "hospital", "host", "hosting", "hot", "hotels", "hotmail", + "house", "how", "hr", "hsbc", "ht", "hu", "hughes", "hyatt", "hyundai", "ibm", + "icbc", "ice", "icu", "id", "ie", "ieee", "ifm", "ikano", "il", "im", "imamat", + "imdb", "immo", "immobilien", "in", "inc", "industries", "infiniti", "info", + "ing", "ink", "institute", "insurance", "insure", "int", "international", "intuit", + "investments", "io", "ipiranga", "iq", "ir", "irish", "is", "ismaili", "ist", + "istanbul", "it", "itau", "itv", "jaguar", "java", "jcb", "je", "jeep", "jetzt", + "jewelry", "jio", "jll", "jm", "jmp", "jnj", "jo", "jobs", "joburg", "jot", "joy", + "jp", "jpmorgan", "jprs", "juegos", "juniper", "kaufen", "kddi", "ke", "kerryhotels", + "kerrylogistics", "kerryproperties", "kfh", "kg", "kh", "ki", "kia", "kids", "kim", + "kindle", "kitchen", "kiwi", "km", "kn", "koeln", "komatsu", "kosher", "kp", "kpmg", + "kpn", "kr", "krd", "kred", "kuokgroup", "kw", "ky", "kyoto", "kz", "la", "lacaixa", + "lamborghini", "lamer", "lancaster", "land", "landrover", "lanxess", "lasalle", + "lat", "latino", "latrobe", "law", "lawyer", "lb", "lc", "lds", "lease", "leclerc", + "lefrak", "legal", "lego", "lexus", "lgbt", "li", "lidl", "life", "lifeinsurance", + "lifestyle", "lighting", "like", "lilly", "limited", "limo", "lincoln", "link", + "lipsy", "live", "living", "lk", "llc", "llp", "loan", "loans", "locker", "locus", + "lol", "london", "lotte", "lotto", "love", "lpl", "lplfinancial", "lr", "ls", "lt", + "ltd", "ltda", "lu", "lundbeck", "luxe", "luxury", "lv", "ly", "ma", "madrid", + "maif", "maison", "makeup", "man", "management", "mango", "map", "market", + "marketing", "markets", "marriott", "marshalls", "mattel", "mba", "mc", "mckinsey", + "md", "me", "med", "media", "meet", "melbourne", "meme", "memorial", "men", "menu", + "merckmsd", "mg", "mh", "miami", "microsoft", "mil", "mini", "mint", "mit", + "mitsubishi", "mk", "ml", "mlb", "mls", "mm", "mma", "mn", "mo", "mobi", "mobile", + "moda", "moe", "moi", "mom", "monash", "money", "monster", "mormon", "mortgage", + "moscow", "moto", "motorcycles", "mov", "movie", "mp", "mq", "mr", "ms", "msd", + "mt", "mtn", "mtr", "mu", "museum", "music", "mv", "mw", "mx", "my", "mz", "na", + "nab", "nagoya", "name", "navy", "nba", "nc", "ne", "nec", "net", "netbank", + "netflix", "network", "neustar", "new", "news", "next", "nextdirect", "nexus", + "nf", "nfl", "ng", "ngo", "nhk", "ni", "nico", "nike", "nikon", "ninja", "nissan", + "nissay", "nl", "no", "nokia", "norton", "now", "nowruz", "nowtv", "np", "nr", + "nra", "nrw", "ntt", "nu", "nyc", "nz", "obi", "observer", "office", "okinawa", + "olayan", "olayangroup", "ollo", "om", "omega", "one", "ong", "onion", "onl", + "online", "ooo", "open", "oracle", "orange", "org", "organic", "origins", "osaka", + "otsuka", "ott", "ovh", "pa", "page", "panasonic", "paris", "pars", "partners", + "parts", "party", "pay", "pccw", "pe", "pet", "pf", "pfizer", "pg", "ph", "pharmacy", + "phd", "philips", "phone", "photo", "photography", "photos", "physio", "pics", + "pictet", "pictures", "pid", "pin", "ping", "pink", "pioneer", "pizza", "pk", "pl", + "place", "play", "playstation", "plumbing", "plus", "pm", "pn", "pnc", "pohl", + "poker", "politie", "porn", "post", "pr", "pramerica", "praxi", "press", "prime", + "pro", "prod", "productions", "prof", "progressive", "promo", "properties", + "property", "protection", "pru", "prudential", "ps", "pt", "pub", "pw", "pwc", + "py", "qa", "qpon", "quebec", "quest", "racing", "radio", "re", "read", + "realestate", "realtor", "realty", "recipes", "red", "redstone", "redumbrella", + "rehab", "reise", "reisen", "reit", "reliance", "ren", "rent", "rentals", "repair", + "report", "republican", "rest", "restaurant", "review", "reviews", "rexroth", + "rich", "richardli", "ricoh", "ril", "rio", "rip", "ro", "rocks", "rodeo", "rogers", + "room", "rs", "rsvp", "ru", "rugby", "ruhr", "run", "rw", "rwe", "ryukyu", "sa", + "saarland", "safe", "safety", "sakura", "sale", "salon", "samsclub", "samsung", + "sandvik", "sandvikcoromant", "sanofi", "sap", "sarl", "sas", "save", "saxo", "sb", + "sbi", "sbs", "sc", "scb", "schaeffler", "schmidt", "scholarships", "school", + "schule", "schwarz", "science", "scot", "sd", "se", "search", "seat", "secure", + "security", "seek", "select", "sener", "services", "seven", "sew", "sex", "sexy", + "sfr", "sg", "sh", "shangrila", "sharp", "shell", "shia", "shiksha", "shoes", + "shop", "shopping", "shouji", "show", "si", "silk", "sina", "singles", "site", + "sj", "sk", "ski", "skin", "sky", "skype", "sl", "sling", "sm", "smart", "smile", + "sn", "sncf", "so", "soccer", "social", "softbank", "software", "sohu", "solar", + "solutions", "song", "sony", "soy", "spa", "space", "sport", "spot", "sr", "srl", + "ss", "st", "stada", "staples", "star", "statebank", "statefarm", "stc", "stcgroup", + "stockholm", "storage", "store", "stream", "studio", "study", "style", "su", + "sucks", "supplies", "supply", "support", "surf", "surgery", "suzuki", "sv", + "swatch", "swiss", "sx", "sy", "sydney", "systems", "sz", "tab", "taipei", "talk", + "taobao", "target", "tatamotors", "tatar", "tattoo", "tax", "taxi", "tc", "tci", + "td", "tdk", "team", "tech", "technology", "tel", "temasek", "tennis", "teva", + "tf", "tg", "th", "thd", "theater", "theatre", "tiaa", "tickets", "tienda", "tips", + "tires", "tirol", "tj", "tjmaxx", "tjx", "tk", "tkmaxx", "tl", "tm", "tmall", + "tn", "to", "today", "tokyo", "ton", "tools", "top", "toray", "toshiba", "total", + "tours", "town", "toyota", "toys", "tr", "trade", "trading", "training", "travel", + "travelers", "travelersinsurance", "trust", "trv", "tt", "tube", "tui", "tunes", + "tushu", "tv", "tvs", "tw", "tz", "ua", "ubank", "ubs", "ug", "uk", "unicom", "university", + "uno", "uol", "ups", "us", "uy", "uz", "va", "vacations", "vana", "vanguard", + "vc", "ve", "vegas", "ventures", "verisign", "vermögensberater", "vermögensberatung", + "versicherung", "vet", "vg", "vi", "viajes", "video", "vig", "viking", "villas", + "vin", "vip", "virgin", "visa", "vision", "viva", "vivo", "vlaanderen", "vn", "vodka", + "volvo", "vote", "voting", "voto", "voyage", "vu", "wales", "walmart", "walter", + "wang", "wanggou", "watch", "watches", "weather", "weatherchannel", "webcam", + "weber", "website", "wed", "wedding", "weibo", "weir", "wf", "whoswho", "wien", + "wiki", "williamhill", "win", "windows", "wine", "winners", "wme", "wolterskluwer", + "woodside", "work", "works", "world", "wow", "ws", "wtc", "wtf", "xbox", "xerox", + "xihuan", "xin", "ελ", "ευ", "бг", "бел", "дети", "ею", "католик", "ком", "мкд", + "мон", "москва", "онлайн", "орг", "рус", "рф", "сайт", "срб", "укр", "қаз", "հայ", + "ישראל", "קום", "ابوظبي", "ارامكو", "الاردن", "البحرين", "الجزائر", "السعودية", + "العليان", "المغرب", "امارات", "ایران", "بارت", "بازار", "بيتك", "بھارت", "تونس", + "سودان", "سورية", "شبكة", "عراق", "عرب", "عمان", "فلسطين", "قطر", "كاثوليك", "كوم", + "مصر", "مليسيا", "موريتانيا", "موقع", "همراه", "پاکستان", "ڀارت", "कॉम", "नेट", "भारत", + "भारतम्", "भारोत", "संगठन", "বাংলা", "ভারত", "ভাৰত", "ਭਾਰਤ", "ભારત", "ଭାରତ", "இந்தியா", + "இலங்கை", "சிங்கப்பூர்", "భారత్", "ಭಾರತ", "ഭാരതം", "ලංකා", "คอม", "ไทย", "ລາວ", + "გე", "みんな", "アマゾン", "クラウド", "グーグル", "コム", "ストア", "セール", "ファッション", + "ポイント", "世界", "中信", "中国", "中國", "中文网", "亚马逊", "企业", "佛山", "信息", + "健康", "八卦", "公司", "公益", "台湾", "台灣", "商城", "商店", "商标", "嘉里", "嘉里大酒店", + "在线", "大拿", "天主教", "娱乐", "家電", "广东", "微博", "慈善", "我爱你", "手机", "招聘", + "政务", "政府", "新加坡", "新闻", "时尚", "書籍", "机构", "淡马锡", "游戏", "澳門", "点看", + "移动", "组织机构", "网址", "网店", "网站", "网络", "联通", "谷歌", "购物", "通販", "集团", + "電訊盈科", "飞利浦", "食品", "餐厅", "香格里拉", "香港", "닷넷", "닷컴", "삼성", "한국", + "xxx", "xyz", "yachts", "yahoo", "yamaxun", "yandex", "ye", "yodobashi", "yoga", + "yokohama", "you", "youtube", "yt", "yun", "za", "zappos", "zara", "zero", "zip", + "zm", "zone", "zuerich", "zw") + class _EntityPosition: """ @@ -442,6 +608,106 @@ def _is_hashtag_letter(letter: str) -> bool: return False +def _fix_url(full_url: str) -> str: + has_protocol = False + url = full_url + protocols_pattern = re.compile(r"^(https?|ftp|tonsite)://") + + if match := protocols_pattern.match(full_url): + has_protocol = True + url = url[match.end():] + + domain_end = len(url) + # Looking for the leftmost position of + # the one of the given chars (these chars divide + # the domain and the path). + for ch in "/?#": + pos = url.find(ch) + if pos > -1 and pos < domain_end: + domain_end = pos + domain, path = url[:domain_end], url[domain_end:] + + if (at_pos := domain.find("@")) > -1: + domain = domain[at_pos+1:] + + if (colon_pos := domain.rfind(":")) > -1: + domain = domain[:colon_pos] + + if domain.lower() == "teiegram.org": + return "" + + parentheses_cnt, square_br_cnt, curly_br_cnt = 0, 0, 0 + + path_pos = 0 + for ch in path: + if ch == "(": + parentheses_cnt += 1 + elif ch == ")": + parentheses_cnt -= 1 + elif ch == "[": + square_br_cnt += 1 + elif ch == "]": + square_br_cnt -= 1 + elif ch == "{": + curly_br_cnt += 1 + elif ch == "}": + curly_br_cnt -= 1 + + if parentheses_cnt < 0 or square_br_cnt < 0 or curly_br_cnt < 0: + break + + path_pos += 1 + + bad_path_end_chars = ".:;,('?!`" + + while path_pos > 0 and path[path_pos-1] in bad_path_end_chars: + path_pos -= 1 + + full_url = full_url[:len(full_url) - (len(path) - path_pos)] + + is_ipv4 = True + try: + ipaddress.ip_address(domain) + except ValueError: + is_ipv4 = False + + domain_parts = domain.split(".") + if len(domain_parts) <= 1: + return "" + + validator = lambda x: not x or len(x) >= 64 or x.endswith("-") + if any(map(validator, domain_parts)): + return "" + + if is_ipv4: + return full_url + + # .com, .net, .org, etc. + tld = domain_parts[-1] + if len(tld) <= 1: + return "" + + # The "google" part in "google.com". + second_level_domain = domain_parts[-2] + # Telegram considers the underscore as an invalid symbol + # only in the second level domain, while for all subdomains + # it is perfectly OK. + if "_" in second_level_domain: + return "" + + if tld.startswith("xn--"): + if len(tld) <= 5 or re.search(r"[^0-9a-zA-Z]", tld[4:]): + return "" + else: + if tld.count("_") + tld.count("-") > 0: + return "" + + if not has_protocol and tld not in COMMON_TLDS: + return "" + + return full_url + + class EntityParser: @staticmethod def parse_markdown(text: str) -> tuple[str, tuple[MessageEntity, ...]]: diff --git a/tests/test_EntityParser/test_misc.py b/tests/test_EntityParser/test_misc.py index 0d2ffb9..25734b0 100644 --- a/tests/test_EntityParser/test_misc.py +++ b/tests/test_EntityParser/test_misc.py @@ -11,7 +11,8 @@ _get_id_from_telegram_url, EntityParser, get_hash, - _is_hashtag_letter) + _is_hashtag_letter, + _fix_url) def test_get_utf16_length(): @@ -238,6 +239,77 @@ def test_is_hashtag_letter(): assert not _is_hashtag_letter("\t") +class TestFixUrl: + def test_valid_urls_with_protocol(self): + assert _fix_url("http://example.com") == "http://example.com" + assert _fix_url("https://example.org/path") == "https://example.org/path" + assert _fix_url("ftp://sub.domain.co.uk?query=1") == "ftp://sub.domain.co.uk?query=1" + assert _fix_url("tonsite://example.ton") == "tonsite://example.ton" + assert _fix_url("https://example.com:8080") == "https://example.com:8080" + + def test_valid_urls_without_protocol(self): + assert _fix_url("example.com") == "example.com" + assert _fix_url("domain.org/path/page.html") == "domain.org/path/page.html" + assert _fix_url("sub_.example.com") == "sub_.example.com" + + def test_domain_path_dividers(self): + assert _fix_url("http://example.com/path") == "http://example.com/path" + assert _fix_url("http://example.com#path") == "http://example.com#path" + assert _fix_url("http://example.com?path") == "http://example.com?path" + + def test_url_with_basic_auth(self): + assert _fix_url("https://user:pass@example.com") == "https://user:pass@example.com" + + def test_url_with_port(self): + url = "https://example.com:8080" + assert _fix_url(url) == url + + def test_fake_domain_teiegram_org(self): + assert _fix_url("teiegram.org") == "" + assert _fix_url("https://teiegram.org") == "" + assert _fix_url("http://teiegram.org") == "" + assert _fix_url("ftp://teiegram.org") == "" + assert _fix_url("tonsite://teiegram.org") == "" + + def test_valid_brackets_balance(self): + assert _fix_url("http://site.com/path(sub[1]{2})") == "http://site.com/path(sub[1]{2})" + + def test_invalid_brackets_balance(self): + assert _fix_url("http://broken.com/test)") == "http://broken.com/test" + + def test_striping_invalid_symbols_at_the_end(self): + assert _fix_url("https://example.com/path);") == "https://example.com/path" + assert _fix_url("http://example.com/test!") == "http://example.com/test" + assert _fix_url("http://example.com/test.:;,('?!`") == "http://example.com/test" + + def test_valid_ipv4(self): + assert _fix_url("http://192.168.1.1") == "http://192.168.1.1" + assert _fix_url("http://192.168.1.1/path") == "http://192.168.1.1/path" + assert _fix_url("http://192.168.1.1/?param=value") == "http://192.168.1.1/?param=value" + assert _fix_url("192.168.1.1/?param=value") == "192.168.1.1/?param=value" + + def test_invalid_ip_addresses(self): + assert _fix_url("http://127.00.0.1") == "" + assert _fix_url("http://256.100.0.1") == "" + + def test_invalid_urls(self): + assert _fix_url("localhost") == "" + assert _fix_url("custom.domainzzz") == "" + assert _fix_url("bad_domain.com") == "" + assert _fix_url("https://bad-.com") == "" + assert _fix_url("https://example.c_m") == "" + + def test_valid_punycode(self): + assert _fix_url("https://xn--e1afmkfd.xn--80asehdb/") == "https://xn--e1afmkfd.xn--80asehdb/" + + def test_invalid_punycode(self): + assert _fix_url("https://xn--a.xn--8/") == "" + + def test_url_with_all_parts(self): + url = "https://user:pass@example.com:8080/path?param1=val¶m2=val2#anchor" + assert _fix_url(url) == url + + class TestEntityParserExtractEntities: ep = EntityParser() From 947ef8a8e9e6a7d927f24904e07869905ee0e3a6 Mon Sep 17 00:00:00 2001 From: elebur Date: Tue, 27 May 2025 22:19:07 +0300 Subject: [PATCH 02/16] refactor(`_fix_url`): add additional checkers for validating URLs to match Telegram's rules --- ptbtest/entityparser.py | 35 +++++++++++++++++++++------- tests/test_EntityParser/test_misc.py | 6 +++++ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index 6a91908..daac0d4 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -611,7 +611,7 @@ def _is_hashtag_letter(letter: str) -> bool: def _fix_url(full_url: str) -> str: has_protocol = False url = full_url - protocols_pattern = re.compile(r"^(https?|ftp|tonsite)://") + protocols_pattern = re.compile(r"^(https?|ftp|tonsite)://", flags=re.IGNORECASE) if match := protocols_pattern.match(full_url): has_protocol = True @@ -682,19 +682,36 @@ def _fix_url(full_url: str) -> str: if is_ipv4: return full_url - # .com, .net, .org, etc. - tld = domain_parts[-1] - if len(tld) <= 1: - return "" - # The "google" part in "google.com". second_level_domain = domain_parts[-2] + # Skip the URL if there are no subdomains and domain starts with a underscore. + if len(domain_parts) == 2 and second_level_domain.startswith("_"): + return "" + + # If the 2nd level domain consists of whitespaces only. + if not second_level_domain.strip(): + return "" # Telegram considers the underscore as an invalid symbol # only in the second level domain, while for all subdomains # it is perfectly OK. - if "_" in second_level_domain: + elif "_" in second_level_domain: + return "" + + # .com, .net, .org, etc. + tld = domain_parts[-1].rstrip("…") + if len(tld) <= 1: return "" + def is_common_tld(tld: str) -> bool: + if tld.islower(): + return tld in COMMON_TLDS + + lowered = tld.lower() + if lowered != tld and lowered[1:] == tld[1:]: + return False + + return lowered in COMMON_TLDS + if tld.startswith("xn--"): if len(tld) <= 5 or re.search(r"[^0-9a-zA-Z]", tld[4:]): return "" @@ -702,8 +719,8 @@ def _fix_url(full_url: str) -> str: if tld.count("_") + tld.count("-") > 0: return "" - if not has_protocol and tld not in COMMON_TLDS: - return "" + if not has_protocol and not is_common_tld(tld): + return "" return full_url diff --git a/tests/test_EntityParser/test_misc.py b/tests/test_EntityParser/test_misc.py index 25734b0..feffa20 100644 --- a/tests/test_EntityParser/test_misc.py +++ b/tests/test_EntityParser/test_misc.py @@ -301,10 +301,16 @@ def test_invalid_urls(self): def test_valid_punycode(self): assert _fix_url("https://xn--e1afmkfd.xn--80asehdb/") == "https://xn--e1afmkfd.xn--80asehdb/" + assert _fix_url("xn--80afpi2a3c.xn--p1ai") == "xn--80afpi2a3c.xn--p1ai" def test_invalid_punycode(self): assert _fix_url("https://xn--a.xn--8/") == "" + def test_is_common_tld(self): + """This is a test for the inner function.""" + assert _fix_url("example.Com") == "" + assert _fix_url("тест.Онлайн") == "" + def test_url_with_all_parts(self): url = "https://user:pass@example.com:8080/path?param1=val¶m2=val2#anchor" assert _fix_url(url) == url From 3947565078aec93e3ed3262cc1a0a480724b2ad7 Mon Sep 17 00:00:00 2001 From: elebur Date: Tue, 27 May 2025 22:24:56 +0300 Subject: [PATCH 03/16] refactor: replace `_EntityPosition` with `_EntityMatch` object --- ptbtest/entityparser.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index daac0d4..5bd83b1 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -230,19 +230,30 @@ "zm", "zone", "zuerich", "zw") -class _EntityPosition: +class _EntityMatch: """ Args start_pos (int): The start position of the entity. end_pos (int): The end position of the entity. text (str): The text entities are parsed from. It is used for calculating utf16 offset. + match (re.Match): The raw regex match object. """ - def __init__(self, start_pos:int, end_pos:int, text:str): - self.start = start_pos - self.end = end_pos - self._utf16_offset = _get_utf16_length(text[:start_pos]) - self._length = _get_utf16_length(text[self.start:self.end]) + def __init__(self, match: re.Match, text:str): + self._match = match + self._start = self._match.start() + self._end = self._match.end() + + self._utf16_offset = _get_utf16_length(text[:self._start]) + self._length = _get_utf16_length(text[self._start:self._end]) + + @property + def start(self): + return self._start + + @property + def end(self): + return self._end @property def offset(self): @@ -254,6 +265,9 @@ def length(self): """Return the UTF-16 length of the entity.""" return self._length + def group(self, value: Any): + return self._match.group(value) + def _get_utf16_length(text: str) -> int: """ @@ -1635,7 +1649,7 @@ def get_byte_offset(begin_pos): return result_text, tuple(sorted_entities) @staticmethod - def _extract_entities(text: str, pattern: Union[str, re.Pattern]) -> tuple[_EntityPosition, ...]: + def _extract_entities(text: str, pattern: Union[str, re.Pattern]) -> tuple[_EntityMatch, ...]: """ Parse entities from text with the given regular expression. @@ -1649,7 +1663,7 @@ def _extract_entities(text: str, pattern: Union[str, re.Pattern]) -> tuple[_Enti pattern (str | ~typing.Pattern): A regular expression. Returns: - tuple[_EntityPosition]: A tuple of ``_EntityPosition`` with the offset and + tuple[_EntityMatch]: A tuple of ``_EntityPosition`` with the offset and the length of the found entities. """ if isinstance(pattern, str): @@ -1657,7 +1671,7 @@ def _extract_entities(text: str, pattern: Union[str, re.Pattern]) -> tuple[_Enti result = list() for match in pattern.finditer(text): - result.append(_EntityPosition(match.start(), match.end(), text)) + result.append(_EntityMatch(match, text)) return tuple(result) From dce527cd7c0a1cf4d76d4272b16474e32bb4e00a Mon Sep 17 00:00:00 2001 From: elebur Date: Wed, 28 May 2025 23:36:35 +0300 Subject: [PATCH 04/16] feat(`EntityParser`): add the `parse_urls` method --- ptbtest/entityparser.py | 132 ++++++++++ tests/test_EntityParser/test_parse_urls.py | 267 +++++++++++++++++++++ 2 files changed, 399 insertions(+) create mode 100644 tests/test_EntityParser/test_parse_urls.py diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index 5bd83b1..0e29f3f 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -26,6 +26,7 @@ import ipaddress import re import string +import unicodedata from collections.abc import Sequence from dataclasses import dataclass from typing import Any, Literal, Optional, Union @@ -1899,6 +1900,137 @@ def parse_cashtags(text: str) -> tuple[MessageEntity, ...]: return tuple(entities) + @staticmethod + def parse_urls(text: str) -> tuple[MessageEntity, ...]: + """ + Extract :obj:`~telegram.MessageEntity` representing + URLs (``https://example.com``) from the given ``text``. + + Examples: + An input string: ``https://example.com`` + + Result: + + .. code:: python + + (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + + Args: + text (str): A message that must be parsed. + + Returns: + tuple[~telegram.MessageEntity]: Tuple of :obj:`~telegram.MessageEntity` with + type :obj:`~telegram.constants.MessageEntityType.URL`. + The tuple might be empty if no entities were found. + """ + # Allowed characters in the username and in the password in the basic auth. + user_pass_chars = "a-zA-Z0-9._―‑!-" + # This pattern is based on this one https://gist.github.com/dperini/729294 + pattern = re.compile( + # Optional protocol. + r"(?:[a-zA-Z]+://)?" + # 'user:pass' basic auth (optional) + fr"(?:[:{user_pass_chars}]+(?::[{user_pass_chars}]+)?@)?" + r"(?:" + # IP address + r"(?:(?:\d{1,3})\.){3}(?:\d{1,3})\b" + r"|" + # host & domain names + r"(?:" + r"(?:" + r"[a-z0-9\u00a1-\uffff―_‑-]" + r"[a-z0-9\u00a1-\uffff_―‑-]{0,62}" + r")?" + r"[a-z0-9\u00a1-\uffff_―‑-]\." + r")+" + # TLD identifier name + r"(?:[a-z0-9\u00a1-\uffff`‑―-]{2,})" + r")" + # port number (optional) + r"(?P:[0-9]+)?" + # resource path (optional) + r"(?P[/?#]\S*)?", flags=re.IGNORECASE) + + def is_url_path_symbol(ch): + """ + Check if the given symbol is a valid symbol for the path. + """ + if ch in "\n<>\"«»": + return False + + int_ch = ord(ch) + if 0x206f >= int_ch >= 0x2000: # General Punctuation. + # Zero Width Non-Joiner/Joiner and various dashes + return int_ch == 0x200c or int_ch == 0x200d or (0x2015 >= int_ch >= 0x2010) + + # The char is not a Separator. + return not unicodedata.category(ch).startswith("Z") + + entities = list() + matches = EntityParser._extract_entities(text, pattern) + for match in matches: + url = text[match.start:match.end] + protocol = urlparse(url).scheme if "://" in url else None + prev_ch: str = get_item(text, match.start - 1, "", allow_negative_indexing=False) + + # Skip if there is a dot or a latin letter right before the url or ... + if (prev_ch and prev_ch in string.ascii_letters + "." or + # ... there is '@' symbol without user:pass or ... + "://@" in url or + # ... there is no protocol, but '://' at the beginning or the URL startswith '@'. + url.startswith("@") or url.startswith("://")): + continue + # if there is a dot(s) followed by a non-whitespace symbol right after the + # TLD, then ignore such an URL. + elif re.search("^\.+[^.\s]", text[match.end:]): + continue + elif protocol and protocol.lower() not in ("http", "https", "ftp", "tonsite"): + continue + + path = match.group("path") + + # Checking for invalid symbols in the path. + valid_symbols_in_path_counter = 1 # Skip the leading slash in the path. + while (path and + valid_symbols_in_path_counter < len(path) and + is_url_path_symbol(path[valid_symbols_in_path_counter])): + valid_symbols_in_path_counter+=1 + + length_subtraction = 0 + if path and valid_symbols_in_path_counter != len(path): + invalid_symbols_counter = len(path) - valid_symbols_in_path_counter + url = url[:len(url) - invalid_symbols_counter] + path = path[:valid_symbols_in_path_counter] + length_subtraction += invalid_symbols_counter + + fixed_url = _fix_url(url) + if not fixed_url: + continue + elif (url_length_diff := len(url) - len(fixed_url)) > 0: + length_subtraction += url_length_diff + + # The 'raw_port' will contain the colon symbol. + # E.g., ':8080'. + if raw_port := match.group("port"): + # If the port is bigger than 65535, than ignore everything + # in the url after the tld. + port = int(raw_port[1:]) + if port == 0 or port > 65535: + length_subtraction += len(raw_port + (path or "")) + + # Ignore trailing '#' symbol if there are no preceding '#', '?' or '/' symbols. + if re.search(r"(?65535) + assert self.ep.parse_urls("google.com:000000065536/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("google.com:65536") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("google.com:100000") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + # The port number overflow and invalid symbold in the path. + assert self.ep.parse_urls("google.com:0000000655353/abs>>>>") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + # Zero port is not acceptable. + assert self.ep.parse_urls("google.com:0000000/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("google.com:0/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + # Empty port. + assert self.ep.parse_urls("google.com:/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + + def test_localhost_ip_address(self): + assert self.ep.parse_urls("127.001") == () + assert self.ep.parse_urls("127.0.0.1") == (MessageEntity(length=9, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("127.0.0.01") == () + assert self.ep.parse_urls("127.0.0.256") == () + assert self.ep.parse_urls("127.0.0.300") == () + assert self.ep.parse_urls("127.0.0.260") == () + assert self.ep.parse_urls("1.0") == () + assert self.ep.parse_urls("127.0.0.1000") == () + + def test_fake_domain_teiegram(self): + assert self.ep.parse_urls("teiegram.org/test") == () + assert self.ep.parse_urls("TeiegraM.org/test") == () + assert self.ep.parse_urls("TeiegraM.org") == () + assert self.ep.parse_urls("teiegram.org") == () + + def test_parentheses_and_brackets(self): + assert self.ep.parse_urls("http://test.google.com/?q=abc()}[]def") == (MessageEntity(length=31, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test.google.com/?q=abc([{)]}def") == (MessageEntity(length=38, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test.google.com/?q=abc(){}]def") == (MessageEntity(length=33, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test.google.com/?q=abc){}[]def") == (MessageEntity(length=29, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test.google.com/?q=abc(){}[]def") == (MessageEntity(length=38, offset=0, type=MessageEntityType.URL),) + + def test_underscores(self): + assert self.ep.parse_urls("http://google_.com") == () + assert self.ep.parse_urls("http://google._com_") == () + assert self.ep.parse_urls("http://test_.google.com") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) + + def test_hyphen_at_end_of_domain_and_subdomain(self): + assert self.ep.parse_urls("http://test-.google.com") == () + assert self.ep.parse_urls("http://test.google-.com") == () + + def test_ipv6_address(self): + assert self.ep.parse_urls("http://[2001:4860:0:2001::68]/") == () + + def test_tg_domains(self): + assert self.ep.parse_urls("tg://resolve") == () + + def test_different_url_endings(self): + assert self.ep.parse_urls("http://google.com/") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://google.com?") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://google.com#") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://google.com##") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://google.com/?") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://www.google.com/ab,") == (MessageEntity(length=25, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test.com#a") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test.com#") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test.com?#") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test.com/?#") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + + def test_at_symbol(self): + assert self.ep.parse_urls("https://a.bc@c.com") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://a.de/bc@c.com") == (MessageEntity(length=21, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://a.debc@c.com") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://a.de`bc@c.com") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL), + MessageEntity(length=5, offset=16, type=MessageEntityType.URL)) + + assert self.ep.parse_urls("https://a.bcde.fg@c.com") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://abc@c.com") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://a.bc@test.com:cd.com") == (MessageEntity(length=21, offset=0, type=MessageEntityType.URL), + MessageEntity(length=6, offset=22, type=MessageEntityType.URL)) + + def test_filenames_like_urls(self): + assert self.ep.parse_urls("File '/usr/views.py'") == (MessageEntity(length=8, offset=11, type=MessageEntityType.URL),) + assert self.ep.parse_urls(".views.py") == () + assert self.ep.parse_urls("'views.py'") == (MessageEntity(length=8, offset=1, type=MessageEntityType.URL),) + + def test_misc(self): + assert self.ep.parse_urls("telegram. org. www. com... telegram.org... ...google.com...") == (MessageEntity(length=12, offset=27, type=MessageEntityType.URL),) + assert self.ep.parse_urls("Такой сайт: http://www.google.com или такой telegram.org") == (MessageEntity(length=21, offset=12, type=MessageEntityType.URL), + MessageEntity(length=12, offset=44, type=MessageEntityType.URL)) + assert self.ep.parse_urls("[http://google.com](test)") == (MessageEntity(length=17, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls("google.com:᪀᪀") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("/.b/..a @.....@/. a.ba") == (MessageEntity(length=4, offset=21, type=MessageEntityType.URL),) + assert self.ep.parse_urls("('http://telegram.org/a-b/?br=ie&lang=en',)") == (MessageEntity(length=38, offset=2, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://ai.telegram.org/bot%20bot/test-...") == (MessageEntity(length=39, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("bbbbbbbbbbbbbb.@.@") == () + assert self.ep.parse_urls("@.") == () + assert self.ep.parse_urls("") == (MessageEntity(length=59, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://t.me/abcdef…") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://t.me…") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://t.m…") == () + assert self.ep.parse_urls("https://t.…") == () + assert self.ep.parse_urls("https://t…") == () + assert self.ep.parse_urls(".?") == () + assert self.ep.parse_urls("👉http://ab.com/cdefgh-1IJ") == (MessageEntity(length=24, offset=2, type=MessageEntityType.URL),) + assert self.ep.parse_urls("http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――") == (MessageEntity(length=48, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("a!:b@gmail.com") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("a:b!@gmail.com") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("_sip._udp.apnic.net") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls("https://as_sip._udp.apnic.net") == (MessageEntity(length=29, offset=0, type=MessageEntityType.URL),) From f5863c3b3c492c66f497259b96ce80da94546139 Mon Sep 17 00:00:00 2001 From: elebur Date: Fri, 30 May 2025 21:08:09 +0300 Subject: [PATCH 05/16] refactor(`entityparser`): renamed `_EntityMatch`'s properties. --- ptbtest/entityparser.py | 18 ++++++++---------- tests/test_EntityParser/test_misc.py | 8 ++++---- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index 0e29f3f..c0c0763 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -257,13 +257,11 @@ def end(self): return self._end @property - def offset(self): - """Return the UTF-16 offset of the entity in the text.""" + def utf16_offset(self): return self._utf16_offset @property - def length(self): - """Return the UTF-16 length of the entity.""" + def utf16_length(self): return self._length def group(self, value: Any): @@ -1708,15 +1706,15 @@ def parse_mentions(text: str) -> tuple[MessageEntity, ...]: allowed_3_char_mentions = ("@gif", "@vid", "@pic") entities: list[MessageEntity] = list() for entity_position in points: - if entity_position.length < 4 or entity_position.length > 33: + if entity_position.utf16_length < 4 or entity_position.utf16_length > 33: continue - elif (entity_position.length == 4 and + elif (entity_position.utf16_length == 4 and text[entity_position.start:entity_position.end] not in allowed_3_char_mentions): continue entities.append(MessageEntity(MessageEntityType.MENTION, - offset=entity_position.offset, - length=entity_position.length)) + offset=entity_position.utf16_offset, + length=entity_position.utf16_length)) return tuple(entities) @@ -1749,8 +1747,8 @@ def parse_bot_commands(text: str) -> tuple[MessageEntity, ...]: entities = list() for entity_position in EntityParser._extract_entities(text, pattern): entities.append(MessageEntity(MessageEntityType.BOT_COMMAND, - offset=entity_position.offset, - length=entity_position.length)) + offset=entity_position.utf16_offset, + length=entity_position.utf16_length)) return tuple(entities) diff --git a/tests/test_EntityParser/test_misc.py b/tests/test_EntityParser/test_misc.py index feffa20..4631682 100644 --- a/tests/test_EntityParser/test_misc.py +++ b/tests/test_EntityParser/test_misc.py @@ -325,8 +325,8 @@ def test_str_pattern(self): assert result[0].start == 0 assert result[0].end == 8 - assert result[0].length == 8 - assert result[0].offset == 0 + assert result[0].utf16_length == 8 + assert result[0].utf16_offset == 0 def test_compiled_pattern(self): pattern = re.compile(r"(?<=\B)@([a-zA-Z0-9_]{2,32})(?=\b)") @@ -334,8 +334,8 @@ def test_compiled_pattern(self): assert result[0].start == 0 assert result[0].end == 8 - assert result[0].length == 8 - assert result[0].offset == 0 + assert result[0].utf16_length == 8 + assert result[0].utf16_offset == 0 def test_empty_string(self): pattern = re.compile(r"(?<=\B)@([a-zA-Z0-9_]{2,32})(?=\b)") From 799905f56e268bcabfcd67e7060738dde9fa3984 Mon Sep 17 00:00:00 2001 From: elebur Date: Fri, 30 May 2025 21:12:29 +0300 Subject: [PATCH 06/16] feat(`EntityParser`): add the email parser --- ptbtest/entityparser.py | 41 ++++++-- tests/test_EntityParser/test_misc.py | 117 ++++++++++++++++++++- tests/test_EntityParser/test_parse_urls.py | 35 ++++++ 3 files changed, 183 insertions(+), 10 deletions(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index c0c0763..d3a6db3 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -738,6 +738,18 @@ def is_common_tld(tld: str) -> bool: return full_url +def _is_email_address(text: str) -> bool: + """ + Check if the given ``text`` is a valid email address. + """ + pattern = re.compile(r"^([a-z0-9_-]{0,26}[.+:]){0,10}" + r"[a-z0-9_-]{1,35}" + r"@(([a-z0-9][a-z0-9_-]{0,28})?[a-z0-9][.]){1,6}" + r"[a-z]{2,8}$", flags=re.IGNORECASE) + + return bool(pattern.search(text)) + + class EntityParser: @staticmethod def parse_markdown(text: str) -> tuple[str, tuple[MessageEntity, ...]]: @@ -1967,6 +1979,7 @@ def is_url_path_symbol(ch): entities = list() matches = EntityParser._extract_entities(text, pattern) for match in matches: + entity_length = match.utf16_length url = text[match.start:match.end] protocol = urlparse(url).scheme if "://" in url else None prev_ch: str = get_item(text, match.start - 1, "", allow_negative_indexing=False) @@ -1994,18 +2007,17 @@ def is_url_path_symbol(ch): is_url_path_symbol(path[valid_symbols_in_path_counter])): valid_symbols_in_path_counter+=1 - length_subtraction = 0 if path and valid_symbols_in_path_counter != len(path): invalid_symbols_counter = len(path) - valid_symbols_in_path_counter url = url[:len(url) - invalid_symbols_counter] path = path[:valid_symbols_in_path_counter] - length_subtraction += invalid_symbols_counter + entity_length -= invalid_symbols_counter fixed_url = _fix_url(url) if not fixed_url: continue elif (url_length_diff := len(url) - len(fixed_url)) > 0: - length_subtraction += url_length_diff + entity_length -= url_length_diff # The 'raw_port' will contain the colon symbol. # E.g., ':8080'. @@ -2014,18 +2026,29 @@ def is_url_path_symbol(ch): # in the url after the tld. port = int(raw_port[1:]) if port == 0 or port > 65535: - length_subtraction += len(raw_port + (path or "")) + entity_length -= len(raw_port + (path or "")) # Ignore trailing '#' symbol if there are no preceding '#', '?' or '/' symbols. if re.search(r"(? Date: Fri, 30 May 2025 21:14:35 +0300 Subject: [PATCH 07/16] refactor: rename the test's name --- tests/test_EntityParser/test_parse_hashtags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_EntityParser/test_parse_hashtags.py b/tests/test_EntityParser/test_parse_hashtags.py index 48f3877..8b58b2f 100644 --- a/tests/test_EntityParser/test_parse_hashtags.py +++ b/tests/test_EntityParser/test_parse_hashtags.py @@ -95,7 +95,7 @@ def test_string_consists_of_hash_sign_only(self): assert self.ep.parse_hashtags("##") == () assert self.ep.parse_hashtags("##############") == () - def test_hash_tag_with_hash_sign_at_the_end(self): + def test_hash_sign_at_the_end(self): assert self.ep.parse_hashtags("hashtag#") == () assert self.ep.parse_hashtags("hashtag # ") == () From 80136a108103c6b8274440497dfdefd56cafa2f3 Mon Sep 17 00:00:00 2001 From: elebur Date: Fri, 30 May 2025 21:16:47 +0300 Subject: [PATCH 08/16] refactor(`EntityParser`): rename `parse_urls` to `parse_urls_and_emails` --- ptbtest/entityparser.py | 2 +- tests/test_EntityParser/test_parse_urls.py | 396 ++++++++++----------- 2 files changed, 199 insertions(+), 199 deletions(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index d3a6db3..13f6566 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -1911,7 +1911,7 @@ def parse_cashtags(text: str) -> tuple[MessageEntity, ...]: return tuple(entities) @staticmethod - def parse_urls(text: str) -> tuple[MessageEntity, ...]: + def parse_urls_and_emails(text: str) -> tuple[MessageEntity, ...]: """ Extract :obj:`~telegram.MessageEntity` representing URLs (``https://example.com``) from the given ``text``. diff --git a/tests/test_EntityParser/test_parse_urls.py b/tests/test_EntityParser/test_parse_urls.py index 2843a69..f06cbf8 100644 --- a/tests/test_EntityParser/test_parse_urls.py +++ b/tests/test_EntityParser/test_parse_urls.py @@ -7,90 +7,90 @@ class TestParseUrls: ep = EntityParser() def test_empty_string(self): - assert self.ep.parse_urls("") == () + assert self.ep.parse_urls_and_emails("") == () def test_no_urls_in_string(self): - assert self.ep.parse_urls(".") == () - assert self.ep.parse_urls("Hello world.") == () + assert self.ep.parse_urls_and_emails(".") == () + assert self.ep.parse_urls_and_emails("Hello world.") == () def test_invalid_urls(self): - assert self.ep.parse_urls("http://  .com") == () - assert self.ep.parse_urls("URL:     .com") == () - assert self.ep.parse_urls("URL: .com") == () - assert self.ep.parse_urls(".com") == () - assert self.ep.parse_urls("http://  .") == () - assert self.ep.parse_urls("http://.") == () - assert self.ep.parse_urls("http://.com") == () - assert self.ep.parse_urls("http:// .") == () - assert self.ep.parse_urls("http://1.0") == () - assert self.ep.parse_urls("http://a.0") == () - assert self.ep.parse_urls("http://a.a") == () - assert self.ep.parse_urls("https://t.…") == () + assert self.ep.parse_urls_and_emails("http://  .com") == () + assert self.ep.parse_urls_and_emails("URL:     .com") == () + assert self.ep.parse_urls_and_emails("URL: .com") == () + assert self.ep.parse_urls_and_emails(".com") == () + assert self.ep.parse_urls_and_emails("http://  .") == () + assert self.ep.parse_urls_and_emails("http://.") == () + assert self.ep.parse_urls_and_emails("http://.com") == () + assert self.ep.parse_urls_and_emails("http:// .") == () + assert self.ep.parse_urls_and_emails("http://1.0") == () + assert self.ep.parse_urls_and_emails("http://a.0") == () + assert self.ep.parse_urls_and_emails("http://a.a") == () + assert self.ep.parse_urls_and_emails("https://t.…") == () def test_valid_domains(self): - assert self.ep.parse_urls("telegram.org") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("(telegram.org)") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls("\ntelegram.org)") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls(" telegram.org)") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls("\"telegram.org\"") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls(" telegram.org ") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls(" telegram.org. ") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:᪉᪉᪉᪉᪉") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("telegram.ton") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("telegram.onion") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("ТеСт.ОнЛайН") == (MessageEntity(length=11, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("ÀÁ.com. ÀÁ.com.") == (MessageEntity(length=6, offset=0, type=MessageEntityType.URL), - MessageEntity(length=6, offset=8, type=MessageEntityType.URL)) - assert self.ep.parse_urls("ÀÁ.com,ÀÁ.com.") == (MessageEntity(length=6, offset=0, type=MessageEntityType.URL), - MessageEntity(length=6, offset=7, type=MessageEntityType.URL)) - assert self.ep.parse_urls("https://a.de`bc") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("telegram.ORG") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("_.test.com") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("telegram.org") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("(telegram.org)") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("\ntelegram.org)") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails(" telegram.org)") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("\"telegram.org\"") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails(" telegram.org ") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails(" telegram.org. ") == (MessageEntity(length=12, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:᪉᪉᪉᪉᪉") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("telegram.ton") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("telegram.onion") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("ТеСт.ОнЛайН") == (MessageEntity(length=11, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("ÀÁ.com. ÀÁ.com.") == (MessageEntity(length=6, offset=0, type=MessageEntityType.URL), + MessageEntity(length=6, offset=8, type=MessageEntityType.URL)) + assert self.ep.parse_urls_and_emails("ÀÁ.com,ÀÁ.com.") == (MessageEntity(length=6, offset=0, type=MessageEntityType.URL), + MessageEntity(length=6, offset=7, type=MessageEntityType.URL)) + assert self.ep.parse_urls_and_emails("https://a.de`bc") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("telegram.ORG") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("_.test.com") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) def test_invalid_domains(self): - assert self.ep.parse_urls(".telegram.org)") == () - assert self.ep.parse_urls("telegram.tonsite") == () - assert self.ep.parse_urls("a.ab") == () - assert self.ep.parse_urls("test.abd") == () - assert self.ep.parse_urls("ТеСт.Онлайн") == () + assert self.ep.parse_urls_and_emails(".telegram.org)") == () + assert self.ep.parse_urls_and_emails("telegram.tonsite") == () + assert self.ep.parse_urls_and_emails("a.ab") == () + assert self.ep.parse_urls_and_emails("test.abd") == () + assert self.ep.parse_urls_and_emails("ТеСт.Онлайн") == () # The upper greek letter alpha. - assert self.ep.parse_urls("ТеСт.ОнлΑЙН") == () - assert self.ep.parse_urls("ТеСт.Онлайнн") == () - assert self.ep.parse_urls("test.abd") == () - assert self.ep.parse_urls("telegram.Org") == () - assert self.ep.parse_urls("telegram.Org") == () - assert self.ep.parse_urls("a.b.c.com.a.b.c") == () - assert self.ep.parse_urls("http://test_.com") == () - assert self.ep.parse_urls("test_.com") == () - assert self.ep.parse_urls("_test.com") == () - assert self.ep.parse_urls("bad_domain.com") == () + assert self.ep.parse_urls_and_emails("ТеСт.ОнлΑЙН") == () + assert self.ep.parse_urls_and_emails("ТеСт.Онлайнн") == () + assert self.ep.parse_urls_and_emails("test.abd") == () + assert self.ep.parse_urls_and_emails("telegram.Org") == () + assert self.ep.parse_urls_and_emails("telegram.Org") == () + assert self.ep.parse_urls_and_emails("a.b.c.com.a.b.c") == () + assert self.ep.parse_urls_and_emails("http://test_.com") == () + assert self.ep.parse_urls_and_emails("test_.com") == () + assert self.ep.parse_urls_and_emails("_test.com") == () + assert self.ep.parse_urls_and_emails("bad_domain.com") == () def test_valid_protocols(self): - assert self.ep.parse_urls("https://telegram.org") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://telegram.org") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("ftp://telegram.org") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("tonsite://telegram.ton") == (MessageEntity(length=22, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://ÀТеСт.ОнЛайНн") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://telegram.org") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://telegram.org") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("ftp://telegram.org") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("tonsite://telegram.ton") == (MessageEntity(length=22, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://ÀТеСт.ОнЛайНн") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) def test_invalid_protocols(self): - assert self.ep.parse_urls("sftp://telegram.org") == () - assert self.ep.parse_urls("ftps://telegram.org") == () - assert self.ep.parse_urls("invalid://telegram.org") == () - assert self.ep.parse_urls("sftp://telegram.org") == () + assert self.ep.parse_urls_and_emails("sftp://telegram.org") == () + assert self.ep.parse_urls_and_emails("ftps://telegram.org") == () + assert self.ep.parse_urls_and_emails("invalid://telegram.org") == () + assert self.ep.parse_urls_and_emails("sftp://telegram.org") == () def test_without_protocol(self): - assert self.ep.parse_urls("://telegram.org") == (MessageEntity(length=12, offset=3, type=MessageEntityType.URL),) - assert self.ep.parse_urls("telegram.org") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("://telegram.org") == (MessageEntity(length=12, offset=3, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("telegram.org") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) def test_slashes_without_protocol(self): - assert self.ep.parse_urls("//telegram.org)") == (MessageEntity(length=12, offset=2, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("//telegram.org)") == (MessageEntity(length=12, offset=2, type=MessageEntityType.URL),) def test_comma_inside_url(self): - assert self.ep.parse_urls("http://google,.com") == () + assert self.ep.parse_urls_and_emails("http://google,.com") == () def test_with_params(self): - assert self.ep.parse_urls("()telegram.org/?q=()") == (MessageEntity(length=18, offset=2, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://telegram.org/?asd=123#123.") == (MessageEntity(length=32, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("()telegram.org/?q=()") == (MessageEntity(length=18, offset=2, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://telegram.org/?asd=123#123.") == (MessageEntity(length=32, offset=0, type=MessageEntityType.URL),) def test_basic_auth_ignoring_mentions(self): """ @@ -98,40 +98,40 @@ def test_basic_auth_ignoring_mentions(self): because mentions extracted before URLs and in this string http://@google.com "@google" as a mention will be found, and no URLs. """ - assert self.ep.parse_urls("http://@google.com") == (MessageEntity(length=10, offset=8, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://@goog.com") == (MessageEntity(length=8, offset=8, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://@@google.com") == (MessageEntity(length=10, offset=9, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://@google.com") == (MessageEntity(length=10, offset=8, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://@goog.com") == (MessageEntity(length=8, offset=8, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://@@google.com") == (MessageEntity(length=10, offset=9, type=MessageEntityType.URL),) def test_basic_auth(self): - assert self.ep.parse_urls("http://a@google.com") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test@google.com") == (MessageEntity(length=22, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://user:pass@google.com") == (MessageEntity(length=27, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://a:h.bcde.fg@c.com") == (MessageEntity(length=25, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://bc:defg@c.com") == (MessageEntity(length=21, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://a:hbc:defg@c.com") == (MessageEntity(length=24, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://a@google.com") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test@google.com") == (MessageEntity(length=22, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://user:pass@google.com") == (MessageEntity(length=27, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://a:h.bcde.fg@c.com") == (MessageEntity(length=25, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://bc:defg@c.com") == (MessageEntity(length=21, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://a:hbc:defg@c.com") == (MessageEntity(length=24, offset=0, type=MessageEntityType.URL),) def test_uncommon_tld(self): # WITHOUT the protocol. - assert self.ep.parse_urls("telegram.tonsite") == () + assert self.ep.parse_urls_and_emails("telegram.tonsite") == () # WITH the protocol. - assert self.ep.parse_urls("http://telegram.tonsite") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://telegram.tonsite") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) def test_mix_cased_protocol(self): - assert self.ep.parse_urls("hTtPs://telegram.org") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("HTTP://telegram.org") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://telegram.org") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("hTtPs://telegram.org") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("HTTP://telegram.org") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://telegram.org") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) def test_protocols_with_leading_latin_char(self): - assert self.ep.parse_urls("sHTTP://telegram.org") == () - assert self.ep.parse_urls(".ahttp://google.com") == () + assert self.ep.parse_urls_and_emails("sHTTP://telegram.org") == () + assert self.ep.parse_urls_and_emails(".ahttp://google.com") == () def test_protocols_with_leading_non_latin_char(self): # The leading cyrillic letter 'a'. - assert self.ep.parse_urls("аHTTP://telegram.org") == (MessageEntity(length=19, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls("൹HTTP://telegram.org") == (MessageEntity(length=19, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("аHTTP://telegram.org") == (MessageEntity(length=19, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("൹HTTP://telegram.org") == (MessageEntity(length=19, offset=1, type=MessageEntityType.URL),) def test_very_long_url(self): - assert self.ep.parse_urls("http://abcdefghijkabcdefghijkabcdefghijkabcdefg" + assert self.ep.parse_urls_and_emails("http://abcdefghijkabcdefghijkabcdefghijkabcdefg" "hijkabcdefghijkabcdefghijkabcdefghijkabcdefghij" "kabcdefghijkabcdefghijkabcdefghijkabcdefghijkab" "cdefghijkabcdefghijkabcdefghijkabcdefghijkabcde" @@ -152,151 +152,151 @@ def test_very_long_url(self): "defghijkabcdefghijkabcdefghijk.com") == () def test_valid_ports(self): - assert self.ep.parse_urls("google.com:1#ab c") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:1#") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:1#1") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:1#ab c") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:1#") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:1#1") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) # Leading zeros are acceptable (according to the Telegram rules). - assert self.ep.parse_urls("google.com:00000001/abs") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:000000065535/abs") == (MessageEntity(length=27, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:000000080/abs") == (MessageEntity(length=24, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:65535") == (MessageEntity(length=16, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:00000001/abs") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:000000065535/abs") == (MessageEntity(length=27, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:000000080/abs") == (MessageEntity(length=24, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:65535") == (MessageEntity(length=16, offset=0, type=MessageEntityType.URL),) def test_invalid_ports(self): # Too big port number (>65535) - assert self.ep.parse_urls("google.com:000000065536/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:65536") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:100000") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:000000065536/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:65536") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:100000") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) # The port number overflow and invalid symbold in the path. - assert self.ep.parse_urls("google.com:0000000655353/abs>>>>") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:0000000655353/abs>>>>") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) # Zero port is not acceptable. - assert self.ep.parse_urls("google.com:0000000/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:0/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:0000000/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:0/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) # Empty port. - assert self.ep.parse_urls("google.com:/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:/abs") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) def test_localhost_ip_address(self): - assert self.ep.parse_urls("127.001") == () - assert self.ep.parse_urls("127.0.0.1") == (MessageEntity(length=9, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("127.0.0.01") == () - assert self.ep.parse_urls("127.0.0.256") == () - assert self.ep.parse_urls("127.0.0.300") == () - assert self.ep.parse_urls("127.0.0.260") == () - assert self.ep.parse_urls("1.0") == () - assert self.ep.parse_urls("127.0.0.1000") == () + assert self.ep.parse_urls_and_emails("127.001") == () + assert self.ep.parse_urls_and_emails("127.0.0.1") == (MessageEntity(length=9, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("127.0.0.01") == () + assert self.ep.parse_urls_and_emails("127.0.0.256") == () + assert self.ep.parse_urls_and_emails("127.0.0.300") == () + assert self.ep.parse_urls_and_emails("127.0.0.260") == () + assert self.ep.parse_urls_and_emails("1.0") == () + assert self.ep.parse_urls_and_emails("127.0.0.1000") == () def test_fake_domain_teiegram(self): - assert self.ep.parse_urls("teiegram.org/test") == () - assert self.ep.parse_urls("TeiegraM.org/test") == () - assert self.ep.parse_urls("TeiegraM.org") == () - assert self.ep.parse_urls("teiegram.org") == () + assert self.ep.parse_urls_and_emails("teiegram.org/test") == () + assert self.ep.parse_urls_and_emails("TeiegraM.org/test") == () + assert self.ep.parse_urls_and_emails("TeiegraM.org") == () + assert self.ep.parse_urls_and_emails("teiegram.org") == () def test_parentheses_and_brackets(self): - assert self.ep.parse_urls("http://test.google.com/?q=abc()}[]def") == (MessageEntity(length=31, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test.google.com/?q=abc([{)]}def") == (MessageEntity(length=38, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test.google.com/?q=abc(){}]def") == (MessageEntity(length=33, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test.google.com/?q=abc){}[]def") == (MessageEntity(length=29, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test.google.com/?q=abc(){}[]def") == (MessageEntity(length=38, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.google.com/?q=abc()}[]def") == (MessageEntity(length=31, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.google.com/?q=abc([{)]}def") == (MessageEntity(length=38, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.google.com/?q=abc(){}]def") == (MessageEntity(length=33, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.google.com/?q=abc){}[]def") == (MessageEntity(length=29, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.google.com/?q=abc(){}[]def") == (MessageEntity(length=38, offset=0, type=MessageEntityType.URL),) def test_underscores(self): - assert self.ep.parse_urls("http://google_.com") == () - assert self.ep.parse_urls("http://google._com_") == () - assert self.ep.parse_urls("http://test_.google.com") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://google_.com") == () + assert self.ep.parse_urls_and_emails("http://google._com_") == () + assert self.ep.parse_urls_and_emails("http://test_.google.com") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) def test_hyphen_at_end_of_domain_and_subdomain(self): - assert self.ep.parse_urls("http://test-.google.com") == () - assert self.ep.parse_urls("http://test.google-.com") == () + assert self.ep.parse_urls_and_emails("http://test-.google.com") == () + assert self.ep.parse_urls_and_emails("http://test.google-.com") == () def test_ipv6_address(self): - assert self.ep.parse_urls("http://[2001:4860:0:2001::68]/") == () + assert self.ep.parse_urls_and_emails("http://[2001:4860:0:2001::68]/") == () def test_tg_domains(self): - assert self.ep.parse_urls("tg://resolve") == () + assert self.ep.parse_urls_and_emails("tg://resolve") == () def test_different_url_endings(self): - assert self.ep.parse_urls("http://google.com/") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://google.com?") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://google.com#") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://google.com##") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://google.com/?") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://www.google.com/ab,") == (MessageEntity(length=25, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test.com#a") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test.com#") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test.com?#") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test.com/?#") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://google.com/") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://google.com?") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://google.com#") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://google.com##") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://google.com/?") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://www.google.com/ab,") == (MessageEntity(length=25, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.com#a") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.com#") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.com?#") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test.com/?#") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) def test_at_symbol(self): - assert self.ep.parse_urls("https://a.bc@c.com") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://a.de/bc@c.com") == (MessageEntity(length=21, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://a.debc@c.com") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://a.de`bc@c.com") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL), - MessageEntity(length=5, offset=16, type=MessageEntityType.URL)) + assert self.ep.parse_urls_and_emails("https://a.bc@c.com") == (MessageEntity(length=18, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://a.de/bc@c.com") == (MessageEntity(length=21, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://a.debc@c.com") == (MessageEntity(length=20, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://a.de`bc@c.com") == (MessageEntity(length=15, offset=0, type=MessageEntityType.URL), + MessageEntity(length=5, offset=16, type=MessageEntityType.URL)) - assert self.ep.parse_urls("https://a.bcde.fg@c.com") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://abc@c.com") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://a.bc@test.com:cd.com") == (MessageEntity(length=21, offset=0, type=MessageEntityType.URL), - MessageEntity(length=6, offset=22, type=MessageEntityType.URL)) + assert self.ep.parse_urls_and_emails("https://a.bcde.fg@c.com") == (MessageEntity(length=23, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://abc@c.com") == (MessageEntity(length=17, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://a.bc@test.com:cd.com") == (MessageEntity(length=21, offset=0, type=MessageEntityType.URL), + MessageEntity(length=6, offset=22, type=MessageEntityType.URL)) def test_filenames_like_urls(self): - assert self.ep.parse_urls("File '/usr/views.py'") == (MessageEntity(length=8, offset=11, type=MessageEntityType.URL),) - assert self.ep.parse_urls(".views.py") == () - assert self.ep.parse_urls("'views.py'") == (MessageEntity(length=8, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("File '/usr/views.py'") == (MessageEntity(length=8, offset=11, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails(".views.py") == () + assert self.ep.parse_urls_and_emails("'views.py'") == (MessageEntity(length=8, offset=1, type=MessageEntityType.URL),) def test_misc(self): - assert self.ep.parse_urls("telegram. org. www. com... telegram.org... ...google.com...") == (MessageEntity(length=12, offset=27, type=MessageEntityType.URL),) - assert self.ep.parse_urls("Такой сайт: http://www.google.com или такой telegram.org") == (MessageEntity(length=21, offset=12, type=MessageEntityType.URL), - MessageEntity(length=12, offset=44, type=MessageEntityType.URL)) - assert self.ep.parse_urls("[http://google.com](test)") == (MessageEntity(length=17, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls("google.com:᪀᪀") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("/.b/..a @.....@/. a.ba") == (MessageEntity(length=4, offset=21, type=MessageEntityType.URL),) - assert self.ep.parse_urls("('http://telegram.org/a-b/?br=ie&lang=en',)") == (MessageEntity(length=38, offset=2, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://ai.telegram.org/bot%20bot/test-...") == (MessageEntity(length=39, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("bbbbbbbbbbbbbb.@.@") == () - assert self.ep.parse_urls("@.") == () - assert self.ep.parse_urls("") == (MessageEntity(length=59, offset=1, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://t.me/abcdef…") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://t.me…") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://t.m…") == () - assert self.ep.parse_urls("https://t.…") == () - assert self.ep.parse_urls("https://t…") == () - assert self.ep.parse_urls(".?") == () - assert self.ep.parse_urls("👉http://ab.com/cdefgh-1IJ") == (MessageEntity(length=24, offset=2, type=MessageEntityType.URL),) - assert self.ep.parse_urls("http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――") == (MessageEntity(length=48, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("a!:b@gmail.com") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("a:b!@gmail.com") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("_sip._udp.apnic.net") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) - assert self.ep.parse_urls("https://as_sip._udp.apnic.net") == (MessageEntity(length=29, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("telegram. org. www. com... telegram.org... ...google.com...") == (MessageEntity(length=12, offset=27, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("Такой сайт: http://www.google.com или такой telegram.org") == (MessageEntity(length=21, offset=12, type=MessageEntityType.URL), + MessageEntity(length=12, offset=44, type=MessageEntityType.URL)) + assert self.ep.parse_urls_and_emails("[http://google.com](test)") == (MessageEntity(length=17, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("google.com:᪀᪀") == (MessageEntity(length=10, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("/.b/..a @.....@/. a.ba") == (MessageEntity(length=4, offset=21, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("('http://telegram.org/a-b/?br=ie&lang=en',)") == (MessageEntity(length=38, offset=2, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://ai.telegram.org/bot%20bot/test-...") == (MessageEntity(length=39, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("bbbbbbbbbbbbbb.@.@") == () + assert self.ep.parse_urls_and_emails("@.") == () + assert self.ep.parse_urls_and_emails("") == (MessageEntity(length=59, offset=1, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://t.me/abcdef…") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://t.me…") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://t.m…") == () + assert self.ep.parse_urls_and_emails("https://t.…") == () + assert self.ep.parse_urls_and_emails("https://t…") == () + assert self.ep.parse_urls_and_emails(".?") == () + assert self.ep.parse_urls_and_emails("👉http://ab.com/cdefgh-1IJ") == (MessageEntity(length=24, offset=2, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――") == (MessageEntity(length=48, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("a!:b@gmail.com") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("a:b!@gmail.com") == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("_sip._udp.apnic.net") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("https://as_sip._udp.apnic.net") == (MessageEntity(length=29, offset=0, type=MessageEntityType.URL),) def test_emails(self): - assert self.ep.parse_urls("a.bc@c.com") == (MessageEntity(length=10, offset=0, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://a.de[bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), - MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) - assert self.ep.parse_urls("https://a.de]bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), - MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) - assert self.ep.parse_urls("https://a.de{bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), - MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) - assert self.ep.parse_urls("https://a.de}bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), - MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) - assert self.ep.parse_urls("https://a.de(bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), - MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) - assert self.ep.parse_urls("https://a.de)bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), - MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) - assert self.ep.parse_urls("https://a.de'bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), - MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) - assert self.ep.parse_urls("https://de[bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de/bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de[bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de{bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de}bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de(bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de)bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de\\bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de'bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("https://de`bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("a@b@c.com") == (MessageEntity(length=7, offset=2, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("a@b.com:c@1") == (MessageEntity(length=7, offset=0, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("test@test.software") == (MessageEntity(length=18, offset=0, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("abc@c.com@d.com") == (MessageEntity(length=9, offset=0, type=MessageEntityType.EMAIL), - MessageEntity(length=5, offset=10, type=MessageEntityType.URL)) - assert self.ep.parse_urls("Look :test@example.com") == (MessageEntity(length=16, offset=6, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("a#:b@gmail.com") == (MessageEntity(length=11, offset=3, type=MessageEntityType.EMAIL),) - assert self.ep.parse_urls("Look mailto:test@example.com") == (MessageEntity(length=16, offset=12, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("a.bc@c.com") == (MessageEntity(length=10, offset=0, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://a.de[bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), + MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) + assert self.ep.parse_urls_and_emails("https://a.de]bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), + MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) + assert self.ep.parse_urls_and_emails("https://a.de{bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), + MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) + assert self.ep.parse_urls_and_emails("https://a.de}bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), + MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) + assert self.ep.parse_urls_and_emails("https://a.de(bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), + MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) + assert self.ep.parse_urls_and_emails("https://a.de)bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), + MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) + assert self.ep.parse_urls_and_emails("https://a.de'bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), + MessageEntity(length=8, offset=13, type=MessageEntityType.EMAIL)) + assert self.ep.parse_urls_and_emails("https://de[bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de/bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de[bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de{bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de}bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de(bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de)bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de\\bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de'bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("https://de`bc@c.com") == (MessageEntity(length=8, offset=11, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("a@b@c.com") == (MessageEntity(length=7, offset=2, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("a@b.com:c@1") == (MessageEntity(length=7, offset=0, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("test@test.software") == (MessageEntity(length=18, offset=0, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("abc@c.com@d.com") == (MessageEntity(length=9, offset=0, type=MessageEntityType.EMAIL), + MessageEntity(length=5, offset=10, type=MessageEntityType.URL)) + assert self.ep.parse_urls_and_emails("Look :test@example.com") == (MessageEntity(length=16, offset=6, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("a#:b@gmail.com") == (MessageEntity(length=11, offset=3, type=MessageEntityType.EMAIL),) + assert self.ep.parse_urls_and_emails("Look mailto:test@example.com") == (MessageEntity(length=16, offset=12, type=MessageEntityType.EMAIL),) From 4673795c15c32132d7b2b3ee0354d906b5ca1aa4 Mon Sep 17 00:00:00 2001 From: elebur Date: Fri, 30 May 2025 21:18:22 +0300 Subject: [PATCH 09/16] refactor: rename `test_parse_urls` to `test_parse_urls_and_emails` --- .../{test_parse_urls.py => test_parse_urls_and_emails.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/test_EntityParser/{test_parse_urls.py => test_parse_urls_and_emails.py} (100%) diff --git a/tests/test_EntityParser/test_parse_urls.py b/tests/test_EntityParser/test_parse_urls_and_emails.py similarity index 100% rename from tests/test_EntityParser/test_parse_urls.py rename to tests/test_EntityParser/test_parse_urls_and_emails.py From b8b93e81ec19bd0ccf2cc77f9610b85fbd3ea87f Mon Sep 17 00:00:00 2001 From: elebur Date: Fri, 30 May 2025 21:20:56 +0300 Subject: [PATCH 10/16] test: mark failing test with `xfail` When all the entities parsers will be ready this test must be rewritten --- tests/test_MessageGenerator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_MessageGenerator.py b/tests/test_MessageGenerator.py index 56643ee..69bd3c4 100644 --- a/tests/test_MessageGenerator.py +++ b/tests/test_MessageGenerator.py @@ -158,6 +158,7 @@ def test_text_with_markdown(self): MessageGenerator().get_message( text="bad *_double_* markdown", parse_mode="Markdown") + @pytest.mark.xfail(reason="Waiting for entity parsers.") def test_with_html(self): teststr = ("we have bold code " "google @username " From 5617b8c53ea7746c5c555effdc2e755367fc6ac5 Mon Sep 17 00:00:00 2001 From: elebur Date: Sat, 31 May 2025 13:13:16 +0300 Subject: [PATCH 11/16] refactor(`parse_urls_and_emails`): allow the "%" sign in the basic auth part --- ptbtest/entityparser.py | 2 +- .../test_parse_urls_and_emails.py | 79 +++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index 13f6566..d7764c1 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -1934,7 +1934,7 @@ def parse_urls_and_emails(text: str) -> tuple[MessageEntity, ...]: The tuple might be empty if no entities were found. """ # Allowed characters in the username and in the password in the basic auth. - user_pass_chars = "a-zA-Z0-9._―‑!-" + user_pass_chars = "a-z0-9._―‑!%-" # This pattern is based on this one https://gist.github.com/dperini/729294 pattern = re.compile( # Optional protocol. diff --git a/tests/test_EntityParser/test_parse_urls_and_emails.py b/tests/test_EntityParser/test_parse_urls_and_emails.py index f06cbf8..c81a663 100644 --- a/tests/test_EntityParser/test_parse_urls_and_emails.py +++ b/tests/test_EntityParser/test_parse_urls_and_emails.py @@ -266,6 +266,85 @@ def test_misc(self): assert self.ep.parse_urls_and_emails("_sip._udp.apnic.net") == (MessageEntity(length=19, offset=0, type=MessageEntityType.URL),) assert self.ep.parse_urls_and_emails("https://as_sip._udp.apnic.net") == (MessageEntity(length=29, offset=0, type=MessageEntityType.URL),) + def test_complex(self): + text = ("a.b.google.com dfsknnfs gsdfgsg http://códuia.de/ dffdg,\" 12)(cpia.de/())(\" http://гришка.рф/ sdufhdf " + "http://xn--80afpi2a3c.xn--p1ai/ I have a good time.Thanks, guys!\n\n(hdfughidufhgdis) go#ogle.com гришка.рф " + "hsighsdf gi почта.рф\n\n✪df.ws/123 " + "xn--80afpi2a3c.xn--p1ai\n\nhttp://foo.com/blah_blah\nhttp://foo.com/blah_blah/\n(Something like " + "http://foo.com/blah_blah)\nhttp://foo.com/blah_blah_(wikipedi8989a_Вася)\n(Something like " + "http://foo.com/blah_blah_(Стакан_007))\nhttp://foo.com/blah_blah.\nhttp://foo.com/blah_blah/.\n\n\nhttp://foo.com/blah_blah,\nhttp://" + "www.example.com/wpstyle/?p=364.\nhttp://✪df.ws/123\nrdar://1234\nhttp://" + "userid:password@example.com:8080\nhttp://userid@example.com\nhttp://userid@example.com:8080\nhttp://" + "userid:password@example.com\nhttp://example.com:8080 " + "x-yojimbo-item://6303E4C1-xxxx-45A6-AB9D-3A908F59AE0E\nmessage://" + "%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e\n" + "http://example.com\nJust a www.example.com " + "link.\n\n➡️.ws/" + "䨹\n\nabcdefghijklmnopqrstuvwxyz0123456789qwe_sdfsdf.aweawe-sdfs.com\ngoogle.com:" + "᪉᪉᪉᪉\ngoogle." + "com:᪀᪀\nhttp://  .com\nURL:     .com\nURL: " + ".com\n\ngoogle.com?qwe\ngoogle.com#qwe\ngoogle.com/?\ngoogle.com/#\ngoogle.com?\ngoogle.com#\n") + + assert self.ep.parse_urls_and_emails(text) == (MessageEntity(length=14, offset=0, type=MessageEntityType.URL), + MessageEntity(length=17, offset=32, type=MessageEntityType.URL), + MessageEntity(length=10, offset=62, type=MessageEntityType.URL), + MessageEntity(length=17, offset=76, type=MessageEntityType.URL), + MessageEntity(length=31, offset=102, type=MessageEntityType.URL), + MessageEntity(length=8, offset=189, type=MessageEntityType.URL), + MessageEntity(length=9, offset=198, type=MessageEntityType.URL), + MessageEntity(length=8, offset=220, type=MessageEntityType.URL), + MessageEntity(length=10, offset=230, type=MessageEntityType.URL), + MessageEntity(length=23, offset=246, type=MessageEntityType.URL), + MessageEntity(length=24, offset=271, type=MessageEntityType.URL), + MessageEntity(length=25, offset=296, type=MessageEntityType.URL), + MessageEntity(length=24, offset=338, type=MessageEntityType.URL), + MessageEntity(length=45, offset=364, type=MessageEntityType.URL), + MessageEntity(length=37, offset=426, type=MessageEntityType.URL), + MessageEntity(length=24, offset=465, type=MessageEntityType.URL), + MessageEntity(length=25, offset=491, type=MessageEntityType.URL), + MessageEntity(length=24, offset=519, type=MessageEntityType.URL), + MessageEntity(length=16, offset=583, type=MessageEntityType.URL), + MessageEntity(length=24, offset=601, type=MessageEntityType.URL), + MessageEntity(length=37, offset=627, type=MessageEntityType.URL), + MessageEntity(length=17, offset=666, type=MessageEntityType.URL), + MessageEntity(length=39, offset=696, type=MessageEntityType.URL), + MessageEntity(length=25, offset=736, type=MessageEntityType.URL), + MessageEntity(length=30, offset=762, type=MessageEntityType.URL), + MessageEntity(length=34, offset=793, type=MessageEntityType.URL), + MessageEntity(length=23, offset=828, type=MessageEntityType.URL), + MessageEntity(length=18, offset=982, type=MessageEntityType.URL), + MessageEntity(length=15, offset=1014, type=MessageEntityType.URL), + MessageEntity(length=7, offset=1037, type=MessageEntityType.URL), + MessageEntity(length=62, offset=1046, type=MessageEntityType.URL), + MessageEntity(length=10, offset=1109, type=MessageEntityType.URL), + MessageEntity(length=10, offset=1125, type=MessageEntityType.URL), + MessageEntity(length=14, offset=1178, type=MessageEntityType.URL), + MessageEntity(length=14, offset=1193, type=MessageEntityType.URL), + MessageEntity(length=11, offset=1208, type=MessageEntityType.URL), + MessageEntity(length=12, offset=1221, type=MessageEntityType.URL), + MessageEntity(length=10, offset=1234, type=MessageEntityType.URL), + MessageEntity(length=10, offset=1246, type=MessageEntityType.URL)) + + + def test_percentage_symbol(self): + assert self.ep.parse_urls_and_emails("http://%3c330e7f8409726r@mail.gmail.com") == (MessageEntity(length=39, + offset=0, + type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://%3c330e7f8409726rmail.gmail.com") == (MessageEntity(length=30, + offset=8, + type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("%3c330e7f8409726rmail.gmail.com") == (MessageEntity(length=30, + offset=1, + type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("%3c330e7f8409726r@mail.gmail.com") == (MessageEntity(length=32, + offset=0, + type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("http://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com") == (MessageEntity(length=64, + offset=0, + type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("message://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e") == () + def test_emails(self): assert self.ep.parse_urls_and_emails("a.bc@c.com") == (MessageEntity(length=10, offset=0, type=MessageEntityType.EMAIL),) assert self.ep.parse_urls_and_emails("https://a.de[bc@c.com") == (MessageEntity(length=12, offset=0, type=MessageEntityType.URL), From 7dae32605e4040310bd172b2e1e01cc9fb22d178 Mon Sep 17 00:00:00 2001 From: elebur Date: Sat, 31 May 2025 13:15:12 +0300 Subject: [PATCH 12/16] refactor(`parse_urls_and_emails`): remove the redundant range in the regex pattern --- ptbtest/entityparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index d7764c1..cadfb4b 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -1938,7 +1938,7 @@ def parse_urls_and_emails(text: str) -> tuple[MessageEntity, ...]: # This pattern is based on this one https://gist.github.com/dperini/729294 pattern = re.compile( # Optional protocol. - r"(?:[a-zA-Z]+://)?" + r"(?:[a-z]+://)?" # 'user:pass' basic auth (optional) fr"(?:[:{user_pass_chars}]+(?::[{user_pass_chars}]+)?@)?" r"(?:" From 8b18cec5c171b234d717b87ce8b2ca3fd2f998af Mon Sep 17 00:00:00 2001 From: elebur Date: Sat, 31 May 2025 13:16:54 +0300 Subject: [PATCH 13/16] refactor(`parse_urls_and_emails`): move the characters set for the domain to a separate variable --- ptbtest/entityparser.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index cadfb4b..722bf67 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -1935,6 +1935,8 @@ def parse_urls_and_emails(text: str) -> tuple[MessageEntity, ...]: """ # Allowed characters in the username and in the password in the basic auth. user_pass_chars = "a-z0-9._―‑!%-" + + host_domain_symbols = "a-z0-9\u00a1-\uffff―_‑-" # This pattern is based on this one https://gist.github.com/dperini/729294 pattern = re.compile( # Optional protocol. @@ -1948,10 +1950,10 @@ def parse_urls_and_emails(text: str) -> tuple[MessageEntity, ...]: # host & domain names r"(?:" r"(?:" - r"[a-z0-9\u00a1-\uffff―_‑-]" - r"[a-z0-9\u00a1-\uffff_―‑-]{0,62}" + rf"[{host_domain_symbols}]" + rf"[{host_domain_symbols}]{{0,62}}" r")?" - r"[a-z0-9\u00a1-\uffff_―‑-]\." + rf"[{host_domain_symbols}]\." r")+" # TLD identifier name r"(?:[a-z0-9\u00a1-\uffff`‑―-]{2,})" From 41c4905c3f71d534cf8c49393f329d59f2af0ed3 Mon Sep 17 00:00:00 2001 From: elebur Date: Sat, 31 May 2025 13:17:52 +0300 Subject: [PATCH 14/16] test(`parse_bot_commands`): add tests for commands that consist of numbers only --- tests/test_EntityParser/test_parse_bot_commands.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_EntityParser/test_parse_bot_commands.py b/tests/test_EntityParser/test_parse_bot_commands.py index eb1ec17..4665504 100644 --- a/tests/test_EntityParser/test_parse_bot_commands.py +++ b/tests/test_EntityParser/test_parse_bot_commands.py @@ -149,3 +149,7 @@ def test_utf16_offset(self): result = self.ep.parse_bot_commands(text) assert result == (MessageEntity(length=8, offset=6, type=MessageEntityType.BOT_COMMAND),) + + def test_command_from_numbers_only(self): + assert self.ep.parse_bot_commands("/1234") == (MessageEntity(length=5, offset=0, type=MessageEntityType.BOT_COMMAND),) + assert self.ep.parse_bot_commands("rdar:/1234") == (MessageEntity(length=5, offset=5, type=MessageEntityType.BOT_COMMAND),) From 54793876e27dff3fe70b2e29c46a71a50914d1c2 Mon Sep 17 00:00:00 2001 From: elebur Date: Sat, 31 May 2025 13:53:21 +0300 Subject: [PATCH 15/16] fix(`parse_urls_and_emails`): now the method respects the UTF16 length of the strings --- ptbtest/entityparser.py | 4 ++-- tests/test_EntityParser/test_parse_urls_and_emails.py | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index 722bf67..8f18f05 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -2012,14 +2012,14 @@ def is_url_path_symbol(ch): if path and valid_symbols_in_path_counter != len(path): invalid_symbols_counter = len(path) - valid_symbols_in_path_counter url = url[:len(url) - invalid_symbols_counter] + entity_length -= _get_utf16_length(path[valid_symbols_in_path_counter:]) path = path[:valid_symbols_in_path_counter] - entity_length -= invalid_symbols_counter fixed_url = _fix_url(url) if not fixed_url: continue elif (url_length_diff := len(url) - len(fixed_url)) > 0: - entity_length -= url_length_diff + entity_length -= _get_utf16_length(url[-url_length_diff:]) # The 'raw_port' will contain the colon symbol. # E.g., ':8080'. diff --git a/tests/test_EntityParser/test_parse_urls_and_emails.py b/tests/test_EntityParser/test_parse_urls_and_emails.py index c81a663..4e67e41 100644 --- a/tests/test_EntityParser/test_parse_urls_and_emails.py +++ b/tests/test_EntityParser/test_parse_urls_and_emails.py @@ -326,6 +326,13 @@ def test_complex(self): MessageEntity(length=10, offset=1234, type=MessageEntityType.URL), MessageEntity(length=10, offset=1246, type=MessageEntityType.URL)) + def test_utf16_length(self): + assert self.ep.parse_urls_and_emails("example.com/hello=𐐷&hhh=2𐍈") == (MessageEntity(length=28, offset=0, + type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("example.com/hello=𐐷&«hhh=2𐍈") == (MessageEntity(length=21, offset=0, + type=MessageEntityType.URL),) + assert self.ep.parse_urls_and_emails("example.com/hello=𐐷&hhh)=2𐍈") == (MessageEntity(length=24, offset=0, + type=MessageEntityType.URL),) def test_percentage_symbol(self): assert self.ep.parse_urls_and_emails("http://%3c330e7f8409726r@mail.gmail.com") == (MessageEntity(length=39, From 03fef63d32e8c0eed216fb0a17d650efbcc562af Mon Sep 17 00:00:00 2001 From: elebur Date: Sat, 31 May 2025 21:56:24 +0300 Subject: [PATCH 16/16] refactor(`entityparser`): apply hatch's warnings --- ptbtest/entityparser.py | 15 +++++++++------ .../test_parse_urls_and_emails.py | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ptbtest/entityparser.py b/ptbtest/entityparser.py index 8f18f05..0eb7211 100644 --- a/ptbtest/entityparser.py +++ b/ptbtest/entityparser.py @@ -1,3 +1,4 @@ +# ruff: noqa: C901, RUF001 # A library that provides a testing suite fot python-telegram-bot # which can be found on https://github.com/python-telegram-bot/python-telegram-bot # Copyright (C) 2017 @@ -688,7 +689,9 @@ def _fix_url(full_url: str) -> str: if len(domain_parts) <= 1: return "" - validator = lambda x: not x or len(x) >= 64 or x.endswith("-") + def validator(text): + return not text or len(text) >= 64 or text.endswith("-") + if any(map(validator, domain_parts)): return "" @@ -1876,9 +1879,9 @@ def parse_cashtags(text: str) -> tuple[MessageEntity, ...]: for match in matches: # If the input string is "$ABC@mention", then - # match.group(0) is '$ABC@mention' - # match.group(1) is 'ABC' - # match.group(2) is 'mention' (optional) + # group 0 is '$ABC@mention' + # group 1 is 'ABC' + # group 2 is 'mention' (optional) cashtag = match.group(1) mention = match.group(2) @@ -1944,7 +1947,7 @@ def parse_urls_and_emails(text: str) -> tuple[MessageEntity, ...]: # 'user:pass' basic auth (optional) fr"(?:[:{user_pass_chars}]+(?::[{user_pass_chars}]+)?@)?" r"(?:" - # IP address + # IP address r"(?:(?:\d{1,3})\.){3}(?:\d{1,3})\b" r"|" # host & domain names @@ -1995,7 +1998,7 @@ def is_url_path_symbol(ch): continue # if there is a dot(s) followed by a non-whitespace symbol right after the # TLD, then ignore such an URL. - elif re.search("^\.+[^.\s]", text[match.end:]): + elif re.search(r"^\.+[^.\s]", text[match.end:]): continue elif protocol and protocol.lower() not in ("http", "https", "ftp", "tonsite"): continue diff --git a/tests/test_EntityParser/test_parse_urls_and_emails.py b/tests/test_EntityParser/test_parse_urls_and_emails.py index 4e67e41..5028d20 100644 --- a/tests/test_EntityParser/test_parse_urls_and_emails.py +++ b/tests/test_EntityParser/test_parse_urls_and_emails.py @@ -1,3 +1,4 @@ +# ruff: noqa: RUF001 from telegram import MessageEntity from telegram.constants import MessageEntityType