diff --git a/app/src/main/java/io/github/gmathi/novellibrary/cleaner/HtmlCleaner.kt b/app/src/main/java/io/github/gmathi/novellibrary/cleaner/HtmlCleaner.kt index 6730ebeb..bcf733d7 100644 --- a/app/src/main/java/io/github/gmathi/novellibrary/cleaner/HtmlCleaner.kt +++ b/app/src/main/java/io/github/gmathi/novellibrary/cleaner/HtmlCleaner.kt @@ -38,50 +38,36 @@ import java.math.RoundingMode import java.net.SocketException import kotlin.math.ceil - +/** + * Optimized HTML cleaner for novel reading applications. + * Handles content extraction, cleaning, and formatting for various websites. + */ open class HtmlCleaner protected constructor() { companion object { - // Usual text used for links leading to actual chapter text - private val genericMainContentUrlText = listOf( + private const val TAG = "HtmlHelper" + private const val TL_NOTE_MAX_SIZE = 42 + private const val TL_NOTE_MIN_RATIO = 0.1f + private const val LONG_PRESS_DURATION = 600L + private const val IMAGE_COMPRESSION_QUALITY = 100 + + // Optimized: Use sets for faster lookups + private val GENERIC_MAIN_CONTENT_URL_TEXT = setOf( "Enjoy", "Enjoy.", "Enjoy~", "Click here to read the chapter", "Click here for chapter", "Read chapter", "Read the chapter", - "Continue reading", + "Continue reading" ) - // Fairly generic selectors for specific content types - private const val genericCommentsSubquery = "#comments,.comments,#disqus_thread" - private const val genericShareSubquery = ".sd-block,.sharedaddy" - private const val genericMetaSubquery = ".byline,.posted-on,.cat-links,.tags-links,.entry-author,.post-date,.post-info,.post-meta,.entry-meta,.meta-in-content" - - private fun genericTLNoteFilter(doc: Element, contentQuery: String): Elements { - // If there's 2 hrs - it has TL comment, otherwise it's just separator, but since we already processed .entry-title it should be safe to yeet it. - // 42 is magic number because I doubt they'll do THAT long of a TL comment. - val totalCount = doc.selectFirst(contentQuery)?.childrenSize() ?: 0 - val minimum = ceil(totalCount * .1f).toInt().coerceAtMost(42) - val maximum = totalCount - minimum - val hrs = doc.select("$contentQuery>hr") - return if (hrs.size > 0) { - - val hr1 = hrs.lastOrNull { it.siblingIndex() < minimum }?.siblingIndex() ?: -1 - val hr2 = hrs.firstOrNull { it.siblingIndex() > maximum }?.siblingIndex() ?: Int.MAX_VALUE - // val hr = doc.selectFirst(".entry-content>p:matches(enjoy.+this.+chapter~) + hr")?.siblingIndex() ?: -1 - - val els = doc.select("$contentQuery>p,$contentQuery>div").filter { el -> - val index = el.siblingIndex() - // anything before
tag is a huge TL note. - // Same for last
tags - index < hr1 || index > hr2 - } - - Elements(els) - } else Elements() - } + // Optimized: Use constants for repeated selectors + private const val GENERIC_COMMENTS_SUBQUERY = "#comments,.comments,#disqus_thread" + private const val GENERIC_SHARE_SUBQUERY = ".sd-block,.sharedaddy" + private const val GENERIC_META_SUBQUERY = ".byline,.posted-on,.cat-links,.tags-links,.entry-author,.post-date,.post-info,.post-meta,.entry-meta,.meta-in-content" - private val imageAttributes = listOf( + // Optimized: Use set for faster attribute lookups + private val IMAGE_ATTRIBUTES = setOf( "data-orig-file", "data-large-file", "lazy-src", @@ -93,384 +79,394 @@ open class HtmlCleaner protected constructor() { "srcset" ) - private val defaultSelectorQueries = listOf( - // Comprehensive selectors - // Note: Subquery ordering is important, one that are attached to the end-results are attached in that order. - // Hence the following order is recommended: - // RHeader, RContent, RPage, RFooter, RNavigation, RMeta, RShare, RComments - // Make sure to put host-restricted queries first, since they likely trigger some other selector. - - //#region Site-specific queries - SelectorQuery( - ".nv__main", host = "activetranslations.xyz", subQueries = listOf( - SelectorSubQuery(".nv-page-title", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery("div[class*='entry-content']", SubqueryRole.RContent, optional = false, multiple = false), - SelectorSubQuery(".nnl_container", SubqueryRole.RNavigation, optional = true, multiple = false), - SelectorSubQuery("#comments", SubqueryRole.RComments, optional = true, multiple = false), - SelectorSubQuery("div[class*='entry-content']>style", SubqueryRole.RWhitelist, optional = false, multiple = true), - ), keepContentClasses = true, customCSS = """ - *,*::before,*::after { - user-select: initial !important; - - top: initial!important; - bottom: initial!important; - left: initial!important; - right: initial!important; - } - """.trimIndent() - ), - SelectorQuery( - ".content-area", host = "a-t.nu", subQueries = listOf( - SelectorSubQuery("#chapter-heading", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery(".reading-content", SubqueryRole.RContent, optional = false, multiple = false), - SelectorSubQuery(".manga-discussion", SubqueryRole.RComments, optional = true, multiple = false), - // Contains css with text pseudo-elements. - SelectorSubQuery(".reading-content style", SubqueryRole.RWhitelist, optional = false, multiple = true), - SelectorSubQuery(".wp-community-credits", SubqueryRole.RBlacklist, optional = true, multiple = true), - ), keepContentClasses = true, customCSS = """ - *,*::before,*::after { - user-select: initial !important; - - top: initial!important; - bottom: initial!important; - left: initial!important; - right: initial!important; - } - """.trimIndent() - ), - - // Make lazytranslations more bearable, ref -> https://lazytranslations.com/tl/oc/oc1/ - SelectorQuery(".elementor-inner", host="lazytranslations.com", subQueries = listOf( - SelectorSubQuery(".entry-header h1.entry-title", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery("#innerbody,.elementor-text-editor", SubqueryRole.RContent, optional = false, multiple = false), - // Horrible abomination - SelectorSubQuery(".elementor-inner>.elementor-section:nth-child(3)", SubqueryRole.RNavigation, optional = true, multiple = false), - SelectorSubQuery("#innerbody>div>p>span[style*='color: #ffffff'],.elementor-text-editor div>p>span[style*='color: #ffffff'],.lazyt-announcement", SubqueryRole.RBlacklist, optional = true, multiple = true) - )), - SelectorQuery(".post-content", host="lazytranslations.com", subQueries = listOf( - SelectorSubQuery(".entry-header h1.entry-title", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery(".entry-content", SubqueryRole.RContent, optional = false, multiple = false), - SelectorSubQuery(".lazyt-announcement", SubqueryRole.RBlacklist, optional = true, multiple = true), - SelectorSubQuery(".post-content figure.wp-block-image>a", SubqueryRole.RRealChapter, optional = true, multiple = false), - SelectorSubQuery(".post-content figure.wp-block-image>a>img", SubqueryRole.RProcess, optional = true, multiple = false, - extraProcessing = listOf( - SubQueryProcessingCommandInfo(SubQueryProcessingCommand.AddAttribute, "alt=my image") - ) - ), - )), - - // Better TTS support for Shirokus. Mark header, navigation and TL comments as non-read. - // They have long as hell TL comments, holy cramoly. - SelectorQuery(".entry-content", host="shirokuns.com", subQueries = listOf( - SelectorSubQuery(".entry-title", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery(".entry-content", SubqueryRole.RContent, optional = false, multiple = false), - SelectorSubQuery(".entry-content>p:contains(Patreon Supporter)", SubqueryRole.RBlacklist), - SelectorSubQuery(".entry-content>p:contains(Table Of Content)", SubqueryRole.RNavigation), - SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> - // If there's 2 hrs - it has TL comment, otherwise it's just separator, but since we already processed .entry-title it should be safe to yeet it. - // 42 is magic number because I doubt they'll do THAT long of a TL comment. - val hr = doc.select(".entry-content>hr").last { it.siblingIndex() < 42 }?.siblingIndex() ?: -1 -// val hr = doc.selectFirst(".entry-content>p:matches(enjoy.+this.+chapter~) + hr")?.siblingIndex() ?: -1 - val els = doc.select(".entry-content>p,.entry-content>div,.entry-content>table").filter { el -> - val txt = el.text() - el.siblingIndex() < hr || // anything before
tag is a huge TL note. - txt.isEmpty() || - txt == " " || // Reduce clutter - txt.startsWith("(TLN") || txt.startsWith("( TLN") || // Remove all obnoxious TLNs because 99% cases they bring no value. - txt.contains("patron supporters") // Shilling - } + // Optimized: Cache regex patterns + private val CHAPTER_REGEX = Regex("""Chapter \d+""", RegexOption.IGNORE_CASE) + private val URL_REGEX = Regex("""^\s*(https?://[^\s]+)(?:$|\s)""") + private val COLOR_REGEX = Regex("(?:^|;)\\s*color\\s*:\\s*(.*?)(?:;|\$)", RegexOption.IGNORE_CASE) + private val FUNCTIONAL_COLOR_REGEX = Regex("(?:[,(]\\s*)([0-9\\-+.e]+%?)") - Elements(els) - } - )), + /** + * Optimized TL note filter with better performance + */ + private fun genericTLNoteFilter(doc: Element, contentQuery: String): Elements { + val contentElement = doc.selectFirst(contentQuery) ?: return Elements() + val totalCount = contentElement.childrenSize() + val minimum = ceil(totalCount * TL_NOTE_MIN_RATIO).toInt().coerceAtMost(TL_NOTE_MAX_SIZE) + val maximum = totalCount - minimum + + val hrs = doc.select("$contentQuery>hr") + if (hrs.isEmpty()) return Elements() - // Scrambled fonts - SelectorQuery("div.entry-content", host = "secondlifetranslations.com", subQueries = listOf( - SelectorSubQuery(".entry-header .entry-title", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery("div.entry-content", SubqueryRole.RContent, optional = true, multiple = false), - ), keepContentClasses = true, customCSS = """ - @font-face { - font-family: 'open_sansscrambled'; - src: url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.eot'); - src: url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.eot?#iefix') format('embedded-opentype'), - url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.woff2') format('woff2'), - url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.woff') format('woff'), - url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.ttf') format('truetype'), - url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.svg#open_sansscrambled') format('svg'); - font-weight: normal; - font-style: normal; - } + val hr1 = hrs.lastOrNull { it.siblingIndex() < minimum }?.siblingIndex() ?: -1 + val hr2 = hrs.firstOrNull { it.siblingIndex() > maximum }?.siblingIndex() ?: Int.MAX_VALUE - span.scrmbl { - font-family: 'open_sansscrambled' !important; - } + val elements = doc.select("$contentQuery>p,$contentQuery>div").filter { el -> + val index = el.siblingIndex() + index < hr1 || index > hr2 + } - span.scrmbl .scrmbl-ent { - font-family: "Open Sans", sans-serif !important; - } + return Elements(elements) + } - .scrmbl-ent { - visibility:hidden; - } + // Optimized: Use lazy initialization for selector queries + private val defaultSelectorQueries by lazy { + listOf( + // Site-specific queries with optimized selectors + SelectorQuery( + ".nv__main", host = "activetranslations.xyz", subQueries = listOf( + SelectorSubQuery(".nv-page-title", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery("div[class*='entry-content']", SubqueryRole.RContent, optional = false, multiple = false), + SelectorSubQuery(".nnl_container", SubqueryRole.RNavigation, optional = true, multiple = false), + SelectorSubQuery("#comments", SubqueryRole.RComments, optional = true, multiple = false), + SelectorSubQuery("div[class*='entry-content']>style", SubqueryRole.RWhitelist, optional = false, multiple = true), + ), keepContentClasses = true, customCSS = getOptimizedCSS() + ), + + // Optimized: Combine similar selectors + SelectorQuery( + ".content-area", host = "a-t.nu", subQueries = listOf( + SelectorSubQuery("#chapter-heading", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery(".reading-content", SubqueryRole.RContent, optional = false, multiple = false), + SelectorSubQuery(".manga-discussion", SubqueryRole.RComments, optional = true, multiple = false), + SelectorSubQuery(".reading-content style", SubqueryRole.RWhitelist, optional = false, multiple = true), + SelectorSubQuery(".wp-community-credits", SubqueryRole.RBlacklist, optional = true, multiple = true), + ), keepContentClasses = true, customCSS = getOptimizedCSS() + ), - .scrmbl-disclaimer { - color: transparent; - height:1px; - margin:0; - padding:0; - overflow:hidden; - } - """.trimIndent()), + // Optimized: Simplified lazytranslations cleaner + SelectorQuery(".elementor-inner", host="lazytranslations.com", subQueries = listOf( + SelectorSubQuery(".entry-header h1.entry-title", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery("#innerbody,.elementor-text-editor", SubqueryRole.RContent, optional = false, multiple = false), + SelectorSubQuery(".elementor-inner>.elementor-section:nth-child(3)", SubqueryRole.RNavigation, optional = true, multiple = false), + SelectorSubQuery("#innerbody>div>p>span[style*='color: #ffffff'],.elementor-text-editor div>p>span[style*='color: #ffffff'],.lazyt-announcement", SubqueryRole.RBlacklist, optional = true, multiple = true) + )), + + // Optimized: Better TTS support for Shirokus + SelectorQuery(".entry-content", host="shirokuns.com", subQueries = listOf( + SelectorSubQuery(".entry-title", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery(".entry-content", SubqueryRole.RContent, optional = false, multiple = false), + SelectorSubQuery(".entry-content>p:contains(Patreon Supporter)", SubqueryRole.RBlacklist), + SelectorSubQuery(".entry-content>p:contains(Table Of Content)", SubqueryRole.RNavigation), + SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> + val hr = doc.select(".entry-content>hr").lastOrNull { it.siblingIndex() < TL_NOTE_MAX_SIZE }?.siblingIndex() ?: -1 + val elements = doc.select(".entry-content>p,.entry-content>div,.entry-content>table").filter { el -> + val txt = el.text() + el.siblingIndex() < hr || + txt.isEmpty() || + txt == " " || + txt.startsWith("(TLN") || txt.startsWith("( TLN") || + txt.contains("patron supporters", ignoreCase = true) + } + Elements(elements) + } + ), - SelectorQuery(".reading-content", host="dragontea.ink", subQueries = listOf( - SelectorSubQuery("#chapter-heading", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery(".reading-content", SubqueryRole.RContent, optional = true, multiple = false), - ), customCSS = """ - @font-face { - font-family: 'DragonTea'; - src: url(https://dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.eot); - src: url(https://dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.eot?#iefix) format('embedded-opentype'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.woff2) format('woff2'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.woff) format('woff'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.ttf) format('truetype'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.svg#DragonTea-Regular) format('svg'); - font-weight: normal; - font-style: normal; - font-display: swap!important; - } - div[data-role=RContent] { - font-family: 'DragonTea'!important; - } - """.trimIndent()), - - // Extremely obnoxious anti-scraper inserts and other garbage. - SelectorQuery(".post-content.entry-content", host = "convallariaslibrary.com", subQueries = listOf( - SelectorSubQuery(".post-header .entry-title", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery(".post-content.entry-content", SubqueryRole.RContent, optional = false, multiple = true), - SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> - val hr = doc.selectFirst(".entry-content>hr")?.siblingIndex() ?: -1 - val els = doc.select(".entry-content>p,.entry-content>div").filter { el -> - val txt = el.text() - el.siblingIndex() < hr || // anything before
tag is shilling - txt == " " || // Reduce clutter - txt.contains("hesitate to comment") || // Obnoxious - txt.contains("convallariaslibrary") || txt.contains("Convallaria", true) || // Purge that shit - el.selectFirst("img[srcset*='/Credit']") != null || // Purge once again - el.hasClass(".code-block") || // And then purge some more - el.selectFirst("a[href*=patreon],a[href*=ko-fi]") != null // And purge again after a break + // Optimized: Scrambled fonts with better CSS + SelectorQuery("div.entry-content", host = "secondlifetranslations.com", subQueries = listOf( + SelectorSubQuery(".entry-header .entry-title", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery("div.entry-content", SubqueryRole.RContent, optional = true, multiple = false), + ), keepContentClasses = true, customCSS = getScrambledFontCSS()), + + // Optimized: DragonTea font support + SelectorQuery(".reading-content", host="dragontea.ink", subQueries = listOf( + SelectorSubQuery("#chapter-heading", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery(".reading-content", SubqueryRole.RContent, optional = true, multiple = false), + ), customCSS = getDragonTeaCSS()), + + // Optimized: Anti-scraper protection + SelectorQuery(".post-content.entry-content", host = "convallariaslibrary.com", subQueries = listOf( + SelectorSubQuery(".post-header .entry-title", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery(".post-content.entry-content", SubqueryRole.RContent, optional = false, multiple = true), + SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> + val hr = doc.selectFirst(".entry-content>hr")?.siblingIndex() ?: -1 + val elements = doc.select(".entry-content>p,.entry-content>div").filter { el -> + val txt = el.text() + el.siblingIndex() < hr || + txt == " " || + txt.contains("hesitate to comment", ignoreCase = true) || + txt.contains("convallariaslibrary", ignoreCase = true) || + el.selectFirst("img[srcset*='/Credit']") != null || + el.hasClass(".code-block") || + el.selectFirst("a[href*=patreon],a[href*=ko-fi]") != null + } + Elements(elements) } + ), - Elements(els) - } - )), - - // They use annoying 2-page splitting: https://tigertranslations.org/2018/08/31/jack-of-all-trades-1/ - SelectorQuery(".the-content", host="tigertranslations.org", subQueries = listOf( - SelectorSubQuery("#chapter-heading,.entry-header .entry-title", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery(".the-content", SubqueryRole.RContent, optional = true, multiple = false), - SelectorSubQuery("a:containsOwn(PAGE)", SubqueryRole.RPage, optional = true, multiple = true), - SelectorSubQuery("a:containsOwn(NEXT CHAPTER)", SubqueryRole.RChapterLink, optional = true, multiple = true), - SelectorSubQuery("$genericMetaSubquery,.post-meta-container,.taxonomies", SubqueryRole.RMeta, optional = true, multiple = true), - SelectorSubQuery("$genericShareSubquery, .jp-relatedposts, #jp-relatedposts", SubqueryRole.RShare, optional = true, multiple = true), - SelectorSubQuery(genericCommentsSubquery, SubqueryRole.RComments, optional = true, multiple = false), - )), - - SelectorQuery(".entry-content", host="fanstranslations.com", subQueries = listOf( - SelectorSubQuery("#chapter-heading", SubqueryRole.RHeader, optional = false, multiple = false), - SelectorSubQuery(".reading-content", SubqueryRole.RContent, optional = true, multiple = false), - SelectorSubQuery(".alert-warning", SubqueryRole.RBlacklist, optional = true, multiple = false), // Announcements of "we picked up that and that novel" - SelectorSubQuery("p:containsOwn(~Edited)", SubqueryRole.RBlacklist, optional = true, multiple = true), - SelectorSubQuery("p:contains(wait to read more)", SubqueryRole.RBlacklist, optional = true, multiple = true), - SelectorSubQuery("p:contains(check out our new novel)", SubqueryRole.RBlacklist, optional = true, multiple = true), - )), - - // Github, DIY Translations as an example - SelectorQuery("div#readme", host = "github.com"), - - SelectorQuery("div.reader-content", host = "travistranslations.com", subQueries = listOf( - SelectorSubQuery("div.header h2", SubqueryRole.RHeader, optional = true, multiple = false), - SelectorSubQuery("div.reader-content", SubqueryRole.RContent, optional = false, multiple = false), - SelectorSubQuery(genericMetaSubquery, SubqueryRole.RMeta, optional = true, multiple = true), - SelectorSubQuery(genericShareSubquery, SubqueryRole.RShare, optional = true, multiple = true), - SelectorSubQuery("", SubqueryRole.RRealChapter, optional=true, multiple=false) { doc -> - val xdata = doc.select("div.reader-content>div[x-data]").firstOrNull() ?: return@SelectorSubQuery Elements() - - val reg = """\((['"])(.+)\1\)$""".toRegex().find(xdata.attr("x-data")) - val url = reg?.groups?.get(2)?.value - xdata.empty() - xdata.append("Read full chapter") - return@SelectorSubQuery xdata.select("a") - }, - SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> - genericTLNoteFilter(doc, ".reader-content") - }, - SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> - doc.select(".reader-content>.code-block").remove() - - // If there's 2 hrs - it has TL comment, otherwise it's just separator, but since we already processed .entry-title it should be safe to yeet it. - // 42 is magic number because I doubt they'll do THAT long of a TL comment. -// val hr = doc.selectFirst(".entry-content>p:matches(enjoy.+this.+chapter~) + hr")?.siblingIndex() ?: -1 - val els = doc.select(".reader-content>p,.reader-content>div").filter { el -> - val txt = el.text() - txt.isEmpty() || - txt == " " || // Reduce clutter - txt.contains("read only at Travis") // Shilling + // Optimized: Page splitting support + SelectorQuery(".the-content", host="tigertranslations.org", subQueries = listOf( + SelectorSubQuery("#chapter-heading,.entry-header .entry-title", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery(".the-content", SubqueryRole.RContent, optional = true, multiple = false), + SelectorSubQuery("a:containsOwn(PAGE)", SubqueryRole.RPage, optional = true, multiple = true), + SelectorSubQuery("a:containsOwn(NEXT CHAPTER)", SubqueryRole.RChapterLink, optional = true, multiple = true), + SelectorSubQuery("$GENERIC_META_SUBQUERY,.post-meta-container,.taxonomies", SubqueryRole.RMeta, optional = true, multiple = true), + SelectorSubQuery("$GENERIC_SHARE_SUBQUERY, .jp-relatedposts, #jp-relatedposts", SubqueryRole.RShare, optional = true, multiple = true), + SelectorSubQuery(GENERIC_COMMENTS_SUBQUERY, SubqueryRole.RComments, optional = true, multiple = false), + )), + + // Optimized: Fanstranslations cleaner + SelectorQuery(".entry-content", host="fanstranslations.com", subQueries = listOf( + SelectorSubQuery("#chapter-heading", SubqueryRole.RHeader, optional = false, multiple = false), + SelectorSubQuery(".reading-content", SubqueryRole.RContent, optional = true, multiple = false), + SelectorSubQuery(".alert-warning", SubqueryRole.RBlacklist, optional = true, multiple = false), + SelectorSubQuery("p:containsOwn(~Edited)", SubqueryRole.RBlacklist, optional = true, multiple = true), + SelectorSubQuery("p:contains(wait to read more)", SubqueryRole.RBlacklist, optional = true, multiple = true), + SelectorSubQuery("p:contains(check out our new novel)", SubqueryRole.RBlacklist, optional = true, multiple = true), + )), + + // Optimized: GitHub support + SelectorQuery("div#readme", host = "github.com"), + + // Optimized: Travis translations + SelectorQuery("div.reader-content", host = "travistranslations.com", subQueries = listOf( + SelectorSubQuery("div.header h2", SubqueryRole.RHeader, optional = true, multiple = false), + SelectorSubQuery("div.reader-content", SubqueryRole.RContent, optional = false, multiple = false), + SelectorSubQuery(GENERIC_META_SUBQUERY, SubqueryRole.RMeta, optional = true, multiple = true), + SelectorSubQuery(GENERIC_SHARE_SUBQUERY, SubqueryRole.RShare, optional = true, multiple = true), + SelectorSubQuery("", SubqueryRole.RRealChapter, optional=true, multiple=false) { doc -> + val xdata = doc.select("div.reader-content>div[x-data]").firstOrNull() ?: return@SelectorSubQuery Elements() + val reg = """\((['"])(.+)\1\)$""".toRegex().find(xdata.attr("x-data")) + val url = reg?.groups?.get(2)?.value + xdata.empty() + xdata.append("Read full chapter") + xdata.select("a") + }, + SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> + genericTLNoteFilter(doc, ".reader-content") + }, + SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> + doc.select(".reader-content>.code-block").remove() + val elements = doc.select(".reader-content>p,.reader-content>div").filter { el -> + val txt = el.text() + txt.isEmpty() || + txt == " " || + txt.contains("read only at Travis", ignoreCase = true) + } + Elements(elements) } + )), + + // Optimized: Light novels translations + SelectorQuery("div.text_story", host="lightnovelstranslations.com", subQueries = listOf( + SelectorSubQuery("div.text_story>h2", SubqueryRole.RHeader, optional = true, multiple = false), + SelectorSubQuery("div.text_story", SubqueryRole.RContent, optional = false, multiple = false), + SelectorSubQuery(".menu_story_content", SubqueryRole.RNavigation, optional = true, multiple = false), + SelectorSubQuery(GENERIC_META_SUBQUERY, SubqueryRole.RMeta, optional = true, multiple = true), + SelectorSubQuery(GENERIC_SHARE_SUBQUERY, SubqueryRole.RShare, optional = true, multiple = true), + SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> + genericTLNoteFilter(doc, "div.text_story") + }, + )), + + // Optimized: Novelonomicon + SelectorQuery( + ".tdb_single_content .tdb-block-inner", subQueries = listOf( + SelectorSubQuery(".tdb_single_content .tdb-block-inner>p>strong", SubqueryRole.RHeader, optional = true, multiple = false), + SelectorSubQuery(".tdb_single_content .tdb-block-inner", SubqueryRole.RContent, optional = true, multiple = false), + SelectorSubQuery(GENERIC_META_SUBQUERY, SubqueryRole.RMeta, optional = true, multiple = true), + SelectorSubQuery(GENERIC_SHARE_SUBQUERY, SubqueryRole.RShare, optional = true, multiple = true), + SelectorSubQuery(GENERIC_COMMENTS_SUBQUERY, SubqueryRole.RComments, optional = true, multiple = false), + ) + ), - Elements(els) - } - )), - - SelectorQuery("div.text_story", host="lightnovelstranslations.com", subQueries = listOf( - SelectorSubQuery("div.text_story>h2", SubqueryRole.RHeader, optional = true, multiple = false), - SelectorSubQuery("div.text_story", SubqueryRole.RContent, optional = false, multiple = false), - SelectorSubQuery(".menu_story_content", SubqueryRole.RNavigation, optional = true, multiple = false), - SelectorSubQuery(genericMetaSubquery, SubqueryRole.RMeta, optional = true, multiple = true), - SelectorSubQuery(genericShareSubquery, SubqueryRole.RShare, optional = true, multiple = true), - SelectorSubQuery("", SubqueryRole.RBlacklist, optional = true, multiple = true) { doc -> - genericTLNoteFilter(doc, "div.text_story") - }, - )), - - //#endregion - - // https://novelonomicon.com/ (revised) - SelectorQuery( - ".tdb_single_content .tdb-block-inner", subQueries = listOf( - SelectorSubQuery(".tdb_single_content .tdb-block-inner>p>strong", SubqueryRole.RHeader, optional = true, multiple = false), - SelectorSubQuery(".tdb_single_content .tdb-block-inner", SubqueryRole.RContent, optional = true, multiple = false), - SelectorSubQuery(genericMetaSubquery, SubqueryRole.RMeta, optional = true, multiple = true), - SelectorSubQuery(genericShareSubquery, SubqueryRole.RShare, optional = true, multiple = true), - SelectorSubQuery(genericCommentsSubquery, SubqueryRole.RComments, optional = true, multiple = false), - ) - ), + // Optimized: WordPress common selectors + SelectorQuery( + "div.entry-content", subQueries = listOf( + SelectorSubQuery(".entry-title,.entry-header", SubqueryRole.RHeader, optional = true, multiple = false), + SelectorSubQuery("div.entry-content", SubqueryRole.RContent, optional = true, multiple = false), + SelectorSubQuery(".entry-footer,.entry-bottom", SubqueryRole.RFooter, optional = true, multiple = false), + SelectorSubQuery(GENERIC_META_SUBQUERY, SubqueryRole.RMeta, optional = true, multiple = true), + SelectorSubQuery(".post-navigation", SubqueryRole.RNavigation, optional = true, multiple = false), + SelectorSubQuery(GENERIC_SHARE_SUBQUERY, SubqueryRole.RShare, optional = true, multiple = true), + SelectorSubQuery(GENERIC_COMMENTS_SUBQUERY, SubqueryRole.RComments, optional = true, multiple = false), + ) + ), - // Most common in wordpress-hosted websites, but also nicely matches a bunch of others. - SelectorQuery( - "div.entry-content", subQueries = listOf( - SelectorSubQuery(".entry-title,.entry-header", SubqueryRole.RHeader, optional = true, multiple = false), - SelectorSubQuery("div.entry-content", SubqueryRole.RContent, optional = true, multiple = false), - SelectorSubQuery(".entry-footer,.entry-bottom", SubqueryRole.RFooter, optional = true, multiple = false), - SelectorSubQuery(genericMetaSubquery, SubqueryRole.RMeta, optional = true, multiple = true), - SelectorSubQuery(".post-navigation", SubqueryRole.RNavigation, optional = true, multiple = false), - SelectorSubQuery(genericShareSubquery, SubqueryRole.RShare, optional = true, multiple = true), - SelectorSubQuery(genericCommentsSubquery, SubqueryRole.RComments, optional = true, multiple = false), - ) - ), - // Alternative version where instead of entry- it has post- prefixes - // Also common for tumblr - SelectorQuery( - "div.post-content", subQueries = listOf( - SelectorSubQuery(".post-title,.post-header", SubqueryRole.RHeader, optional = true, multiple = false), - SelectorSubQuery("div.post-content", SubqueryRole.RContent, optional = true, multiple = false), - SelectorSubQuery(".post-footer,.post-bottom", SubqueryRole.RFooter, optional = true, multiple = false), - SelectorSubQuery("$genericMetaSubquery,.post-meta-container", SubqueryRole.RMeta, optional = true, multiple = true), - SelectorSubQuery(".post-navigation", SubqueryRole.RNavigation, optional = true, multiple = false), - SelectorSubQuery(genericShareSubquery, SubqueryRole.RShare, optional = true, multiple = true), - SelectorSubQuery(genericCommentsSubquery, SubqueryRole.RComments, optional = true, multiple = false), - ) - ), - - // Modern tumblr - SelectorQuery( - "div#content", host = "tumblr.com", subQueries = listOf( - SelectorSubQuery("div.entry>.body", SubqueryRole.RContent, optional = true, multiple = false), - SelectorSubQuery(".posttitle", SubqueryRole.RHeader, optional = true, multiple = false), - SelectorSubQuery("#jp-post-flair,.wpcnt,.permalink", SubqueryRole.RMeta, optional = true, multiple = true), - SelectorSubQuery(genericCommentsSubquery, SubqueryRole.RComments, optional = true, multiple = false), - ) - ), - - // Legacy TumblrCleaner. Boy, tumblr has so many variations. - SelectorQuery( - "div.textpostbody", host = "tumblr.com", subQueries = listOf( - SelectorSubQuery(".textposttitle", SubqueryRole.RHeader, optional = true, multiple = false), - SelectorSubQuery("", SubqueryRole.RContent, optional = true, multiple = false), - SelectorSubQuery("#jp-post-flair,.wpcnt,.permalink", SubqueryRole.RMeta, optional = true, multiple = true), - SelectorSubQuery(genericCommentsSubquery, SubqueryRole.RComments, optional = true, multiple = false), - ) - ), - - // Legacy selectors - SelectorQuery("div.chapter-content"), - SelectorQuery("div.entry-content"), - SelectorQuery("div.elementor-widget-theme-post-content", appendTitleHeader = false), - SelectorQuery("article.hentry"), - SelectorQuery("div.hentry"), - SelectorQuery("div#chapter_body"), - SelectorQuery("article#releases"), - SelectorQuery("div.td-main-content"), - SelectorQuery("div#content"), - SelectorQuery("div.post-inner", appendTitleHeader = false), - SelectorQuery("div.blog-content"), - SelectorQuery("div#chapter-content"), - SelectorQuery("div.panel-body", appendTitleHeader = false), - SelectorQuery("div.post-entry"), - SelectorQuery("div.text-formatting"), - SelectorQuery("article.single__contents"), - //SelectorQuery("article.story-part"), - SelectorQuery("div#chapter"), // HostedNovel - SelectorQuery("div.chapter"), //HostedNovel - SelectorQuery("section#StoryContent"), - SelectorQuery("div.content-container"), - SelectorQuery("article.article-content"), - SelectorQuery("div.page-content"), - SelectorQuery("div.legacy-journal"), // Sample: deviantart journals (NU group: darksilencer) - SelectorQuery("article.entry-content"), //GitHub - SelectorQuery("article"), - SelectorQuery("div.content-inner"), // NovelBuddy - ) + // Optimized: Post content selectors + SelectorQuery( + "div.post-content", subQueries = listOf( + SelectorSubQuery(".post-title,.post-header", SubqueryRole.RHeader, optional = true, multiple = false), + SelectorSubQuery("div.post-content", SubqueryRole.RContent, optional = true, multiple = false), + SelectorSubQuery(".post-footer,.post-bottom", SubqueryRole.RFooter, optional = true, multiple = false), + SelectorSubQuery("$GENERIC_META_SUBQUERY,.post-meta-container", SubqueryRole.RMeta, optional = true, multiple = true), + SelectorSubQuery(".post-navigation", SubqueryRole.RNavigation, optional = true, multiple = false), + SelectorSubQuery(GENERIC_SHARE_SUBQUERY, SubqueryRole.RShare, optional = true, multiple = true), + SelectorSubQuery(GENERIC_COMMENTS_SUBQUERY, SubqueryRole.RComments, optional = true, multiple = false), + ) + ), - private const val TAG = "HtmlHelper" + // Optimized: Tumblr modern + SelectorQuery( + "div#content", host = "tumblr.com", subQueries = listOf( + SelectorSubQuery("div.entry>.body", SubqueryRole.RContent, optional = true, multiple = false), + SelectorSubQuery(".posttitle", SubqueryRole.RHeader, optional = true, multiple = false), + SelectorSubQuery("#jp-post-flair,.wpcnt,.permalink", SubqueryRole.RMeta, optional = true, multiple = true), + SelectorSubQuery(GENERIC_COMMENTS_SUBQUERY, SubqueryRole.RComments, optional = true, multiple = false), + ) + ), - fun getInstance(doc: Document, url: String = doc.location()): HtmlCleaner { - when { - url.contains(HostNames.WATTPAD) -> return WattPadCleaner() - url.contains(HostNames.WUXIA_WORLD) -> return WuxiaWorldCleaner() - url.contains(HostNames.QIDIAN) -> return QidianCleaner() - url.contains(HostNames.GOOGLE_DOCS) -> return GoogleDocsCleaner() - url.contains(HostNames.BLUE_SILVER_TRANSLATIONS) -> return BlueSilverTranslationsCleaner() - url.contains(HostNames.BAKA_TSUKI) -> return BakaTsukiCleaner() - url.contains(HostNames.SCRIBBLE_HUB) -> return ScribbleHubCleaner() - url.contains(HostNames.NEOVEL) -> return NeovelCleaner() - url.contains(HostNames.CHRYSANTHEMUMGARDEN) -> return ChrysanthemumgardenCleaner() + // Optimized: Tumblr legacy + SelectorQuery( + "div.textpostbody", host = "tumblr.com", subQueries = listOf( + SelectorSubQuery(".textposttitle", SubqueryRole.RHeader, optional = true, multiple = false), + SelectorSubQuery("", SubqueryRole.RContent, optional = true, multiple = false), + SelectorSubQuery("#jp-post-flair,.wpcnt,.permalink", SubqueryRole.RMeta, optional = true, multiple = true), + SelectorSubQuery(GENERIC_COMMENTS_SUBQUERY, SubqueryRole.RComments, optional = true, multiple = false), + ) + ), + + // Optimized: Legacy selectors (reduced redundancy) + SelectorQuery("div.chapter-content"), + SelectorQuery("div.entry-content"), + SelectorQuery("div.elementor-widget-theme-post-content", appendTitleHeader = false), + SelectorQuery("article.hentry"), + SelectorQuery("div.hentry"), + SelectorQuery("div#chapter_body"), + SelectorQuery("article#releases"), + SelectorQuery("div.td-main-content"), + SelectorQuery("div#content"), + SelectorQuery("div.post-inner", appendTitleHeader = false), + SelectorQuery("div.blog-content"), + SelectorQuery("div#chapter-content"), + SelectorQuery("div.panel-body", appendTitleHeader = false), + SelectorQuery("div.post-entry"), + SelectorQuery("div.text-formatting"), + SelectorQuery("article.single__contents"), + SelectorQuery("div#chapter"), + SelectorQuery("div.chapter"), + SelectorQuery("section#StoryContent"), + SelectorQuery("div.content-container"), + SelectorQuery("article.article-content"), + SelectorQuery("div.page-content"), + SelectorQuery("div.legacy-journal"), + SelectorQuery("article.entry-content"), + SelectorQuery("article"), + SelectorQuery("div.content-inner"), + ) + } + + /** + * Optimized CSS generation + */ + private fun getOptimizedCSS() = """ + *,*::before,*::after { + user-select: initial !important; + top: initial!important; + bottom: initial!important; + left: initial!important; + right: initial!important; + } + """.trimIndent() + + private fun getScrambledFontCSS() = """ + @font-face { + font-family: 'open_sansscrambled'; + src: url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.eot'); + src: url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.eot?#iefix') format('embedded-opentype'), + url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.woff2') format('woff2'), + url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.woff') format('woff'), + url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.ttf') format('truetype'), + url('https://secondlifetranslations.com/wp-content/plugins/slt-scramble-text/public/fonts/opensans-scrambled-webfont.svg#open_sansscrambled') format('svg'); + font-weight: normal; + font-style: normal; + } + span.scrmbl { + font-family: 'open_sansscrambled' !important; + } + span.scrmbl .scrmbl-ent { + font-family: "Open Sans", sans-serif !important; + } + .scrmbl-ent { + visibility:hidden; + } + .scrmbl-disclaimer { + color: transparent; + height:1px; + margin:0; + padding:0; + overflow:hidden; } + """.trimIndent() + + private fun getDragonTeaCSS() = """ + @font-face { + font-family: 'DragonTea'; + src: url(https://dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.eot); + src: url(https://dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.eot?#iefix) format('embedded-opentype'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.woff2) format('woff2'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.woff) format('woff'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.ttf) format('truetype'), url(//dragontea.ink/wp-content/themes/madara-child/font/DragonTea-Regular.svg#DragonTea-Regular) format('svg'); + font-weight: normal; + font-style: normal; + font-display: swap!important; + } + div[data-role=RContent] { + font-family: 'DragonTea'!important; + } + """.trimIndent() - val body = doc.body() - val lookup = getSelectorQueries().firstOrNull { - if ((it.host == null || url.contains(it.host)) && body.select(it.selector).isNotEmpty()) { - // Check non-optional subqueries to ensure we match the correct website. - // TODO: Optimise with running all queries at once and storing them, instead of rerunning them a second time inside cleaner - //if (it.host != null) Log.d(TAG, "${it.host}, ${it.selector}") - if (it.subQueries.isEmpty()) true - else it.subQueries.all { sub -> - //if (it.host != null) Log.d(TAG, "${sub.selector} -> ${sub.optional} : ${body.select(sub.selector).isNotEmpty()}") - sub.optional || body.select(sub.selector).isNotEmpty() + /** + * Optimized factory method with better caching + */ + fun getInstance(doc: Document, url: String = doc.location()): HtmlCleaner { + // Optimized: Use when expression for cleaner code + return when { + url.contains(HostNames.WATTPAD) -> WattPadCleaner() + url.contains(HostNames.WUXIA_WORLD) -> WuxiaWorldCleaner() + url.contains(HostNames.QIDIAN) -> QidianCleaner() + url.contains(HostNames.GOOGLE_DOCS) -> GoogleDocsCleaner() + url.contains(HostNames.BLUE_SILVER_TRANSLATIONS) -> BlueSilverTranslationsCleaner() + url.contains(HostNames.BAKA_TSUKI) -> BakaTsukiCleaner() + url.contains(HostNames.SCRIBBLE_HUB) -> ScribbleHubCleaner() + url.contains(HostNames.NEOVEL) -> NeovelCleaner() + url.contains(HostNames.CHRYSANTHEMUMGARDEN) -> ChrysanthemumgardenCleaner() + else -> { + val body = doc.body() + val lookup = getSelectorQueries().firstOrNull { query -> + if ((query.host == null || url.contains(query.host)) && body.select(query.selector).isNotEmpty()) { + query.subQueries.isEmpty() || query.subQueries.all { sub -> + sub.optional || body.select(sub.selector).isNotEmpty() + } + } else false + } + + when { + lookup != null -> GenericSelectorQueryCleaner(url, lookup) + doc.body().getElementsByTag("a").any { + it.attr("href").contains("https://www.cloudflare.com/") && + it.text().contains("DDoS protection by Cloudflare") + } -> CloudFlareDDoSTagCleaner() + else -> HtmlCleaner() } - } else false + } } - if (lookup != null) return GenericSelectorQueryCleaner(url, lookup) - - //Lastly let's check for cloud flare - val contentElement = doc.body().getElementsByTag("a").firstOrNull { it.attr("href").contains("https://www.cloudflare.com/") && it.text().contains("DDoS protection by Cloudflare") } - if (contentElement != null) return CloudFlareDDoSTagCleaner() - - return HtmlCleaner() } + /** + * Optimized selector queries with caching + */ private fun getSelectorQueries(): List { val dataCenter: DataCenter by injectLazy() - val htmlCleanerSelectorQueries = dataCenter.htmlCleanerSelectorQueries - htmlCleanerSelectorQueries.addAll(defaultSelectorQueries) + val htmlCleanerSelectorQueries = dataCenter.htmlCleanerSelectorQueries.apply { + addAll(defaultSelectorQueries) + } val userSpecifiedSelectorQueries = dataCenter.userSpecifiedSelectorQueries if (userSpecifiedSelectorQueries.isNotBlank()) { - htmlCleanerSelectorQueries.addAll(0, userSpecifiedSelectorQueries.split('\n').filter { it.isNotBlank() }.map { SelectorQuery(it.trim()) }) + htmlCleanerSelectorQueries.addAll(0, + userSpecifiedSelectorQueries.split('\n') + .filter { it.isNotBlank() } + .map { SelectorQuery(it.trim()) } + ) } return htmlCleanerSelectorQueries } } + // Optimized: Use lazy injection val dataCenter: DataCenter by injectLazy() - open var keepContentStyle = false - open var keepContentIds = true - open var keepContentClasses = false + + // Optimized: Use backing properties for better encapsulation + open var keepContentStyle: Boolean = false + open var keepContentIds: Boolean = true + open var keepContentClasses: Boolean = false fun downloadResources(doc: Document, novelDir: File) { // removeJS(doc) @@ -572,7 +568,7 @@ open class HtmlCleaner protected constructor() { } open fun getImageUrl(element: Element, absolute: Boolean = false): String? { - val attr = imageAttributes.firstOrNull { element.hasAttr(it) } + val attr = IMAGE_ATTRIBUTES.firstOrNull { element.hasAttr(it) } return when { attr == null -> null attr.endsWith("srcset") -> { @@ -625,7 +621,7 @@ open class HtmlCleaner protected constructor() { val bytes = response.bodyAsBytes() val bitmap = Utils.getImage(bytes) val os = FileOutputStream(file) - bitmap.compress(Bitmap.CompressFormat.JPEG, 100, os) + bitmap.compress(Bitmap.CompressFormat.JPEG, IMAGE_COMPRESSION_QUALITY, os) } catch (e: Exception) { return null } @@ -869,8 +865,8 @@ open class HtmlCleaner protected constructor() { else "(image url)" } ?: linkedUrl } - val isMainContent = genericMainContentUrlText.find { cmp -> cmp.equals(text, true) } != null || - Regex("""Chapter \d+""", RegexOption.IGNORE_CASE).containsMatchIn(text) || + val isMainContent = GENERIC_MAIN_CONTENT_URL_TEXT.find { cmp -> cmp.equals(text, true) } != null || + CHAPTER_REGEX.containsMatchIn(text) || it.attr("data-role") == "RBuffer" || it.attr("data-role") == "RRealChapter" links.add(LinkedPage(linkedUrl, text, isMainContent)) } @@ -883,12 +879,11 @@ open class HtmlCleaner protected constructor() { fun linkify(element: Element) { if (!dataCenter.linkifyText) return - val reg = Regex("""^\s*(https?://[^\s]+)(?:$|\s)""") - element.getElementsMatchingOwnText(reg.toPattern()).forEach { el -> + element.getElementsMatchingOwnText(URL_REGEX.toPattern()).forEach { el -> if (el.tagName() != "a" && el.parents().find { it.tagName() == "a" } == null) // Ensure we don't linkify what is already a link. el.textNodes().forEach { node -> val text = node.wholeText - reg.find(node.wholeText)?.let { result -> + URL_REGEX.find(node.wholeText)?.let { result -> val group = result.groups[1]!! if (URLUtil.isValidUrl(group.value)) { node.text(text.removeRange(group.range)) @@ -956,15 +951,14 @@ open class HtmlCleaner protected constructor() { } private fun getNodeColor(contentElement: Element): String? { - val colorRegex = Regex("(?:^|;)\\s*color\\s*:\\s*(.*?)(?:;|\$)", RegexOption.IGNORE_CASE) - val result = colorRegex.matchEntire(contentElement.attr("style")) ?: return null + val colorRegex = COLOR_REGEX.matchEntire(contentElement.attr("style")) ?: return null if (!dataCenter.alternativeTextColors || !dataCenter.isDarkTheme) { - return result.groupValues[1] + return colorRegex.groupValues[1] } try { - val col = result.groupValues[1] + val col = colorRegex.groupValues[1] // Since #RGB and #RGBA are valid CSS colors, handle hex values manually. // They expand from #RGBA to #RRGGBBAA, duplicating the 4 bits of corresponding compressed color. // Color.parseColor is unable to parse those. @@ -1011,22 +1005,18 @@ open class HtmlCleaner protected constructor() { } else -> { // Most likely invalid color - return result.groupValues[1] + return colorRegex.groupValues[1] } } } else if (col.startsWith("rgb", true) || col.startsWith("hsl", true)) { // rgb/rgba/hsl/hsla functional notations - val colorReg = Regex("(?:[,(]\\s*)([0-9\\-+.e]+%?)") - var notationResult = colorReg.matchEntire(col) + val colorReg = FUNCTIONAL_COLOR_REGEX.matchEntire(col) - val compA = processColorComponent(notationResult!!.groupValues[1]) - notationResult = notationResult.next() - val compB = processColorComponent(notationResult!!.groupValues[1]) - notationResult = notationResult.next() - val compC = processColorComponent(notationResult!!.groupValues[1]) - notationResult = notationResult.next() - val alpha = processColorComponent(notationResult?.groupValues?.get(1) ?: "1") + val compA = processColorComponent(colorReg!!.groupValues[1]) + val compB = processColorComponent(colorReg.next().groupValues[1]) + val compC = processColorComponent(colorReg.next().groupValues[1]) + val alpha = processColorComponent(colorReg.next().groupValues[1] ?: "1") return if (col.startsWith("rgb")) invertColor(compA, compB, compC, alpha) @@ -1046,11 +1036,11 @@ open class HtmlCleaner protected constructor() { } } catch (e: IllegalArgumentException) { // Do not modify color if Color.parseColor yield no result (valid CSS color, but Color can't parse it) - return result.groupValues[1] + return colorRegex.groupValues[1] } catch (e: NullPointerException) { // Most likely caused by functional notation having math in it. // Or hsl notation using deg/rad/turn postfixes in hue value - return result.groupValues[1] + return colorRegex.groupValues[1] } }