From 566abfa99120652fb1e9190d791fdbbba64d2e0d Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Fri, 16 Nov 2018 16:26:22 +0100 Subject: Add adblockrule_parse --- lib/urlfilter/filterleaf.cpp | 22 +-- lib/urlfilter/filterleaf.h | 12 +- lib/urlfilter/formats/adblockrule.cpp | 200 +++++----------------------- lib/urlfilter/formats/adblockrule.h | 17 ++- lib/urlfilter/formats/adblockrule_parse.cpp | 180 +++++++++++++++++++++++++ lib/urlfilter/formats/adblockrule_parse.h | 17 +++ lib/urlfilter/formats/hostlistrule.cpp | 6 +- lib/urlfilter/formats/hostlistrule.h | 5 +- lib/urlfilter/meson.build | 2 +- 9 files changed, 261 insertions(+), 200 deletions(-) create mode 100644 lib/urlfilter/formats/adblockrule_parse.cpp create mode 100644 lib/urlfilter/formats/adblockrule_parse.h (limited to 'lib/urlfilter') diff --git a/lib/urlfilter/filterleaf.cpp b/lib/urlfilter/filterleaf.cpp index 87cd91d..5797718 100644 --- a/lib/urlfilter/filterleaf.cpp +++ b/lib/urlfilter/filterleaf.cpp @@ -1,26 +1,14 @@ #include "filterleaf.h" -FilterLeaf::FilterLeaf(FilterLeaf &&other) -{ - m_isBlocking = other.m_isBlocking; - m_request = std::move(other.m_request); - m_redirect = std::move(other.m_redirect); -} - -FilterLeaf &FilterLeaf::operator=(FilterLeaf &&other) -{ - m_isBlocking = other.m_isBlocking; - m_request = std::move(other.m_request); - m_redirect = std::move(other.m_redirect); - return *this; -} - const QString FilterLeaf::request() const { return m_request; } -const QString FilterLeaf::redirect() const +std::optional FilterLeaf::option(QWebEngineUrlRequestInfo::ResourceType opt) const { - return m_redirect; + if(resourceTypeOptions.contains(opt)) + return resourceTypeOptions.value(opt); + else + return std::nullopt; } diff --git a/lib/urlfilter/filterleaf.h b/lib/urlfilter/filterleaf.h index 44330aa..64f465d 100644 --- a/lib/urlfilter/filterleaf.h +++ b/lib/urlfilter/filterleaf.h @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include class FilterLeaf { @@ -34,19 +37,15 @@ public: DomainMatch }; - FilterLeaf(FilterLeaf &&other); - FilterLeaf &operator=(FilterLeaf &&other); virtual ~FilterLeaf() = default; virtual bool match(const QUrl &requestUrl) const = 0; - virtual Action action() const = 0; + virtual std::pair action() const = 0; const QString request() const; - const QString redirect() const; + std::optional option(QWebEngineUrlRequestInfo::ResourceType opt) const; protected: - explicit FilterLeaf() = default; - // rule matching UrlMatchType matchType = InvalidMatch; QHash resourceTypeOptions; @@ -54,7 +53,6 @@ protected: // rule action bool m_isBlocking; - QString m_redirect; }; Q_DECLARE_METATYPE(FilterLeaf::Action) diff --git a/lib/urlfilter/formats/adblockrule.cpp b/lib/urlfilter/formats/adblockrule.cpp index 6b97d5d..db1c3c5 100644 --- a/lib/urlfilter/formats/adblockrule.cpp +++ b/lib/urlfilter/formats/adblockrule.cpp @@ -5,174 +5,21 @@ * * SPDX-License-Identifier: GPL-3.0 */ -// Based on Falkon's AdBlockRule class #include "adblockrule.h" +#include +#include -// adblock format documentation -// https://adblockplus.org/filters - -// QString::mid(pos, len) - Returns a string starting at the specified position index. -// QString::chop(len) - Removes n characters from the end of the string. -// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. - -AdBlockRule *loadRule(const QString &filter) -{ - QString parsedLine = filter.trimmed(); - - // there is no rule, or it's a comment - if(parsedLine.isEmpty() || parsedLine.startsWith("!")) { - return nullptr; - } - - // css rule -> filterleaves cannot do element blocking - if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { - return nullptr; - } - - // exception rules - FilterLeaf::Action action = FilterLeaf::Block; - if(parsedLine.startsWith(QLatin1Literal("@@"))) { - action = FilterLeaf::Allow; - parsedLine.remove(0, 2); - } - - // parse options - QStringList enabledOn, disabledOn; - QHash optionsHash; - { - const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); - if(sepPos != -1) { - const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); - parsedLine = parsedLine.mid(0, sepPos); - - for(const QString &option : options) { - if(option.startsWith(QLatin1Literal("domain"))) { - const auto domainList = option.mid(7).split(QLatin1Literal("|")); - - for(const QString &domain : domainList) { - if(domain.startsWith(QLatin1Literal("~"))) { - disabledOn.append(domain.mid(1)); - } else { - enabledOn.append(domain); - } - } - } else { - const auto pair = parseOption(option); - if(pair) - optionsHash.insert(pair.value().first, pair.value().second); - } - } - } - } - - FilterLeaf::UrlMatchType matchType; - QString pattern; - - if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { - // regular expression rule - matchType = FilterLeaf::RegularExpressionMatch; - pattern = parsedLine.mid(1, parsedLine.length() - 2); - - } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { - // string equals rule - matchType = FilterLeaf::StringEquals; - pattern = parsedLine.mid(1, parsedLine.length() - 2); - - } else { - - // Basic filter rules can use wildcards, which were supported by QRegExp, - // but were deprecated in QRegularExpression. - - // remove beginning and ending wildcards - if(parsedLine.startsWith(QLatin1Literal("*"))) - parsedLine = parsedLine.mid(1); - - if(parsedLine.endsWith(QLatin1Literal("*"))) - parsedLine.chop(1); - - if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { - matchType = FilterLeaf::DomainMatch; - pattern = parsedLine.mid(2, parsedLine.length() - 3); - - } else if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { - // check for wildcards and translate to regexp - // wildcard "*" - any number of characters - // separator "^" - end, ? or / - matchType = FilterLeaf::RegularExpressionMatch; - parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); - parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); - parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); - parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); - pattern = parsedLine; - } - } - - auto *rule = new AdBlockRule(matchType, pattern, action); - rule->mergeOptions(optionsHash); - return rule; -} - -std::optional> parseOption(const QString &option) -{ - const bool exception = !option.startsWith(QLatin1Literal("~")); - - if(option.endsWith(QLatin1Literal("script"))) { - // external scripts loaded via HTML script tag - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); - - } else if(option.endsWith(QLatin1Literal("image"))) { - // regular images, typically loaded via HTML img tag - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); - - } else if(option.endsWith(QLatin1Literal("stylesheet"))) { - // external CSS stylesheet files - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); - - } else if(option.endsWith(QLatin1Literal("object"))) { - // content handled by browser plugins, e.g. Flash or Java - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); - - } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { - // requests started using the XMLHttpRequest object or fetch() API - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); - - } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { - // requests started by plugins like Flash - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); - - } else if(option.endsWith(QLatin1Literal("subdocument"))) { - // embedded pages, usually included via HTML frames - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); - - } else if(option.endsWith(QLatin1Literal("ping"))) { - // requests started by or navigator.sendBeacon() - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePing, exception); - - } else if(option.endsWith(QLatin1Literal("websocket"))) { - // requests initiated via WebSocket object - qDebug("Resource type 'websocket' not available"); - - } else if(option.endsWith(QLatin1Literal("webrtc"))) { - // connections opened via RTCPeerConnection instances to ICE servers - qDebug("Resource type 'webrtc' not available"); - - } else if(option.endsWith(QLatin1Literal("document"))) { - // the page itself - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); - - } else if(option.endsWith(QLatin1Literal("other"))) { - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); - } - - return std::nullopt; -} - -AdBlockRule::AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString& filter, FilterLeaf::Action action) +AdBlockRule::AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString &filter, FilterLeaf::Action action) { this->matchType = matchType; this->m_request = filter; - this->m_isBlocking = (action == FilterLeaf::Block) ? true : false; + this->m_isBlocking = (action == FilterLeaf::Block); + //matcher.setPattern(filter); + if(matchType == FilterLeaf::RegularExpressionMatch) + regExp = new QRegExp(filter); + else + stringMatcher = new QStringMatcher(filter); } void AdBlockRule::mergeOptions(const QHash &options) @@ -180,17 +27,36 @@ void AdBlockRule::mergeOptions(const QHashresourceTypeOptions.unite(options); } -bool AdBlockRule::match(const QUrl& requestUrl) const +bool AdBlockRule::match(const QUrl &requestUrl) const { switch(matchType) { - case FilterLeaf::StringContains: - return requestUrl.toString().contains(m_request); + case FilterLeaf::RegularExpressionMatch: + return (regExp->indexIn(requestUrl.toString()) != -1); + default: + return false; + } +} + +bool AdBlockRule::match(const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const +{ + // if request is of the required type, or there are no types set (== apply to all requests) + if(this->resourceTypeOptions.contains(type) || this->resourceTypeOptions.isEmpty()) { + switch(matchType) { + case FilterLeaf::RegularExpressionMatch: + return (regExp->indexIn(requestUrl.toString()) != -1); default: return false; + } } + + // request type is not matched + return false; } -FilterLeaf::Action AdBlockRule::action() const +std::pair AdBlockRule::action() const { - return m_isBlocking ? FilterLeaf::Block : FilterLeaf::Allow; + if(m_isBlocking) + return std::make_pair(FilterLeaf::Block, QVariant()); + else + return std::make_pair(FilterLeaf::Allow, QVariant()); } diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/formats/adblockrule.h index da7e4fc..9c89dac 100644 --- a/lib/urlfilter/formats/adblockrule.h +++ b/lib/urlfilter/formats/adblockrule.h @@ -16,13 +16,22 @@ class AdBlockRule : public FilterLeaf { public: explicit AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString &filter, FilterLeaf::Action action); + ~AdBlockRule() + { + delete stringMatcher; + delete regExp; + }; + void mergeOptions(const QHash &options); bool match(const QUrl &requestUrl) const override; - FilterLeaf::Action action() const override; -}; + bool match(const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const; + std::pair action() const override; -std::optional> parseOption(const QString &option); -AdBlockRule *loadRule(const QString &filter); +private: + /* Once C++20 comes out, perhaps this can be replaced with a concept template */ + QStringMatcher *stringMatcher = nullptr; + QRegExp *regExp = nullptr; +}; #endif // SMOLBOTE_ADBLOCKRULE_H diff --git a/lib/urlfilter/formats/adblockrule_parse.cpp b/lib/urlfilter/formats/adblockrule_parse.cpp new file mode 100644 index 0000000..0e5bf05 --- /dev/null +++ b/lib/urlfilter/formats/adblockrule_parse.cpp @@ -0,0 +1,180 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "adblockrule.h" +#include "adblockrule_parse.h" + +// adblock format documentation +// https://adblockplus.org/filters + +// QString::mid(pos, len) const - Returns a string starting at the specified position index. +// QString::chop(len) - Removes n characters from the end of the string. +// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. + +AdBlockRule *loadRule(const QString &filter) +{ + QString parsedLine = filter.trimmed(); + + // there is no rule, or it's a comment + if(parsedLine.isEmpty() || parsedLine.startsWith("!")) { + return nullptr; + } + + // css rule -> filterleaves cannot do element blocking + if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { + return nullptr; + } + + // exception rules + FilterLeaf::Action action = FilterLeaf::Block; + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + action = FilterLeaf::Allow; + parsedLine.remove(0, 2); + } + + // parse options + QStringList enabledOn, disabledOn; + QHash optionsHash; + { + const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); + if(sepPos != -1) { + const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); + parsedLine = parsedLine.mid(0, sepPos); + + for(const QString &option : options) { + if(option.startsWith(QLatin1Literal("domain"))) { + const auto domainList = option.mid(7).split(QLatin1Literal("|")); + + for(const QString &domain : domainList) { + if(domain.startsWith(QLatin1Literal("~"))) { + disabledOn.append(domain.mid(1)); + } else { + enabledOn.append(domain); + } + } + } else { + const auto pair = parseOption(option); + if(pair) + optionsHash.insert(pair.value().first, pair.value().second); + } + } + } + } + + FilterLeaf::UrlMatchType matchType = FilterLeaf::InvalidMatch; + + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + // regular expression rule + matchType = FilterLeaf::RegularExpressionMatch; + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + + } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { + matchType = FilterLeaf::DomainMatch; + parsedLine = parsedLine.mid(2, parsedLine.length() - 3); + + } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + // string equals rule + matchType = FilterLeaf::StringEquals; + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + + } else if(parsedLine.startsWith(QLatin1Literal("||"))) { + // string starts with rule + matchType = FilterLeaf::StringStartsWith; + parsedLine = parsedLine.mid(2); + + } else if(parsedLine.endsWith(QLatin1Literal("|"))) { + // string ends with rule + matchType = FilterLeaf::StringEndsWith; + parsedLine.chop(1); + + } else { + // generic contains rule + matchType = FilterLeaf::StringContains; + + // Basic filter rules can use wildcards, which were supported by QRegExp, + // but were deprecated in QRegularExpression. + + // remove beginning and ending wildcards + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { + // check for wildcards and translate to regexp + // wildcard "*" - any number of characters + // separator "^" - end, ? or / + parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); + parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); + parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); + parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); + + matchType = FilterLeaf::RegularExpressionMatch; + } + } + + AdBlockRule *rule = new AdBlockRule(matchType, parsedLine, action); + rule->mergeOptions(optionsHash); + return rule; +} + +std::optional> parseOption(const QString &option) +{ + const bool exception = !option.startsWith(QLatin1Literal("~")); + + if(option.endsWith(QLatin1Literal("script"))) { + // external scripts loaded via HTML script tag + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); + + } else if(option.endsWith(QLatin1Literal("image"))) { + // regular images, typically loaded via HTML img tag + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); + + } else if(option.endsWith(QLatin1Literal("stylesheet"))) { + // external CSS stylesheet files + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); + + } else if(option.endsWith(QLatin1Literal("object"))) { + // content handled by browser plugins, e.g. Flash or Java + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); + + } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { + // requests started using the XMLHttpRequest object or fetch() API + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); + + } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { + // requests started by plugins like Flash + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); + + } else if(option.endsWith(QLatin1Literal("subdocument"))) { + // embedded pages, usually included via HTML frames + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); + + } else if(option.endsWith(QLatin1Literal("ping"))) { + // requests started by or navigator.sendBeacon() + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePing, exception); + + } else if(option.endsWith(QLatin1Literal("websocket"))) { + // requests initiated via WebSocket object + qDebug("Resource type 'websocket' not available"); + + } else if(option.endsWith(QLatin1Literal("webrtc"))) { + // connections opened via RTCPeerConnection instances to ICE servers + qDebug("Resource type 'webrtc' not available"); + + } else if(option.endsWith(QLatin1Literal("document"))) { + // the page itself + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); + + } else if(option.endsWith(QLatin1Literal("other"))) { + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); + } + + return std::nullopt; +} diff --git a/lib/urlfilter/formats/adblockrule_parse.h b/lib/urlfilter/formats/adblockrule_parse.h new file mode 100644 index 0000000..7d380a8 --- /dev/null +++ b/lib/urlfilter/formats/adblockrule_parse.h @@ -0,0 +1,17 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef ADBLOCKRULE_PARSE_H +#define ADBLOCKRULE_PARSE_H + +class AdBlockRule; + +AdBlockRule *loadRule(const QString &filter); +std::optional> parseOption(const QString &option); + +#endif // ADBLOCKRULE_PARSE_H \ No newline at end of file diff --git a/lib/urlfilter/formats/hostlistrule.cpp b/lib/urlfilter/formats/hostlistrule.cpp index e4561f0..ad2c2a6 100644 --- a/lib/urlfilter/formats/hostlistrule.cpp +++ b/lib/urlfilter/formats/hostlistrule.cpp @@ -21,9 +21,9 @@ bool HostlistRule::match(const QUrl &requestUrl) const return (m_request == requestUrl.host()); } -FilterLeaf::Action HostlistRule::action() const +std::pair HostlistRule::action() const { if(m_isBlocking) - return FilterLeaf::Block; - return FilterLeaf::Redirect; + return std::make_pair(FilterLeaf::Block, QVariant()); + return std::make_pair(FilterLeaf::Redirect, QVariant(m_redirect)); } diff --git a/lib/urlfilter/formats/hostlistrule.h b/lib/urlfilter/formats/hostlistrule.h index c65a98f..58ec690 100644 --- a/lib/urlfilter/formats/hostlistrule.h +++ b/lib/urlfilter/formats/hostlistrule.h @@ -18,7 +18,10 @@ public: explicit HostlistRule(const QString &domain, const QString &redirect); bool match(const QUrl &requestUrl) const override; - FilterLeaf::Action action() const override; + std::pair action() const override; + +private: + QString m_redirect; }; #endif // SMOLBOTE_HOSTLIST_RULE_H diff --git a/lib/urlfilter/meson.build b/lib/urlfilter/meson.build index 082320c..5d0a970 100644 --- a/lib/urlfilter/meson.build +++ b/lib/urlfilter/meson.build @@ -8,7 +8,7 @@ urlfilter_moc = qt5.preprocess( urlfilter_lib = static_library('urlfilter', ['filtertree.cpp', 'filterleaf.cpp', urlfilter_moc, 'domain.cpp', 'domain.h', - 'formats/adblockrule.cpp', 'formats/hostlistrule.cpp', ], + 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp', ], dependencies: dep_qt5 ) -- cgit v1.2.1