diff options
author | Aqua-sama <aqua@iserlohn-fortress.net> | 2018-11-16 16:26:22 +0100 |
---|---|---|
committer | Aqua-sama <aqua@iserlohn-fortress.net> | 2018-11-16 16:26:22 +0100 |
commit | 566abfa99120652fb1e9190d791fdbbba64d2e0d (patch) | |
tree | 86a6f71b926794298d922a9319b55909cf5a07b4 /lib/urlfilter/formats/adblockrule.cpp | |
parent | Add more regex benchmarks (diff) | |
download | smolbote-566abfa99120652fb1e9190d791fdbbba64d2e0d.tar.xz |
Add adblockrule_parse
Diffstat (limited to 'lib/urlfilter/formats/adblockrule.cpp')
-rw-r--r-- | lib/urlfilter/formats/adblockrule.cpp | 200 |
1 files changed, 33 insertions, 167 deletions
diff --git a/lib/urlfilter/formats/adblockrule.cpp b/lib/urlfilter/formats/adblockrule.cpp index 6b97d5d..db1c3c5 100644 --- a/lib/urlfilter/formats/adblockrule.cpp +++ b/lib/urlfilter/formats/adblockrule.cpp @@ -5,174 +5,21 @@ * * SPDX-License-Identifier: GPL-3.0 */ -// Based on Falkon's AdBlockRule class #include "adblockrule.h" +#include <QRegExp> +#include <QStringMatcher> -// adblock format documentation -// https://adblockplus.org/filters - -// QString::mid(pos, len) - Returns a string starting at the specified position index. -// QString::chop(len) - Removes n characters from the end of the string. -// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. - -AdBlockRule *loadRule(const QString &filter) -{ - QString parsedLine = filter.trimmed(); - - // there is no rule, or it's a comment - if(parsedLine.isEmpty() || parsedLine.startsWith("!")) { - return nullptr; - } - - // css rule -> filterleaves cannot do element blocking - if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { - return nullptr; - } - - // exception rules - FilterLeaf::Action action = FilterLeaf::Block; - if(parsedLine.startsWith(QLatin1Literal("@@"))) { - action = FilterLeaf::Allow; - parsedLine.remove(0, 2); - } - - // parse options - QStringList enabledOn, disabledOn; - QHash<QWebEngineUrlRequestInfo::ResourceType, bool> optionsHash; - { - const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); - if(sepPos != -1) { - const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); - parsedLine = parsedLine.mid(0, sepPos); - - for(const QString &option : options) { - if(option.startsWith(QLatin1Literal("domain"))) { - const auto domainList = option.mid(7).split(QLatin1Literal("|")); - - for(const QString &domain : domainList) { - if(domain.startsWith(QLatin1Literal("~"))) { - disabledOn.append(domain.mid(1)); - } else { - enabledOn.append(domain); - } - } - } else { - const auto pair = parseOption(option); - if(pair) - optionsHash.insert(pair.value().first, pair.value().second); - } - } - } - } - - FilterLeaf::UrlMatchType matchType; - QString pattern; - - if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { - // regular expression rule - matchType = FilterLeaf::RegularExpressionMatch; - pattern = parsedLine.mid(1, parsedLine.length() - 2); - - } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { - // string equals rule - matchType = FilterLeaf::StringEquals; - pattern = parsedLine.mid(1, parsedLine.length() - 2); - - } else { - - // Basic filter rules can use wildcards, which were supported by QRegExp, - // but were deprecated in QRegularExpression. - - // remove beginning and ending wildcards - if(parsedLine.startsWith(QLatin1Literal("*"))) - parsedLine = parsedLine.mid(1); - - if(parsedLine.endsWith(QLatin1Literal("*"))) - parsedLine.chop(1); - - if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { - matchType = FilterLeaf::DomainMatch; - pattern = parsedLine.mid(2, parsedLine.length() - 3); - - } else if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { - // check for wildcards and translate to regexp - // wildcard "*" - any number of characters - // separator "^" - end, ? or / - matchType = FilterLeaf::RegularExpressionMatch; - parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); - parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); - parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); - parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); - pattern = parsedLine; - } - } - - auto *rule = new AdBlockRule(matchType, pattern, action); - rule->mergeOptions(optionsHash); - return rule; -} - -std::optional<QPair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseOption(const QString &option) -{ - const bool exception = !option.startsWith(QLatin1Literal("~")); - - if(option.endsWith(QLatin1Literal("script"))) { - // external scripts loaded via HTML script tag - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); - - } else if(option.endsWith(QLatin1Literal("image"))) { - // regular images, typically loaded via HTML img tag - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); - - } else if(option.endsWith(QLatin1Literal("stylesheet"))) { - // external CSS stylesheet files - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); - - } else if(option.endsWith(QLatin1Literal("object"))) { - // content handled by browser plugins, e.g. Flash or Java - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); - - } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { - // requests started using the XMLHttpRequest object or fetch() API - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); - - } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { - // requests started by plugins like Flash - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); - - } else if(option.endsWith(QLatin1Literal("subdocument"))) { - // embedded pages, usually included via HTML frames - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); - - } else if(option.endsWith(QLatin1Literal("ping"))) { - // requests started by <a ping> or navigator.sendBeacon() - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePing, exception); - - } else if(option.endsWith(QLatin1Literal("websocket"))) { - // requests initiated via WebSocket object - qDebug("Resource type 'websocket' not available"); - - } else if(option.endsWith(QLatin1Literal("webrtc"))) { - // connections opened via RTCPeerConnection instances to ICE servers - qDebug("Resource type 'webrtc' not available"); - - } else if(option.endsWith(QLatin1Literal("document"))) { - // the page itself - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); - - } else if(option.endsWith(QLatin1Literal("other"))) { - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); - } - - return std::nullopt; -} - -AdBlockRule::AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString& filter, FilterLeaf::Action action) +AdBlockRule::AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString &filter, FilterLeaf::Action action) { this->matchType = matchType; this->m_request = filter; - this->m_isBlocking = (action == FilterLeaf::Block) ? true : false; + this->m_isBlocking = (action == FilterLeaf::Block); + //matcher.setPattern(filter); + if(matchType == FilterLeaf::RegularExpressionMatch) + regExp = new QRegExp(filter); + else + stringMatcher = new QStringMatcher(filter); } void AdBlockRule::mergeOptions(const QHash<QWebEngineUrlRequestInfo::ResourceType, bool> &options) @@ -180,17 +27,36 @@ void AdBlockRule::mergeOptions(const QHash<QWebEngineUrlRequestInfo::ResourceTyp this->resourceTypeOptions.unite(options); } -bool AdBlockRule::match(const QUrl& requestUrl) const +bool AdBlockRule::match(const QUrl &requestUrl) const { switch(matchType) { - case FilterLeaf::StringContains: - return requestUrl.toString().contains(m_request); + case FilterLeaf::RegularExpressionMatch: + return (regExp->indexIn(requestUrl.toString()) != -1); + default: + return false; + } +} + +bool AdBlockRule::match(const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const +{ + // if request is of the required type, or there are no types set (== apply to all requests) + if(this->resourceTypeOptions.contains(type) || this->resourceTypeOptions.isEmpty()) { + switch(matchType) { + case FilterLeaf::RegularExpressionMatch: + return (regExp->indexIn(requestUrl.toString()) != -1); default: return false; + } } + + // request type is not matched + return false; } -FilterLeaf::Action AdBlockRule::action() const +std::pair<FilterLeaf::Action, QVariant> AdBlockRule::action() const { - return m_isBlocking ? FilterLeaf::Block : FilterLeaf::Allow; + if(m_isBlocking) + return std::make_pair(FilterLeaf::Block, QVariant()); + else + return std::make_pair(FilterLeaf::Allow, QVariant()); } |