diff options
author | Aqua-sama <aqua@iserlohn-fortress.net> | 2018-11-16 16:26:22 +0100 |
---|---|---|
committer | Aqua-sama <aqua@iserlohn-fortress.net> | 2018-11-16 16:26:22 +0100 |
commit | 566abfa99120652fb1e9190d791fdbbba64d2e0d (patch) | |
tree | 86a6f71b926794298d922a9319b55909cf5a07b4 /lib/urlfilter/formats/adblockrule_parse.cpp | |
parent | Add more regex benchmarks (diff) | |
download | smolbote-566abfa99120652fb1e9190d791fdbbba64d2e0d.tar.xz |
Add adblockrule_parse
Diffstat (limited to 'lib/urlfilter/formats/adblockrule_parse.cpp')
-rw-r--r-- | lib/urlfilter/formats/adblockrule_parse.cpp | 180 |
1 files changed, 180 insertions, 0 deletions
diff --git a/lib/urlfilter/formats/adblockrule_parse.cpp b/lib/urlfilter/formats/adblockrule_parse.cpp new file mode 100644 index 0000000..0e5bf05 --- /dev/null +++ b/lib/urlfilter/formats/adblockrule_parse.cpp @@ -0,0 +1,180 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "adblockrule.h" +#include "adblockrule_parse.h" + +// adblock format documentation +// https://adblockplus.org/filters + +// QString::mid(pos, len) const - Returns a string starting at the specified position index. +// QString::chop(len) - Removes n characters from the end of the string. +// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. + +AdBlockRule *loadRule(const QString &filter) +{ + QString parsedLine = filter.trimmed(); + + // there is no rule, or it's a comment + if(parsedLine.isEmpty() || parsedLine.startsWith("!")) { + return nullptr; + } + + // css rule -> filterleaves cannot do element blocking + if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { + return nullptr; + } + + // exception rules + FilterLeaf::Action action = FilterLeaf::Block; + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + action = FilterLeaf::Allow; + parsedLine.remove(0, 2); + } + + // parse options + QStringList enabledOn, disabledOn; + QHash<QWebEngineUrlRequestInfo::ResourceType, bool> optionsHash; + { + const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); + if(sepPos != -1) { + const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); + parsedLine = parsedLine.mid(0, sepPos); + + for(const QString &option : options) { + if(option.startsWith(QLatin1Literal("domain"))) { + const auto domainList = option.mid(7).split(QLatin1Literal("|")); + + for(const QString &domain : domainList) { + if(domain.startsWith(QLatin1Literal("~"))) { + disabledOn.append(domain.mid(1)); + } else { + enabledOn.append(domain); + } + } + } else { + const auto pair = parseOption(option); + if(pair) + optionsHash.insert(pair.value().first, pair.value().second); + } + } + } + } + + FilterLeaf::UrlMatchType matchType = FilterLeaf::InvalidMatch; + + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + // regular expression rule + matchType = FilterLeaf::RegularExpressionMatch; + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + + } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { + matchType = FilterLeaf::DomainMatch; + parsedLine = parsedLine.mid(2, parsedLine.length() - 3); + + } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + // string equals rule + matchType = FilterLeaf::StringEquals; + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + + } else if(parsedLine.startsWith(QLatin1Literal("||"))) { + // string starts with rule + matchType = FilterLeaf::StringStartsWith; + parsedLine = parsedLine.mid(2); + + } else if(parsedLine.endsWith(QLatin1Literal("|"))) { + // string ends with rule + matchType = FilterLeaf::StringEndsWith; + parsedLine.chop(1); + + } else { + // generic contains rule + matchType = FilterLeaf::StringContains; + + // Basic filter rules can use wildcards, which were supported by QRegExp, + // but were deprecated in QRegularExpression. + + // remove beginning and ending wildcards + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { + // check for wildcards and translate to regexp + // wildcard "*" - any number of characters + // separator "^" - end, ? or / + parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); + parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); + parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); + parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); + + matchType = FilterLeaf::RegularExpressionMatch; + } + } + + AdBlockRule *rule = new AdBlockRule(matchType, parsedLine, action); + rule->mergeOptions(optionsHash); + return rule; +} + +std::optional<QPair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseOption(const QString &option) +{ + const bool exception = !option.startsWith(QLatin1Literal("~")); + + if(option.endsWith(QLatin1Literal("script"))) { + // external scripts loaded via HTML script tag + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); + + } else if(option.endsWith(QLatin1Literal("image"))) { + // regular images, typically loaded via HTML img tag + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); + + } else if(option.endsWith(QLatin1Literal("stylesheet"))) { + // external CSS stylesheet files + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); + + } else if(option.endsWith(QLatin1Literal("object"))) { + // content handled by browser plugins, e.g. Flash or Java + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); + + } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { + // requests started using the XMLHttpRequest object or fetch() API + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); + + } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { + // requests started by plugins like Flash + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); + + } else if(option.endsWith(QLatin1Literal("subdocument"))) { + // embedded pages, usually included via HTML frames + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); + + } else if(option.endsWith(QLatin1Literal("ping"))) { + // requests started by <a ping> or navigator.sendBeacon() + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePing, exception); + + } else if(option.endsWith(QLatin1Literal("websocket"))) { + // requests initiated via WebSocket object + qDebug("Resource type 'websocket' not available"); + + } else if(option.endsWith(QLatin1Literal("webrtc"))) { + // connections opened via RTCPeerConnection instances to ICE servers + qDebug("Resource type 'webrtc' not available"); + + } else if(option.endsWith(QLatin1Literal("document"))) { + // the page itself + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); + + } else if(option.endsWith(QLatin1Literal("other"))) { + return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); + } + + return std::nullopt; +} |