From 3d2ae07c455c0e423c64f19e445518427a5684fa Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Wed, 9 Jan 2019 19:38:58 +0100 Subject: Rewrite lib/urlfilter - Make HostList and AdBlockList implementations independent from each other - Move urlfilter tests to lib/urlfilter --- lib/urlfilter/adblock/adblocklist.cpp | 188 ++++++++++++++++++++++++++++++++++ lib/urlfilter/adblock/adblocklist.h | 42 ++++++++ lib/urlfilter/adblock/parser.cpp | 75 ++++++++++++++ lib/urlfilter/adblock/parser.h | 14 +++ 4 files changed, 319 insertions(+) create mode 100644 lib/urlfilter/adblock/adblocklist.cpp create mode 100644 lib/urlfilter/adblock/adblocklist.h create mode 100644 lib/urlfilter/adblock/parser.cpp create mode 100644 lib/urlfilter/adblock/parser.h (limited to 'lib/urlfilter/adblock') diff --git a/lib/urlfilter/adblock/adblocklist.cpp b/lib/urlfilter/adblock/adblocklist.cpp new file mode 100644 index 0000000..c749e9e --- /dev/null +++ b/lib/urlfilter/adblock/adblocklist.cpp @@ -0,0 +1,188 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "adblocklist.h" +#include "parser.h" +#include +#include +#include + +AdBlockList::AdBlockList(QIODevice *device) +{ + Q_ASSERT(device->isOpen()); + + QTextStream list(device); + while (!list.atEnd()) { + parseLine(list.readLine()); + } + + qDebug() << m_metadata; +} + +AdBlockList::~AdBlockList() +{ + for(Rule &r : rules) { + delete r.matcher; + } +} + +QString AdBlockList::metadata(const QString& key) const +{ + return m_metadata.value(key); +} + +int AdBlockList::ruleCount() const +{ + return rules.size(); +} + +std::pair AdBlockList::match(const QUrl& firstParty, const QUrl& requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const +{ + const QString domain = firstParty.host(); + const QString request = requestUrl.toString(); + + for(const Rule &r : rules) { + // if there are options specified, but not the one we need + if(!r.options.isEmpty() && !r.options.contains(type)) + continue; + + if(r.disabledOn.contains(domain)) + continue; + + if(!r.enabledOn.isEmpty() && !r.enabledOn.contains(domain)) + continue; + + if(r.matcher->hasMatch(request)) + return std::make_pair(r.action, QString()); + } + + return std::make_pair(UrlFilter::NotMatched, QString()); +} + +void AdBlockList::parseLine(const QString& line) +{ + QString parsedLine = line.trimmed(); + + if(parsedLine.isEmpty()) + return; + + if(parsedLine.startsWith(QLatin1Literal("!"))) { + const auto comment = parseComment(parsedLine); + + if(comment) { + const auto key = comment.value().first; + if(keys.contains(key)) + m_metadata[key] = comment.value().second; + } + + return; + } + + // css rule -> filterleaves cannot do element blocking + if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { + qDebug("TODO: %s", qUtf8Printable(parsedLine)); + return; + } + + Rule r; + r.action = UrlFilter::Block; + + // exception rules + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + r.action = UrlFilter::Allow; + parsedLine.remove(0, 2); + } + + bool matchCase = false; + + // parse options + { + const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); + if(sepPos != -1) { + const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); + parsedLine = parsedLine.mid(0, sepPos); + + for(const QString &option : options) { + if(option.startsWith(QLatin1Literal("domain"))) { + const auto domainList = option.mid(7).split(QLatin1Literal("|")); + + for(const QString &domain : domainList) { + if(domain.startsWith(QLatin1Literal("~"))) { + r.disabledOn.append(domain.mid(1)); + } else { + r.enabledOn.append(domain); + } + } + } else if(option.endsWith(QLatin1Literal("match-case"))) { + matchCase = !option.startsWith(QLatin1Literal("~")); + + } else { + const auto pair = parseResourceOption(option); + if(pair) + r.options.insert(pair.value().first, pair.value().second); + } + } + } + } + + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + // regular expression rule + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::RegularExpressionMatch); + + } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { + parsedLine = parsedLine.mid(2, parsedLine.length() - 3); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::DomainMatch); + + } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + // string equals rule + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::StringEquals); + + } else if(parsedLine.startsWith(QLatin1Literal("||"))) { + // string starts with rule + parsedLine = parsedLine.mid(2); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::StringStartsWith); + + } else if(parsedLine.endsWith(QLatin1Literal("|"))) { + // string ends with rule + parsedLine.chop(1); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::StringEndsWith); + + } else { + // generic contains rule + + // remove beginning and ending wildcards + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { + // check for wildcards and translate to regexp + // wildcard "*" - any number of characters + // separator "^" - end, ? or / + parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); + parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); + parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); + parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); + + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::RegularExpressionMatch); + + } else { + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::StringContains); + } + } + + r.matcher->setCaseSensitive(matchCase); + + Q_CHECK_PTR(r.matcher); + rules.emplace_back(std::move(r)); +} + diff --git a/lib/urlfilter/adblock/adblocklist.h b/lib/urlfilter/adblock/adblocklist.h new file mode 100644 index 0000000..ee41e11 --- /dev/null +++ b/lib/urlfilter/adblock/adblocklist.h @@ -0,0 +1,42 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "urlfilter.h" +#include "matcher.h" +#include +#include + +class QIODevice; +class AdBlockList : public UrlFilter +{ +public: + // TODO: check if all keys are listed + const QStringList keys = { "Version", "Title", "Last modified", "Expires", "Homepage", "Licence", "Redirect" }; + + AdBlockList(QIODevice *device); + ~AdBlockList(); + + QString metadata(const QString &key) const override; + int ruleCount() const; + std::pair match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const override; + +protected: + void parseLine(const QString &line); + +private: + QHash m_metadata; + + struct Rule { + UrlFilter::MatchResult action = UrlFilter::NotMatched; + Matcher *matcher; + QStringList enabledOn, disabledOn; + QHash options; + }; + + std::vector rules; +}; diff --git a/lib/urlfilter/adblock/parser.cpp b/lib/urlfilter/adblock/parser.cpp new file mode 100644 index 0000000..1e7f0bc --- /dev/null +++ b/lib/urlfilter/adblock/parser.cpp @@ -0,0 +1,75 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "parser.h" + +std::optional> parseComment(QString &line) +{ + const QLatin1Literal separator(": "); + if(line.contains(separator)) { + const QStringList comment = line.mid(1).split(QLatin1Literal(": ")); + return std::make_pair(comment.at(0).trimmed(), comment.at(1).trimmed()); + } else + return std::nullopt; +} + +std::optional> parseResourceOption(const QString &option) +{ + const bool exception = !option.startsWith(QLatin1Literal("~")); + + if(option.endsWith(QLatin1Literal("script"))) { + // external scripts loaded via HTML script tag + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); + + } else if(option.endsWith(QLatin1Literal("image"))) { + // regular images, typically loaded via HTML img tag + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); + + } else if(option.endsWith(QLatin1Literal("stylesheet"))) { + // external CSS stylesheet files + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); + + } else if(option.endsWith(QLatin1Literal("object"))) { + // content handled by browser plugins, e.g. Flash or Java + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); + + } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { + // requests started using the XMLHttpRequest object or fetch() API + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); + + } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { + // requests started by plugins like Flash + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); + + } else if(option.endsWith(QLatin1Literal("subdocument"))) { + // embedded pages, usually included via HTML frames + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); + + } else if(option.endsWith(QLatin1Literal("ping"))) { + // requests started by or navigator.sendBeacon() + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypePing, exception); + + } else if(option.endsWith(QLatin1Literal("websocket"))) { + // requests initiated via WebSocket object + qDebug("Resource type 'websocket' not available"); + + } else if(option.endsWith(QLatin1Literal("webrtc"))) { + // connections opened via RTCPeerConnection instances to ICE servers + qDebug("Resource type 'webrtc' not available"); + + } else if(option.endsWith(QLatin1Literal("document"))) { + // the page itself + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); + + } else if(option.endsWith(QLatin1Literal("other"))) { + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); + } + + qDebug("TODO: %s", qUtf8Printable(option)); + return std::nullopt; +} diff --git a/lib/urlfilter/adblock/parser.h b/lib/urlfilter/adblock/parser.h new file mode 100644 index 0000000..c73a9cf --- /dev/null +++ b/lib/urlfilter/adblock/parser.h @@ -0,0 +1,14 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include +#include +#include + +std::optional> parseComment(QString &line); +std::optional> parseResourceOption(const QString &option); -- cgit v1.2.1