From 3d2ae07c455c0e423c64f19e445518427a5684fa Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Wed, 9 Jan 2019 19:38:58 +0100 Subject: Rewrite lib/urlfilter - Make HostList and AdBlockList implementations independent from each other - Move urlfilter tests to lib/urlfilter --- lib/urlfilter/adblock/adblocklist.cpp | 188 ++++++++++++++++++++++++++++ lib/urlfilter/adblock/adblocklist.h | 42 +++++++ lib/urlfilter/adblock/parser.cpp | 75 +++++++++++ lib/urlfilter/adblock/parser.h | 14 +++ lib/urlfilter/domain.cpp | 65 ---------- lib/urlfilter/domain.h | 33 ----- lib/urlfilter/filterleaf.cpp | 14 --- lib/urlfilter/filterleaf.h | 60 --------- lib/urlfilter/filtertree.cpp | 94 -------------- lib/urlfilter/filtertree.h | 61 --------- lib/urlfilter/formats/adblocklist.cpp | 95 -------------- lib/urlfilter/formats/adblocklist.h | 32 ----- lib/urlfilter/formats/adblockrule.cpp | 63 ---------- lib/urlfilter/formats/adblockrule.h | 113 ----------------- lib/urlfilter/formats/adblockrule_parse.cpp | 181 -------------------------- lib/urlfilter/formats/adblockrule_parse.h | 17 --- lib/urlfilter/formats/hostlistrule.cpp | 29 ----- lib/urlfilter/formats/hostlistrule.h | 27 ---- lib/urlfilter/hostlist/hostlist.cpp | 79 ++++++++++++ lib/urlfilter/hostlist/hostlist.h | 44 +++++++ lib/urlfilter/matcher.h | 109 ++++++++++++++++ lib/urlfilter/meson.build | 31 +++-- lib/urlfilter/test/adblock.cpp | 88 +++++++++++++ lib/urlfilter/test/adblock.txt | 26 ++++ lib/urlfilter/test/hostlist.cpp | 34 +++++ lib/urlfilter/test/hostlist.txt | 6 + lib/urlfilter/test/matcher.cpp | 42 +++++++ lib/urlfilter/urlfilter.h | 43 +++++++ 28 files changed, 809 insertions(+), 896 deletions(-) create mode 100644 lib/urlfilter/adblock/adblocklist.cpp create mode 100644 lib/urlfilter/adblock/adblocklist.h create mode 100644 lib/urlfilter/adblock/parser.cpp create mode 100644 lib/urlfilter/adblock/parser.h delete mode 100644 lib/urlfilter/domain.cpp delete mode 100644 lib/urlfilter/domain.h delete mode 100644 lib/urlfilter/filterleaf.cpp delete mode 100644 lib/urlfilter/filterleaf.h delete mode 100644 lib/urlfilter/filtertree.cpp delete mode 100644 lib/urlfilter/filtertree.h delete mode 100644 lib/urlfilter/formats/adblocklist.cpp delete mode 100644 lib/urlfilter/formats/adblocklist.h delete mode 100644 lib/urlfilter/formats/adblockrule.cpp delete mode 100644 lib/urlfilter/formats/adblockrule.h delete mode 100644 lib/urlfilter/formats/adblockrule_parse.cpp delete mode 100644 lib/urlfilter/formats/adblockrule_parse.h delete mode 100644 lib/urlfilter/formats/hostlistrule.cpp delete mode 100644 lib/urlfilter/formats/hostlistrule.h create mode 100644 lib/urlfilter/hostlist/hostlist.cpp create mode 100644 lib/urlfilter/hostlist/hostlist.h create mode 100644 lib/urlfilter/matcher.h create mode 100644 lib/urlfilter/test/adblock.cpp create mode 100644 lib/urlfilter/test/adblock.txt create mode 100644 lib/urlfilter/test/hostlist.cpp create mode 100644 lib/urlfilter/test/hostlist.txt create mode 100644 lib/urlfilter/test/matcher.cpp create mode 100644 lib/urlfilter/urlfilter.h (limited to 'lib/urlfilter') diff --git a/lib/urlfilter/adblock/adblocklist.cpp b/lib/urlfilter/adblock/adblocklist.cpp new file mode 100644 index 0000000..c749e9e --- /dev/null +++ b/lib/urlfilter/adblock/adblocklist.cpp @@ -0,0 +1,188 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "adblocklist.h" +#include "parser.h" +#include +#include +#include + +AdBlockList::AdBlockList(QIODevice *device) +{ + Q_ASSERT(device->isOpen()); + + QTextStream list(device); + while (!list.atEnd()) { + parseLine(list.readLine()); + } + + qDebug() << m_metadata; +} + +AdBlockList::~AdBlockList() +{ + for(Rule &r : rules) { + delete r.matcher; + } +} + +QString AdBlockList::metadata(const QString& key) const +{ + return m_metadata.value(key); +} + +int AdBlockList::ruleCount() const +{ + return rules.size(); +} + +std::pair AdBlockList::match(const QUrl& firstParty, const QUrl& requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const +{ + const QString domain = firstParty.host(); + const QString request = requestUrl.toString(); + + for(const Rule &r : rules) { + // if there are options specified, but not the one we need + if(!r.options.isEmpty() && !r.options.contains(type)) + continue; + + if(r.disabledOn.contains(domain)) + continue; + + if(!r.enabledOn.isEmpty() && !r.enabledOn.contains(domain)) + continue; + + if(r.matcher->hasMatch(request)) + return std::make_pair(r.action, QString()); + } + + return std::make_pair(UrlFilter::NotMatched, QString()); +} + +void AdBlockList::parseLine(const QString& line) +{ + QString parsedLine = line.trimmed(); + + if(parsedLine.isEmpty()) + return; + + if(parsedLine.startsWith(QLatin1Literal("!"))) { + const auto comment = parseComment(parsedLine); + + if(comment) { + const auto key = comment.value().first; + if(keys.contains(key)) + m_metadata[key] = comment.value().second; + } + + return; + } + + // css rule -> filterleaves cannot do element blocking + if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { + qDebug("TODO: %s", qUtf8Printable(parsedLine)); + return; + } + + Rule r; + r.action = UrlFilter::Block; + + // exception rules + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + r.action = UrlFilter::Allow; + parsedLine.remove(0, 2); + } + + bool matchCase = false; + + // parse options + { + const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); + if(sepPos != -1) { + const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); + parsedLine = parsedLine.mid(0, sepPos); + + for(const QString &option : options) { + if(option.startsWith(QLatin1Literal("domain"))) { + const auto domainList = option.mid(7).split(QLatin1Literal("|")); + + for(const QString &domain : domainList) { + if(domain.startsWith(QLatin1Literal("~"))) { + r.disabledOn.append(domain.mid(1)); + } else { + r.enabledOn.append(domain); + } + } + } else if(option.endsWith(QLatin1Literal("match-case"))) { + matchCase = !option.startsWith(QLatin1Literal("~")); + + } else { + const auto pair = parseResourceOption(option); + if(pair) + r.options.insert(pair.value().first, pair.value().second); + } + } + } + } + + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + // regular expression rule + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::RegularExpressionMatch); + + } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { + parsedLine = parsedLine.mid(2, parsedLine.length() - 3); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::DomainMatch); + + } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + // string equals rule + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::StringEquals); + + } else if(parsedLine.startsWith(QLatin1Literal("||"))) { + // string starts with rule + parsedLine = parsedLine.mid(2); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::StringStartsWith); + + } else if(parsedLine.endsWith(QLatin1Literal("|"))) { + // string ends with rule + parsedLine.chop(1); + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::StringEndsWith); + + } else { + // generic contains rule + + // remove beginning and ending wildcards + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { + // check for wildcards and translate to regexp + // wildcard "*" - any number of characters + // separator "^" - end, ? or / + parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); + parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); + parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); + parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); + + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::RegularExpressionMatch); + + } else { + r.matcher = new ContentsMatcher(parsedLine, UrlFilter::StringContains); + } + } + + r.matcher->setCaseSensitive(matchCase); + + Q_CHECK_PTR(r.matcher); + rules.emplace_back(std::move(r)); +} + diff --git a/lib/urlfilter/adblock/adblocklist.h b/lib/urlfilter/adblock/adblocklist.h new file mode 100644 index 0000000..ee41e11 --- /dev/null +++ b/lib/urlfilter/adblock/adblocklist.h @@ -0,0 +1,42 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "urlfilter.h" +#include "matcher.h" +#include +#include + +class QIODevice; +class AdBlockList : public UrlFilter +{ +public: + // TODO: check if all keys are listed + const QStringList keys = { "Version", "Title", "Last modified", "Expires", "Homepage", "Licence", "Redirect" }; + + AdBlockList(QIODevice *device); + ~AdBlockList(); + + QString metadata(const QString &key) const override; + int ruleCount() const; + std::pair match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const override; + +protected: + void parseLine(const QString &line); + +private: + QHash m_metadata; + + struct Rule { + UrlFilter::MatchResult action = UrlFilter::NotMatched; + Matcher *matcher; + QStringList enabledOn, disabledOn; + QHash options; + }; + + std::vector rules; +}; diff --git a/lib/urlfilter/adblock/parser.cpp b/lib/urlfilter/adblock/parser.cpp new file mode 100644 index 0000000..1e7f0bc --- /dev/null +++ b/lib/urlfilter/adblock/parser.cpp @@ -0,0 +1,75 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "parser.h" + +std::optional> parseComment(QString &line) +{ + const QLatin1Literal separator(": "); + if(line.contains(separator)) { + const QStringList comment = line.mid(1).split(QLatin1Literal(": ")); + return std::make_pair(comment.at(0).trimmed(), comment.at(1).trimmed()); + } else + return std::nullopt; +} + +std::optional> parseResourceOption(const QString &option) +{ + const bool exception = !option.startsWith(QLatin1Literal("~")); + + if(option.endsWith(QLatin1Literal("script"))) { + // external scripts loaded via HTML script tag + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); + + } else if(option.endsWith(QLatin1Literal("image"))) { + // regular images, typically loaded via HTML img tag + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); + + } else if(option.endsWith(QLatin1Literal("stylesheet"))) { + // external CSS stylesheet files + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); + + } else if(option.endsWith(QLatin1Literal("object"))) { + // content handled by browser plugins, e.g. Flash or Java + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); + + } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { + // requests started using the XMLHttpRequest object or fetch() API + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); + + } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { + // requests started by plugins like Flash + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); + + } else if(option.endsWith(QLatin1Literal("subdocument"))) { + // embedded pages, usually included via HTML frames + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); + + } else if(option.endsWith(QLatin1Literal("ping"))) { + // requests started by or navigator.sendBeacon() + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypePing, exception); + + } else if(option.endsWith(QLatin1Literal("websocket"))) { + // requests initiated via WebSocket object + qDebug("Resource type 'websocket' not available"); + + } else if(option.endsWith(QLatin1Literal("webrtc"))) { + // connections opened via RTCPeerConnection instances to ICE servers + qDebug("Resource type 'webrtc' not available"); + + } else if(option.endsWith(QLatin1Literal("document"))) { + // the page itself + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); + + } else if(option.endsWith(QLatin1Literal("other"))) { + return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); + } + + qDebug("TODO: %s", qUtf8Printable(option)); + return std::nullopt; +} diff --git a/lib/urlfilter/adblock/parser.h b/lib/urlfilter/adblock/parser.h new file mode 100644 index 0000000..c73a9cf --- /dev/null +++ b/lib/urlfilter/adblock/parser.h @@ -0,0 +1,14 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include +#include +#include + +std::optional> parseComment(QString &line); +std::optional> parseResourceOption(const QString &option); diff --git a/lib/urlfilter/domain.cpp b/lib/urlfilter/domain.cpp deleted file mode 100644 index 2bfd524..0000000 --- a/lib/urlfilter/domain.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#include "domain.h" - -Domain::Domain(const QString &domain) - : m_domain(domain) - , m_hash(qHash(domain, 0)) -{ -} - -Domain::Domain(Domain &&other) - : m_domain(std::move(other.m_domain)) - , m_hash(std::move(other.m_hash)) -{ -} - -Domain &Domain::operator=(Domain &&other) -{ - m_domain = std::move(other.m_domain); - m_hash = other.m_hash; - return *this; -} - -bool Domain::matches(const QUrl &url) const -{ - // empty domain matches all - if(m_domain.isEmpty() || url.isEmpty()) - return true; - - const QString domain = url.host(); - - // domain and filter are the same - if(domain == m_domain) { - return true; - } - - // domain cannot be matched if it doesn't end with filter - // ex. example2.com isn't matched by example.com - if(!domain.endsWith(m_domain)) { - return false; - } - - // match with subdomains - // ex. subdomain.example.com is matched by example.com - int index = domain.indexOf(m_domain); - - // match if (domain ends with filter) && (filter has been found) and (character before filter is '.') - return index > 0 && domain[index - 1] == QLatin1Char('.'); -} - -bool Domain::matchesExactly(uint hash) const -{ - return (m_hash == hash); -} - -QString Domain::host() const -{ - return m_domain; -} diff --git a/lib/urlfilter/domain.h b/lib/urlfilter/domain.h deleted file mode 100644 index 0406f0f..0000000 --- a/lib/urlfilter/domain.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#ifndef SMOLBOTE_DOMAIN_H -#define SMOLBOTE_DOMAIN_H - -#include -#include - -class Domain -{ -public: - explicit Domain(const QString &domain); - explicit Domain(Domain &&other); - Domain &operator=(Domain &&other); - - // match domain and subdomains of domain - bool matches(const QUrl &url) const; - // exact match of domain - bool matchesExactly(uint hash) const; - QString host() const; - -private: - QString m_domain; - uint m_hash; -}; - -#endif // SMOLBOTE_DOMAIN_H diff --git a/lib/urlfilter/filterleaf.cpp b/lib/urlfilter/filterleaf.cpp deleted file mode 100644 index 5797718..0000000 --- a/lib/urlfilter/filterleaf.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "filterleaf.h" - -const QString FilterLeaf::request() const -{ - return m_request; -} - -std::optional FilterLeaf::option(QWebEngineUrlRequestInfo::ResourceType opt) const -{ - if(resourceTypeOptions.contains(opt)) - return resourceTypeOptions.value(opt); - else - return std::nullopt; -} diff --git a/lib/urlfilter/filterleaf.h b/lib/urlfilter/filterleaf.h deleted file mode 100644 index 64f465d..0000000 --- a/lib/urlfilter/filterleaf.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#ifndef SMOLBOTE_FILTERLEAF_H -#define SMOLBOTE_FILTERLEAF_H - -#include -#include -#include -#include -#include -#include -#include - -class FilterLeaf -{ -public: - enum Action { - NotMatched, - Allow, - Block, - Redirect - }; - - enum UrlMatchType { - InvalidMatch, - RegularExpressionMatch, - StringContains, - StringStartsWith, - StringEndsWith, - StringEquals, - DomainMatch - }; - - virtual ~FilterLeaf() = default; - - virtual bool match(const QUrl &requestUrl) const = 0; - virtual std::pair action() const = 0; - - const QString request() const; - std::optional option(QWebEngineUrlRequestInfo::ResourceType opt) const; - -protected: - // rule matching - UrlMatchType matchType = InvalidMatch; - QHash resourceTypeOptions; - QString m_request; - - // rule action - bool m_isBlocking; -}; - -Q_DECLARE_METATYPE(FilterLeaf::Action) - -#endif // SMOLBOTE_FILTERLEAF_H diff --git a/lib/urlfilter/filtertree.cpp b/lib/urlfilter/filtertree.cpp deleted file mode 100644 index 2cdd6d0..0000000 --- a/lib/urlfilter/filtertree.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#include "filtertree.h" -#include "filterleaf.h" -#include "formats/hostlistrule.h" -#include - -bool loadHostlist(QIODevice &from, FilterTree *tree) -{ - Q_ASSERT(from.isReadable()); - QTextStream stream(&from); - while(!stream.atEnd()) { - const QString line = stream.readLine().trimmed(); - if(line.isEmpty() || line.startsWith(QLatin1Literal("#"))) - continue; - - const QStringList &parts = line.split(QLatin1Literal(" ")); - if(parts.length() < 2) { -#ifdef QT_DEBUG - qDebug("Cannot parse: %s", qUtf8Printable(line)); -#endif - return false; - } - - for(int i = 1; i < parts.length(); ++i) { - // HostlistRule(domain, redirect) - auto *rule = new HostlistRule(parts.at(i), parts.constFirst()); - // addRule(rule, enable_on_domain) - const bool added = tree->addRule(rule, QString()); - if(!added) - return false; - } - } - return true; -} - -FilterTree::~FilterTree() -{ - for(auto &branch : m_branches) { - qDeleteAll(branch.leaves); - branch.leaves.clear(); - } -} - -const QStringList FilterTree::branches() const -{ - QStringList branches; - for(auto &branch : m_branches) { - branches.append(branch.domain.host()); - } - return branches; -} - -QVector FilterTree::match(const QUrl &domain, const QUrl &requestUrl) const -{ - QVector leaves; - for(const auto &branch : m_branches) { - if(branch.domain.matches(domain)) { - - for(const auto leaf : branch.leaves) { - if(leaf->match(requestUrl)) { - leaves.append(leaf); - } - } - } - } - return leaves; -} - -bool FilterTree::addRule(FilterLeaf *rule, const QString &domain) -{ - branchLock.lock(); - this->branch(domain).leaves.emplace_back(rule); - branchLock.unlock(); - return true; -} - -FilterTree::Branch & FilterTree::branch(const QString& domain) -{ - for(auto &branch : m_branches) { - if(branch.domain.matches(QUrl(domain))) - return branch; - } - - // no branch was found - Branch branch(domain); - return m_branches.emplace_back(std::move(branch)); -} diff --git a/lib/urlfilter/filtertree.h b/lib/urlfilter/filtertree.h deleted file mode 100644 index f453a3d..0000000 --- a/lib/urlfilter/filtertree.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#ifndef SMOLBOTE_FILTERTREE_H -#define SMOLBOTE_FILTERTREE_H - -#include "domain.h" -#include "filterleaf.h" -#include -#include -#include -#include -#include - -/** FilterTree: B+ tree of filter rules - * The tree contains branches that represent domains - * Each domain-branch contains leaves (rules) that are to be applied to it. - * Rules may be applied to multiple branches. - */ -class FilterTree : public QObject -{ - Q_OBJECT - -public: - ~FilterTree(); - - const QStringList branches() const; - QVector match(const QUrl &domain, const QUrl &requestUrl) const; - - bool addRule(FilterLeaf *rule, const QString &domain); - -private: - struct Branch { - explicit Branch(const QString &host) - : domain(host) - { - } - explicit Branch(Branch &&other) - : domain(std::move(other.domain)) - , leaves(std::move(other.leaves)) - { - } - - Domain domain; - std::vector leaves; - }; - - Branch& branch(const QString &domain); - - QMutex branchLock; - std::vector m_branches; -}; - -bool loadHostlist(QIODevice &from, FilterTree *tree); - -#endif // SMOLBOTE_FILTERTREE_H diff --git a/lib/urlfilter/formats/adblocklist.cpp b/lib/urlfilter/formats/adblocklist.cpp deleted file mode 100644 index 772c252..0000000 --- a/lib/urlfilter/formats/adblocklist.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include "adblocklist.h" - -AdBlockList::AdBlockList() -{ -} - -QString AdBlockList::metadata(const QString &key) const -{ - return m_metadata.value(key, QString()); -} - -FilterLeaf::Action AdBlockList::match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const -{ - const QString request = requestUrl.toString(); - - for(auto &filter : m_rules) { - if(filter.matcher->hasMatch(request)) - return filter.action; - } - return FilterLeaf::NotMatched; -} - -bool AdBlockList::parseLine(const QString &line) -{ - // remove whitespace from start/end of the line - QString parsedLine = line.trimmed(); - - // check if the line is empty - if(parsedLine.isEmpty()) - return false; - - // parse comment - if(parsedLine.startsWith(QLatin1Literal("!"))) - return parseComment(parsedLine); - - Filter filter; - - // exception rules - if(parsedLine.startsWith(QLatin1Literal("@@"))) { - filter.action = FilterLeaf::Allow; - parsedLine.remove(0, 2); - } - - // remove '*' at the beginning and the end - if(parsedLine.startsWith(QLatin1Literal("*"))) - parsedLine = parsedLine.mid(1); - if(parsedLine.endsWith(QLatin1Literal("*"))) - parsedLine.chop(1); - - if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { - // regular expression rule - parsedLine = parsedLine.mid(1, parsedLine.length() - 2); - filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::RegularExpressionMatch); - - } else if(parsedLine.contains(QLatin1Literal("*"))) { - parsedLine = QRegularExpression::wildcardToRegularExpression(parsedLine); - filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::RegularExpressionMatch); - - } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { -// matchType = FilterLeaf::DomainMatch; - parsedLine = parsedLine.mid(2, parsedLine.length() - 3); - filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::DomainMatch); - - } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { - // string equals rule - parsedLine = parsedLine.mid(1, parsedLine.length() - 2); - filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::StringEquals); - - } else if(parsedLine.startsWith(QLatin1Literal("||"))) { - // string starts with rule - parsedLine = parsedLine.mid(2); - filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::StringStartsWith); - - } else if(parsedLine.endsWith(QLatin1Literal("|"))) { - // string ends with rule - parsedLine.chop(1); - filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::StringEndsWith); - - } else { - // generic contains rule - filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::StringContains); - } - - - Q_CHECK_PTR(filter.matcher); - m_rules.emplace_back(std::move(filter)); - return true; -} - -bool AdBlockList::parseComment(const QString &commentLine) -{ - const QStringList comment = commentLine.mid(1).split(QLatin1Literal(": ")); - m_metadata[comment.at(0).trimmed()] = comment.at(1).trimmed(); - return true; -} diff --git a/lib/urlfilter/formats/adblocklist.h b/lib/urlfilter/formats/adblocklist.h deleted file mode 100644 index 34a2120..0000000 --- a/lib/urlfilter/formats/adblocklist.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ADBLOCKLIST_H -#define ADBLOCKLIST_H - -#include -#include "adblockrule.h" - -class AdBlockList -{ -public: - AdBlockList(); - - QString metadata(const QString &key) const; - FilterLeaf::Action match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type = QWebEngineUrlRequestInfo::ResourceTypeUnknown) const; - - bool parseLine(const QString &line); - -protected: - bool parseComment(const QString &commentLine); - -private: - struct Filter - { - FilterLeaf::Action action = FilterLeaf::Block; - Matcher *matcher; - }; - - QHash m_metadata; - //QMap m_rules; - std::vector m_rules; -}; - -#endif // ADBLOCKLIST_H diff --git a/lib/urlfilter/formats/adblockrule.cpp b/lib/urlfilter/formats/adblockrule.cpp deleted file mode 100644 index 60e817f..0000000 --- a/lib/urlfilter/formats/adblockrule.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#include "adblockrule.h" -#include -#include - -AdBlockRule::AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString &filter, FilterLeaf::Action action) -{ - this->matchType = matchType; - this->m_request = filter; - this->m_isBlocking = (action == FilterLeaf::Block); - //matcher.setPattern(filter); - if(matchType == FilterLeaf::RegularExpressionMatch) - regExp = new QRegExp(filter); - else - stringMatcher = new QStringMatcher(filter); -} - -void AdBlockRule::mergeOptions(const QHash &options) -{ - this->resourceTypeOptions.unite(options); -} - -bool AdBlockRule::match(const QUrl &requestUrl) const -{ - switch(matchType) { - case FilterLeaf::RegularExpressionMatch: - return (regExp->indexIn(requestUrl.toString()) != -1); - default: - return false; - } -} - -bool AdBlockRule::match(const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const -{ - // if request is of the required type, or there are no types set (== apply to all requests) - if(this->resourceTypeOptions.contains(type) || this->resourceTypeOptions.isEmpty()) { - switch(matchType) { - case FilterLeaf::RegularExpressionMatch: - return (regExp->indexIn(requestUrl.toString()) != -1); - default: - qWarning("Match type not implemented, returning false!"); - return false; - } - } - - // request type is not matched - return false; -} - -std::pair AdBlockRule::action() const -{ - if(m_isBlocking) - return std::make_pair(FilterLeaf::Block, QVariant()); - else - return std::make_pair(FilterLeaf::Allow, QVariant()); -} diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/formats/adblockrule.h deleted file mode 100644 index 6be3cdf..0000000 --- a/lib/urlfilter/formats/adblockrule.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#ifndef SMOLBOTE_ADBLOCKRULE_H -#define SMOLBOTE_ADBLOCKRULE_H - -#include "../filterleaf.h" -#include -#include -#include - -class Matcher -{ -public: - virtual bool hasMatch(const QString &where) const = 0; -}; - -template -class ContentsMatcher : public Matcher -{ -public: - ContentsMatcher(const QString &pattern, FilterLeaf::UrlMatchType matchType) - { - this->matchType = matchType; - patternLength = pattern.length(); - - - if constexpr(std::is_same_v) { - matcher.setPatternOptions(matcher.patternOptions() | QRegularExpression::CaseInsensitiveOption); - matcher.setPattern(pattern); - } else if constexpr(std::is_same_v) { - matcher.setCaseSensitivity(Qt::CaseInsensitive); - matcher.setPattern(pattern); - } else if constexpr(std::is_same_v) { - matcher = QUrl::fromUserInput(pattern).host(); -// qDebug("matcher: %s", qUtf8Printable(matcher)); - } - } - - bool hasMatch(const QString &where) const override - { - if constexpr(std::is_same_v) { - switch (matchType) { - case FilterLeaf::InvalidMatch: - case FilterLeaf::RegularExpressionMatch: - case FilterLeaf::DomainMatch: - qWarning("ContentsMatcher is a String Matcher, but not doing string matching!"); - return false; - - case FilterLeaf::StringContains: - return (matcher.indexIn(where) != -1); - - case FilterLeaf::StringStartsWith: - return (matcher.indexIn(where) == 0); - - case FilterLeaf::StringEndsWith: - return (matcher.indexIn(where) == where.length() - patternLength); - - case FilterLeaf::StringEquals: - return (matcher.indexIn(where) == 0) && (patternLength == where.length()); - } - - } else if constexpr(std::is_same_v) { - if(matchType != FilterLeaf::RegularExpressionMatch) - qWarning("ContentsMatcher is a regular expression, but not doing a regular expression match!"); - return matcher.match(where).hasMatch(); - } else if constexpr(std::is_same_v) { - // TODO: fix - if(matchType == FilterLeaf::DomainMatch) { -// qDebug("matching %s", qUtf8Printable(QUrl(where).host())); - return QUrl(where).host().endsWith(matcher); - } else - return matcher == where; - } else { - qWarning("Matcher has no backend, returning false"); - return false; - } - } - -private: - int patternLength; - T matcher; - FilterLeaf::UrlMatchType matchType; -}; - -class AdBlockRule : public FilterLeaf -{ -public: - explicit AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString &filter, FilterLeaf::Action action); - ~AdBlockRule() - { - delete stringMatcher; - delete regExp; - }; - - void mergeOptions(const QHash &options); - - bool match(const QUrl &requestUrl) const override; - bool match(const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const; - std::pair action() const override; - -private: - /* Once C++20 comes out, perhaps this can be replaced with a concept template */ - QStringMatcher *stringMatcher = nullptr; - QRegExp *regExp = nullptr; -}; - -#endif // SMOLBOTE_ADBLOCKRULE_H diff --git a/lib/urlfilter/formats/adblockrule_parse.cpp b/lib/urlfilter/formats/adblockrule_parse.cpp deleted file mode 100644 index c01ddfd..0000000 --- a/lib/urlfilter/formats/adblockrule_parse.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#include "adblockrule.h" -#include "adblockrule_parse.h" - -// adblock format documentation -// https://adblockplus.org/filters - -// QString::mid(pos, len) const - Returns a string starting at the specified position index. -// QString::chop(len) - Removes n characters from the end of the string. -// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. -// QString::trimmed() const - Remove whitespace from start and end - -AdBlockRule *parseRule_adblock(const QString &filter) -{ - QString parsedLine = filter.trimmed(); - - // there is no rule, or it's a comment - if(parsedLine.isEmpty() || parsedLine.startsWith("!")) { - return nullptr; - } - - // css rule -> filterleaves cannot do element blocking - if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { - return nullptr; - } - - // exception rules - FilterLeaf::Action action = FilterLeaf::Block; - if(parsedLine.startsWith(QLatin1Literal("@@"))) { - action = FilterLeaf::Allow; - parsedLine.remove(0, 2); - } - - // parse options - QStringList enabledOn, disabledOn; - QHash optionsHash; - { - const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); - if(sepPos != -1) { - const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); - parsedLine = parsedLine.mid(0, sepPos); - - for(const QString &option : options) { - if(option.startsWith(QLatin1Literal("domain"))) { - const auto domainList = option.mid(7).split(QLatin1Literal("|")); - - for(const QString &domain : domainList) { - if(domain.startsWith(QLatin1Literal("~"))) { - disabledOn.append(domain.mid(1)); - } else { - enabledOn.append(domain); - } - } - } else { - const auto pair = parseOption(option); - if(pair) - optionsHash.insert(pair.value().first, pair.value().second); - } - } - } - } - - FilterLeaf::UrlMatchType matchType = FilterLeaf::InvalidMatch; - - if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { - // regular expression rule - matchType = FilterLeaf::RegularExpressionMatch; - parsedLine = parsedLine.mid(1, parsedLine.length() - 2); - - } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { - matchType = FilterLeaf::DomainMatch; - parsedLine = parsedLine.mid(2, parsedLine.length() - 3); - - } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { - // string equals rule - matchType = FilterLeaf::StringEquals; - parsedLine = parsedLine.mid(1, parsedLine.length() - 2); - - } else if(parsedLine.startsWith(QLatin1Literal("||"))) { - // string starts with rule - matchType = FilterLeaf::StringStartsWith; - parsedLine = parsedLine.mid(2); - - } else if(parsedLine.endsWith(QLatin1Literal("|"))) { - // string ends with rule - matchType = FilterLeaf::StringEndsWith; - parsedLine.chop(1); - - } else { - // generic contains rule - matchType = FilterLeaf::StringContains; - - // Basic filter rules can use wildcards, which were supported by QRegExp, - // but were deprecated in QRegularExpression. - - // remove beginning and ending wildcards - if(parsedLine.startsWith(QLatin1Literal("*"))) - parsedLine = parsedLine.mid(1); - - if(parsedLine.endsWith(QLatin1Literal("*"))) - parsedLine.chop(1); - - if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { - // check for wildcards and translate to regexp - // wildcard "*" - any number of characters - // separator "^" - end, ? or / - parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); - parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); - parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); - parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); - - matchType = FilterLeaf::RegularExpressionMatch; - } - } - - AdBlockRule *rule = new AdBlockRule(matchType, parsedLine, action); - rule->mergeOptions(optionsHash); - return rule; -} - -std::optional> parseOption(const QString &option) -{ - const bool exception = !option.startsWith(QLatin1Literal("~")); - - if(option.endsWith(QLatin1Literal("script"))) { - // external scripts loaded via HTML script tag - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); - - } else if(option.endsWith(QLatin1Literal("image"))) { - // regular images, typically loaded via HTML img tag - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); - - } else if(option.endsWith(QLatin1Literal("stylesheet"))) { - // external CSS stylesheet files - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); - - } else if(option.endsWith(QLatin1Literal("object"))) { - // content handled by browser plugins, e.g. Flash or Java - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); - - } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { - // requests started using the XMLHttpRequest object or fetch() API - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); - - } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { - // requests started by plugins like Flash - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); - - } else if(option.endsWith(QLatin1Literal("subdocument"))) { - // embedded pages, usually included via HTML frames - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); - - } else if(option.endsWith(QLatin1Literal("ping"))) { - // requests started by or navigator.sendBeacon() - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePing, exception); - - } else if(option.endsWith(QLatin1Literal("websocket"))) { - // requests initiated via WebSocket object - qDebug("Resource type 'websocket' not available"); - - } else if(option.endsWith(QLatin1Literal("webrtc"))) { - // connections opened via RTCPeerConnection instances to ICE servers - qDebug("Resource type 'webrtc' not available"); - - } else if(option.endsWith(QLatin1Literal("document"))) { - // the page itself - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); - - } else if(option.endsWith(QLatin1Literal("other"))) { - return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); - } - - return std::nullopt; -} diff --git a/lib/urlfilter/formats/adblockrule_parse.h b/lib/urlfilter/formats/adblockrule_parse.h deleted file mode 100644 index 01255ca..0000000 --- a/lib/urlfilter/formats/adblockrule_parse.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#ifndef ADBLOCKRULE_PARSE_H -#define ADBLOCKRULE_PARSE_H - -class AdBlockRule; - -AdBlockRule *parseRule_adblock(const QString &filter); -std::optional> parseOption(const QString &option); - -#endif // ADBLOCKRULE_PARSE_H diff --git a/lib/urlfilter/formats/hostlistrule.cpp b/lib/urlfilter/formats/hostlistrule.cpp deleted file mode 100644 index ad2c2a6..0000000 --- a/lib/urlfilter/formats/hostlistrule.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#include "hostlistrule.h" - -HostlistRule::HostlistRule(const QString &domain, const QString &redirect) -{ - this->m_isBlocking = (redirect == QLatin1Literal("0.0.0.0")); - this->m_request = domain; - this->m_redirect = redirect; -} - -bool HostlistRule::match(const QUrl &requestUrl) const -{ - //qDebug("checking [%s] against [%s]", qUtf8Printable(requestUrl.host()), qUtf8Printable(m_request)); - return (m_request == requestUrl.host()); -} - -std::pair HostlistRule::action() const -{ - if(m_isBlocking) - return std::make_pair(FilterLeaf::Block, QVariant()); - return std::make_pair(FilterLeaf::Redirect, QVariant(m_redirect)); -} diff --git a/lib/urlfilter/formats/hostlistrule.h b/lib/urlfilter/formats/hostlistrule.h deleted file mode 100644 index 58ec690..0000000 --- a/lib/urlfilter/formats/hostlistrule.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * This file is part of smolbote. It's copyrighted by the contributors recorded - * in the version control history of the file, available from its original - * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote - * - * SPDX-License-Identifier: GPL-3.0 - */ - -#ifndef SMOLBOTE_HOSTLIST_RULE_H -#define SMOLBOTE_HOSTLIST_RULE_H - -#include "../filterleaf.h" -#include - -class HostlistRule : public FilterLeaf -{ -public: - explicit HostlistRule(const QString &domain, const QString &redirect); - - bool match(const QUrl &requestUrl) const override; - std::pair action() const override; - -private: - QString m_redirect; -}; - -#endif // SMOLBOTE_HOSTLIST_RULE_H diff --git a/lib/urlfilter/hostlist/hostlist.cpp b/lib/urlfilter/hostlist/hostlist.cpp new file mode 100644 index 0000000..ec0b214 --- /dev/null +++ b/lib/urlfilter/hostlist/hostlist.cpp @@ -0,0 +1,79 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "hostlist.h" +#include +#include +#include + +HostList::HostList(QIODevice *device) +{ + Q_ASSERT(device->isOpen()); + + QTextStream list(device); + while (!list.atEnd()) { + parseLine(list.readLine()); + } + + qDebug() << m_metadata; +} + +QString HostList::metadata(const QString& key) const +{ + return m_metadata.value(key); +} + +int HostList::ruleCount() const +{ + return rules.size(); +} + +std::pair HostList::match(const QUrl& firstParty, const QUrl& requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const +{ + Q_UNUSED(firstParty); + Q_UNUSED(type); + + const QString domain = requestUrl.host(); + const uint domainHash = qHash(domain); + + for(const Rule &r : rules) { + if(r.domainHash == domainHash) + return std::make_pair(r.action, r.redirect); + } + + return std::make_pair(UrlFilter::NotMatched, QString()); +} + +void HostList::parseLine(const QString& line) +{ + // check comment + if(line.startsWith(QLatin1Literal("#"))) + return; + + QString parsedLine = line.trimmed(); + + // malformed rule + if(!parsedLine.contains(QLatin1Literal(" "))) + return; + + const QStringList parts = parsedLine.split(QLatin1Literal(" ")); + const QString redirect = parts.at(0); + const auto action = (redirect == QLatin1Literal("0.0.0.0")) ? UrlFilter::Block : UrlFilter::Redirect; + + for(int i = 1; i < parts.size(); i++) { + const QString domain = parts.at(i); + Rule r; + r.action = action; + r.domainHash = qHash(domain); + if(action == UrlFilter::Redirect) + r.redirect = redirect; + + rules.emplace_back(std::move(r)); + } +} + diff --git a/lib/urlfilter/hostlist/hostlist.h b/lib/urlfilter/hostlist/hostlist.h new file mode 100644 index 0000000..d4a8d87 --- /dev/null +++ b/lib/urlfilter/hostlist/hostlist.h @@ -0,0 +1,44 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef SMOLBOTE_URLFILTER_HOSTLIST +#define SMOLBOTE_URLFILTER_HOSTLIST + +#include "urlfilter.h" +#include +#include +#include + +class QIODevice; +class HostList : public UrlFilter +{ +public: + + HostList(QIODevice *device); + ~HostList() = default; + + QString metadata(const QString &key) const override; + int ruleCount() const; + std::pair match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const override; + +protected: + void parseLine(const QString &line); + +private: + QHash m_metadata; + + struct Rule { + UrlFilter::MatchResult action = UrlFilter::NotMatched; + uint domainHash; + QString redirect; + }; + + std::vector rules; +}; + +#endif // SMOLBOTE_URLFILTER_HOSTLIST diff --git a/lib/urlfilter/matcher.h b/lib/urlfilter/matcher.h new file mode 100644 index 0000000..6696958 --- /dev/null +++ b/lib/urlfilter/matcher.h @@ -0,0 +1,109 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef SMOLBOTE_URLFILTER_MATCHER +#define SMOLBOTE_URLFILTER_MATCHER + +#include +#include +#include +#include +#include +#include + +/** An interface class so we can use templated ContentsMatcher interchangeably + */ +class Matcher +{ +public: + virtual ~Matcher() = default; + + virtual void setCaseSensitive(bool matchCase) = 0; + virtual bool hasMatch(const QString &where) const = 0; +}; + +template +class ContentsMatcher : public Matcher +{ +public: + ContentsMatcher(const QString &pattern, UrlFilter::MatchType type) + : patternLength(pattern.length()) + , matchType(type) + { + if constexpr(std::is_same_v) { + matcher.setPatternOptions(matcher.patternOptions() | QRegularExpression::CaseInsensitiveOption); + matcher.setPattern(pattern); + } else if constexpr(std::is_same_v) { + matcher.setCaseSensitivity(Qt::CaseInsensitive); + matcher.setPattern(pattern); + } else if constexpr(std::is_same_v) { + matcher = QUrl::fromUserInput(pattern).host(); + } + } + ~ContentsMatcher() = default; + + void setCaseSensitive(bool matchCase) override + { + if constexpr(std::is_same_v) { + auto options = matcher.patternOptions(); + options.setFlag(QRegularExpression::CaseInsensitiveOption, !matchCase); + matcher.setPatternOptions(options); + + } else if constexpr(std::is_same_v) { + matcher.setCaseSensitivity(matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive); + } + } + + bool hasMatch(const QString &where) const override + { + if constexpr(std::is_same_v) { + switch (matchType) { + case UrlFilter::InvalidMatch: + case UrlFilter::RegularExpressionMatch: + case UrlFilter::DomainMatch: + qWarning("ContentsMatcher is a String Matcher, but not doing string matching!"); + return false; + + case UrlFilter::StringContains: + return (matcher.indexIn(where) != -1); + + case UrlFilter::StringStartsWith: + return (matcher.indexIn(where) == 0); + + case UrlFilter::StringEndsWith: + return (matcher.indexIn(where) == where.length() - patternLength); + + case UrlFilter::StringEquals: + return (matcher.indexIn(where) == 0) && (patternLength == where.length()); + } + + } else if constexpr(std::is_same_v) { + if(matchType != UrlFilter::RegularExpressionMatch) + qWarning("ContentsMatcher is a regular expression, but not doing a regular expression match!"); + return matcher.match(where).hasMatch(); + } else if constexpr(std::is_same_v) { + // TODO: fix + if(matchType == UrlFilter::DomainMatch) { +// qDebug("matching %s", qUtf8Printable(QUrl(where).host())); + return QUrl(where).host().endsWith(matcher); + } else + return matcher == where; + } + + qWarning("Matcher has no backend, returning false"); + return false; + } + +private: + const int patternLength; + const UrlFilter::MatchType matchType; + T matcher; +}; + +#endif // SMOLBOTE_URLFILTER_MATCHER + diff --git a/lib/urlfilter/meson.build b/lib/urlfilter/meson.build index 1f4f47c..b017eb5 100644 --- a/lib/urlfilter/meson.build +++ b/lib/urlfilter/meson.build @@ -1,19 +1,26 @@ -urlfilter_inc = include_directories('.') - -urlfilter_moc = qt5.preprocess( - moc_headers: 'filtertree.h', - dependencies: dep_qt5 -) - urlfilter_lib = static_library('urlfilter', - ['filtertree.cpp', 'filterleaf.cpp', urlfilter_moc, - 'domain.cpp', 'domain.h', - 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp', - 'formats/adblocklist.cpp'], + ['urlfilter.h', 'matcher.h', + 'hostlist/hostlist.cpp', 'hostlist/hostlist.h', + 'adblock/adblocklist.cpp', 'adblock/adblocklist.h', 'adblock/parser.cpp', 'adblock/parser.h'], dependencies: dep_qt5 ) dep_urlfilter = declare_dependency( - include_directories: urlfilter_inc, + include_directories: include_directories('.'), link_with: urlfilter_lib ) + +if get_option('testing').enabled() + test('urlfilter: matcher', + executable('urlfilter-matcher', dependencies: [dep_qt5, dep_gtest, dep_urlfilter], sources: ['test/matcher.cpp']), + workdir: meson.current_source_dir() / 'test' + ) + test('urlfilter: host list', + executable('urlfilter-hostlist', dependencies: [dep_qt5, dep_gtest, dep_urlfilter], sources: ['test/hostlist.cpp']), + workdir: meson.current_source_dir() / 'test' + ) + test('urlfilter: adblock list', + executable('urlfilter-adblocklist', dependencies: [dep_qt5, dep_gtest, dep_urlfilter], sources: ['test/adblock.cpp']), + workdir: meson.current_source_dir() / 'test' + ) +endif diff --git a/lib/urlfilter/test/adblock.cpp b/lib/urlfilter/test/adblock.cpp new file mode 100644 index 0000000..ecb94ee --- /dev/null +++ b/lib/urlfilter/test/adblock.cpp @@ -0,0 +1,88 @@ +#include "urlfilter.h" +#include "adblock/adblocklist.h" +#include +#include + +AdBlockList *list = nullptr; + +TEST(AdBlockList, MetaData) { + EXPECT_STREQ(qUtf8Printable(list->metadata("Homepage")), "http://example.com/"); + EXPECT_STREQ(qUtf8Printable(list->metadata("Title")), "FooList"); + EXPECT_STREQ(qUtf8Printable(list->metadata("Expires")), "5 days"); + EXPECT_STREQ(qUtf8Printable(list->metadata("Redirect")), "http://example.com/list.txt"); + EXPECT_STREQ(qUtf8Printable(list->metadata("Version")), "1234"); +} + +TEST(AdBlockList, BasicFilter) { + // Rule: /banner/*/img^ + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo/img"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo/bar/img?param"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner//img/foo"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::Block); + + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo.png"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::NotMatched); + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/img"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::NotMatched); + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo/imgraph"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::NotMatched); + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo/img.gif"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::NotMatched); + + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/ads/img.png"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); +} + +TEST(AdBlockList, MatchBeginningEnd) { + // Rule: |http://beginning-pattern.com + EXPECT_EQ(list->match(QUrl(), QUrl("http://beginning-pattern.com"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("https://beginning-pattern.com"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); + // Rule: end-pattern| + EXPECT_EQ(list->match(QUrl(), QUrl("https://endpattern.com/end-pattern"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("https://endpattern.com/end-pattern/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); +} + +TEST(AdBlockList, Domain) { + // Rule: ||ads.example.com^ + EXPECT_EQ(list->match(QUrl(), QUrl("http://ads.example.com/foo.gif"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("http://server1.ads.example.com/foo.gif"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("https://ads.example.com:8000/"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + + EXPECT_EQ(list->match(QUrl(), QUrl("http://ads.example.com.ua/foo.gif"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/redirect/http://ads.example.com/"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); +} + +TEST(AdBlockList, RegularExpression) { + // Rule: /banner\d+/ + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner123"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner321"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banners"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); +} + +TEST(AdBlockList, MatchCase) { + // Rule: matchThisCase$match-case + EXPECT_EQ(list->match(QUrl(), QUrl("http://matchcase.com/matchThisCase"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl("http://matchcase.com/MatchThisCase"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); +} + +TEST(AdBlockList, DomainOption) { + // Rule: domain-limited-string$domain=example.com + EXPECT_EQ(list->match(QUrl("https://example.com"), QUrl("https://example.com/domain-limited-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl("https://example.com"), QUrl("https://example.com/another-domain-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); + EXPECT_EQ(list->match(QUrl("https://another.com"), QUrl("https://example.com/domain-limited-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); + + //Rule: exception-limited-string$domain=~example.com + EXPECT_EQ(list->match(QUrl("https://another.com"), QUrl("https://example.com/exception-limited-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl("https://example.com"), QUrl("https://example.com/exception-limited-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); +} + +int main(int argc, char **argv) { + QFile f("adblock.txt"); + if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) { + qDebug("Could not open list"); + return -1; + } + + list = new AdBlockList(&f); + f.close(); + + qDebug("Parsed %i rules", list->ruleCount()); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/lib/urlfilter/test/adblock.txt b/lib/urlfilter/test/adblock.txt new file mode 100644 index 0000000..635ce09 --- /dev/null +++ b/lib/urlfilter/test/adblock.txt @@ -0,0 +1,26 @@ +! Homepage: http://example.com/ +! Title: FooList +! Expires: 5 days +! Redirect: http://example.com/list.txt +! Version: 1234 + +/banner/*/img^ +||ads.example.com^ +|http://example.com/| +/banner\d+/ + +! match beginning +||http://beginning-pattern.com +! match end +end-pattern| + +! options +! match-case +matchThisCase$match-case + +! domain limiting +! only apply this filter on this domain +domain-limited-string$domain=example.com +! apply this filter to all domains but the listed one +exception-limited-string$domain=~example.com + diff --git a/lib/urlfilter/test/hostlist.cpp b/lib/urlfilter/test/hostlist.cpp new file mode 100644 index 0000000..041cd5f --- /dev/null +++ b/lib/urlfilter/test/hostlist.cpp @@ -0,0 +1,34 @@ +#include +#include "hostlist/hostlist.h" +#include + +HostList *list = nullptr; + +TEST(AdBlockList, Block) { + EXPECT_EQ(list->match(QUrl(), QUrl::fromUserInput("blockeddomain.com"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl::fromUserInput("blockeddomain.first"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + EXPECT_EQ(list->match(QUrl(), QUrl::fromUserInput("blockeddomain.second"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block); + + const auto r = list->match(QUrl(), QUrl::fromUserInput("localhost.localdomain"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame); + EXPECT_EQ(r.first, UrlFilter::Redirect); + EXPECT_EQ(r.second, QString("127.0.0.1")); + + EXPECT_EQ(list->match(QUrl(), QUrl::fromUserInput("other.domain"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched); +} + +int main(int argc, char **argv) { + QFile f("hostlist.txt"); + if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) { + qDebug("Could not open list"); + return -1; + } + + list = new HostList(&f); + f.close(); + + qDebug("Parsed %i rules", list->ruleCount()); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/lib/urlfilter/test/hostlist.txt b/lib/urlfilter/test/hostlist.txt new file mode 100644 index 0000000..a0b4e5c --- /dev/null +++ b/lib/urlfilter/test/hostlist.txt @@ -0,0 +1,6 @@ +# This is a comment, and after it comes a blank line + +127.0.0.1 localhost.localdomain + +0.0.0.0 blockeddomain.com +0.0.0.0 blockeddomain.first blockeddomain.second diff --git a/lib/urlfilter/test/matcher.cpp b/lib/urlfilter/test/matcher.cpp new file mode 100644 index 0000000..1c1efbf --- /dev/null +++ b/lib/urlfilter/test/matcher.cpp @@ -0,0 +1,42 @@ +#include "urlfilter.h" +#include "matcher.h" +#include + +TEST(Matcher, StringContains) { + ContentsMatcher matcher("spam-pattern", UrlFilter::StringContains); + EXPECT_TRUE(matcher.hasMatch("this string contains a spam-pattern")); + EXPECT_FALSE(matcher.hasMatch("this string does not contain the pattern")); +} + +TEST(Matcher, StringStartsWith) { + ContentsMatcher matcher("beginning", UrlFilter::StringStartsWith); + EXPECT_TRUE(matcher.hasMatch("beginning this string is the pattern")); + EXPECT_FALSE(matcher.hasMatch("ending this string is the pattern, the word beginning")); + EXPECT_FALSE(matcher.hasMatch("this would be a string where the pattern cannot be found")); +} + +TEST(Matcher, StringEndsWith) { + ContentsMatcher matcher("ending", UrlFilter::StringEndsWith); + EXPECT_TRUE(matcher.hasMatch("this string has the proper ending")); + EXPECT_FALSE(matcher.hasMatch("and this string doesn't")); +} + +TEST(Matcher, StringEquals) { + ContentsMatcher matcher("string-to-match", UrlFilter::StringEquals); + EXPECT_TRUE(matcher.hasMatch("string-to-match")); + EXPECT_FALSE(matcher.hasMatch("same-len-string")); + EXPECT_FALSE(matcher.hasMatch("not the string-to-match")); +} + +TEST(Matcher, RegularExpression) { + ContentsMatcher matcher("banner\\d+", UrlFilter::RegularExpressionMatch); + EXPECT_TRUE(matcher.hasMatch("http://another.com/banner123")); + EXPECT_TRUE(matcher.hasMatch("http://another.com/banner321")); + EXPECT_FALSE(matcher.hasMatch("http://another.com/banners")); + +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/lib/urlfilter/urlfilter.h b/lib/urlfilter/urlfilter.h new file mode 100644 index 0000000..e15122a --- /dev/null +++ b/lib/urlfilter/urlfilter.h @@ -0,0 +1,43 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include +#include +#include +#include + +#ifndef SMOLBOTE_URLFILTER_FILTER +#define SMOLBOTE_URLFILTER_FILTER + +class UrlFilter +{ +public: + enum MatchResult { + NotMatched, + Allow, + Block, + Redirect + }; + + enum MatchType { + InvalidMatch, + RegularExpressionMatch, + StringContains, + StringStartsWith, + StringEndsWith, + StringEquals, + DomainMatch + }; + + virtual ~UrlFilter() = default; + + virtual QString metadata(const QString &key) const = 0; + virtual std::pair match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const = 0; +}; + +#endif // SMOLBOTE_URLFILTER_FILTER -- cgit v1.2.1