aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorAqua-sama <aqua@iserlohn-fortress.net>2019-01-09 19:38:58 +0100
committerAqua-sama <aqua@iserlohn-fortress.net>2019-01-09 19:38:58 +0100
commit3d2ae07c455c0e423c64f19e445518427a5684fa (patch)
tree58f6b47c3db33658a6f2e605fd021f08d1fa9964 /lib
parentAdd assorted unfished doc files to repo (diff)
downloadsmolbote-3d2ae07c455c0e423c64f19e445518427a5684fa.tar.xz
Rewrite lib/urlfilter
- Make HostList and AdBlockList implementations independent from each other - Move urlfilter tests to lib/urlfilter
Diffstat (limited to 'lib')
-rw-r--r--lib/urlfilter/adblock/adblocklist.cpp188
-rw-r--r--lib/urlfilter/adblock/adblocklist.h42
-rw-r--r--lib/urlfilter/adblock/parser.cpp75
-rw-r--r--lib/urlfilter/adblock/parser.h (renamed from lib/urlfilter/formats/adblockrule_parse.h)13
-rw-r--r--lib/urlfilter/domain.cpp65
-rw-r--r--lib/urlfilter/domain.h33
-rw-r--r--lib/urlfilter/filterleaf.cpp14
-rw-r--r--lib/urlfilter/filterleaf.h60
-rw-r--r--lib/urlfilter/filtertree.cpp94
-rw-r--r--lib/urlfilter/filtertree.h61
-rw-r--r--lib/urlfilter/formats/adblocklist.cpp95
-rw-r--r--lib/urlfilter/formats/adblocklist.h32
-rw-r--r--lib/urlfilter/formats/adblockrule.cpp63
-rw-r--r--lib/urlfilter/formats/adblockrule_parse.cpp181
-rw-r--r--lib/urlfilter/formats/hostlistrule.cpp29
-rw-r--r--lib/urlfilter/formats/hostlistrule.h27
-rw-r--r--lib/urlfilter/hostlist/hostlist.cpp79
-rw-r--r--lib/urlfilter/hostlist/hostlist.h44
-rw-r--r--lib/urlfilter/matcher.h (renamed from lib/urlfilter/formats/adblockrule.h)88
-rw-r--r--lib/urlfilter/meson.build31
-rw-r--r--lib/urlfilter/test/adblock.cpp88
-rw-r--r--lib/urlfilter/test/adblock.txt26
-rw-r--r--lib/urlfilter/test/hostlist.cpp34
-rw-r--r--lib/urlfilter/test/hostlist.txt6
-rw-r--r--lib/urlfilter/test/matcher.cpp42
-rw-r--r--lib/urlfilter/urlfilter.h43
26 files changed, 733 insertions, 820 deletions
diff --git a/lib/urlfilter/adblock/adblocklist.cpp b/lib/urlfilter/adblock/adblocklist.cpp
new file mode 100644
index 0000000..c749e9e
--- /dev/null
+++ b/lib/urlfilter/adblock/adblocklist.cpp
@@ -0,0 +1,188 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "adblocklist.h"
+#include "parser.h"
+#include <QIODevice>
+#include <QTextStream>
+#include <QDebug>
+
+AdBlockList::AdBlockList(QIODevice *device)
+{
+ Q_ASSERT(device->isOpen());
+
+ QTextStream list(device);
+ while (!list.atEnd()) {
+ parseLine(list.readLine());
+ }
+
+ qDebug() << m_metadata;
+}
+
+AdBlockList::~AdBlockList()
+{
+ for(Rule &r : rules) {
+ delete r.matcher;
+ }
+}
+
+QString AdBlockList::metadata(const QString& key) const
+{
+ return m_metadata.value(key);
+}
+
+int AdBlockList::ruleCount() const
+{
+ return rules.size();
+}
+
+std::pair<UrlFilter::MatchResult, QString> AdBlockList::match(const QUrl& firstParty, const QUrl& requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const
+{
+ const QString domain = firstParty.host();
+ const QString request = requestUrl.toString();
+
+ for(const Rule &r : rules) {
+ // if there are options specified, but not the one we need
+ if(!r.options.isEmpty() && !r.options.contains(type))
+ continue;
+
+ if(r.disabledOn.contains(domain))
+ continue;
+
+ if(!r.enabledOn.isEmpty() && !r.enabledOn.contains(domain))
+ continue;
+
+ if(r.matcher->hasMatch(request))
+ return std::make_pair(r.action, QString());
+ }
+
+ return std::make_pair(UrlFilter::NotMatched, QString());
+}
+
+void AdBlockList::parseLine(const QString& line)
+{
+ QString parsedLine = line.trimmed();
+
+ if(parsedLine.isEmpty())
+ return;
+
+ if(parsedLine.startsWith(QLatin1Literal("!"))) {
+ const auto comment = parseComment(parsedLine);
+
+ if(comment) {
+ const auto key = comment.value().first;
+ if(keys.contains(key))
+ m_metadata[key] = comment.value().second;
+ }
+
+ return;
+ }
+
+ // css rule -> filterleaves cannot do element blocking
+ if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) {
+ qDebug("TODO: %s", qUtf8Printable(parsedLine));
+ return;
+ }
+
+ Rule r;
+ r.action = UrlFilter::Block;
+
+ // exception rules
+ if(parsedLine.startsWith(QLatin1Literal("@@"))) {
+ r.action = UrlFilter::Allow;
+ parsedLine.remove(0, 2);
+ }
+
+ bool matchCase = false;
+
+ // parse options
+ {
+ const int sepPos = parsedLine.indexOf(QLatin1Literal("$"));
+ if(sepPos != -1) {
+ const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(","));
+ parsedLine = parsedLine.mid(0, sepPos);
+
+ for(const QString &option : options) {
+ if(option.startsWith(QLatin1Literal("domain"))) {
+ const auto domainList = option.mid(7).split(QLatin1Literal("|"));
+
+ for(const QString &domain : domainList) {
+ if(domain.startsWith(QLatin1Literal("~"))) {
+ r.disabledOn.append(domain.mid(1));
+ } else {
+ r.enabledOn.append(domain);
+ }
+ }
+ } else if(option.endsWith(QLatin1Literal("match-case"))) {
+ matchCase = !option.startsWith(QLatin1Literal("~"));
+
+ } else {
+ const auto pair = parseResourceOption(option);
+ if(pair)
+ r.options.insert(pair.value().first, pair.value().second);
+ }
+ }
+ }
+ }
+
+ if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) {
+ // regular expression rule
+ parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
+ r.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, UrlFilter::RegularExpressionMatch);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) {
+ parsedLine = parsedLine.mid(2, parsedLine.length() - 3);
+ r.matcher = new ContentsMatcher<QString>(parsedLine, UrlFilter::DomainMatch);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) {
+ // string equals rule
+ parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
+ r.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, UrlFilter::StringEquals);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("||"))) {
+ // string starts with rule
+ parsedLine = parsedLine.mid(2);
+ r.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, UrlFilter::StringStartsWith);
+
+ } else if(parsedLine.endsWith(QLatin1Literal("|"))) {
+ // string ends with rule
+ parsedLine.chop(1);
+ r.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, UrlFilter::StringEndsWith);
+
+ } else {
+ // generic contains rule
+
+ // remove beginning and ending wildcards
+ if(parsedLine.startsWith(QLatin1Literal("*")))
+ parsedLine = parsedLine.mid(1);
+
+ if(parsedLine.endsWith(QLatin1Literal("*")))
+ parsedLine.chop(1);
+
+ if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) {
+ // check for wildcards and translate to regexp
+ // wildcard "*" - any number of characters
+ // separator "^" - end, ? or /
+ parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://"));
+ parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|"));
+ parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*"));
+ parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)"));
+
+ r.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, UrlFilter::RegularExpressionMatch);
+
+ } else {
+ r.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, UrlFilter::StringContains);
+ }
+ }
+
+ r.matcher->setCaseSensitive(matchCase);
+
+ Q_CHECK_PTR(r.matcher);
+ rules.emplace_back(std::move(r));
+}
+
diff --git a/lib/urlfilter/adblock/adblocklist.h b/lib/urlfilter/adblock/adblocklist.h
new file mode 100644
index 0000000..ee41e11
--- /dev/null
+++ b/lib/urlfilter/adblock/adblocklist.h
@@ -0,0 +1,42 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "urlfilter.h"
+#include "matcher.h"
+#include <QHash>
+#include <QWebEngineUrlRequestInfo>
+
+class QIODevice;
+class AdBlockList : public UrlFilter
+{
+public:
+ // TODO: check if all keys are listed
+ const QStringList keys = { "Version", "Title", "Last modified", "Expires", "Homepage", "Licence", "Redirect" };
+
+ AdBlockList(QIODevice *device);
+ ~AdBlockList();
+
+ QString metadata(const QString &key) const override;
+ int ruleCount() const;
+ std::pair<MatchResult, QString> match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const override;
+
+protected:
+ void parseLine(const QString &line);
+
+private:
+ QHash<QString, QString> m_metadata;
+
+ struct Rule {
+ UrlFilter::MatchResult action = UrlFilter::NotMatched;
+ Matcher *matcher;
+ QStringList enabledOn, disabledOn;
+ QHash<QWebEngineUrlRequestInfo::ResourceType, bool> options;
+ };
+
+ std::vector<Rule> rules;
+};
diff --git a/lib/urlfilter/adblock/parser.cpp b/lib/urlfilter/adblock/parser.cpp
new file mode 100644
index 0000000..1e7f0bc
--- /dev/null
+++ b/lib/urlfilter/adblock/parser.cpp
@@ -0,0 +1,75 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "parser.h"
+
+std::optional<std::pair<QString, QString>> parseComment(QString &line)
+{
+ const QLatin1Literal separator(": ");
+ if(line.contains(separator)) {
+ const QStringList comment = line.mid(1).split(QLatin1Literal(": "));
+ return std::make_pair(comment.at(0).trimmed(), comment.at(1).trimmed());
+ } else
+ return std::nullopt;
+}
+
+std::optional<std::pair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseResourceOption(const QString &option)
+{
+ const bool exception = !option.startsWith(QLatin1Literal("~"));
+
+ if(option.endsWith(QLatin1Literal("script"))) {
+ // external scripts loaded via HTML script tag
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception);
+
+ } else if(option.endsWith(QLatin1Literal("image"))) {
+ // regular images, typically loaded via HTML img tag
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception);
+
+ } else if(option.endsWith(QLatin1Literal("stylesheet"))) {
+ // external CSS stylesheet files
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception);
+
+ } else if(option.endsWith(QLatin1Literal("object"))) {
+ // content handled by browser plugins, e.g. Flash or Java
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception);
+
+ } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) {
+ // requests started using the XMLHttpRequest object or fetch() API
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception);
+
+ } else if(option.endsWith(QLatin1Literal("object-subrequest"))) {
+ // requests started by plugins like Flash
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception);
+
+ } else if(option.endsWith(QLatin1Literal("subdocument"))) {
+ // embedded pages, usually included via HTML frames
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception);
+
+ } else if(option.endsWith(QLatin1Literal("ping"))) {
+ // requests started by <a ping> or navigator.sendBeacon()
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypePing, exception);
+
+ } else if(option.endsWith(QLatin1Literal("websocket"))) {
+ // requests initiated via WebSocket object
+ qDebug("Resource type 'websocket' not available");
+
+ } else if(option.endsWith(QLatin1Literal("webrtc"))) {
+ // connections opened via RTCPeerConnection instances to ICE servers
+ qDebug("Resource type 'webrtc' not available");
+
+ } else if(option.endsWith(QLatin1Literal("document"))) {
+ // the page itself
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception);
+
+ } else if(option.endsWith(QLatin1Literal("other"))) {
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception);
+ }
+
+ qDebug("TODO: %s", qUtf8Printable(option));
+ return std::nullopt;
+}
diff --git a/lib/urlfilter/formats/adblockrule_parse.h b/lib/urlfilter/adblock/parser.h
index 01255ca..c73a9cf 100644
--- a/lib/urlfilter/formats/adblockrule_parse.h
+++ b/lib/urlfilter/adblock/parser.h
@@ -6,12 +6,9 @@
* SPDX-License-Identifier: GPL-3.0
*/
-#ifndef ADBLOCKRULE_PARSE_H
-#define ADBLOCKRULE_PARSE_H
+#include <QWebEngineUrlRequestInfo>
+#include <optional>
+#include <utility>
-class AdBlockRule;
-
-AdBlockRule *parseRule_adblock(const QString &filter);
-std::optional<QPair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseOption(const QString &option);
-
-#endif // ADBLOCKRULE_PARSE_H
+std::optional<std::pair<QString, QString>> parseComment(QString &line);
+std::optional<std::pair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseResourceOption(const QString &option);
diff --git a/lib/urlfilter/domain.cpp b/lib/urlfilter/domain.cpp
deleted file mode 100644
index 2bfd524..0000000
--- a/lib/urlfilter/domain.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#include "domain.h"
-
-Domain::Domain(const QString &domain)
- : m_domain(domain)
- , m_hash(qHash(domain, 0))
-{
-}
-
-Domain::Domain(Domain &&other)
- : m_domain(std::move(other.m_domain))
- , m_hash(std::move(other.m_hash))
-{
-}
-
-Domain &Domain::operator=(Domain &&other)
-{
- m_domain = std::move(other.m_domain);
- m_hash = other.m_hash;
- return *this;
-}
-
-bool Domain::matches(const QUrl &url) const
-{
- // empty domain matches all
- if(m_domain.isEmpty() || url.isEmpty())
- return true;
-
- const QString domain = url.host();
-
- // domain and filter are the same
- if(domain == m_domain) {
- return true;
- }
-
- // domain cannot be matched if it doesn't end with filter
- // ex. example2.com isn't matched by example.com
- if(!domain.endsWith(m_domain)) {
- return false;
- }
-
- // match with subdomains
- // ex. subdomain.example.com is matched by example.com
- int index = domain.indexOf(m_domain);
-
- // match if (domain ends with filter) && (filter has been found) and (character before filter is '.')
- return index > 0 && domain[index - 1] == QLatin1Char('.');
-}
-
-bool Domain::matchesExactly(uint hash) const
-{
- return (m_hash == hash);
-}
-
-QString Domain::host() const
-{
- return m_domain;
-}
diff --git a/lib/urlfilter/domain.h b/lib/urlfilter/domain.h
deleted file mode 100644
index 0406f0f..0000000
--- a/lib/urlfilter/domain.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#ifndef SMOLBOTE_DOMAIN_H
-#define SMOLBOTE_DOMAIN_H
-
-#include <QString>
-#include <QUrl>
-
-class Domain
-{
-public:
- explicit Domain(const QString &domain);
- explicit Domain(Domain &&other);
- Domain &operator=(Domain &&other);
-
- // match domain and subdomains of domain
- bool matches(const QUrl &url) const;
- // exact match of domain
- bool matchesExactly(uint hash) const;
- QString host() const;
-
-private:
- QString m_domain;
- uint m_hash;
-};
-
-#endif // SMOLBOTE_DOMAIN_H
diff --git a/lib/urlfilter/filterleaf.cpp b/lib/urlfilter/filterleaf.cpp
deleted file mode 100644
index 5797718..0000000
--- a/lib/urlfilter/filterleaf.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "filterleaf.h"
-
-const QString FilterLeaf::request() const
-{
- return m_request;
-}
-
-std::optional<bool> FilterLeaf::option(QWebEngineUrlRequestInfo::ResourceType opt) const
-{
- if(resourceTypeOptions.contains(opt))
- return resourceTypeOptions.value(opt);
- else
- return std::nullopt;
-}
diff --git a/lib/urlfilter/filterleaf.h b/lib/urlfilter/filterleaf.h
deleted file mode 100644
index 64f465d..0000000
--- a/lib/urlfilter/filterleaf.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#ifndef SMOLBOTE_FILTERLEAF_H
-#define SMOLBOTE_FILTERLEAF_H
-
-#include <QHash>
-#include <QObject>
-#include <QString>
-#include <QWebEngineUrlRequestInfo>
-#include <optional>
-#include <utility>
-#include <QVariant>
-
-class FilterLeaf
-{
-public:
- enum Action {
- NotMatched,
- Allow,
- Block,
- Redirect
- };
-
- enum UrlMatchType {
- InvalidMatch,
- RegularExpressionMatch,
- StringContains,
- StringStartsWith,
- StringEndsWith,
- StringEquals,
- DomainMatch
- };
-
- virtual ~FilterLeaf() = default;
-
- virtual bool match(const QUrl &requestUrl) const = 0;
- virtual std::pair<Action, QVariant> action() const = 0;
-
- const QString request() const;
- std::optional<bool> option(QWebEngineUrlRequestInfo::ResourceType opt) const;
-
-protected:
- // rule matching
- UrlMatchType matchType = InvalidMatch;
- QHash<QWebEngineUrlRequestInfo::ResourceType, bool> resourceTypeOptions;
- QString m_request;
-
- // rule action
- bool m_isBlocking;
-};
-
-Q_DECLARE_METATYPE(FilterLeaf::Action)
-
-#endif // SMOLBOTE_FILTERLEAF_H
diff --git a/lib/urlfilter/filtertree.cpp b/lib/urlfilter/filtertree.cpp
deleted file mode 100644
index 2cdd6d0..0000000
--- a/lib/urlfilter/filtertree.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#include "filtertree.h"
-#include "filterleaf.h"
-#include "formats/hostlistrule.h"
-#include <QTextStream>
-
-bool loadHostlist(QIODevice &from, FilterTree *tree)
-{
- Q_ASSERT(from.isReadable());
- QTextStream stream(&from);
- while(!stream.atEnd()) {
- const QString line = stream.readLine().trimmed();
- if(line.isEmpty() || line.startsWith(QLatin1Literal("#")))
- continue;
-
- const QStringList &parts = line.split(QLatin1Literal(" "));
- if(parts.length() < 2) {
-#ifdef QT_DEBUG
- qDebug("Cannot parse: %s", qUtf8Printable(line));
-#endif
- return false;
- }
-
- for(int i = 1; i < parts.length(); ++i) {
- // HostlistRule(domain, redirect)
- auto *rule = new HostlistRule(parts.at(i), parts.constFirst());
- // addRule(rule, enable_on_domain)
- const bool added = tree->addRule(rule, QString());
- if(!added)
- return false;
- }
- }
- return true;
-}
-
-FilterTree::~FilterTree()
-{
- for(auto &branch : m_branches) {
- qDeleteAll(branch.leaves);
- branch.leaves.clear();
- }
-}
-
-const QStringList FilterTree::branches() const
-{
- QStringList branches;
- for(auto &branch : m_branches) {
- branches.append(branch.domain.host());
- }
- return branches;
-}
-
-QVector<const FilterLeaf *> FilterTree::match(const QUrl &domain, const QUrl &requestUrl) const
-{
- QVector<const FilterLeaf *> leaves;
- for(const auto &branch : m_branches) {
- if(branch.domain.matches(domain)) {
-
- for(const auto leaf : branch.leaves) {
- if(leaf->match(requestUrl)) {
- leaves.append(leaf);
- }
- }
- }
- }
- return leaves;
-}
-
-bool FilterTree::addRule(FilterLeaf *rule, const QString &domain)
-{
- branchLock.lock();
- this->branch(domain).leaves.emplace_back(rule);
- branchLock.unlock();
- return true;
-}
-
-FilterTree::Branch & FilterTree::branch(const QString& domain)
-{
- for(auto &branch : m_branches) {
- if(branch.domain.matches(QUrl(domain)))
- return branch;
- }
-
- // no branch was found
- Branch branch(domain);
- return m_branches.emplace_back(std::move(branch));
-}
diff --git a/lib/urlfilter/filtertree.h b/lib/urlfilter/filtertree.h
deleted file mode 100644
index f453a3d..0000000
--- a/lib/urlfilter/filtertree.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#ifndef SMOLBOTE_FILTERTREE_H
-#define SMOLBOTE_FILTERTREE_H
-
-#include "domain.h"
-#include "filterleaf.h"
-#include <QIODevice>
-#include <QObject>
-#include <QVector>
-#include <vector>
-#include <QMutex>
-
-/** FilterTree: B+ tree of filter rules
- * The tree contains branches that represent domains
- * Each domain-branch contains leaves (rules) that are to be applied to it.
- * Rules may be applied to multiple branches.
- */
-class FilterTree : public QObject
-{
- Q_OBJECT
-
-public:
- ~FilterTree();
-
- const QStringList branches() const;
- QVector<const FilterLeaf *> match(const QUrl &domain, const QUrl &requestUrl) const;
-
- bool addRule(FilterLeaf *rule, const QString &domain);
-
-private:
- struct Branch {
- explicit Branch(const QString &host)
- : domain(host)
- {
- }
- explicit Branch(Branch &&other)
- : domain(std::move(other.domain))
- , leaves(std::move(other.leaves))
- {
- }
-
- Domain domain;
- std::vector<FilterLeaf *> leaves;
- };
-
- Branch& branch(const QString &domain);
-
- QMutex branchLock;
- std::vector<Branch> m_branches;
-};
-
-bool loadHostlist(QIODevice &from, FilterTree *tree);
-
-#endif // SMOLBOTE_FILTERTREE_H
diff --git a/lib/urlfilter/formats/adblocklist.cpp b/lib/urlfilter/formats/adblocklist.cpp
deleted file mode 100644
index 772c252..0000000
--- a/lib/urlfilter/formats/adblocklist.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "adblocklist.h"
-
-AdBlockList::AdBlockList()
-{
-}
-
-QString AdBlockList::metadata(const QString &key) const
-{
- return m_metadata.value(key, QString());
-}
-
-FilterLeaf::Action AdBlockList::match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const
-{
- const QString request = requestUrl.toString();
-
- for(auto &filter : m_rules) {
- if(filter.matcher->hasMatch(request))
- return filter.action;
- }
- return FilterLeaf::NotMatched;
-}
-
-bool AdBlockList::parseLine(const QString &line)
-{
- // remove whitespace from start/end of the line
- QString parsedLine = line.trimmed();
-
- // check if the line is empty
- if(parsedLine.isEmpty())
- return false;
-
- // parse comment
- if(parsedLine.startsWith(QLatin1Literal("!")))
- return parseComment(parsedLine);
-
- Filter filter;
-
- // exception rules
- if(parsedLine.startsWith(QLatin1Literal("@@"))) {
- filter.action = FilterLeaf::Allow;
- parsedLine.remove(0, 2);
- }
-
- // remove '*' at the beginning and the end
- if(parsedLine.startsWith(QLatin1Literal("*")))
- parsedLine = parsedLine.mid(1);
- if(parsedLine.endsWith(QLatin1Literal("*")))
- parsedLine.chop(1);
-
- if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) {
- // regular expression rule
- parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
- filter.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, FilterLeaf::RegularExpressionMatch);
-
- } else if(parsedLine.contains(QLatin1Literal("*"))) {
- parsedLine = QRegularExpression::wildcardToRegularExpression(parsedLine);
- filter.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, FilterLeaf::RegularExpressionMatch);
-
- } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) {
-// matchType = FilterLeaf::DomainMatch;
- parsedLine = parsedLine.mid(2, parsedLine.length() - 3);
- filter.matcher = new ContentsMatcher<QString>(parsedLine, FilterLeaf::DomainMatch);
-
- } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) {
- // string equals rule
- parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
- filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringEquals);
-
- } else if(parsedLine.startsWith(QLatin1Literal("||"))) {
- // string starts with rule
- parsedLine = parsedLine.mid(2);
- filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringStartsWith);
-
- } else if(parsedLine.endsWith(QLatin1Literal("|"))) {
- // string ends with rule
- parsedLine.chop(1);
- filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringEndsWith);
-
- } else {
- // generic contains rule
- filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringContains);
- }
-
-
- Q_CHECK_PTR(filter.matcher);
- m_rules.emplace_back(std::move(filter));
- return true;
-}
-
-bool AdBlockList::parseComment(const QString &commentLine)
-{
- const QStringList comment = commentLine.mid(1).split(QLatin1Literal(": "));
- m_metadata[comment.at(0).trimmed()] = comment.at(1).trimmed();
- return true;
-}
diff --git a/lib/urlfilter/formats/adblocklist.h b/lib/urlfilter/formats/adblocklist.h
deleted file mode 100644
index 34a2120..0000000
--- a/lib/urlfilter/formats/adblocklist.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef ADBLOCKLIST_H
-#define ADBLOCKLIST_H
-
-#include <QHash>
-#include "adblockrule.h"
-
-class AdBlockList
-{
-public:
- AdBlockList();
-
- QString metadata(const QString &key) const;
- FilterLeaf::Action match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type = QWebEngineUrlRequestInfo::ResourceTypeUnknown) const;
-
- bool parseLine(const QString &line);
-
-protected:
- bool parseComment(const QString &commentLine);
-
-private:
- struct Filter
- {
- FilterLeaf::Action action = FilterLeaf::Block;
- Matcher *matcher;
- };
-
- QHash<QString, QString> m_metadata;
- //QMap<QString, Filter> m_rules;
- std::vector<Filter> m_rules;
-};
-
-#endif // ADBLOCKLIST_H
diff --git a/lib/urlfilter/formats/adblockrule.cpp b/lib/urlfilter/formats/adblockrule.cpp
deleted file mode 100644
index 60e817f..0000000
--- a/lib/urlfilter/formats/adblockrule.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#include "adblockrule.h"
-#include <QRegExp>
-#include <QStringMatcher>
-
-AdBlockRule::AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString &filter, FilterLeaf::Action action)
-{
- this->matchType = matchType;
- this->m_request = filter;
- this->m_isBlocking = (action == FilterLeaf::Block);
- //matcher.setPattern(filter);
- if(matchType == FilterLeaf::RegularExpressionMatch)
- regExp = new QRegExp(filter);
- else
- stringMatcher = new QStringMatcher(filter);
-}
-
-void AdBlockRule::mergeOptions(const QHash<QWebEngineUrlRequestInfo::ResourceType, bool> &options)
-{
- this->resourceTypeOptions.unite(options);
-}
-
-bool AdBlockRule::match(const QUrl &requestUrl) const
-{
- switch(matchType) {
- case FilterLeaf::RegularExpressionMatch:
- return (regExp->indexIn(requestUrl.toString()) != -1);
- default:
- return false;
- }
-}
-
-bool AdBlockRule::match(const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const
-{
- // if request is of the required type, or there are no types set (== apply to all requests)
- if(this->resourceTypeOptions.contains(type) || this->resourceTypeOptions.isEmpty()) {
- switch(matchType) {
- case FilterLeaf::RegularExpressionMatch:
- return (regExp->indexIn(requestUrl.toString()) != -1);
- default:
- qWarning("Match type not implemented, returning false!");
- return false;
- }
- }
-
- // request type is not matched
- return false;
-}
-
-std::pair<FilterLeaf::Action, QVariant> AdBlockRule::action() const
-{
- if(m_isBlocking)
- return std::make_pair(FilterLeaf::Block, QVariant());
- else
- return std::make_pair(FilterLeaf::Allow, QVariant());
-}
diff --git a/lib/urlfilter/formats/adblockrule_parse.cpp b/lib/urlfilter/formats/adblockrule_parse.cpp
deleted file mode 100644
index c01ddfd..0000000
--- a/lib/urlfilter/formats/adblockrule_parse.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#include "adblockrule.h"
-#include "adblockrule_parse.h"
-
-// adblock format documentation
-// https://adblockplus.org/filters
-
-// QString::mid(pos, len) const - Returns a string starting at the specified position index.
-// QString::chop(len) - Removes n characters from the end of the string.
-// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index.
-// QString::trimmed() const - Remove whitespace from start and end
-
-AdBlockRule *parseRule_adblock(const QString &filter)
-{
- QString parsedLine = filter.trimmed();
-
- // there is no rule, or it's a comment
- if(parsedLine.isEmpty() || parsedLine.startsWith("!")) {
- return nullptr;
- }
-
- // css rule -> filterleaves cannot do element blocking
- if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) {
- return nullptr;
- }
-
- // exception rules
- FilterLeaf::Action action = FilterLeaf::Block;
- if(parsedLine.startsWith(QLatin1Literal("@@"))) {
- action = FilterLeaf::Allow;
- parsedLine.remove(0, 2);
- }
-
- // parse options
- QStringList enabledOn, disabledOn;
- QHash<QWebEngineUrlRequestInfo::ResourceType, bool> optionsHash;
- {
- const int sepPos = parsedLine.indexOf(QLatin1Literal("$"));
- if(sepPos != -1) {
- const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(","));
- parsedLine = parsedLine.mid(0, sepPos);
-
- for(const QString &option : options) {
- if(option.startsWith(QLatin1Literal("domain"))) {
- const auto domainList = option.mid(7).split(QLatin1Literal("|"));
-
- for(const QString &domain : domainList) {
- if(domain.startsWith(QLatin1Literal("~"))) {
- disabledOn.append(domain.mid(1));
- } else {
- enabledOn.append(domain);
- }
- }
- } else {
- const auto pair = parseOption(option);
- if(pair)
- optionsHash.insert(pair.value().first, pair.value().second);
- }
- }
- }
- }
-
- FilterLeaf::UrlMatchType matchType = FilterLeaf::InvalidMatch;
-
- if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) {
- // regular expression rule
- matchType = FilterLeaf::RegularExpressionMatch;
- parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
-
- } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) {
- matchType = FilterLeaf::DomainMatch;
- parsedLine = parsedLine.mid(2, parsedLine.length() - 3);
-
- } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) {
- // string equals rule
- matchType = FilterLeaf::StringEquals;
- parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
-
- } else if(parsedLine.startsWith(QLatin1Literal("||"))) {
- // string starts with rule
- matchType = FilterLeaf::StringStartsWith;
- parsedLine = parsedLine.mid(2);
-
- } else if(parsedLine.endsWith(QLatin1Literal("|"))) {
- // string ends with rule
- matchType = FilterLeaf::StringEndsWith;
- parsedLine.chop(1);
-
- } else {
- // generic contains rule
- matchType = FilterLeaf::StringContains;
-
- // Basic filter rules can use wildcards, which were supported by QRegExp,
- // but were deprecated in QRegularExpression.
-
- // remove beginning and ending wildcards
- if(parsedLine.startsWith(QLatin1Literal("*")))
- parsedLine = parsedLine.mid(1);
-
- if(parsedLine.endsWith(QLatin1Literal("*")))
- parsedLine.chop(1);
-
- if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) {
- // check for wildcards and translate to regexp
- // wildcard "*" - any number of characters
- // separator "^" - end, ? or /
- parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://"));
- parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|"));
- parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*"));
- parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)"));
-
- matchType = FilterLeaf::RegularExpressionMatch;
- }
- }
-
- AdBlockRule *rule = new AdBlockRule(matchType, parsedLine, action);
- rule->mergeOptions(optionsHash);
- return rule;
-}
-
-std::optional<QPair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseOption(const QString &option)
-{
- const bool exception = !option.startsWith(QLatin1Literal("~"));
-
- if(option.endsWith(QLatin1Literal("script"))) {
- // external scripts loaded via HTML script tag
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception);
-
- } else if(option.endsWith(QLatin1Literal("image"))) {
- // regular images, typically loaded via HTML img tag
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception);
-
- } else if(option.endsWith(QLatin1Literal("stylesheet"))) {
- // external CSS stylesheet files
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception);
-
- } else if(option.endsWith(QLatin1Literal("object"))) {
- // content handled by browser plugins, e.g. Flash or Java
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception);
-
- } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) {
- // requests started using the XMLHttpRequest object or fetch() API
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception);
-
- } else if(option.endsWith(QLatin1Literal("object-subrequest"))) {
- // requests started by plugins like Flash
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception);
-
- } else if(option.endsWith(QLatin1Literal("subdocument"))) {
- // embedded pages, usually included via HTML frames
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception);
-
- } else if(option.endsWith(QLatin1Literal("ping"))) {
- // requests started by <a ping> or navigator.sendBeacon()
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePing, exception);
-
- } else if(option.endsWith(QLatin1Literal("websocket"))) {
- // requests initiated via WebSocket object
- qDebug("Resource type 'websocket' not available");
-
- } else if(option.endsWith(QLatin1Literal("webrtc"))) {
- // connections opened via RTCPeerConnection instances to ICE servers
- qDebug("Resource type 'webrtc' not available");
-
- } else if(option.endsWith(QLatin1Literal("document"))) {
- // the page itself
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception);
-
- } else if(option.endsWith(QLatin1Literal("other"))) {
- return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception);
- }
-
- return std::nullopt;
-}
diff --git a/lib/urlfilter/formats/hostlistrule.cpp b/lib/urlfilter/formats/hostlistrule.cpp
deleted file mode 100644
index ad2c2a6..0000000
--- a/lib/urlfilter/formats/hostlistrule.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#include "hostlistrule.h"
-
-HostlistRule::HostlistRule(const QString &domain, const QString &redirect)
-{
- this->m_isBlocking = (redirect == QLatin1Literal("0.0.0.0"));
- this->m_request = domain;
- this->m_redirect = redirect;
-}
-
-bool HostlistRule::match(const QUrl &requestUrl) const
-{
- //qDebug("checking [%s] against [%s]", qUtf8Printable(requestUrl.host()), qUtf8Printable(m_request));
- return (m_request == requestUrl.host());
-}
-
-std::pair<FilterLeaf::Action, QVariant> HostlistRule::action() const
-{
- if(m_isBlocking)
- return std::make_pair(FilterLeaf::Block, QVariant());
- return std::make_pair(FilterLeaf::Redirect, QVariant(m_redirect));
-}
diff --git a/lib/urlfilter/formats/hostlistrule.h b/lib/urlfilter/formats/hostlistrule.h
deleted file mode 100644
index 58ec690..0000000
--- a/lib/urlfilter/formats/hostlistrule.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * This file is part of smolbote. It's copyrighted by the contributors recorded
- * in the version control history of the file, available from its original
- * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
- *
- * SPDX-License-Identifier: GPL-3.0
- */
-
-#ifndef SMOLBOTE_HOSTLIST_RULE_H
-#define SMOLBOTE_HOSTLIST_RULE_H
-
-#include "../filterleaf.h"
-#include <QString>
-
-class HostlistRule : public FilterLeaf
-{
-public:
- explicit HostlistRule(const QString &domain, const QString &redirect);
-
- bool match(const QUrl &requestUrl) const override;
- std::pair<FilterLeaf::Action, QVariant> action() const override;
-
-private:
- QString m_redirect;
-};
-
-#endif // SMOLBOTE_HOSTLIST_RULE_H
diff --git a/lib/urlfilter/hostlist/hostlist.cpp b/lib/urlfilter/hostlist/hostlist.cpp
new file mode 100644
index 0000000..ec0b214
--- /dev/null
+++ b/lib/urlfilter/hostlist/hostlist.cpp
@@ -0,0 +1,79 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "hostlist.h"
+#include <QIODevice>
+#include <QTextStream>
+#include <QDebug>
+
+HostList::HostList(QIODevice *device)
+{
+ Q_ASSERT(device->isOpen());
+
+ QTextStream list(device);
+ while (!list.atEnd()) {
+ parseLine(list.readLine());
+ }
+
+ qDebug() << m_metadata;
+}
+
+QString HostList::metadata(const QString& key) const
+{
+ return m_metadata.value(key);
+}
+
+int HostList::ruleCount() const
+{
+ return rules.size();
+}
+
+std::pair<UrlFilter::MatchResult, QString> HostList::match(const QUrl& firstParty, const QUrl& requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const
+{
+ Q_UNUSED(firstParty);
+ Q_UNUSED(type);
+
+ const QString domain = requestUrl.host();
+ const uint domainHash = qHash(domain);
+
+ for(const Rule &r : rules) {
+ if(r.domainHash == domainHash)
+ return std::make_pair(r.action, r.redirect);
+ }
+
+ return std::make_pair(UrlFilter::NotMatched, QString());
+}
+
+void HostList::parseLine(const QString& line)
+{
+ // check comment
+ if(line.startsWith(QLatin1Literal("#")))
+ return;
+
+ QString parsedLine = line.trimmed();
+
+ // malformed rule
+ if(!parsedLine.contains(QLatin1Literal(" ")))
+ return;
+
+ const QStringList parts = parsedLine.split(QLatin1Literal(" "));
+ const QString redirect = parts.at(0);
+ const auto action = (redirect == QLatin1Literal("0.0.0.0")) ? UrlFilter::Block : UrlFilter::Redirect;
+
+ for(int i = 1; i < parts.size(); i++) {
+ const QString domain = parts.at(i);
+ Rule r;
+ r.action = action;
+ r.domainHash = qHash(domain);
+ if(action == UrlFilter::Redirect)
+ r.redirect = redirect;
+
+ rules.emplace_back(std::move(r));
+ }
+}
+
diff --git a/lib/urlfilter/hostlist/hostlist.h b/lib/urlfilter/hostlist/hostlist.h
new file mode 100644
index 0000000..d4a8d87
--- /dev/null
+++ b/lib/urlfilter/hostlist/hostlist.h
@@ -0,0 +1,44 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#ifndef SMOLBOTE_URLFILTER_HOSTLIST
+#define SMOLBOTE_URLFILTER_HOSTLIST
+
+#include "urlfilter.h"
+#include <QHash>
+#include <vector>
+#include <QWebEngineUrlRequestInfo>
+
+class QIODevice;
+class HostList : public UrlFilter
+{
+public:
+
+ HostList(QIODevice *device);
+ ~HostList() = default;
+
+ QString metadata(const QString &key) const override;
+ int ruleCount() const;
+ std::pair<MatchResult, QString> match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const override;
+
+protected:
+ void parseLine(const QString &line);
+
+private:
+ QHash<QString, QString> m_metadata;
+
+ struct Rule {
+ UrlFilter::MatchResult action = UrlFilter::NotMatched;
+ uint domainHash;
+ QString redirect;
+ };
+
+ std::vector<Rule> rules;
+};
+
+#endif // SMOLBOTE_URLFILTER_HOSTLIST
diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/matcher.h
index 6be3cdf..6696958 100644
--- a/lib/urlfilter/formats/adblockrule.h
+++ b/lib/urlfilter/matcher.h
@@ -6,17 +6,24 @@
* SPDX-License-Identifier: GPL-3.0
*/
-#ifndef SMOLBOTE_ADBLOCKRULE_H
-#define SMOLBOTE_ADBLOCKRULE_H
+#ifndef SMOLBOTE_URLFILTER_MATCHER
+#define SMOLBOTE_URLFILTER_MATCHER
-#include "../filterleaf.h"
-#include <optional>
+#include <QUrl>
+#include <QString>
+#include <utility>
#include <QRegularExpression>
#include <QStringMatcher>
+#include <QWebEngineUrlRequestInfo>
+/** An interface class so we can use templated ContentsMatcher interchangeably
+ */
class Matcher
{
public:
+ virtual ~Matcher() = default;
+
+ virtual void setCaseSensitive(bool matchCase) = 0;
virtual bool hasMatch(const QString &where) const = 0;
};
@@ -24,12 +31,10 @@ template <typename T>
class ContentsMatcher : public Matcher
{
public:
- ContentsMatcher(const QString &pattern, FilterLeaf::UrlMatchType matchType)
+ ContentsMatcher(const QString &pattern, UrlFilter::MatchType type)
+ : patternLength(pattern.length())
+ , matchType(type)
{
- this->matchType = matchType;
- patternLength = pattern.length();
-
-
if constexpr(std::is_same_v<T, QRegularExpression>) {
matcher.setPatternOptions(matcher.patternOptions() | QRegularExpression::CaseInsensitiveOption);
matcher.setPattern(pattern);
@@ -38,7 +43,19 @@ public:
matcher.setPattern(pattern);
} else if constexpr(std::is_same_v<T, QString>) {
matcher = QUrl::fromUserInput(pattern).host();
-// qDebug("matcher: %s", qUtf8Printable(matcher));
+ }
+ }
+ ~ContentsMatcher() = default;
+
+ void setCaseSensitive(bool matchCase) override
+ {
+ if constexpr(std::is_same_v<T, QRegularExpression>) {
+ auto options = matcher.patternOptions();
+ options.setFlag(QRegularExpression::CaseInsensitiveOption, !matchCase);
+ matcher.setPatternOptions(options);
+
+ } else if constexpr(std::is_same_v<T, QStringMatcher>) {
+ matcher.setCaseSensitivity(matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive);
}
}
@@ -46,68 +63,47 @@ public:
{
if constexpr(std::is_same_v<T, QStringMatcher>) {
switch (matchType) {
- case FilterLeaf::InvalidMatch:
- case FilterLeaf::RegularExpressionMatch:
- case FilterLeaf::DomainMatch:
+ case UrlFilter::InvalidMatch:
+ case UrlFilter::RegularExpressionMatch:
+ case UrlFilter::DomainMatch:
qWarning("ContentsMatcher is a String Matcher, but not doing string matching!");
return false;
- case FilterLeaf::StringContains:
+ case UrlFilter::StringContains:
return (matcher.indexIn(where) != -1);
- case FilterLeaf::StringStartsWith:
+ case UrlFilter::StringStartsWith:
return (matcher.indexIn(where) == 0);
- case FilterLeaf::StringEndsWith:
+ case UrlFilter::StringEndsWith:
return (matcher.indexIn(where) == where.length() - patternLength);
- case FilterLeaf::StringEquals:
+ case UrlFilter::StringEquals:
return (matcher.indexIn(where) == 0) && (patternLength == where.length());
}
} else if constexpr(std::is_same_v<T, QRegularExpression>) {
- if(matchType != FilterLeaf::RegularExpressionMatch)
+ if(matchType != UrlFilter::RegularExpressionMatch)
qWarning("ContentsMatcher is a regular expression, but not doing a regular expression match!");
return matcher.match(where).hasMatch();
} else if constexpr(std::is_same_v<T, QString>) {
// TODO: fix
- if(matchType == FilterLeaf::DomainMatch) {
+ if(matchType == UrlFilter::DomainMatch) {
// qDebug("matching %s", qUtf8Printable(QUrl(where).host()));
return QUrl(where).host().endsWith(matcher);
} else
return matcher == where;
- } else {
- qWarning("Matcher has no backend, returning false");
- return false;
}
+
+ qWarning("Matcher has no backend, returning false");
+ return false;
}
private:
- int patternLength;
+ const int patternLength;
+ const UrlFilter::MatchType matchType;
T matcher;
- FilterLeaf::UrlMatchType matchType;
};
-class AdBlockRule : public FilterLeaf
-{
-public:
- explicit AdBlockRule(FilterLeaf::UrlMatchType matchType, const QString &filter, FilterLeaf::Action action);
- ~AdBlockRule()
- {
- delete stringMatcher;
- delete regExp;
- };
-
- void mergeOptions(const QHash<QWebEngineUrlRequestInfo::ResourceType, bool> &options);
-
- bool match(const QUrl &requestUrl) const override;
- bool match(const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const;
- std::pair<FilterLeaf::Action, QVariant> action() const override;
-
-private:
- /* Once C++20 comes out, perhaps this can be replaced with a concept template */
- QStringMatcher *stringMatcher = nullptr;
- QRegExp *regExp = nullptr;
-};
+#endif // SMOLBOTE_URLFILTER_MATCHER
-#endif // SMOLBOTE_ADBLOCKRULE_H
diff --git a/lib/urlfilter/meson.build b/lib/urlfilter/meson.build
index 1f4f47c..b017eb5 100644
--- a/lib/urlfilter/meson.build
+++ b/lib/urlfilter/meson.build
@@ -1,19 +1,26 @@
-urlfilter_inc = include_directories('.')
-
-urlfilter_moc = qt5.preprocess(
- moc_headers: 'filtertree.h',
- dependencies: dep_qt5
-)
-
urlfilter_lib = static_library('urlfilter',
- ['filtertree.cpp', 'filterleaf.cpp', urlfilter_moc,
- 'domain.cpp', 'domain.h',
- 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp',
- 'formats/adblocklist.cpp'],
+ ['urlfilter.h', 'matcher.h',
+ 'hostlist/hostlist.cpp', 'hostlist/hostlist.h',
+ 'adblock/adblocklist.cpp', 'adblock/adblocklist.h', 'adblock/parser.cpp', 'adblock/parser.h'],
dependencies: dep_qt5
)
dep_urlfilter = declare_dependency(
- include_directories: urlfilter_inc,
+ include_directories: include_directories('.'),
link_with: urlfilter_lib
)
+
+if get_option('testing').enabled()
+ test('urlfilter: matcher',
+ executable('urlfilter-matcher', dependencies: [dep_qt5, dep_gtest, dep_urlfilter], sources: ['test/matcher.cpp']),
+ workdir: meson.current_source_dir() / 'test'
+ )
+ test('urlfilter: host list',
+ executable('urlfilter-hostlist', dependencies: [dep_qt5, dep_gtest, dep_urlfilter], sources: ['test/hostlist.cpp']),
+ workdir: meson.current_source_dir() / 'test'
+ )
+ test('urlfilter: adblock list',
+ executable('urlfilter-adblocklist', dependencies: [dep_qt5, dep_gtest, dep_urlfilter], sources: ['test/adblock.cpp']),
+ workdir: meson.current_source_dir() / 'test'
+ )
+endif
diff --git a/lib/urlfilter/test/adblock.cpp b/lib/urlfilter/test/adblock.cpp
new file mode 100644
index 0000000..ecb94ee
--- /dev/null
+++ b/lib/urlfilter/test/adblock.cpp
@@ -0,0 +1,88 @@
+#include "urlfilter.h"
+#include "adblock/adblocklist.h"
+#include <gtest/gtest.h>
+#include <QFile>
+
+AdBlockList *list = nullptr;
+
+TEST(AdBlockList, MetaData) {
+ EXPECT_STREQ(qUtf8Printable(list->metadata("Homepage")), "http://example.com/");
+ EXPECT_STREQ(qUtf8Printable(list->metadata("Title")), "FooList");
+ EXPECT_STREQ(qUtf8Printable(list->metadata("Expires")), "5 days");
+ EXPECT_STREQ(qUtf8Printable(list->metadata("Redirect")), "http://example.com/list.txt");
+ EXPECT_STREQ(qUtf8Printable(list->metadata("Version")), "1234");
+}
+
+TEST(AdBlockList, BasicFilter) {
+ // Rule: /banner/*/img^
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo/img"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo/bar/img?param"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner//img/foo"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::Block);
+
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo.png"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::NotMatched);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/img"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::NotMatched);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo/imgraph"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::NotMatched);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/foo/img.gif"), QWebEngineUrlRequestInfo::ResourceTypeImage).first, UrlFilter::NotMatched);
+
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner/ads/img.png"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+}
+
+TEST(AdBlockList, MatchBeginningEnd) {
+ // Rule: |http://beginning-pattern.com
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://beginning-pattern.com"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("https://beginning-pattern.com"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+ // Rule: end-pattern|
+ EXPECT_EQ(list->match(QUrl(), QUrl("https://endpattern.com/end-pattern"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("https://endpattern.com/end-pattern/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+}
+
+TEST(AdBlockList, Domain) {
+ // Rule: ||ads.example.com^
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://ads.example.com/foo.gif"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://server1.ads.example.com/foo.gif"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("https://ads.example.com:8000/"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://ads.example.com.ua/foo.gif"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/redirect/http://ads.example.com/"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+}
+
+TEST(AdBlockList, RegularExpression) {
+ // Rule: /banner\d+/
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner123"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banner321"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://example.com/banners"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+}
+
+TEST(AdBlockList, MatchCase) {
+ // Rule: matchThisCase$match-case
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://matchcase.com/matchThisCase"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl("http://matchcase.com/MatchThisCase"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+}
+
+TEST(AdBlockList, DomainOption) {
+ // Rule: domain-limited-string$domain=example.com
+ EXPECT_EQ(list->match(QUrl("https://example.com"), QUrl("https://example.com/domain-limited-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl("https://example.com"), QUrl("https://example.com/another-domain-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+ EXPECT_EQ(list->match(QUrl("https://another.com"), QUrl("https://example.com/domain-limited-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+
+ //Rule: exception-limited-string$domain=~example.com
+ EXPECT_EQ(list->match(QUrl("https://another.com"), QUrl("https://example.com/exception-limited-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl("https://example.com"), QUrl("https://example.com/exception-limited-string/foo"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+}
+
+int main(int argc, char **argv) {
+ QFile f("adblock.txt");
+ if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) {
+ qDebug("Could not open list");
+ return -1;
+ }
+
+ list = new AdBlockList(&f);
+ f.close();
+
+ qDebug("Parsed %i rules", list->ruleCount());
+
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/lib/urlfilter/test/adblock.txt b/lib/urlfilter/test/adblock.txt
new file mode 100644
index 0000000..635ce09
--- /dev/null
+++ b/lib/urlfilter/test/adblock.txt
@@ -0,0 +1,26 @@
+! Homepage: http://example.com/
+! Title: FooList
+! Expires: 5 days
+! Redirect: http://example.com/list.txt
+! Version: 1234
+
+/banner/*/img^
+||ads.example.com^
+|http://example.com/|
+/banner\d+/
+
+! match beginning
+||http://beginning-pattern.com
+! match end
+end-pattern|
+
+! options
+! match-case
+matchThisCase$match-case
+
+! domain limiting
+! only apply this filter on this domain
+domain-limited-string$domain=example.com
+! apply this filter to all domains but the listed one
+exception-limited-string$domain=~example.com
+
diff --git a/lib/urlfilter/test/hostlist.cpp b/lib/urlfilter/test/hostlist.cpp
new file mode 100644
index 0000000..041cd5f
--- /dev/null
+++ b/lib/urlfilter/test/hostlist.cpp
@@ -0,0 +1,34 @@
+#include <gtest/gtest.h>
+#include "hostlist/hostlist.h"
+#include <QFile>
+
+HostList *list = nullptr;
+
+TEST(AdBlockList, Block) {
+ EXPECT_EQ(list->match(QUrl(), QUrl::fromUserInput("blockeddomain.com"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl::fromUserInput("blockeddomain.first"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+ EXPECT_EQ(list->match(QUrl(), QUrl::fromUserInput("blockeddomain.second"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::Block);
+
+ const auto r = list->match(QUrl(), QUrl::fromUserInput("localhost.localdomain"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame);
+ EXPECT_EQ(r.first, UrlFilter::Redirect);
+ EXPECT_EQ(r.second, QString("127.0.0.1"));
+
+ EXPECT_EQ(list->match(QUrl(), QUrl::fromUserInput("other.domain"), QWebEngineUrlRequestInfo::ResourceTypeMainFrame).first, UrlFilter::NotMatched);
+}
+
+int main(int argc, char **argv) {
+ QFile f("hostlist.txt");
+ if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) {
+ qDebug("Could not open list");
+ return -1;
+ }
+
+ list = new HostList(&f);
+ f.close();
+
+ qDebug("Parsed %i rules", list->ruleCount());
+
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/lib/urlfilter/test/hostlist.txt b/lib/urlfilter/test/hostlist.txt
new file mode 100644
index 0000000..a0b4e5c
--- /dev/null
+++ b/lib/urlfilter/test/hostlist.txt
@@ -0,0 +1,6 @@
+# This is a comment, and after it comes a blank line
+
+127.0.0.1 localhost.localdomain
+
+0.0.0.0 blockeddomain.com
+0.0.0.0 blockeddomain.first blockeddomain.second
diff --git a/lib/urlfilter/test/matcher.cpp b/lib/urlfilter/test/matcher.cpp
new file mode 100644
index 0000000..1c1efbf
--- /dev/null
+++ b/lib/urlfilter/test/matcher.cpp
@@ -0,0 +1,42 @@
+#include "urlfilter.h"
+#include "matcher.h"
+#include <gtest/gtest.h>
+
+TEST(Matcher, StringContains) {
+ ContentsMatcher<QStringMatcher> matcher("spam-pattern", UrlFilter::StringContains);
+ EXPECT_TRUE(matcher.hasMatch("this string contains a spam-pattern"));
+ EXPECT_FALSE(matcher.hasMatch("this string does not contain the pattern"));
+}
+
+TEST(Matcher, StringStartsWith) {
+ ContentsMatcher<QStringMatcher> matcher("beginning", UrlFilter::StringStartsWith);
+ EXPECT_TRUE(matcher.hasMatch("beginning this string is the pattern"));
+ EXPECT_FALSE(matcher.hasMatch("ending this string is the pattern, the word beginning"));
+ EXPECT_FALSE(matcher.hasMatch("this would be a string where the pattern cannot be found"));
+}
+
+TEST(Matcher, StringEndsWith) {
+ ContentsMatcher<QStringMatcher> matcher("ending", UrlFilter::StringEndsWith);
+ EXPECT_TRUE(matcher.hasMatch("this string has the proper ending"));
+ EXPECT_FALSE(matcher.hasMatch("and this string doesn't"));
+}
+
+TEST(Matcher, StringEquals) {
+ ContentsMatcher<QStringMatcher> matcher("string-to-match", UrlFilter::StringEquals);
+ EXPECT_TRUE(matcher.hasMatch("string-to-match"));
+ EXPECT_FALSE(matcher.hasMatch("same-len-string"));
+ EXPECT_FALSE(matcher.hasMatch("not the string-to-match"));
+}
+
+TEST(Matcher, RegularExpression) {
+ ContentsMatcher<QRegularExpression> matcher("banner\\d+", UrlFilter::RegularExpressionMatch);
+ EXPECT_TRUE(matcher.hasMatch("http://another.com/banner123"));
+ EXPECT_TRUE(matcher.hasMatch("http://another.com/banner321"));
+ EXPECT_FALSE(matcher.hasMatch("http://another.com/banners"));
+
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/lib/urlfilter/urlfilter.h b/lib/urlfilter/urlfilter.h
new file mode 100644
index 0000000..e15122a
--- /dev/null
+++ b/lib/urlfilter/urlfilter.h
@@ -0,0 +1,43 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include <QUrl>
+#include <QString>
+#include <utility>
+#include <QWebEngineUrlRequestInfo>
+
+#ifndef SMOLBOTE_URLFILTER_FILTER
+#define SMOLBOTE_URLFILTER_FILTER
+
+class UrlFilter
+{
+public:
+ enum MatchResult {
+ NotMatched,
+ Allow,
+ Block,
+ Redirect
+ };
+
+ enum MatchType {
+ InvalidMatch,
+ RegularExpressionMatch,
+ StringContains,
+ StringStartsWith,
+ StringEndsWith,
+ StringEquals,
+ DomainMatch
+ };
+
+ virtual ~UrlFilter() = default;
+
+ virtual QString metadata(const QString &key) const = 0;
+ virtual std::pair<MatchResult, QString> match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const = 0;
+};
+
+#endif // SMOLBOTE_URLFILTER_FILTER