aboutsummaryrefslogtreecommitdiff
path: root/lib/urlfilter/adblock
diff options
context:
space:
mode:
Diffstat (limited to 'lib/urlfilter/adblock')
-rw-r--r--lib/urlfilter/adblock/adblocklist.cpp188
-rw-r--r--lib/urlfilter/adblock/adblocklist.h42
-rw-r--r--lib/urlfilter/adblock/parser.cpp75
-rw-r--r--lib/urlfilter/adblock/parser.h14
4 files changed, 319 insertions, 0 deletions
diff --git a/lib/urlfilter/adblock/adblocklist.cpp b/lib/urlfilter/adblock/adblocklist.cpp
new file mode 100644
index 0000000..c749e9e
--- /dev/null
+++ b/lib/urlfilter/adblock/adblocklist.cpp
@@ -0,0 +1,188 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "adblocklist.h"
+#include "parser.h"
+#include <QIODevice>
+#include <QTextStream>
+#include <QDebug>
+
+AdBlockList::AdBlockList(QIODevice *device)
+{
+ Q_ASSERT(device->isOpen());
+
+ QTextStream list(device);
+ while (!list.atEnd()) {
+ parseLine(list.readLine());
+ }
+
+ qDebug() << m_metadata;
+}
+
+AdBlockList::~AdBlockList()
+{
+ for(Rule &r : rules) {
+ delete r.matcher;
+ }
+}
+
+QString AdBlockList::metadata(const QString& key) const
+{
+ return m_metadata.value(key);
+}
+
+int AdBlockList::ruleCount() const
+{
+ return rules.size();
+}
+
+std::pair<UrlFilter::MatchResult, QString> AdBlockList::match(const QUrl& firstParty, const QUrl& requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const
+{
+ const QString domain = firstParty.host();
+ const QString request = requestUrl.toString();
+
+ for(const Rule &r : rules) {
+ // if there are options specified, but not the one we need
+ if(!r.options.isEmpty() && !r.options.contains(type))
+ continue;
+
+ if(r.disabledOn.contains(domain))
+ continue;
+
+ if(!r.enabledOn.isEmpty() && !r.enabledOn.contains(domain))
+ continue;
+
+ if(r.matcher->hasMatch(request))
+ return std::make_pair(r.action, QString());
+ }
+
+ return std::make_pair(UrlFilter::NotMatched, QString());
+}
+
+void AdBlockList::parseLine(const QString& line)
+{
+ QString parsedLine = line.trimmed();
+
+ if(parsedLine.isEmpty())
+ return;
+
+ if(parsedLine.startsWith(QLatin1Literal("!"))) {
+ const auto comment = parseComment(parsedLine);
+
+ if(comment) {
+ const auto key = comment.value().first;
+ if(keys.contains(key))
+ m_metadata[key] = comment.value().second;
+ }
+
+ return;
+ }
+
+ // css rule -> filterleaves cannot do element blocking
+ if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) {
+ qDebug("TODO: %s", qUtf8Printable(parsedLine));
+ return;
+ }
+
+ Rule r;
+ r.action = UrlFilter::Block;
+
+ // exception rules
+ if(parsedLine.startsWith(QLatin1Literal("@@"))) {
+ r.action = UrlFilter::Allow;
+ parsedLine.remove(0, 2);
+ }
+
+ bool matchCase = false;
+
+ // parse options
+ {
+ const int sepPos = parsedLine.indexOf(QLatin1Literal("$"));
+ if(sepPos != -1) {
+ const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(","));
+ parsedLine = parsedLine.mid(0, sepPos);
+
+ for(const QString &option : options) {
+ if(option.startsWith(QLatin1Literal("domain"))) {
+ const auto domainList = option.mid(7).split(QLatin1Literal("|"));
+
+ for(const QString &domain : domainList) {
+ if(domain.startsWith(QLatin1Literal("~"))) {
+ r.disabledOn.append(domain.mid(1));
+ } else {
+ r.enabledOn.append(domain);
+ }
+ }
+ } else if(option.endsWith(QLatin1Literal("match-case"))) {
+ matchCase = !option.startsWith(QLatin1Literal("~"));
+
+ } else {
+ const auto pair = parseResourceOption(option);
+ if(pair)
+ r.options.insert(pair.value().first, pair.value().second);
+ }
+ }
+ }
+ }
+
+ if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) {
+ // regular expression rule
+ parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
+ r.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, UrlFilter::RegularExpressionMatch);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) {
+ parsedLine = parsedLine.mid(2, parsedLine.length() - 3);
+ r.matcher = new ContentsMatcher<QString>(parsedLine, UrlFilter::DomainMatch);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) {
+ // string equals rule
+ parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
+ r.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, UrlFilter::StringEquals);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("||"))) {
+ // string starts with rule
+ parsedLine = parsedLine.mid(2);
+ r.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, UrlFilter::StringStartsWith);
+
+ } else if(parsedLine.endsWith(QLatin1Literal("|"))) {
+ // string ends with rule
+ parsedLine.chop(1);
+ r.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, UrlFilter::StringEndsWith);
+
+ } else {
+ // generic contains rule
+
+ // remove beginning and ending wildcards
+ if(parsedLine.startsWith(QLatin1Literal("*")))
+ parsedLine = parsedLine.mid(1);
+
+ if(parsedLine.endsWith(QLatin1Literal("*")))
+ parsedLine.chop(1);
+
+ if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) {
+ // check for wildcards and translate to regexp
+ // wildcard "*" - any number of characters
+ // separator "^" - end, ? or /
+ parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://"));
+ parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|"));
+ parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*"));
+ parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)"));
+
+ r.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, UrlFilter::RegularExpressionMatch);
+
+ } else {
+ r.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, UrlFilter::StringContains);
+ }
+ }
+
+ r.matcher->setCaseSensitive(matchCase);
+
+ Q_CHECK_PTR(r.matcher);
+ rules.emplace_back(std::move(r));
+}
+
diff --git a/lib/urlfilter/adblock/adblocklist.h b/lib/urlfilter/adblock/adblocklist.h
new file mode 100644
index 0000000..ee41e11
--- /dev/null
+++ b/lib/urlfilter/adblock/adblocklist.h
@@ -0,0 +1,42 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "urlfilter.h"
+#include "matcher.h"
+#include <QHash>
+#include <QWebEngineUrlRequestInfo>
+
+class QIODevice;
+class AdBlockList : public UrlFilter
+{
+public:
+ // TODO: check if all keys are listed
+ const QStringList keys = { "Version", "Title", "Last modified", "Expires", "Homepage", "Licence", "Redirect" };
+
+ AdBlockList(QIODevice *device);
+ ~AdBlockList();
+
+ QString metadata(const QString &key) const override;
+ int ruleCount() const;
+ std::pair<MatchResult, QString> match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const override;
+
+protected:
+ void parseLine(const QString &line);
+
+private:
+ QHash<QString, QString> m_metadata;
+
+ struct Rule {
+ UrlFilter::MatchResult action = UrlFilter::NotMatched;
+ Matcher *matcher;
+ QStringList enabledOn, disabledOn;
+ QHash<QWebEngineUrlRequestInfo::ResourceType, bool> options;
+ };
+
+ std::vector<Rule> rules;
+};
diff --git a/lib/urlfilter/adblock/parser.cpp b/lib/urlfilter/adblock/parser.cpp
new file mode 100644
index 0000000..1e7f0bc
--- /dev/null
+++ b/lib/urlfilter/adblock/parser.cpp
@@ -0,0 +1,75 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "parser.h"
+
+std::optional<std::pair<QString, QString>> parseComment(QString &line)
+{
+ const QLatin1Literal separator(": ");
+ if(line.contains(separator)) {
+ const QStringList comment = line.mid(1).split(QLatin1Literal(": "));
+ return std::make_pair(comment.at(0).trimmed(), comment.at(1).trimmed());
+ } else
+ return std::nullopt;
+}
+
+std::optional<std::pair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseResourceOption(const QString &option)
+{
+ const bool exception = !option.startsWith(QLatin1Literal("~"));
+
+ if(option.endsWith(QLatin1Literal("script"))) {
+ // external scripts loaded via HTML script tag
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception);
+
+ } else if(option.endsWith(QLatin1Literal("image"))) {
+ // regular images, typically loaded via HTML img tag
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception);
+
+ } else if(option.endsWith(QLatin1Literal("stylesheet"))) {
+ // external CSS stylesheet files
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception);
+
+ } else if(option.endsWith(QLatin1Literal("object"))) {
+ // content handled by browser plugins, e.g. Flash or Java
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception);
+
+ } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) {
+ // requests started using the XMLHttpRequest object or fetch() API
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception);
+
+ } else if(option.endsWith(QLatin1Literal("object-subrequest"))) {
+ // requests started by plugins like Flash
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception);
+
+ } else if(option.endsWith(QLatin1Literal("subdocument"))) {
+ // embedded pages, usually included via HTML frames
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception);
+
+ } else if(option.endsWith(QLatin1Literal("ping"))) {
+ // requests started by <a ping> or navigator.sendBeacon()
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypePing, exception);
+
+ } else if(option.endsWith(QLatin1Literal("websocket"))) {
+ // requests initiated via WebSocket object
+ qDebug("Resource type 'websocket' not available");
+
+ } else if(option.endsWith(QLatin1Literal("webrtc"))) {
+ // connections opened via RTCPeerConnection instances to ICE servers
+ qDebug("Resource type 'webrtc' not available");
+
+ } else if(option.endsWith(QLatin1Literal("document"))) {
+ // the page itself
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception);
+
+ } else if(option.endsWith(QLatin1Literal("other"))) {
+ return std::make_pair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception);
+ }
+
+ qDebug("TODO: %s", qUtf8Printable(option));
+ return std::nullopt;
+}
diff --git a/lib/urlfilter/adblock/parser.h b/lib/urlfilter/adblock/parser.h
new file mode 100644
index 0000000..c73a9cf
--- /dev/null
+++ b/lib/urlfilter/adblock/parser.h
@@ -0,0 +1,14 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include <QWebEngineUrlRequestInfo>
+#include <optional>
+#include <utility>
+
+std::optional<std::pair<QString, QString>> parseComment(QString &line);
+std::optional<std::pair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseResourceOption(const QString &option);