aboutsummaryrefslogtreecommitdiff
path: root/lib/urlfilter
diff options
context:
space:
mode:
authorAqua-sama <aqua@iserlohn-fortress.net>2018-10-01 16:43:18 +0200
committerAqua-sama <aqua@iserlohn-fortress.net>2018-10-02 11:47:49 +0200
commit7d8cbdb9941532cd5bf560b21395f6ed371d1ab5 (patch)
tree9c5a2d72a3882050f2c3c95ec2d15ad21ff98a93 /lib/urlfilter
parentupdater: windows fixes (diff)
downloadsmolbote-7d8cbdb9941532cd5bf560b21395f6ed371d1ab5.tar.xz
Split off UrlFilter into library
- add more adblock filter options
Diffstat (limited to 'lib/urlfilter')
-rw-r--r--lib/urlfilter/CMakeLists.txt15
-rw-r--r--lib/urlfilter/filterrule.cpp97
-rw-r--r--lib/urlfilter/filterrule.h55
-rw-r--r--lib/urlfilter/formats/adblockrule.cpp164
-rw-r--r--lib/urlfilter/formats/adblockrule.h23
5 files changed, 354 insertions, 0 deletions
diff --git a/lib/urlfilter/CMakeLists.txt b/lib/urlfilter/CMakeLists.txt
new file mode 100644
index 0000000..842f18f
--- /dev/null
+++ b/lib/urlfilter/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Find includes in corresponding build directories
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+# Instruct CMake to run moc automatically when needed.
+set(CMAKE_AUTOMOC ON)
+
+add_library(urlfilter
+ filterrule.cpp
+ filterrule.h
+
+ formats/adblockrule.cpp
+ formats/adblockrule.h
+)
+
+target_link_libraries(urlfilter Qt5::WebEngineWidgets)
diff --git a/lib/urlfilter/filterrule.cpp b/lib/urlfilter/filterrule.cpp
new file mode 100644
index 0000000..22a2f06
--- /dev/null
+++ b/lib/urlfilter/filterrule.cpp
@@ -0,0 +1,97 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/smolbote.hg
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "filterrule.h"
+
+inline bool isMatchingDomain(const QString &domain, const QString &filter)
+{
+ // domain and filter are the same
+ if(domain == filter) {
+ return true;
+ }
+
+ // domain can't be matched by filter if it doesn't end with filter
+ // ex. example2.com isn't matched by example.com
+ if(!domain.endsWith(filter)) {
+ return false;
+ }
+
+ // match with subdomains
+ // ex. subdomain.example.com is matched by example.com
+ int index = domain.indexOf(filter);
+
+ // match if (domain ends with filter) && (filter has been found) and (character before filter is '.')
+ return index > 0 && domain[index - 1] == QLatin1Char('.');
+}
+
+bool FilterRule::isEnabled() const
+{
+ return m_isEnabled;
+}
+
+bool FilterRule::isBlocking() const
+{
+ return m_isBlocking;
+}
+
+bool FilterRule::matchesDomain(uint domainHash) const
+{
+ // no domains have been allowed or blocked -> allow on all domains
+ if(allowedDomains_hashes.isEmpty() && blockedDomains_hashes.isEmpty()) {
+ return true;
+ }
+
+ // blockedDomains prevents the rules from being matched on those domains
+ if(blockedDomains_hashes.contains(domainHash)) {
+ return false;
+ }
+
+ // allowedDomains means the rule should only be matched on those domains
+ return allowedDomains_hashes.contains(domainHash);
+
+}
+
+bool FilterRule::matchesType(QWebEngineUrlRequestInfo::ResourceType type) const
+{
+ // no options have been specified -> match all resource types
+ if(m_resourceTypeOptions.isEmpty())
+ return true;
+
+ // this resource type has not been specified -> reject it
+ if(!m_resourceTypeOptions.contains(type))
+ return false;
+
+ // resource type has been specified; true to match, false to exception
+ return m_resourceTypeOptions.value(type);
+}
+
+bool FilterRule::matchesUrl(const QUrl &url) const
+{
+ switch(urlMatchType) {
+ case InvalidMatch:
+ return false;
+
+ case RegularExpressionMatch:
+ return regexp.match(url.toString()).hasMatch();
+
+ case StringContains:
+ return url.toString().contains(match);
+
+ case StringStartsWith:
+ return url.toString().startsWith(match);
+
+ case StringEndsWith:
+ return url.toString().endsWith(match);
+
+ case StringEquals:
+ return url.toString() == match;
+
+ case DomainMatch:
+ return isMatchingDomain(url.host(), match);
+ }
+}
diff --git a/lib/urlfilter/filterrule.h b/lib/urlfilter/filterrule.h
new file mode 100644
index 0000000..95fff6a
--- /dev/null
+++ b/lib/urlfilter/filterrule.h
@@ -0,0 +1,55 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/smolbote.hg
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#ifndef SMOLBOTE_FILTERRULE_H
+#define SMOLBOTE_FILTERRULE_H
+
+#include <QObject>
+#include <QRegularExpression>
+#include <QStringList>
+#include <QStringMatcher>
+#include <QUrl>
+#include <QWebEngineUrlRequestInfo>
+#include <memory>
+#include <QVector>
+
+class FilterRule
+{
+public:
+ enum UrlMatchType {
+ InvalidMatch,
+ RegularExpressionMatch,
+ StringContains,
+ StringStartsWith,
+ StringEndsWith,
+ StringEquals,
+ DomainMatch
+ };
+
+ bool isEnabled() const;
+ bool isBlocking() const;
+
+ bool matchesDomain(uint domainHash) const;
+ bool matchesType(QWebEngineUrlRequestInfo::ResourceType type) const;
+ bool matchesUrl(const QUrl &url) const;
+
+protected:
+ bool m_isEnabled = false;
+ bool m_isBlocking = true;
+
+ UrlMatchType urlMatchType = InvalidMatch;
+ QHash<QWebEngineUrlRequestInfo::ResourceType, bool> m_resourceTypeOptions;
+
+ QVector<uint> allowedDomains_hashes, blockedDomains_hashes;
+
+ QString match;
+ QRegularExpression regexp;
+
+};
+
+#endif // SMOLBOTE_FILTERRULE_H
diff --git a/lib/urlfilter/formats/adblockrule.cpp b/lib/urlfilter/formats/adblockrule.cpp
new file mode 100644
index 0000000..ef7bec1
--- /dev/null
+++ b/lib/urlfilter/formats/adblockrule.cpp
@@ -0,0 +1,164 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/smolbote.hg
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+// Based on Falkon's AdBlockRule class
+
+#include "adblockrule.h"
+
+// adblock format documentation
+// https://adblockplus.org/filters
+
+// QString::mid(pos, len) - Returns a string starting at the specified position index.
+// QString::chop(len) - Removes n characters from the end of the string.
+// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index.
+
+AdBlockRule::AdBlockRule(const QString &filter)
+{
+ QString parsedLine = filter.trimmed();
+
+ // there is no rule, or it's a comment
+ if(parsedLine.isEmpty() || parsedLine.startsWith("!")) {
+ return;
+ }
+
+ // css rule - ignore for now
+ if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) {
+ return;
+ }
+
+ m_isEnabled = true;
+
+ // exception rules
+ if(parsedLine.startsWith(QLatin1Literal("@@"))) {
+ m_isBlocking = false;
+ parsedLine.remove(0, 2);
+ } else
+ m_isBlocking = true;
+
+ // parse options
+ {
+ const int sepPos = parsedLine.indexOf(QLatin1Literal("$"));
+ if(sepPos != -1) {
+ const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(","));
+ parsedLine = parsedLine.mid(0, sepPos);
+
+ for(const QString &option : options) {
+ if(option.startsWith(QLatin1Literal("domain"))) {
+ const auto domainList = option.mid(7).split(QLatin1Literal("|"));
+ for (const QString &domain : domainList) {
+ if (domain.startsWith(QLatin1Literal("~"))) {
+ blockedDomains_hashes.append(qHash(domain.mid(1)));
+ } else {
+ allowedDomains_hashes.append(qHash(domain));
+ }
+ }
+ } else {
+ parseOption(option);
+ }
+ }
+ }
+ }
+
+ // regular expression rule
+ if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) {
+ parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
+
+ urlMatchType = RegularExpressionMatch;
+ regexp.setPattern(parsedLine);
+ return;
+ }
+
+ // string equals rule
+ if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) {
+ urlMatchType = StringEquals;
+ match = parsedLine.mid(1, parsedLine.length() - 2);
+ return;
+ }
+
+ // Basic filter rules can use wildcards, which were supported by QRegExp,
+ // but were deprecated in QRegularExpression.
+
+ // remove beginning and ending wildcards
+ if(parsedLine.startsWith(QLatin1Literal("*")))
+ parsedLine = parsedLine.mid(1);
+
+ if(parsedLine.endsWith(QLatin1Literal("*")))
+ parsedLine.chop(1);
+
+ if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) {
+ urlMatchType = DomainMatch;
+ match = parsedLine.mid(2, parsedLine.length() - 3);
+ return;
+ }
+
+ // check for wildcards and translate to regexp
+ // wildcard "*" - any number of characters
+ // separator "^" - end, ? or /
+ if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) {
+ urlMatchType = RegularExpressionMatch;
+ parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://"));
+ parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|"));
+ parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*"));
+ parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)"));
+ regexp.setPattern(parsedLine);
+ return;
+ }
+
+ match = parsedLine;
+}
+void AdBlockRule::parseOption(const QString &option)
+{
+ const bool exception = !option.startsWith(QLatin1Literal("~"));
+
+ if(option.endsWith(QLatin1Literal("script"))) {
+ // external scripts loaded via HTML script tag
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeScript, exception);
+
+ } else if(option.endsWith(QLatin1Literal("image"))) {
+ // regular images, typically loaded via HTML img tag
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeImage, exception);
+
+ } else if(option.endsWith(QLatin1Literal("stylesheet"))) {
+ // external CSS stylesheet files
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception);
+
+ } else if(option.endsWith(QLatin1Literal("object"))) {
+ // content handled by browser plugins, e.g. Flash or Java
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeObject, exception);
+
+ } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) {
+ // requests started using the XMLHttpRequest object or fetch() API
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception);
+
+ } else if(option.endsWith(QLatin1Literal("object-subrequest"))) {
+ // requests started by plugins like Flash
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception);
+
+ } else if(option.endsWith(QLatin1Literal("subdocument"))) {
+ // embedded pages, usually included via HTML frames
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception);
+
+ } else if(option.endsWith(QLatin1Literal("ping"))) {
+ // requests started by <a ping> or navigator.sendBeacon()
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypePing, exception);
+
+ } else if(option.endsWith(QLatin1Literal("websocket"))) {
+ // requests initiated via WebSocket object
+ qDebug("Resource type 'websocket' not available");
+
+ } else if(option.endsWith(QLatin1Literal("webrtc"))) {
+ // connections opened via RTCPeerConnection instances to ICE servers
+ qDebug("Resource type 'webrtc' not available");
+
+ } else if(option.endsWith(QLatin1Literal("document"))) {
+ // the page itself
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception);
+
+ } else if(option.endsWith(QLatin1Literal("other"))) {
+ m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception);
+ }
+}
diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/formats/adblockrule.h
new file mode 100644
index 0000000..8677c2c
--- /dev/null
+++ b/lib/urlfilter/formats/adblockrule.h
@@ -0,0 +1,23 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://neueland.iserlohn-fortress.net/smolbote.hg
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#ifndef SMOLBOTE_ADBLOCKRULE_H
+#define SMOLBOTE_ADBLOCKRULE_H
+
+#include "../filterrule.h"
+
+class AdBlockRule : public FilterRule
+{
+public:
+ explicit AdBlockRule(const QString &filter);
+
+ void parseOption(const QString &option);
+
+};
+
+#endif // SMOLBOTE_ADBLOCKRULE_H