From 7d8cbdb9941532cd5bf560b21395f6ed371d1ab5 Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Mon, 1 Oct 2018 16:43:18 +0200 Subject: Split off UrlFilter into library - add more adblock filter options --- lib/urlfilter/CMakeLists.txt | 15 ++++ lib/urlfilter/filterrule.cpp | 97 ++++++++++++++++++++ lib/urlfilter/filterrule.h | 55 ++++++++++++ lib/urlfilter/formats/adblockrule.cpp | 164 ++++++++++++++++++++++++++++++++++ lib/urlfilter/formats/adblockrule.h | 23 +++++ 5 files changed, 354 insertions(+) create mode 100644 lib/urlfilter/CMakeLists.txt create mode 100644 lib/urlfilter/filterrule.cpp create mode 100644 lib/urlfilter/filterrule.h create mode 100644 lib/urlfilter/formats/adblockrule.cpp create mode 100644 lib/urlfilter/formats/adblockrule.h (limited to 'lib/urlfilter') diff --git a/lib/urlfilter/CMakeLists.txt b/lib/urlfilter/CMakeLists.txt new file mode 100644 index 0000000..842f18f --- /dev/null +++ b/lib/urlfilter/CMakeLists.txt @@ -0,0 +1,15 @@ +# Find includes in corresponding build directories +set(CMAKE_INCLUDE_CURRENT_DIR ON) + +# Instruct CMake to run moc automatically when needed. +set(CMAKE_AUTOMOC ON) + +add_library(urlfilter + filterrule.cpp + filterrule.h + + formats/adblockrule.cpp + formats/adblockrule.h +) + +target_link_libraries(urlfilter Qt5::WebEngineWidgets) diff --git a/lib/urlfilter/filterrule.cpp b/lib/urlfilter/filterrule.cpp new file mode 100644 index 0000000..22a2f06 --- /dev/null +++ b/lib/urlfilter/filterrule.cpp @@ -0,0 +1,97 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/smolbote.hg + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "filterrule.h" + +inline bool isMatchingDomain(const QString &domain, const QString &filter) +{ + // domain and filter are the same + if(domain == filter) { + return true; + } + + // domain can't be matched by filter if it doesn't end with filter + // ex. example2.com isn't matched by example.com + if(!domain.endsWith(filter)) { + return false; + } + + // match with subdomains + // ex. subdomain.example.com is matched by example.com + int index = domain.indexOf(filter); + + // match if (domain ends with filter) && (filter has been found) and (character before filter is '.') + return index > 0 && domain[index - 1] == QLatin1Char('.'); +} + +bool FilterRule::isEnabled() const +{ + return m_isEnabled; +} + +bool FilterRule::isBlocking() const +{ + return m_isBlocking; +} + +bool FilterRule::matchesDomain(uint domainHash) const +{ + // no domains have been allowed or blocked -> allow on all domains + if(allowedDomains_hashes.isEmpty() && blockedDomains_hashes.isEmpty()) { + return true; + } + + // blockedDomains prevents the rules from being matched on those domains + if(blockedDomains_hashes.contains(domainHash)) { + return false; + } + + // allowedDomains means the rule should only be matched on those domains + return allowedDomains_hashes.contains(domainHash); + +} + +bool FilterRule::matchesType(QWebEngineUrlRequestInfo::ResourceType type) const +{ + // no options have been specified -> match all resource types + if(m_resourceTypeOptions.isEmpty()) + return true; + + // this resource type has not been specified -> reject it + if(!m_resourceTypeOptions.contains(type)) + return false; + + // resource type has been specified; true to match, false to exception + return m_resourceTypeOptions.value(type); +} + +bool FilterRule::matchesUrl(const QUrl &url) const +{ + switch(urlMatchType) { + case InvalidMatch: + return false; + + case RegularExpressionMatch: + return regexp.match(url.toString()).hasMatch(); + + case StringContains: + return url.toString().contains(match); + + case StringStartsWith: + return url.toString().startsWith(match); + + case StringEndsWith: + return url.toString().endsWith(match); + + case StringEquals: + return url.toString() == match; + + case DomainMatch: + return isMatchingDomain(url.host(), match); + } +} diff --git a/lib/urlfilter/filterrule.h b/lib/urlfilter/filterrule.h new file mode 100644 index 0000000..95fff6a --- /dev/null +++ b/lib/urlfilter/filterrule.h @@ -0,0 +1,55 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/smolbote.hg + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef SMOLBOTE_FILTERRULE_H +#define SMOLBOTE_FILTERRULE_H + +#include +#include +#include +#include +#include +#include +#include +#include + +class FilterRule +{ +public: + enum UrlMatchType { + InvalidMatch, + RegularExpressionMatch, + StringContains, + StringStartsWith, + StringEndsWith, + StringEquals, + DomainMatch + }; + + bool isEnabled() const; + bool isBlocking() const; + + bool matchesDomain(uint domainHash) const; + bool matchesType(QWebEngineUrlRequestInfo::ResourceType type) const; + bool matchesUrl(const QUrl &url) const; + +protected: + bool m_isEnabled = false; + bool m_isBlocking = true; + + UrlMatchType urlMatchType = InvalidMatch; + QHash m_resourceTypeOptions; + + QVector allowedDomains_hashes, blockedDomains_hashes; + + QString match; + QRegularExpression regexp; + +}; + +#endif // SMOLBOTE_FILTERRULE_H diff --git a/lib/urlfilter/formats/adblockrule.cpp b/lib/urlfilter/formats/adblockrule.cpp new file mode 100644 index 0000000..ef7bec1 --- /dev/null +++ b/lib/urlfilter/formats/adblockrule.cpp @@ -0,0 +1,164 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/smolbote.hg + * + * SPDX-License-Identifier: GPL-3.0 + */ +// Based on Falkon's AdBlockRule class + +#include "adblockrule.h" + +// adblock format documentation +// https://adblockplus.org/filters + +// QString::mid(pos, len) - Returns a string starting at the specified position index. +// QString::chop(len) - Removes n characters from the end of the string. +// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. + +AdBlockRule::AdBlockRule(const QString &filter) +{ + QString parsedLine = filter.trimmed(); + + // there is no rule, or it's a comment + if(parsedLine.isEmpty() || parsedLine.startsWith("!")) { + return; + } + + // css rule - ignore for now + if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { + return; + } + + m_isEnabled = true; + + // exception rules + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + m_isBlocking = false; + parsedLine.remove(0, 2); + } else + m_isBlocking = true; + + // parse options + { + const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); + if(sepPos != -1) { + const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); + parsedLine = parsedLine.mid(0, sepPos); + + for(const QString &option : options) { + if(option.startsWith(QLatin1Literal("domain"))) { + const auto domainList = option.mid(7).split(QLatin1Literal("|")); + for (const QString &domain : domainList) { + if (domain.startsWith(QLatin1Literal("~"))) { + blockedDomains_hashes.append(qHash(domain.mid(1))); + } else { + allowedDomains_hashes.append(qHash(domain)); + } + } + } else { + parseOption(option); + } + } + } + } + + // regular expression rule + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + + urlMatchType = RegularExpressionMatch; + regexp.setPattern(parsedLine); + return; + } + + // string equals rule + if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + urlMatchType = StringEquals; + match = parsedLine.mid(1, parsedLine.length() - 2); + return; + } + + // Basic filter rules can use wildcards, which were supported by QRegExp, + // but were deprecated in QRegularExpression. + + // remove beginning and ending wildcards + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { + urlMatchType = DomainMatch; + match = parsedLine.mid(2, parsedLine.length() - 3); + return; + } + + // check for wildcards and translate to regexp + // wildcard "*" - any number of characters + // separator "^" - end, ? or / + if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { + urlMatchType = RegularExpressionMatch; + parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); + parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); + parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); + parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); + regexp.setPattern(parsedLine); + return; + } + + match = parsedLine; +} +void AdBlockRule::parseOption(const QString &option) +{ + const bool exception = !option.startsWith(QLatin1Literal("~")); + + if(option.endsWith(QLatin1Literal("script"))) { + // external scripts loaded via HTML script tag + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); + + } else if(option.endsWith(QLatin1Literal("image"))) { + // regular images, typically loaded via HTML img tag + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); + + } else if(option.endsWith(QLatin1Literal("stylesheet"))) { + // external CSS stylesheet files + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); + + } else if(option.endsWith(QLatin1Literal("object"))) { + // content handled by browser plugins, e.g. Flash or Java + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); + + } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { + // requests started using the XMLHttpRequest object or fetch() API + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); + + } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { + // requests started by plugins like Flash + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); + + } else if(option.endsWith(QLatin1Literal("subdocument"))) { + // embedded pages, usually included via HTML frames + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); + + } else if(option.endsWith(QLatin1Literal("ping"))) { + // requests started by or navigator.sendBeacon() + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypePing, exception); + + } else if(option.endsWith(QLatin1Literal("websocket"))) { + // requests initiated via WebSocket object + qDebug("Resource type 'websocket' not available"); + + } else if(option.endsWith(QLatin1Literal("webrtc"))) { + // connections opened via RTCPeerConnection instances to ICE servers + qDebug("Resource type 'webrtc' not available"); + + } else if(option.endsWith(QLatin1Literal("document"))) { + // the page itself + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); + + } else if(option.endsWith(QLatin1Literal("other"))) { + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); + } +} diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/formats/adblockrule.h new file mode 100644 index 0000000..8677c2c --- /dev/null +++ b/lib/urlfilter/formats/adblockrule.h @@ -0,0 +1,23 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/smolbote.hg + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef SMOLBOTE_ADBLOCKRULE_H +#define SMOLBOTE_ADBLOCKRULE_H + +#include "../filterrule.h" + +class AdBlockRule : public FilterRule +{ +public: + explicit AdBlockRule(const QString &filter); + + void parseOption(const QString &option); + +}; + +#endif // SMOLBOTE_ADBLOCKRULE_H -- cgit v1.2.1