From 7d8cbdb9941532cd5bf560b21395f6ed371d1ab5 Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Mon, 1 Oct 2018 16:43:18 +0200 Subject: Split off UrlFilter into library - add more adblock filter options --- lib/urlfilter/formats/adblockrule.cpp | 164 ++++++++++++++++++++++++++++++++++ lib/urlfilter/formats/adblockrule.h | 23 +++++ 2 files changed, 187 insertions(+) create mode 100644 lib/urlfilter/formats/adblockrule.cpp create mode 100644 lib/urlfilter/formats/adblockrule.h (limited to 'lib/urlfilter/formats') diff --git a/lib/urlfilter/formats/adblockrule.cpp b/lib/urlfilter/formats/adblockrule.cpp new file mode 100644 index 0000000..ef7bec1 --- /dev/null +++ b/lib/urlfilter/formats/adblockrule.cpp @@ -0,0 +1,164 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/smolbote.hg + * + * SPDX-License-Identifier: GPL-3.0 + */ +// Based on Falkon's AdBlockRule class + +#include "adblockrule.h" + +// adblock format documentation +// https://adblockplus.org/filters + +// QString::mid(pos, len) - Returns a string starting at the specified position index. +// QString::chop(len) - Removes n characters from the end of the string. +// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. + +AdBlockRule::AdBlockRule(const QString &filter) +{ + QString parsedLine = filter.trimmed(); + + // there is no rule, or it's a comment + if(parsedLine.isEmpty() || parsedLine.startsWith("!")) { + return; + } + + // css rule - ignore for now + if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { + return; + } + + m_isEnabled = true; + + // exception rules + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + m_isBlocking = false; + parsedLine.remove(0, 2); + } else + m_isBlocking = true; + + // parse options + { + const int sepPos = parsedLine.indexOf(QLatin1Literal("$")); + if(sepPos != -1) { + const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(",")); + parsedLine = parsedLine.mid(0, sepPos); + + for(const QString &option : options) { + if(option.startsWith(QLatin1Literal("domain"))) { + const auto domainList = option.mid(7).split(QLatin1Literal("|")); + for (const QString &domain : domainList) { + if (domain.startsWith(QLatin1Literal("~"))) { + blockedDomains_hashes.append(qHash(domain.mid(1))); + } else { + allowedDomains_hashes.append(qHash(domain)); + } + } + } else { + parseOption(option); + } + } + } + } + + // regular expression rule + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + + urlMatchType = RegularExpressionMatch; + regexp.setPattern(parsedLine); + return; + } + + // string equals rule + if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + urlMatchType = StringEquals; + match = parsedLine.mid(1, parsedLine.length() - 2); + return; + } + + // Basic filter rules can use wildcards, which were supported by QRegExp, + // but were deprecated in QRegularExpression. + + // remove beginning and ending wildcards + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { + urlMatchType = DomainMatch; + match = parsedLine.mid(2, parsedLine.length() - 3); + return; + } + + // check for wildcards and translate to regexp + // wildcard "*" - any number of characters + // separator "^" - end, ? or / + if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { + urlMatchType = RegularExpressionMatch; + parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://")); + parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|")); + parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); + parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); + regexp.setPattern(parsedLine); + return; + } + + match = parsedLine; +} +void AdBlockRule::parseOption(const QString &option) +{ + const bool exception = !option.startsWith(QLatin1Literal("~")); + + if(option.endsWith(QLatin1Literal("script"))) { + // external scripts loaded via HTML script tag + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeScript, exception); + + } else if(option.endsWith(QLatin1Literal("image"))) { + // regular images, typically loaded via HTML img tag + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeImage, exception); + + } else if(option.endsWith(QLatin1Literal("stylesheet"))) { + // external CSS stylesheet files + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception); + + } else if(option.endsWith(QLatin1Literal("object"))) { + // content handled by browser plugins, e.g. Flash or Java + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeObject, exception); + + } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) { + // requests started using the XMLHttpRequest object or fetch() API + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception); + + } else if(option.endsWith(QLatin1Literal("object-subrequest"))) { + // requests started by plugins like Flash + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception); + + } else if(option.endsWith(QLatin1Literal("subdocument"))) { + // embedded pages, usually included via HTML frames + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception); + + } else if(option.endsWith(QLatin1Literal("ping"))) { + // requests started by or navigator.sendBeacon() + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypePing, exception); + + } else if(option.endsWith(QLatin1Literal("websocket"))) { + // requests initiated via WebSocket object + qDebug("Resource type 'websocket' not available"); + + } else if(option.endsWith(QLatin1Literal("webrtc"))) { + // connections opened via RTCPeerConnection instances to ICE servers + qDebug("Resource type 'webrtc' not available"); + + } else if(option.endsWith(QLatin1Literal("document"))) { + // the page itself + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception); + + } else if(option.endsWith(QLatin1Literal("other"))) { + m_resourceTypeOptions.insert(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception); + } +} diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/formats/adblockrule.h new file mode 100644 index 0000000..8677c2c --- /dev/null +++ b/lib/urlfilter/formats/adblockrule.h @@ -0,0 +1,23 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/smolbote.hg + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef SMOLBOTE_ADBLOCKRULE_H +#define SMOLBOTE_ADBLOCKRULE_H + +#include "../filterrule.h" + +class AdBlockRule : public FilterRule +{ +public: + explicit AdBlockRule(const QString &filter); + + void parseOption(const QString &option); + +}; + +#endif // SMOLBOTE_ADBLOCKRULE_H -- cgit v1.2.1