From aaed4ebc642d95dfb3cddab4fad196e870077bcf Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Thu, 5 Jul 2018 19:37:05 +0200 Subject: Add web/urlfilter --- .hgignore | 1 + CMakeLists.txt | 4 +- doc/UrlFilter.md | 91 ++++++++++++++++++++++++++++++++++++++ lib/web/CMakeLists.txt | 10 ++--- lib/web/urlfilter/filterdomain.cpp | 62 ++++++++++++++++++++++++++ lib/web/urlfilter/filterdomain.h | 33 ++++++++++++++ lib/web/urlfilter/filterrule.cpp | 88 ++++++++++++++++++++++++++++++++++++ lib/web/urlfilter/filterrule.h | 68 ++++++++++++++++++++++++++++ src/webengine/urlinterceptor.cpp | 9 ++++ test/CMakeLists.txt | 19 ++++---- test/HostlistTest.cpp | 25 ----------- test/HostlistTest.h | 16 ------- test/autotests.qrc | 5 --- test/data/hostlist.txt | 2 - test/urlfilter/urlfiltertest.cpp | 88 ++++++++++++++++++++++++++++++++++++ test/urlfilter/urlfiltertest.h | 41 +++++++++++++++++ 16 files changed, 496 insertions(+), 66 deletions(-) create mode 100644 doc/UrlFilter.md create mode 100644 lib/web/urlfilter/filterdomain.cpp create mode 100644 lib/web/urlfilter/filterdomain.h create mode 100644 lib/web/urlfilter/filterrule.cpp create mode 100644 lib/web/urlfilter/filterrule.h delete mode 100644 test/HostlistTest.cpp delete mode 100644 test/HostlistTest.h delete mode 100644 test/autotests.qrc delete mode 100644 test/data/hostlist.txt create mode 100644 test/urlfilter/urlfiltertest.cpp create mode 100644 test/urlfilter/urlfiltertest.h diff --git a/.hgignore b/.hgignore index 39716ac..dab1be2 100644 --- a/.hgignore +++ b/.hgignore @@ -11,6 +11,7 @@ build* # qtcreator *.user +lang/*.qm test/plugins.d 3rd-party/*/.git diff --git a/CMakeLists.txt b/CMakeLists.txt index f0f48c9..a123603 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ option(MercurialRepo "Get version information from .hg" ON) option(CompilerWarnings "Compiler warnings" ON) option(QtDeprecatedWarnings "Qt deprecated warnings" ON) option(UseLibCpp "Use libc++ over stdlibc++ (requires clang)" OFF) -option(Tests "Enable/disable some basic autotests" ON) +option(Tests "Enable/disable some basic autotests" OFF) option(Plasma "Enable some fancy effects on Plasma" OFF) # Libraries @@ -86,7 +86,7 @@ add_subdirectory(src) if (Tests) enable_testing() find_package(Qt5 COMPONENTS Test REQUIRED) - #add_subdirectory(test) + add_subdirectory(test) endif() message("Version='${VerInfo}' bookmark='${VerBookmark}' commit='${VerCommit}'") diff --git a/doc/UrlFilter.md b/doc/UrlFilter.md new file mode 100644 index 0000000..cea2397 --- /dev/null +++ b/doc/UrlFilter.md @@ -0,0 +1,91 @@ +## FilterDomain +Filter Domains are groups of domains that can point to one or more filter +rules. + +### Types +There are 4 Filter Domain types: +- AllowOnDomains - only match specified domains +- BlockOnDomains - match all but specified domains +- AllowOnAllDomains - match all domains +- BlockOnAllDomains - match no domains + +### JSON + { + "type" : "AllowOnDomains", + "domains" : [ "example.com", "test.example.com" ], + "rules" : [] + } + +## FilterRule +Filter rules contain information on how a request should be modified. + +### "action" +- Whitelist - allow this request +- Blacklist - block this request +- Redirect - redirect this request +- SetHeader - apply a list of headers + +### "regexp", "contains", "endswith" + +### JSON +Allow all URLs that contain "waifu.png" + { + "action" : "Whitelist" + "contains" : "waifu.png" + } + +Block specific URL + { + "action" : "Blacklist" + "equals" : "example.com/annoying-ad/masquerade/waifu.png" + } + +Block all URLs that contain "banner.gif" + { + "action" : "Blacklist", + "contains" : "banner.gif" + } + +Redirect URLs containing "ads/annoying-spam.gif" to "waifu.tld/waifu.gif" + { + "action" : "Redirect" + "contains" : "ads/annoying-spam.gif" + "url" : "waifu.tld/waifu.gif" + } + +Set some headers + { + "action" : "SetHeader" + "header" : [ "DNT" : "1" ] + } + +## QWebEngineUrlRequestInterceptor + +All network requests pass through the request interceptor. It gives the +following information: + +- firstPartyUrl - the page on which the request is made +- requestUrl - the url of the request +- requestMethod +- resourceType +- navigationType + +And provides the following methods: + +- block (bool shouldBlock) - can block the request +- redirect (const QUrl) - can redirect the requestUrl +- setHttpHeader - can set HTTP headers (such as user agent and do not track) + +### Example +This is a sample request made when loading DuckDuckGo: + + firstPartyUrl=https://duckduckgo.com/ + requestUrl=https://duckduckgo.com/o1635.css + +## How the filter works +When a requst comes in, the interceptor extracts the host of the request and +matches it against the list of FilterDomains. + + firstPartyHost=duckduckgo.com + requestHost=duckduckgo.com + diff --git a/lib/web/CMakeLists.txt b/lib/web/CMakeLists.txt index c5d5eba..601c1d5 100644 --- a/lib/web/CMakeLists.txt +++ b/lib/web/CMakeLists.txt @@ -7,11 +7,11 @@ set(CMAKE_AUTOMOC ON) add_library(web webprofile.cpp webprofile.h -) -#target_include_directories(web -# PRIVATE ${Boost_INCLUDE_DIRS} -# PRIVATE .. -#) + urlfilter/filterdomain.cpp + urlfilter/filterdomain.h + urlfilter/filterrule.cpp + urlfilter/filterrule.h +) target_link_libraries(web Qt5::WebEngineWidgets) diff --git a/lib/web/urlfilter/filterdomain.cpp b/lib/web/urlfilter/filterdomain.cpp new file mode 100644 index 0000000..53bc7db --- /dev/null +++ b/lib/web/urlfilter/filterdomain.cpp @@ -0,0 +1,62 @@ +#include "filterdomain.h" +#include + +bool isMatchingDomain(const QString &domain, const QString &filter) +{ + // domain and filter are the same + if(domain == filter) { + return true; + } + + // domain can't be matched by filter if it doesn't end with filter + // ex. example2.com isn't matched by example.com + if(!domain.endsWith(filter)) { + return false; + } + + // match with subdomains + // ex. subdomain.example.com is matched by example.com + int index = domain.indexOf(filter); + + // match if (domain ends with filter) && (filter has been found) and (character before filter is '.') + return index > 0 && domain[index - 1] == QLatin1Char('.'); +} + +FilterDomain::FilterDomain(MatchType type, QObject *parent) + : QObject(parent) +{ + setType(type); +} + +void FilterDomain::setType(MatchType type) +{ + m_type = type; +} + +void FilterDomain::addDomain(const QString &domain) +{ + if(!domain.isEmpty()) + m_domains.append(domain); +} + +bool FilterDomain::hasMatch(const QString &host) const +{ + // match all domains -> this rule applies to all domains + if(m_type == WhitelistAll) + return true; + + // match no domains -> this rule applies to no domains + if(m_type == BlacklistAll) + return false; + + // is this a whitelist or blacklist domain rule? + // should it apply to added domains or not + const bool whitelist = (m_type == Whitelist); + + for(const auto &domain : qAsConst(m_domains)) { + if(isMatchingDomain(host, domain)) + return whitelist; + } + + return !whitelist; +} diff --git a/lib/web/urlfilter/filterdomain.h b/lib/web/urlfilter/filterdomain.h new file mode 100644 index 0000000..2173bfc --- /dev/null +++ b/lib/web/urlfilter/filterdomain.h @@ -0,0 +1,33 @@ +#ifndef FILTERDOMAIN_H +#define FILTERDOMAIN_H + +#include +#include + +class FilterDomain : public QObject +{ + Q_OBJECT +public: + enum MatchType { + Whitelist, // only match added domains + Blacklist, // only match domains not added + WhitelistAll, // match all domains + BlacklistAll // match no domains + }; + + explicit FilterDomain(MatchType type = Whitelist, QObject *parent = nullptr); + + void setType(MatchType type); + void addDomain(const QString &domain); + + bool hasMatch(const QString &host) const; + +private: + MatchType m_type; + QVector m_domains; +}; + +// function taken from KDE/Falkon +bool isMatchingDomain(const QString &domain, const QString &filter); + +#endif // FILTERDOMAIN_H diff --git a/lib/web/urlfilter/filterrule.cpp b/lib/web/urlfilter/filterrule.cpp new file mode 100644 index 0000000..5a9310e --- /dev/null +++ b/lib/web/urlfilter/filterrule.cpp @@ -0,0 +1,88 @@ +#include "filterrule.h" +#include +#include + +FilterRule::FilterRule(const QJsonObject &filter) +{ + const QString action = filter.value("action").toString(); + + // there is no action specified => this rule is invalid + if(action.isEmpty()) + return; + + if(action == "Whitelist") + m_action = ActionType::Whitelist; + else if (action == "Blacklist") + m_action = ActionType::Blacklist; + else if (action == "Redirect") { + m_action = ActionType::Redirect; + m_redirectUrl = QUrl::fromUserInput(filter.value("url").toString()); + } else if (action == "SetHeader") + m_action = ActionType::SetHeader; + else // invalid action + return; + + QJsonValue regexp = filter.value("regexp"); + QJsonValue endswith = filter.value("endswith"); + QJsonValue contains = filter.value("contains"); + + if(!regexp.isUndefined()) { + m_type = RuleType::RegExpMatchRule; + this->regexp.setPattern(regexp.toString()); + } else if(!endswith.isUndefined()) { + m_type = RuleType::StringEndsMatchRule; + pattern = endswith.toString(); + } else if(!contains.isUndefined()) { + m_type = RuleType::StringContainsMatchRule; + this->matcher.setPattern(contains.toString()); + this->matcher.setCaseSensitivity(Qt::CaseInsensitive); + } else // invalid rule + return; + + m_options.insert(QWebEngineUrlRequestInfo::ResourceTypeImage, true); +} + +bool FilterRule::isValid() const +{ + return m_type != RuleType::Invalid; +} + +bool FilterRule::process(QWebEngineUrlRequestInfo &info) const +{ + Q_ASSERT(m_type != RuleType::Invalid); + + if(matchRequestUrl(info.requestUrl().toString(), info.resourceType())) { + switch (m_action) { + case ActionType::Whitelist: + info.block(false); + return true; + case ActionType::Blacklist: + info.block(true); + return true; + case ActionType::Redirect: + info.redirect(m_redirectUrl); + return true; + case ActionType::SetHeader: + break; + } + } + + return false; +} + +bool FilterRule::matchRequestUrl(const QString &requestUrl, const QWebEngineUrlRequestInfo::ResourceType type) const +{ + if(!m_options.contains(type)) + return false; + + switch (m_type) { + case RuleType::RegExpMatchRule: + return regexp.match(requestUrl).hasMatch(); + case RuleType::StringEndsMatchRule: + return requestUrl.endsWith(pattern); + case RuleType::StringContainsMatchRule: + return matcher.indexIn(requestUrl) != -1; + default: + return false; + } +} diff --git a/lib/web/urlfilter/filterrule.h b/lib/web/urlfilter/filterrule.h new file mode 100644 index 0000000..46690b1 --- /dev/null +++ b/lib/web/urlfilter/filterrule.h @@ -0,0 +1,68 @@ +#ifndef SMOLBOTE_FILTERRULE_H +#define SMOLBOTE_FILTERRULE_H + +#include +#include +#include +#include +#include +#include +#include + +class QUrl; +class FilterRule +{ +public: + FilterRule(const QJsonObject &filter); + ~FilterRule() = default; + + bool isValid() const; + bool process(QWebEngineUrlRequestInfo &info) const; + bool matchRequestUrl(const QString &requestUrl, const QWebEngineUrlRequestInfo::ResourceType type) const; + +private: + Q_DISABLE_COPY(FilterRule) + + enum ActionType { + Whitelist, + Blacklist, + Redirect, + SetHeader + }; + + enum RuleType { + CssRule = 0, // + DomainMatchRule = 1, // + RegExpMatchRule = 2, // match request url with regexp + StringEndsMatchRule = 3, // request url ends with string + StringContainsMatchRule = 4, // request url contains string + MatchAllUrlsRule = 5, // + Invalid = 6 + }; + + ActionType m_action; + RuleType m_type = RuleType::Invalid; + + QHash m_options; + + // Parsed rule for string matching (CSS Selector for CSS rules) + QString m_matchString; + // Case sensitivity for string matching + Qt::CaseSensitivity m_caseSensitivity = Qt::CaseInsensitive; + + bool m_isException = false; + + // domains this rule is allowed or blocked on + QStringList m_allowedForDomains; + QStringList m_blockedForDomains; + + QUrl m_redirectUrl; + + QRegularExpression regexp; + QStringMatcher matcher; + QString pattern; +}; + +//bool isMatchingDomain(const QString &domain, const QString &filter); + +#endif // SMOLBOTE_FILTERRULE_H diff --git a/src/webengine/urlinterceptor.cpp b/src/webengine/urlinterceptor.cpp index 70d7701..4e1b2f1 100644 --- a/src/webengine/urlinterceptor.cpp +++ b/src/webengine/urlinterceptor.cpp @@ -47,6 +47,15 @@ void UrlRequestInterceptor::interceptRequest(QWebEngineUrlRequestInfo &info) if(rules.contains(info.requestUrl().host())) { info.block(rules.value(info.requestUrl().host()).isBlocking); } + +#ifdef QT_DEBUG + qDebug("request>>>"); + qDebug("firstParty url=%s", qUtf8Printable(info.firstPartyUrl().toString())); + qDebug("firstParty host=%s", qUtf8Printable(info.firstPartyUrl().host())); + qDebug("request url=%s", qUtf8Printable(info.requestUrl().toString())); + qDebug("request host=%s", qUtf8Printable(info.requestUrl().host())); + qDebug("<<<"); +#endif } QHash parse(const QString &filename) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ad2ca1f..a44da07 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,16 +2,13 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON) set(CMAKE_AUTOMOC ON) set(CMAKE_AUTORCC ON) -macro(create_test testname) - add_executable(${testname} - ${testname}.cpp ${testname}.h - autotests.qrc - ${ARGN}) +add_executable(UrlFilterTest + urlfilter/urlfiltertest.cpp + urlfilter/urlfiltertest.h +) - target_include_directories(${testname} PRIVATE ../lib PRIVATE ../src) - target_link_libraries(${testname} Qt5::Test Qt5::Concurrent Qt5::WebEngineWidgets) +target_include_directories(UrlFilterTest + PRIVATE ../lib/web +) - add_test(NAME smolbote-${testname} COMMAND ${testname}) -endmacro() - -create_test(HostlistTest ../src/webengine/urlinterceptor.cpp ../src/webengine/urlinterceptor.h) +target_link_libraries(UrlFilterTest Qt5::Test web) diff --git a/test/HostlistTest.cpp b/test/HostlistTest.cpp deleted file mode 100644 index 31ae11c..0000000 --- a/test/HostlistTest.cpp +++ /dev/null @@ -1,25 +0,0 @@ -#include "HostlistTest.h" - -void HostlistTest::initTestCase() -{ - rules = parse(":/autotests/data/hostlist.txt"); -} - -void HostlistTest::parse_ruleCount() -{ - QVERIFY(rules.count() == 3); -} - -void HostlistTest::parse_blockSomehost() -{ - QVERIFY(rules.contains("somehost.org")); - QVERIFY(rules.value("somehost.org").isBlocking); -} - -void HostlistTest::parse_blockHost2() -{ - QVERIFY(rules.contains("host2.org")); - QVERIFY(rules.value("host2.org").isBlocking); -} - -QTEST_MAIN(HostlistTest) diff --git a/test/HostlistTest.h b/test/HostlistTest.h deleted file mode 100644 index dcfd5a3..0000000 --- a/test/HostlistTest.h +++ /dev/null @@ -1,16 +0,0 @@ -#include -#include "webengine/urlinterceptor.h" - -class HostlistTest : public QObject -{ - Q_OBJECT - -private slots: - void initTestCase(); - void parse_ruleCount(); - void parse_blockSomehost(); - void parse_blockHost2(); - -private: - QHash rules; -}; diff --git a/test/autotests.qrc b/test/autotests.qrc deleted file mode 100644 index 5817c00..0000000 --- a/test/autotests.qrc +++ /dev/null @@ -1,5 +0,0 @@ - - - data/hostlist.txt - - \ No newline at end of file diff --git a/test/data/hostlist.txt b/test/data/hostlist.txt deleted file mode 100644 index d228e1d..0000000 --- a/test/data/hostlist.txt +++ /dev/null @@ -1,2 +0,0 @@ -0.0.0.0 somehost.org -0.0.0.0 host1.org host2.org diff --git a/test/urlfilter/urlfiltertest.cpp b/test/urlfilter/urlfiltertest.cpp new file mode 100644 index 0000000..f7ae0fb --- /dev/null +++ b/test/urlfilter/urlfiltertest.cpp @@ -0,0 +1,88 @@ +/* ============================================================ +* Falkon - Qt web browser +* Copyright (C) 2013-2018 David Rosca +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +* ============================================================ */ + +#include "urlfiltertest.h" +#include +#include + +#include + +void UrlFilterTest::matchingDomain_data() +{ + filterDomain.addDomain("example.com"); + filterDomain.addDomain("second-domain.org"); + + // Test adapted from Falkon + QTest::addColumn("domain"); + QTest::addColumn("result"); + + // description site domain result + QTest::newRow("missing tld") << "example" << false; + QTest::newRow("different tld") << "example.org" << false; + QTest::newRow("exact match") << "example.com" << true; + QTest::newRow("exact match 2") << "second-domain.org" << true; + QTest::newRow("subdomain match") << "www.example.com" << true; + QTest::newRow("subdomain match") << "www.test.example.com" << true; + QTest::newRow("similar domain") << "anotherexample.com" << false; + QTest::newRow("empty domain") << "" << false; +} + +void UrlFilterTest::matchingDomain() +{ + QFETCH(QString, domain); + QFETCH(bool, result); + + filterDomain.setType(FilterDomain::Whitelist); + QCOMPARE(filterDomain.hasMatch(domain), result); + filterDomain.setType(FilterDomain::Blacklist); + QCOMPARE(filterDomain.hasMatch(domain), !result); + filterDomain.setType(FilterDomain::WhitelistAll); + QCOMPARE(filterDomain.hasMatch(domain), true); + filterDomain.setType(FilterDomain::BlacklistAll); + QCOMPARE(filterDomain.hasMatch(domain), false); +} + +void UrlFilterTest::matchingType_data() +{ + QJsonObject j; + j.insert("action", "Blacklist"); + j.insert("contains", "annoying-ad.banner"); + + filterRule = new FilterRule(j); + Q_ASSERT(filterRule->isValid()); + + QTest::addColumn("requestUrl"); + QTest::addColumn("resourceType"); + QTest::addColumn("result"); + + QTest::newRow("contains 1") << "http://example.com/ads/annoying-ad.banner/something" << static_cast(QWebEngineUrlRequestInfo::ResourceTypeImage) << true; + QTest::newRow("contains 2") << "http://example.com/ads/annoying-ad.banner/something" << static_cast(QWebEngineUrlRequestInfo::ResourceTypeMedia) << false; + QTest::newRow("contains 3") << "http://example.com/ads/banner" << static_cast(QWebEngineUrlRequestInfo::ResourceTypeImage) << false; + QTest::newRow("blank") << "" << static_cast(QWebEngineUrlRequestInfo::ResourceTypeUnknown) << false; +} + +void UrlFilterTest::matchingType() +{ + QFETCH(QString, requestUrl); + QFETCH(int, resourceType); + QFETCH(bool, result); + + QCOMPARE(filterRule->matchRequestUrl(requestUrl, static_cast(resourceType)), result); +} + +QTEST_GUILESS_MAIN(UrlFilterTest) diff --git a/test/urlfilter/urlfiltertest.h b/test/urlfilter/urlfiltertest.h new file mode 100644 index 0000000..1b158e8 --- /dev/null +++ b/test/urlfilter/urlfiltertest.h @@ -0,0 +1,41 @@ +/* ============================================================ +* Falkon - Qt web browser +* Copyright (C) 2013-2018 David Rosca +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +* ============================================================ */ +#ifndef URLFILTER_TEST_H +#define URLFILTER_TEST_H + +#include "urlfilter/filterrule.h" +#include "urlfilter/filterdomain.h" +#include + +class UrlFilterTest : public QObject +{ + Q_OBJECT + +private slots: + void matchingDomain_data(); + void matchingDomain(); + + void matchingType_data(); + void matchingType(); + +private: + FilterDomain filterDomain; + FilterRule *filterRule; +}; + +#endif // URLFILTER_TEST_H -- cgit v1.2.1