From c74367d82c1c7bec393548d2e5014c794333822f Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Tue, 16 Oct 2018 17:25:40 +0200 Subject: urlfilter: Add FilterTree class FilterTree is a class that holds filter rules, sorted by the domain they are to be applied on. The rules are to follow FilterLeaf as interface. - Add a hostlist rule format to FilterTree. - Add a test for hostlist format. --- lib/urlfilter/CMakeLists.txt | 9 ++++ lib/urlfilter/filterleaf.cpp | 26 ++++++++++ lib/urlfilter/filterleaf.h | 43 +++++++++++++++++ lib/urlfilter/filtertree.cpp | 86 ++++++++++++++++++++++++++++++++++ lib/urlfilter/filtertree.h | 46 ++++++++++++++++++ lib/urlfilter/formats/hostlistrule.cpp | 21 +++++++++ lib/urlfilter/formats/hostlistrule.h | 16 +++++++ linux/.config | 2 +- test/CMakeLists.txt | 15 ++++-- test/adblock/adblocktest.cpp | 65 +++++++++++++++++++++++++ test/adblock/adblocktest.h | 13 +++++ test/hostlist.txt | 6 +++ test/hostlist/hostlisttest.cpp | 42 +++++++++++++++++ test/hostlist/hostlisttest.h | 19 ++++++++ test/urlfilter/adblocktest.cpp | 65 ------------------------- test/urlfilter/adblocktest.h | 13 ----- 16 files changed, 405 insertions(+), 82 deletions(-) create mode 100644 lib/urlfilter/filterleaf.cpp create mode 100644 lib/urlfilter/filterleaf.h create mode 100644 lib/urlfilter/filtertree.cpp create mode 100644 lib/urlfilter/filtertree.h create mode 100644 lib/urlfilter/formats/hostlistrule.cpp create mode 100644 lib/urlfilter/formats/hostlistrule.h create mode 100644 test/adblock/adblocktest.cpp create mode 100644 test/adblock/adblocktest.h create mode 100644 test/hostlist.txt create mode 100644 test/hostlist/hostlisttest.cpp create mode 100644 test/hostlist/hostlisttest.h delete mode 100644 test/urlfilter/adblocktest.cpp delete mode 100644 test/urlfilter/adblocktest.h diff --git a/lib/urlfilter/CMakeLists.txt b/lib/urlfilter/CMakeLists.txt index 842f18f..375ffa7 100644 --- a/lib/urlfilter/CMakeLists.txt +++ b/lib/urlfilter/CMakeLists.txt @@ -10,6 +10,15 @@ add_library(urlfilter formats/adblockrule.cpp formats/adblockrule.h + + formats/hostlistrule.cpp + formats/hostlistrule.h + + # filter tree + filtertree.cpp + filtertree.h + filterleaf.cpp + filterleaf.h ) target_link_libraries(urlfilter Qt5::WebEngineWidgets) diff --git a/lib/urlfilter/filterleaf.cpp b/lib/urlfilter/filterleaf.cpp new file mode 100644 index 0000000..3bd10bf --- /dev/null +++ b/lib/urlfilter/filterleaf.cpp @@ -0,0 +1,26 @@ +#include "filterleaf.h" + +FilterLeaf::FilterLeaf(FilterLeaf && other) +{ + m_isBlocking = other.m_isBlocking; + m_request = std::move(other.m_request); + m_redirect = std::move(other.m_redirect); +} + +FilterLeaf & FilterLeaf::operator=(FilterLeaf && other) +{ + m_isBlocking = other.m_isBlocking; + m_request = std::move(other.m_request); + m_redirect = std::move(other.m_redirect); + return *this; +} + +const QString FilterLeaf::request() const +{ + return QString::fromStdString(m_request); +} + +const QString FilterLeaf::redirect() const +{ + return QString::fromStdString(m_redirect); +} diff --git a/lib/urlfilter/filterleaf.h b/lib/urlfilter/filterleaf.h new file mode 100644 index 0000000..6d9caae --- /dev/null +++ b/lib/urlfilter/filterleaf.h @@ -0,0 +1,43 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef SMOLBOTE_FILTERLEAF_H +#define SMOLBOTE_FILTERLEAF_H + +#include +#include + +class FilterLeaf +{ +public: + enum Action { + NotMatched, + Allow, + Block, + Redirect + }; + + FilterLeaf(FilterLeaf &&other); + FilterLeaf& operator=(FilterLeaf &&other); + ~FilterLeaf() = default; + + virtual bool match(const QString &requestUrl) const = 0; + virtual Action action() const = 0; + + const QString request() const; + const QString redirect() const; + +protected: + explicit FilterLeaf() = default; + + bool m_isBlocking; + std::string m_request; + std::string m_redirect; +}; + +#endif // SMOLBOTE_FILTERLEAF_H diff --git a/lib/urlfilter/filtertree.cpp b/lib/urlfilter/filtertree.cpp new file mode 100644 index 0000000..8844a76 --- /dev/null +++ b/lib/urlfilter/filtertree.cpp @@ -0,0 +1,86 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "filtertree.h" +#include "filterleaf.h" +#include +#include "formats/hostlistrule.h" + +bool loadHostlist(QIODevice &from, FilterTree* tree) +{ + Q_ASSERT(from.isReadable()); + QTextStream stream(&from); + while(!stream.atEnd()) { + const QString line = stream.readLine().trimmed(); + if(line.isEmpty() || line.startsWith(QLatin1Literal("#"))) + continue; + + const QStringList &parts = line.split(QLatin1Literal(" ")); + if(parts.length() < 2) { +#ifdef QT_DEBUG + qDebug("Cannot parse: %s", qUtf8Printable(line)); +#endif + return false; + } + + for(int i = 1; i < parts.length(); ++i) { + // HostlistRule(domain, redirect) + auto *rule = new HostlistRule(parts.at(i), parts.constFirst()); + // addRule(rule, enable_on_domain) + const bool added = tree->addRule(rule, QString()); + if(!added) + return false; + } + + } + return true; +} + +const QStringList FilterTree::branches() const +{ + QStringList branches; + for(auto &branch : m_branches) { + branches.append(QString::fromStdString(branch.domain)); + } + return branches; +} + +QVector FilterTree::match(const QString& domain, const QString& requestUrl) const +{ + QVector leaves; + for(const auto &branch : m_branches) { + if(branch.domain == domain.toStdString()) { + + for(const auto leaf : branch.leaves) { + if(leaf->match(requestUrl)) { + leaves.append(leaf); + } + } + + } + } + return leaves; +} + +bool FilterTree::addRule(FilterLeaf *rule, const QString& domain) +{ + for(auto &branch : m_branches) { + if(branch.domain == domain.toStdString()) { + branch.leaves.emplace_back(rule); + return true; + } + } + + // no branch was found + Branch branch; + branch.domain = domain.toStdString(); + // TODO: for some reason, can't add rule here + //branch.leaves.emplace_back(rule); + m_branches.emplace_back(std::move(branch)); + return this->addRule(rule, domain); +} diff --git a/lib/urlfilter/filtertree.h b/lib/urlfilter/filtertree.h new file mode 100644 index 0000000..8cecf50 --- /dev/null +++ b/lib/urlfilter/filtertree.h @@ -0,0 +1,46 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef SMOLBOTE_FILTERTREE_H +#define SMOLBOTE_FILTERTREE_H + +#include +#include +#include +#include +#include "filterleaf.h" + +/** FilterTree: B+ tree of filter rules + * The root of the tree contains branches that represent domains, on which their rules are to be applied. + * Each branch contains leaves - rules + */ +class FilterTree : public QObject +{ + Q_OBJECT + +public: + const QStringList branches() const; + QVector match(const QString &domain, const QString &requestUrl) const; + + bool addRule(FilterLeaf *rule, const QString &domain); + +private: + struct Branch { + ~Branch() { qDeleteAll(leaves); } + + // TODO: replace domain type with domain-matching class + std::string domain; + std::vector leaves; + }; + + std::vector m_branches; +}; + +bool loadHostlist(QIODevice &from, FilterTree *tree); + +#endif // SMOLBOTE_FILTERTREE_H diff --git a/lib/urlfilter/formats/hostlistrule.cpp b/lib/urlfilter/formats/hostlistrule.cpp new file mode 100644 index 0000000..1df6b3e --- /dev/null +++ b/lib/urlfilter/formats/hostlistrule.cpp @@ -0,0 +1,21 @@ +#include "hostlistrule.h" + +HostlistRule::HostlistRule(const QString &domain, const QString& redirect) +{ + this->m_isBlocking = (redirect == QLatin1Literal("0.0.0.0")); + this->m_request = domain.toStdString(); + this->m_redirect = redirect.toStdString(); +} + +bool HostlistRule::match(const QString& requestUrl) const +{ + return (m_request == requestUrl.toStdString()); +} + +FilterLeaf::Action HostlistRule::action() const +{ + if(m_isBlocking) + return FilterLeaf::Block; + return FilterLeaf::Redirect; +} + diff --git a/lib/urlfilter/formats/hostlistrule.h b/lib/urlfilter/formats/hostlistrule.h new file mode 100644 index 0000000..764a2e2 --- /dev/null +++ b/lib/urlfilter/formats/hostlistrule.h @@ -0,0 +1,16 @@ +#ifndef SMOLBOTE_HOSTLIST_RULE_H +#define SMOLBOTE_HOSTLIST_RULE_H + +#include "../filterleaf.h" +#include + +class HostlistRule : public FilterLeaf +{ +public: + explicit HostlistRule(const QString &domain, const QString &redirect); + + bool match(const QString &requestUrl) const override; + FilterLeaf::Action action() const override; +}; + +#endif // SMOLBOTE_HOSTLIST_RULE_H diff --git a/linux/.config b/linux/.config index ea2d51c..eb0fb92 100644 --- a/linux/.config +++ b/linux/.config @@ -48,7 +48,7 @@ CONFIG_USEPLASMA=y # Devel # CONFIG_QTWARNINGS=y -# CONFIG_TESTS is not set +CONFIG_TESTS=y # CONFIG_LLVMLIBCPP is not set # diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4302ab2..6b740d7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -3,10 +3,19 @@ set(CMAKE_AUTOMOC ON) set(CMAKE_AUTORCC ON) add_executable(AdBlockTest - urlfilter/adblocktest.cpp - urlfilter/adblocktest.h + adblock/adblocktest.cpp + adblock/adblocktest.h ) target_include_directories(AdBlockTest PRIVATE ../lib/urlfilter) - target_link_libraries(AdBlockTest Qt5::Test urlfilter) + +add_executable(HostlistTest + hostlist/hostlisttest.cpp + hostlist/hostlisttest.h +) +target_include_directories(HostlistTest PRIVATE ../lib/urlfilter/) +target_link_libraries(HostlistTest Qt5::Test urlfilter) + +add_test(NAME urlfilter-adblock COMMAND AdBlockTest WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) +add_test(NAME urlfilter-hostlist COMMAND HostlistTest WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) diff --git a/test/adblock/adblocktest.cpp b/test/adblock/adblocktest.cpp new file mode 100644 index 0000000..b31d965 --- /dev/null +++ b/test/adblock/adblocktest.cpp @@ -0,0 +1,65 @@ +#include "adblocktest.h" +#include +#include "filterrule.h" +#include "formats/adblockrule.h" + +inline bool check(const std::vector rules, const QUrl &url) +{ + for(const AdBlockRule &rule : rules) { + if(rule.matchesDomain(qHash(url.host())) && rule.matchesUrl(url)) + return true; + } + return false; +} + +void AdBlockTest::parseList() +{ + std::vector rules; + + QFile list("adblock.txt"); + int ruleCount = 0; + QCOMPARE(list.open(QIODevice::ReadOnly | QIODevice::Text), true); + { + QTextStream l(&list); + QString line; + while(l.readLineInto(&line)) { + AdBlockRule rule(line); + if(rule.isEnabled()) { + rules.emplace_back(std::move(rule)); + ruleCount++; + qDebug("added rule: %s", qUtf8Printable(line)); + } + } + } + list.close(); + + // there should be 3 rules + QCOMPARE(rules.size(), ruleCount); + + // block by address part + QCOMPARE(check(rules, QUrl("http://example.com/banner/foo/img")), true); + QCOMPARE(check(rules, QUrl("http://example.com/banner/foo/bar/img?param")), true); + QCOMPARE(check(rules, QUrl("http://example.com/banner//img/foo")), true); + QCOMPARE(check(rules, QUrl("http://example.com/banner/img")), false); + QCOMPARE(check(rules, QUrl("http://example.com/banner/foo/imgraph")), false); + QCOMPARE(check(rules, QUrl("http://example.com/banner/foo/img.gif")), false); + + // block by domain + QCOMPARE(check(rules, QUrl("http://ads.example.com/foo.gif")), true); + QCOMPARE(check(rules, QUrl("http://server1.ads.example.com/foo.gif")), true); + QCOMPARE(check(rules, QUrl("https://ads.example.com:8000/")), true); + QCOMPARE(check(rules, QUrl("http://ads.example.com.ua/foo.gif")), false); + QCOMPARE(check(rules, QUrl("http://example.com/redirect/http://ads.example.com/")), false); + + // block exact address + QCOMPARE(check(rules, QUrl("http://example.com/")), true); + QCOMPARE(check(rules, QUrl("http://example.com/foo.gif")), false); + QCOMPARE(check(rules, QUrl("http://example.info/redirect/http://example.com/")), false); + + // regular expression + QCOMPARE(check(rules, QUrl("http://another.com/banner123")), true); + QCOMPARE(check(rules, QUrl("http://another.com/banner321")), true); + QCOMPARE(check(rules, QUrl("http://another.com/banners")), false); +} + +QTEST_GUILESS_MAIN(AdBlockTest) diff --git a/test/adblock/adblocktest.h b/test/adblock/adblocktest.h new file mode 100644 index 0000000..95cb7e2 --- /dev/null +++ b/test/adblock/adblocktest.h @@ -0,0 +1,13 @@ +#ifndef ADBLOCKTEST_H +#define ADBLOCKTEST_H + +#include +class AdBlockTest : public QObject +{ + Q_OBJECT + +private slots: + void parseList(); +}; + +#endif // ADBLOCKTEST_H diff --git a/test/hostlist.txt b/test/hostlist.txt new file mode 100644 index 0000000..a0b4e5c --- /dev/null +++ b/test/hostlist.txt @@ -0,0 +1,6 @@ +# This is a comment, and after it comes a blank line + +127.0.0.1 localhost.localdomain + +0.0.0.0 blockeddomain.com +0.0.0.0 blockeddomain.first blockeddomain.second diff --git a/test/hostlist/hostlisttest.cpp b/test/hostlist/hostlisttest.cpp new file mode 100644 index 0000000..46f6a85 --- /dev/null +++ b/test/hostlist/hostlisttest.cpp @@ -0,0 +1,42 @@ +#include "hostlisttest.h" +#include + +void HostlistTest::parseList() +{ + //FilterTree tree; + + // load filters + QFile hostlist("hostlist.txt"); + QCOMPARE(hostlist.open(QIODevice::ReadOnly | QIODevice::Text), true); + QCOMPARE(loadHostlist(hostlist, &tree), true); + + QCOMPARE(tree.branches().length(), 1); +} + +void HostlistTest::checkRules() +{ + // test block + QVector block = tree.match(QString(), "blockeddomain.com"); + QCOMPARE(block.length(), 1); + QCOMPARE(block.constFirst()->action(), FilterLeaf::Block); + + // test redirect + QVector redirectResult = tree.match(QString(), "localhost.localdomain"); + QCOMPARE(redirectResult.length(), 1); + QCOMPARE(redirectResult.at(0)->action(), FilterLeaf::Redirect); + QCOMPARE(redirectResult.at(0)->redirect(), "127.0.0.1"); + + // two domains on one line + QVector blockFirst = tree.match(QString(), "blockeddomain.first"); + QCOMPARE(blockFirst.length(), 1); + QCOMPARE(blockFirst.constFirst()->action(), FilterLeaf::Block); + QVector blockSecond = tree.match(QString(), "blockeddomain.second"); + QCOMPARE(blockSecond.length(), 1); + QCOMPARE(blockSecond.constFirst()->action(), FilterLeaf::Block); + + // domain not on list + QVector missing = tree.match(QString(), "other.domain"); + QCOMPARE(missing.length(), 0); +} + +QTEST_GUILESS_MAIN(HostlistTest) diff --git a/test/hostlist/hostlisttest.h b/test/hostlist/hostlisttest.h new file mode 100644 index 0000000..9a87e0d --- /dev/null +++ b/test/hostlist/hostlisttest.h @@ -0,0 +1,19 @@ +#ifndef HOSTLIST_TEST +#define HOSTLIST_TEST + +#include +#include "filtertree.h" + +class HostlistTest : public QObject +{ + Q_OBJECT + +private slots: + void parseList(); + void checkRules(); + +private: + FilterTree tree; +}; + +#endif diff --git a/test/urlfilter/adblocktest.cpp b/test/urlfilter/adblocktest.cpp deleted file mode 100644 index b31d965..0000000 --- a/test/urlfilter/adblocktest.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include "adblocktest.h" -#include -#include "filterrule.h" -#include "formats/adblockrule.h" - -inline bool check(const std::vector rules, const QUrl &url) -{ - for(const AdBlockRule &rule : rules) { - if(rule.matchesDomain(qHash(url.host())) && rule.matchesUrl(url)) - return true; - } - return false; -} - -void AdBlockTest::parseList() -{ - std::vector rules; - - QFile list("adblock.txt"); - int ruleCount = 0; - QCOMPARE(list.open(QIODevice::ReadOnly | QIODevice::Text), true); - { - QTextStream l(&list); - QString line; - while(l.readLineInto(&line)) { - AdBlockRule rule(line); - if(rule.isEnabled()) { - rules.emplace_back(std::move(rule)); - ruleCount++; - qDebug("added rule: %s", qUtf8Printable(line)); - } - } - } - list.close(); - - // there should be 3 rules - QCOMPARE(rules.size(), ruleCount); - - // block by address part - QCOMPARE(check(rules, QUrl("http://example.com/banner/foo/img")), true); - QCOMPARE(check(rules, QUrl("http://example.com/banner/foo/bar/img?param")), true); - QCOMPARE(check(rules, QUrl("http://example.com/banner//img/foo")), true); - QCOMPARE(check(rules, QUrl("http://example.com/banner/img")), false); - QCOMPARE(check(rules, QUrl("http://example.com/banner/foo/imgraph")), false); - QCOMPARE(check(rules, QUrl("http://example.com/banner/foo/img.gif")), false); - - // block by domain - QCOMPARE(check(rules, QUrl("http://ads.example.com/foo.gif")), true); - QCOMPARE(check(rules, QUrl("http://server1.ads.example.com/foo.gif")), true); - QCOMPARE(check(rules, QUrl("https://ads.example.com:8000/")), true); - QCOMPARE(check(rules, QUrl("http://ads.example.com.ua/foo.gif")), false); - QCOMPARE(check(rules, QUrl("http://example.com/redirect/http://ads.example.com/")), false); - - // block exact address - QCOMPARE(check(rules, QUrl("http://example.com/")), true); - QCOMPARE(check(rules, QUrl("http://example.com/foo.gif")), false); - QCOMPARE(check(rules, QUrl("http://example.info/redirect/http://example.com/")), false); - - // regular expression - QCOMPARE(check(rules, QUrl("http://another.com/banner123")), true); - QCOMPARE(check(rules, QUrl("http://another.com/banner321")), true); - QCOMPARE(check(rules, QUrl("http://another.com/banners")), false); -} - -QTEST_GUILESS_MAIN(AdBlockTest) diff --git a/test/urlfilter/adblocktest.h b/test/urlfilter/adblocktest.h deleted file mode 100644 index 95cb7e2..0000000 --- a/test/urlfilter/adblocktest.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef ADBLOCKTEST_H -#define ADBLOCKTEST_H - -#include -class AdBlockTest : public QObject -{ - Q_OBJECT - -private slots: - void parseList(); -}; - -#endif // ADBLOCKTEST_H -- cgit v1.2.1