diff options
author | Aqua-sama <aqua@iserlohn-fortress.net> | 2018-12-21 14:55:18 +0100 |
---|---|---|
committer | Aqua-sama <aqua@iserlohn-fortress.net> | 2018-12-21 14:55:18 +0100 |
commit | 380c05028306680972f848808da17d9e6f55635e (patch) | |
tree | 985e7c2e732dc12df56f67633ad29b13c34359ae | |
parent | Add ContentsMatcher class (diff) | |
download | smolbote-380c05028306680972f848808da17d9e6f55635e.tar.xz |
Add adblocklist class
-rw-r--r-- | lib/urlfilter/formats/adblocklist.cpp | 95 | ||||
-rw-r--r-- | lib/urlfilter/formats/adblocklist.h | 32 | ||||
-rw-r--r-- | lib/urlfilter/formats/adblockrule.h | 26 | ||||
-rw-r--r-- | lib/urlfilter/formats/adblockrule_parse.cpp | 1 | ||||
-rw-r--r-- | lib/urlfilter/meson.build | 3 | ||||
-rw-r--r-- | test/meson.build | 5 | ||||
-rw-r--r-- | test/urlfilter/urlfiltertest.cpp | 61 |
7 files changed, 218 insertions, 5 deletions
diff --git a/lib/urlfilter/formats/adblocklist.cpp b/lib/urlfilter/formats/adblocklist.cpp new file mode 100644 index 0000000..772c252 --- /dev/null +++ b/lib/urlfilter/formats/adblocklist.cpp @@ -0,0 +1,95 @@ +#include "adblocklist.h" + +AdBlockList::AdBlockList() +{ +} + +QString AdBlockList::metadata(const QString &key) const +{ + return m_metadata.value(key, QString()); +} + +FilterLeaf::Action AdBlockList::match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const +{ + const QString request = requestUrl.toString(); + + for(auto &filter : m_rules) { + if(filter.matcher->hasMatch(request)) + return filter.action; + } + return FilterLeaf::NotMatched; +} + +bool AdBlockList::parseLine(const QString &line) +{ + // remove whitespace from start/end of the line + QString parsedLine = line.trimmed(); + + // check if the line is empty + if(parsedLine.isEmpty()) + return false; + + // parse comment + if(parsedLine.startsWith(QLatin1Literal("!"))) + return parseComment(parsedLine); + + Filter filter; + + // exception rules + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + filter.action = FilterLeaf::Allow; + parsedLine.remove(0, 2); + } + + // remove '*' at the beginning and the end + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + // regular expression rule + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + filter.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, FilterLeaf::RegularExpressionMatch); + + } else if(parsedLine.contains(QLatin1Literal("*"))) { + parsedLine = QRegularExpression::wildcardToRegularExpression(parsedLine); + filter.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, FilterLeaf::RegularExpressionMatch); + + } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { +// matchType = FilterLeaf::DomainMatch; + parsedLine = parsedLine.mid(2, parsedLine.length() - 3); + filter.matcher = new ContentsMatcher<QString>(parsedLine, FilterLeaf::DomainMatch); + + } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + // string equals rule + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringEquals); + + } else if(parsedLine.startsWith(QLatin1Literal("||"))) { + // string starts with rule + parsedLine = parsedLine.mid(2); + filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringStartsWith); + + } else if(parsedLine.endsWith(QLatin1Literal("|"))) { + // string ends with rule + parsedLine.chop(1); + filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringEndsWith); + + } else { + // generic contains rule + filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringContains); + } + + + Q_CHECK_PTR(filter.matcher); + m_rules.emplace_back(std::move(filter)); + return true; +} + +bool AdBlockList::parseComment(const QString &commentLine) +{ + const QStringList comment = commentLine.mid(1).split(QLatin1Literal(": ")); + m_metadata[comment.at(0).trimmed()] = comment.at(1).trimmed(); + return true; +} diff --git a/lib/urlfilter/formats/adblocklist.h b/lib/urlfilter/formats/adblocklist.h new file mode 100644 index 0000000..34a2120 --- /dev/null +++ b/lib/urlfilter/formats/adblocklist.h @@ -0,0 +1,32 @@ +#ifndef ADBLOCKLIST_H +#define ADBLOCKLIST_H + +#include <QHash> +#include "adblockrule.h" + +class AdBlockList +{ +public: + AdBlockList(); + + QString metadata(const QString &key) const; + FilterLeaf::Action match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type = QWebEngineUrlRequestInfo::ResourceTypeUnknown) const; + + bool parseLine(const QString &line); + +protected: + bool parseComment(const QString &commentLine); + +private: + struct Filter + { + FilterLeaf::Action action = FilterLeaf::Block; + Matcher *matcher; + }; + + QHash<QString, QString> m_metadata; + //QMap<QString, Filter> m_rules; + std::vector<Filter> m_rules; +}; + +#endif // ADBLOCKLIST_H diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/formats/adblockrule.h index 3331cac..6be3cdf 100644 --- a/lib/urlfilter/formats/adblockrule.h +++ b/lib/urlfilter/formats/adblockrule.h @@ -14,24 +14,35 @@ #include <QRegularExpression> #include <QStringMatcher> +class Matcher +{ +public: + virtual bool hasMatch(const QString &where) const = 0; +}; + template <typename T> -class ContentsMatcher +class ContentsMatcher : public Matcher { public: ContentsMatcher(const QString &pattern, FilterLeaf::UrlMatchType matchType) { this->matchType = matchType; patternLength = pattern.length(); - matcher.setPattern(pattern); + if constexpr(std::is_same_v<T, QRegularExpression>) { matcher.setPatternOptions(matcher.patternOptions() | QRegularExpression::CaseInsensitiveOption); + matcher.setPattern(pattern); } else if constexpr(std::is_same_v<T, QStringMatcher>) { matcher.setCaseSensitivity(Qt::CaseInsensitive); + matcher.setPattern(pattern); + } else if constexpr(std::is_same_v<T, QString>) { + matcher = QUrl::fromUserInput(pattern).host(); +// qDebug("matcher: %s", qUtf8Printable(matcher)); } } - bool hasMatch(const QString &where) const + bool hasMatch(const QString &where) const override { if constexpr(std::is_same_v<T, QStringMatcher>) { switch (matchType) { @@ -58,6 +69,13 @@ public: if(matchType != FilterLeaf::RegularExpressionMatch) qWarning("ContentsMatcher is a regular expression, but not doing a regular expression match!"); return matcher.match(where).hasMatch(); + } else if constexpr(std::is_same_v<T, QString>) { + // TODO: fix + if(matchType == FilterLeaf::DomainMatch) { +// qDebug("matching %s", qUtf8Printable(QUrl(where).host())); + return QUrl(where).host().endsWith(matcher); + } else + return matcher == where; } else { qWarning("Matcher has no backend, returning false"); return false; @@ -65,7 +83,7 @@ public: } private: - int patternLength; + int patternLength; T matcher; FilterLeaf::UrlMatchType matchType; }; diff --git a/lib/urlfilter/formats/adblockrule_parse.cpp b/lib/urlfilter/formats/adblockrule_parse.cpp index 927a6a3..c01ddfd 100644 --- a/lib/urlfilter/formats/adblockrule_parse.cpp +++ b/lib/urlfilter/formats/adblockrule_parse.cpp @@ -15,6 +15,7 @@ // QString::mid(pos, len) const - Returns a string starting at the specified position index. // QString::chop(len) - Removes n characters from the end of the string. // QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. +// QString::trimmed() const - Remove whitespace from start and end AdBlockRule *parseRule_adblock(const QString &filter) { diff --git a/lib/urlfilter/meson.build b/lib/urlfilter/meson.build index 5d0a970..1f4f47c 100644 --- a/lib/urlfilter/meson.build +++ b/lib/urlfilter/meson.build @@ -8,7 +8,8 @@ urlfilter_moc = qt5.preprocess( urlfilter_lib = static_library('urlfilter', ['filtertree.cpp', 'filterleaf.cpp', urlfilter_moc, 'domain.cpp', 'domain.h', - 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp', ], + 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp', + 'formats/adblocklist.cpp'], dependencies: dep_qt5 ) diff --git a/test/meson.build b/test/meson.build index e2e25f6..75e38ed 100644 --- a/test/meson.build +++ b/test/meson.build @@ -1,5 +1,10 @@ dep_gtest = dependency('gtest') +test('urlfilter-AdBlockList', executable('AdBlockList', + dependencies: [dep_gtest, dep_qt5, dep_urlfilter], + sources: ['urlfilter/urlfiltertest.cpp'] +)) + # Adblock parsing test adblock = executable('AdblockTest', dependencies: [dep_gtest, dep_qt5, dep_urlfilter], diff --git a/test/urlfilter/urlfiltertest.cpp b/test/urlfilter/urlfiltertest.cpp new file mode 100644 index 0000000..f6cdbd4 --- /dev/null +++ b/test/urlfilter/urlfiltertest.cpp @@ -0,0 +1,61 @@ +#include "formats/adblockrule.h" +#include "formats/adblockrule_parse.h" +#include "formats/adblocklist.h" +#include <gtest/gtest.h> + +AdBlockList list; + +TEST(AdBlockList, MetaData) { + EXPECT_STREQ(qUtf8Printable(list.metadata("Homepage")), "http://example.com/"); + EXPECT_STREQ(qUtf8Printable(list.metadata("Title")), "FooList"); + EXPECT_STREQ(qUtf8Printable(list.metadata("Expires")), "5 days"); + EXPECT_STREQ(qUtf8Printable(list.metadata("Redirect")), "http://example.com/list.txt"); + EXPECT_STREQ(qUtf8Printable(list.metadata("Version")), "1234"); +} + +TEST(AdBlockList, Contains) { + EXPECT_TRUE(list.match(QUrl(), QUrl("http://example.com/banner/foo.png"))); + EXPECT_FALSE(list.match(QUrl(), QUrl("http://example.com/banner/foo/img"))); + +// AdBlockRule *rule = parseRule_adblock("/banner/*/img^"); +// EXPECT_TRUE(rule->match(QUrl("http://example.com/banner/foo/img"))); +// EXPECT_TRUE(rule->match(QUrl("http://example.com/banner/foo/bar/img?param"))); +// EXPECT_TRUE(rule->match(QUrl("http://example.com/banner//img/foo"))); +// EXPECT_FALSE(rule->match(QUrl("http://example.com/banner/img"))); +// EXPECT_FALSE(rule->match(QUrl("http://example.com/banner/foo/imgraph"))); +// EXPECT_FALSE(rule->match(QUrl("http://example.com/banner/foo/img.gif"))); +} + +TEST(AdBlockList, ContainsWildcard) { + EXPECT_TRUE(list.match(QUrl(), QUrl("http://example.com/banner/ads/img.png"))); +} + +TEST(AdBlockList, Domain) { + EXPECT_TRUE(list.match(QUrl(), QUrl("http://ads.example.com/foo.gif"))); + EXPECT_TRUE(list.match(QUrl(), QUrl("http://server1.ads.example.com/foo.gif"))); + EXPECT_TRUE(list.match(QUrl(), QUrl("https://ads.example.com:8000/"))); + EXPECT_FALSE(list.match(QUrl(), QUrl("http://ads.example.com.ua/foo.gif"))); + EXPECT_FALSE(list.match(QUrl(), QUrl("http://example.com/redirect/http://ads.example.com/"))); +} + +TEST(AdBlockList, RegularExpression) { + EXPECT_TRUE(list.match(QUrl(), QUrl("http://example.com/banner123"))); + EXPECT_TRUE(list.match(QUrl(), QUrl("http://example.com/banner321"))); + EXPECT_FALSE(list.match(QUrl(), QUrl("http://example.com/banners"))); +} + +int main(int argc, char **argv) { + list.parseLine("! Homepage: http://example.com/"); + list.parseLine("! Title: FooList"); + list.parseLine("! Expires: 5 days"); + list.parseLine("! Redirect: http://example.com/list.txt"); + list.parseLine("! Version: 1234"); + + EXPECT_TRUE(list.parseLine("/banner/foo.png")); + EXPECT_TRUE(list.parseLine("/banner/*/img.png")); + EXPECT_TRUE(list.parseLine("||ads.example.com^")); + EXPECT_TRUE(list.parseLine("/banner\\d+/")); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} |