aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAqua-sama <aqua@iserlohn-fortress.net>2018-12-21 14:55:18 +0100
committerAqua-sama <aqua@iserlohn-fortress.net>2018-12-21 14:55:18 +0100
commit380c05028306680972f848808da17d9e6f55635e (patch)
tree985e7c2e732dc12df56f67633ad29b13c34359ae
parentAdd ContentsMatcher class (diff)
downloadsmolbote-380c05028306680972f848808da17d9e6f55635e.tar.xz
Add adblocklist class
-rw-r--r--lib/urlfilter/formats/adblocklist.cpp95
-rw-r--r--lib/urlfilter/formats/adblocklist.h32
-rw-r--r--lib/urlfilter/formats/adblockrule.h26
-rw-r--r--lib/urlfilter/formats/adblockrule_parse.cpp1
-rw-r--r--lib/urlfilter/meson.build3
-rw-r--r--test/meson.build5
-rw-r--r--test/urlfilter/urlfiltertest.cpp61
7 files changed, 218 insertions, 5 deletions
diff --git a/lib/urlfilter/formats/adblocklist.cpp b/lib/urlfilter/formats/adblocklist.cpp
new file mode 100644
index 0000000..772c252
--- /dev/null
+++ b/lib/urlfilter/formats/adblocklist.cpp
@@ -0,0 +1,95 @@
+#include "adblocklist.h"
+
+AdBlockList::AdBlockList()
+{
+}
+
+QString AdBlockList::metadata(const QString &key) const
+{
+ return m_metadata.value(key, QString());
+}
+
+FilterLeaf::Action AdBlockList::match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const
+{
+ const QString request = requestUrl.toString();
+
+ for(auto &filter : m_rules) {
+ if(filter.matcher->hasMatch(request))
+ return filter.action;
+ }
+ return FilterLeaf::NotMatched;
+}
+
+bool AdBlockList::parseLine(const QString &line)
+{
+ // remove whitespace from start/end of the line
+ QString parsedLine = line.trimmed();
+
+ // check if the line is empty
+ if(parsedLine.isEmpty())
+ return false;
+
+ // parse comment
+ if(parsedLine.startsWith(QLatin1Literal("!")))
+ return parseComment(parsedLine);
+
+ Filter filter;
+
+ // exception rules
+ if(parsedLine.startsWith(QLatin1Literal("@@"))) {
+ filter.action = FilterLeaf::Allow;
+ parsedLine.remove(0, 2);
+ }
+
+ // remove '*' at the beginning and the end
+ if(parsedLine.startsWith(QLatin1Literal("*")))
+ parsedLine = parsedLine.mid(1);
+ if(parsedLine.endsWith(QLatin1Literal("*")))
+ parsedLine.chop(1);
+
+ if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) {
+ // regular expression rule
+ parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
+ filter.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, FilterLeaf::RegularExpressionMatch);
+
+ } else if(parsedLine.contains(QLatin1Literal("*"))) {
+ parsedLine = QRegularExpression::wildcardToRegularExpression(parsedLine);
+ filter.matcher = new ContentsMatcher<QRegularExpression>(parsedLine, FilterLeaf::RegularExpressionMatch);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) {
+// matchType = FilterLeaf::DomainMatch;
+ parsedLine = parsedLine.mid(2, parsedLine.length() - 3);
+ filter.matcher = new ContentsMatcher<QString>(parsedLine, FilterLeaf::DomainMatch);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) {
+ // string equals rule
+ parsedLine = parsedLine.mid(1, parsedLine.length() - 2);
+ filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringEquals);
+
+ } else if(parsedLine.startsWith(QLatin1Literal("||"))) {
+ // string starts with rule
+ parsedLine = parsedLine.mid(2);
+ filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringStartsWith);
+
+ } else if(parsedLine.endsWith(QLatin1Literal("|"))) {
+ // string ends with rule
+ parsedLine.chop(1);
+ filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringEndsWith);
+
+ } else {
+ // generic contains rule
+ filter.matcher = new ContentsMatcher<QStringMatcher>(parsedLine, FilterLeaf::StringContains);
+ }
+
+
+ Q_CHECK_PTR(filter.matcher);
+ m_rules.emplace_back(std::move(filter));
+ return true;
+}
+
+bool AdBlockList::parseComment(const QString &commentLine)
+{
+ const QStringList comment = commentLine.mid(1).split(QLatin1Literal(": "));
+ m_metadata[comment.at(0).trimmed()] = comment.at(1).trimmed();
+ return true;
+}
diff --git a/lib/urlfilter/formats/adblocklist.h b/lib/urlfilter/formats/adblocklist.h
new file mode 100644
index 0000000..34a2120
--- /dev/null
+++ b/lib/urlfilter/formats/adblocklist.h
@@ -0,0 +1,32 @@
+#ifndef ADBLOCKLIST_H
+#define ADBLOCKLIST_H
+
+#include <QHash>
+#include "adblockrule.h"
+
+class AdBlockList
+{
+public:
+ AdBlockList();
+
+ QString metadata(const QString &key) const;
+ FilterLeaf::Action match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type = QWebEngineUrlRequestInfo::ResourceTypeUnknown) const;
+
+ bool parseLine(const QString &line);
+
+protected:
+ bool parseComment(const QString &commentLine);
+
+private:
+ struct Filter
+ {
+ FilterLeaf::Action action = FilterLeaf::Block;
+ Matcher *matcher;
+ };
+
+ QHash<QString, QString> m_metadata;
+ //QMap<QString, Filter> m_rules;
+ std::vector<Filter> m_rules;
+};
+
+#endif // ADBLOCKLIST_H
diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/formats/adblockrule.h
index 3331cac..6be3cdf 100644
--- a/lib/urlfilter/formats/adblockrule.h
+++ b/lib/urlfilter/formats/adblockrule.h
@@ -14,24 +14,35 @@
#include <QRegularExpression>
#include <QStringMatcher>
+class Matcher
+{
+public:
+ virtual bool hasMatch(const QString &where) const = 0;
+};
+
template <typename T>
-class ContentsMatcher
+class ContentsMatcher : public Matcher
{
public:
ContentsMatcher(const QString &pattern, FilterLeaf::UrlMatchType matchType)
{
this->matchType = matchType;
patternLength = pattern.length();
- matcher.setPattern(pattern);
+
if constexpr(std::is_same_v<T, QRegularExpression>) {
matcher.setPatternOptions(matcher.patternOptions() | QRegularExpression::CaseInsensitiveOption);
+ matcher.setPattern(pattern);
} else if constexpr(std::is_same_v<T, QStringMatcher>) {
matcher.setCaseSensitivity(Qt::CaseInsensitive);
+ matcher.setPattern(pattern);
+ } else if constexpr(std::is_same_v<T, QString>) {
+ matcher = QUrl::fromUserInput(pattern).host();
+// qDebug("matcher: %s", qUtf8Printable(matcher));
}
}
- bool hasMatch(const QString &where) const
+ bool hasMatch(const QString &where) const override
{
if constexpr(std::is_same_v<T, QStringMatcher>) {
switch (matchType) {
@@ -58,6 +69,13 @@ public:
if(matchType != FilterLeaf::RegularExpressionMatch)
qWarning("ContentsMatcher is a regular expression, but not doing a regular expression match!");
return matcher.match(where).hasMatch();
+ } else if constexpr(std::is_same_v<T, QString>) {
+ // TODO: fix
+ if(matchType == FilterLeaf::DomainMatch) {
+// qDebug("matching %s", qUtf8Printable(QUrl(where).host()));
+ return QUrl(where).host().endsWith(matcher);
+ } else
+ return matcher == where;
} else {
qWarning("Matcher has no backend, returning false");
return false;
@@ -65,7 +83,7 @@ public:
}
private:
- int patternLength;
+ int patternLength;
T matcher;
FilterLeaf::UrlMatchType matchType;
};
diff --git a/lib/urlfilter/formats/adblockrule_parse.cpp b/lib/urlfilter/formats/adblockrule_parse.cpp
index 927a6a3..c01ddfd 100644
--- a/lib/urlfilter/formats/adblockrule_parse.cpp
+++ b/lib/urlfilter/formats/adblockrule_parse.cpp
@@ -15,6 +15,7 @@
// QString::mid(pos, len) const - Returns a string starting at the specified position index.
// QString::chop(len) - Removes n characters from the end of the string.
// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index.
+// QString::trimmed() const - Remove whitespace from start and end
AdBlockRule *parseRule_adblock(const QString &filter)
{
diff --git a/lib/urlfilter/meson.build b/lib/urlfilter/meson.build
index 5d0a970..1f4f47c 100644
--- a/lib/urlfilter/meson.build
+++ b/lib/urlfilter/meson.build
@@ -8,7 +8,8 @@ urlfilter_moc = qt5.preprocess(
urlfilter_lib = static_library('urlfilter',
['filtertree.cpp', 'filterleaf.cpp', urlfilter_moc,
'domain.cpp', 'domain.h',
- 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp', ],
+ 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp',
+ 'formats/adblocklist.cpp'],
dependencies: dep_qt5
)
diff --git a/test/meson.build b/test/meson.build
index e2e25f6..75e38ed 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -1,5 +1,10 @@
dep_gtest = dependency('gtest')
+test('urlfilter-AdBlockList', executable('AdBlockList',
+ dependencies: [dep_gtest, dep_qt5, dep_urlfilter],
+ sources: ['urlfilter/urlfiltertest.cpp']
+))
+
# Adblock parsing test
adblock = executable('AdblockTest',
dependencies: [dep_gtest, dep_qt5, dep_urlfilter],
diff --git a/test/urlfilter/urlfiltertest.cpp b/test/urlfilter/urlfiltertest.cpp
new file mode 100644
index 0000000..f6cdbd4
--- /dev/null
+++ b/test/urlfilter/urlfiltertest.cpp
@@ -0,0 +1,61 @@
+#include "formats/adblockrule.h"
+#include "formats/adblockrule_parse.h"
+#include "formats/adblocklist.h"
+#include <gtest/gtest.h>
+
+AdBlockList list;
+
+TEST(AdBlockList, MetaData) {
+ EXPECT_STREQ(qUtf8Printable(list.metadata("Homepage")), "http://example.com/");
+ EXPECT_STREQ(qUtf8Printable(list.metadata("Title")), "FooList");
+ EXPECT_STREQ(qUtf8Printable(list.metadata("Expires")), "5 days");
+ EXPECT_STREQ(qUtf8Printable(list.metadata("Redirect")), "http://example.com/list.txt");
+ EXPECT_STREQ(qUtf8Printable(list.metadata("Version")), "1234");
+}
+
+TEST(AdBlockList, Contains) {
+ EXPECT_TRUE(list.match(QUrl(), QUrl("http://example.com/banner/foo.png")));
+ EXPECT_FALSE(list.match(QUrl(), QUrl("http://example.com/banner/foo/img")));
+
+// AdBlockRule *rule = parseRule_adblock("/banner/*/img^");
+// EXPECT_TRUE(rule->match(QUrl("http://example.com/banner/foo/img")));
+// EXPECT_TRUE(rule->match(QUrl("http://example.com/banner/foo/bar/img?param")));
+// EXPECT_TRUE(rule->match(QUrl("http://example.com/banner//img/foo")));
+// EXPECT_FALSE(rule->match(QUrl("http://example.com/banner/img")));
+// EXPECT_FALSE(rule->match(QUrl("http://example.com/banner/foo/imgraph")));
+// EXPECT_FALSE(rule->match(QUrl("http://example.com/banner/foo/img.gif")));
+}
+
+TEST(AdBlockList, ContainsWildcard) {
+ EXPECT_TRUE(list.match(QUrl(), QUrl("http://example.com/banner/ads/img.png")));
+}
+
+TEST(AdBlockList, Domain) {
+ EXPECT_TRUE(list.match(QUrl(), QUrl("http://ads.example.com/foo.gif")));
+ EXPECT_TRUE(list.match(QUrl(), QUrl("http://server1.ads.example.com/foo.gif")));
+ EXPECT_TRUE(list.match(QUrl(), QUrl("https://ads.example.com:8000/")));
+ EXPECT_FALSE(list.match(QUrl(), QUrl("http://ads.example.com.ua/foo.gif")));
+ EXPECT_FALSE(list.match(QUrl(), QUrl("http://example.com/redirect/http://ads.example.com/")));
+}
+
+TEST(AdBlockList, RegularExpression) {
+ EXPECT_TRUE(list.match(QUrl(), QUrl("http://example.com/banner123")));
+ EXPECT_TRUE(list.match(QUrl(), QUrl("http://example.com/banner321")));
+ EXPECT_FALSE(list.match(QUrl(), QUrl("http://example.com/banners")));
+}
+
+int main(int argc, char **argv) {
+ list.parseLine("! Homepage: http://example.com/");
+ list.parseLine("! Title: FooList");
+ list.parseLine("! Expires: 5 days");
+ list.parseLine("! Redirect: http://example.com/list.txt");
+ list.parseLine("! Version: 1234");
+
+ EXPECT_TRUE(list.parseLine("/banner/foo.png"));
+ EXPECT_TRUE(list.parseLine("/banner/*/img.png"));
+ EXPECT_TRUE(list.parseLine("||ads.example.com^"));
+ EXPECT_TRUE(list.parseLine("/banner\\d+/"));
+
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}