From 9808d18fa6cd19400f08897014a9948f168927df Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Mon, 9 Jul 2018 20:19:30 +0200 Subject: AdBlock rule testing --- lib/web/CMakeLists.txt | 3 + lib/web/urlfilter/adblockrule.cpp | 120 ++++++++++++++++++++++++++++++++++++++ lib/web/urlfilter/adblockrule.h | 43 ++++++++++++++ test/CMakeLists.txt | 9 +++ test/urlfilter/adblocktest.cpp | 40 +++++++++++++ test/urlfilter/adblocktest.h | 15 +++++ 6 files changed, 230 insertions(+) create mode 100644 lib/web/urlfilter/adblockrule.cpp create mode 100644 lib/web/urlfilter/adblockrule.h create mode 100644 test/urlfilter/adblocktest.cpp create mode 100644 test/urlfilter/adblocktest.h diff --git a/lib/web/CMakeLists.txt b/lib/web/CMakeLists.txt index 601c1d5..9515566 100644 --- a/lib/web/CMakeLists.txt +++ b/lib/web/CMakeLists.txt @@ -12,6 +12,9 @@ add_library(web urlfilter/filterdomain.h urlfilter/filterrule.cpp urlfilter/filterrule.h + + urlfilter/adblockrule.cpp + urlfilter/adblockrule.h ) target_link_libraries(web Qt5::WebEngineWidgets) diff --git a/lib/web/urlfilter/adblockrule.cpp b/lib/web/urlfilter/adblockrule.cpp new file mode 100644 index 0000000..d707783 --- /dev/null +++ b/lib/web/urlfilter/adblockrule.cpp @@ -0,0 +1,120 @@ +#include "adblockrule.h" + +bool isMatchingDomain(const QString &domain, const QString &filter) +{ + // domain and filter are the same + if(domain == filter) { + return true; + } + + // domain can't be matched by filter if it doesn't end with filter + // ex. example2.com isn't matched by example.com + if(!domain.endsWith(filter)) { + return false; + } + + // match with subdomains + // ex. subdomain.example.com is matched by example.com + int index = domain.indexOf(filter); + + // match if (domain ends with filter) && (filter has been found) and (character before filter is '.') + return index > 0 && domain[index - 1] == QLatin1Char('.'); +} + +// adblock format documentation +// https://adblockplus.org/filters + +// QString::mid(pos, len) - Returns a string starting at the specified position index. +// QString::chop(len) - Removes n characters from the end of the string. +// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. + +AdBlockRule::AdBlockRule(const QString &filter) +{ + QString parsedLine = filter.trimmed(); + + // there is no rule, or it"s a comment + if(parsedLine.isEmpty() || parsedLine.startsWith("!")) { + return; + } + + // css rule - ignore for now + if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) { + return; + } + + m_isEnabled = true; + + // exception rules + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + m_isException = true; + parsedLine.remove(0, 2); + } + + // regular expression rules + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + + matchType = RegularExpressionMatch; + regexp.setPattern(parsedLine); + return; + } + + // basic filter rules + if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + matchType = StringEquals; + match = parsedLine.mid(1, parsedLine.length() - 2); + return; + } + + // Basic filter rules can use wildcards, which were supported by QRegExp, + // but were deprecated in QRegularExpression. + + // remove beginning and ending wildcards + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { + matchType = DomainMatch; + match = parsedLine.mid(2, parsedLine.length() - 3); + return; + } + + // check for wildcards and translate to regexp + // wildcard "*" - any number of characters + // separator "^" - end, ? or / + if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) { + matchType = RegularExpressionMatch; + parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*")); + parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)")); + regexp.setPattern(parsedLine); + return; + } + + matcher.setPattern(parsedLine); +} + +bool AdBlockRule::isEnabled() const +{ + return m_isEnabled; +} + +bool AdBlockRule::shouldBlock(const QUrl &url) const +{ + switch (matchType) { + case RegularExpressionMatch: + if(regexp.match(url.toString()).hasMatch()) + return !m_isException; + + case StringEquals: + return url.toString() == match; + + case DomainMatch: + return isMatchingDomain(url.host(), match); + + default: + return false; + } +} diff --git a/lib/web/urlfilter/adblockrule.h b/lib/web/urlfilter/adblockrule.h new file mode 100644 index 0000000..e1cabae --- /dev/null +++ b/lib/web/urlfilter/adblockrule.h @@ -0,0 +1,43 @@ +#ifndef ADBLOCKRULE_H +#define ADBLOCKRULE_H + +#include +#include +#include +#include +#include + +class AdBlockRule +{ +public: + + enum MatchType { + InvalidMatch, + RegularExpressionMatch, + StringContains, + StringStartsWith, + StringEndsWith, + StringEquals, + DomainMatch + }; + + AdBlockRule(const QString &filter); + + bool isEnabled() const; + bool shouldBlock(const QUrl &url) const; + +private: + Q_DISABLE_COPY(AdBlockRule) + + bool m_isEnabled = false; + bool m_isException = false; + + MatchType matchType = InvalidMatch; + QHash m_resourceTypeOptions; + + QString match; + QRegularExpression regexp; + QStringMatcher matcher; +}; + +#endif // ADBLOCKRULE_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a44da07..414d616 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -12,3 +12,12 @@ target_include_directories(UrlFilterTest ) target_link_libraries(UrlFilterTest Qt5::Test web) + +add_executable(AdBlockTest + urlfilter/adblocktest.cpp + urlfilter/adblocktest.h +) + +target_include_directories(AdBlockTest PRIVATE ../lib/web) + +target_link_libraries(AdBlockTest Qt5::Test web) diff --git a/test/urlfilter/adblocktest.cpp b/test/urlfilter/adblocktest.cpp new file mode 100644 index 0000000..d1060f1 --- /dev/null +++ b/test/urlfilter/adblocktest.cpp @@ -0,0 +1,40 @@ +#include "adblocktest.h" +#include +#include "urlfilter/adblockrule.h" + +void AdBlockTest::blockByAddressPart() +{ + AdBlockRule rule("/banner/*/img^"); + + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/banner/foo/img")), true); + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/banner/foo/bar/img?param")), true); + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/banner//img/foo")), true); + + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/banner/img")), false); + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/banner/foo/imgraph")), false); + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/banner/foo/img.gif")), false); +} + +void AdBlockTest::blockByDomain() +{ + AdBlockRule rule("||ads.example.com^"); + + QCOMPARE(rule.shouldBlock(QUrl("http://ads.example.com/foo.gif")), true); + QCOMPARE(rule.shouldBlock(QUrl("http://server1.ads.example.com/foo.gif")), true); + QCOMPARE(rule.shouldBlock(QUrl("https://ads.example.com:8000/")), true); + + QCOMPARE(rule.shouldBlock(QUrl("http://ads.example.com.ua/foo.gif")), false); + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/redirect/http://ads.example.com/")), false); +} + +void AdBlockTest::blockExactAddress() +{ + AdBlockRule rule("|http://example.com/|"); + + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/")), true); + + QCOMPARE(rule.shouldBlock(QUrl("http://example.com/foo.gif")), false); + QCOMPARE(rule.shouldBlock(QUrl("http://example.info/redirect/http://example.com/")), false); +} + +QTEST_GUILESS_MAIN(AdBlockTest) diff --git a/test/urlfilter/adblocktest.h b/test/urlfilter/adblocktest.h new file mode 100644 index 0000000..a7b9f12 --- /dev/null +++ b/test/urlfilter/adblocktest.h @@ -0,0 +1,15 @@ +#ifndef ADBLOCKTEST_H +#define ADBLOCKTEST_H + +#include +class AdBlockTest : public QObject +{ + Q_OBJECT + +private slots: + void blockByAddressPart(); + void blockByDomain(); + void blockExactAddress(); +}; + +#endif // ADBLOCKTEST_H -- cgit v1.2.1