From 380c05028306680972f848808da17d9e6f55635e Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Fri, 21 Dec 2018 14:55:18 +0100 Subject: Add adblocklist class --- lib/urlfilter/formats/adblocklist.cpp | 95 +++++++++++++++++++++++++++++ lib/urlfilter/formats/adblocklist.h | 32 ++++++++++ lib/urlfilter/formats/adblockrule.h | 26 ++++++-- lib/urlfilter/formats/adblockrule_parse.cpp | 1 + lib/urlfilter/meson.build | 3 +- 5 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 lib/urlfilter/formats/adblocklist.cpp create mode 100644 lib/urlfilter/formats/adblocklist.h (limited to 'lib') diff --git a/lib/urlfilter/formats/adblocklist.cpp b/lib/urlfilter/formats/adblocklist.cpp new file mode 100644 index 0000000..772c252 --- /dev/null +++ b/lib/urlfilter/formats/adblocklist.cpp @@ -0,0 +1,95 @@ +#include "adblocklist.h" + +AdBlockList::AdBlockList() +{ +} + +QString AdBlockList::metadata(const QString &key) const +{ + return m_metadata.value(key, QString()); +} + +FilterLeaf::Action AdBlockList::match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type) const +{ + const QString request = requestUrl.toString(); + + for(auto &filter : m_rules) { + if(filter.matcher->hasMatch(request)) + return filter.action; + } + return FilterLeaf::NotMatched; +} + +bool AdBlockList::parseLine(const QString &line) +{ + // remove whitespace from start/end of the line + QString parsedLine = line.trimmed(); + + // check if the line is empty + if(parsedLine.isEmpty()) + return false; + + // parse comment + if(parsedLine.startsWith(QLatin1Literal("!"))) + return parseComment(parsedLine); + + Filter filter; + + // exception rules + if(parsedLine.startsWith(QLatin1Literal("@@"))) { + filter.action = FilterLeaf::Allow; + parsedLine.remove(0, 2); + } + + // remove '*' at the beginning and the end + if(parsedLine.startsWith(QLatin1Literal("*"))) + parsedLine = parsedLine.mid(1); + if(parsedLine.endsWith(QLatin1Literal("*"))) + parsedLine.chop(1); + + if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) { + // regular expression rule + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::RegularExpressionMatch); + + } else if(parsedLine.contains(QLatin1Literal("*"))) { + parsedLine = QRegularExpression::wildcardToRegularExpression(parsedLine); + filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::RegularExpressionMatch); + + } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) { +// matchType = FilterLeaf::DomainMatch; + parsedLine = parsedLine.mid(2, parsedLine.length() - 3); + filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::DomainMatch); + + } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) { + // string equals rule + parsedLine = parsedLine.mid(1, parsedLine.length() - 2); + filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::StringEquals); + + } else if(parsedLine.startsWith(QLatin1Literal("||"))) { + // string starts with rule + parsedLine = parsedLine.mid(2); + filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::StringStartsWith); + + } else if(parsedLine.endsWith(QLatin1Literal("|"))) { + // string ends with rule + parsedLine.chop(1); + filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::StringEndsWith); + + } else { + // generic contains rule + filter.matcher = new ContentsMatcher(parsedLine, FilterLeaf::StringContains); + } + + + Q_CHECK_PTR(filter.matcher); + m_rules.emplace_back(std::move(filter)); + return true; +} + +bool AdBlockList::parseComment(const QString &commentLine) +{ + const QStringList comment = commentLine.mid(1).split(QLatin1Literal(": ")); + m_metadata[comment.at(0).trimmed()] = comment.at(1).trimmed(); + return true; +} diff --git a/lib/urlfilter/formats/adblocklist.h b/lib/urlfilter/formats/adblocklist.h new file mode 100644 index 0000000..34a2120 --- /dev/null +++ b/lib/urlfilter/formats/adblocklist.h @@ -0,0 +1,32 @@ +#ifndef ADBLOCKLIST_H +#define ADBLOCKLIST_H + +#include +#include "adblockrule.h" + +class AdBlockList +{ +public: + AdBlockList(); + + QString metadata(const QString &key) const; + FilterLeaf::Action match(const QUrl &firstParty, const QUrl &requestUrl, QWebEngineUrlRequestInfo::ResourceType type = QWebEngineUrlRequestInfo::ResourceTypeUnknown) const; + + bool parseLine(const QString &line); + +protected: + bool parseComment(const QString &commentLine); + +private: + struct Filter + { + FilterLeaf::Action action = FilterLeaf::Block; + Matcher *matcher; + }; + + QHash m_metadata; + //QMap m_rules; + std::vector m_rules; +}; + +#endif // ADBLOCKLIST_H diff --git a/lib/urlfilter/formats/adblockrule.h b/lib/urlfilter/formats/adblockrule.h index 3331cac..6be3cdf 100644 --- a/lib/urlfilter/formats/adblockrule.h +++ b/lib/urlfilter/formats/adblockrule.h @@ -14,24 +14,35 @@ #include #include +class Matcher +{ +public: + virtual bool hasMatch(const QString &where) const = 0; +}; + template -class ContentsMatcher +class ContentsMatcher : public Matcher { public: ContentsMatcher(const QString &pattern, FilterLeaf::UrlMatchType matchType) { this->matchType = matchType; patternLength = pattern.length(); - matcher.setPattern(pattern); + if constexpr(std::is_same_v) { matcher.setPatternOptions(matcher.patternOptions() | QRegularExpression::CaseInsensitiveOption); + matcher.setPattern(pattern); } else if constexpr(std::is_same_v) { matcher.setCaseSensitivity(Qt::CaseInsensitive); + matcher.setPattern(pattern); + } else if constexpr(std::is_same_v) { + matcher = QUrl::fromUserInput(pattern).host(); +// qDebug("matcher: %s", qUtf8Printable(matcher)); } } - bool hasMatch(const QString &where) const + bool hasMatch(const QString &where) const override { if constexpr(std::is_same_v) { switch (matchType) { @@ -58,6 +69,13 @@ public: if(matchType != FilterLeaf::RegularExpressionMatch) qWarning("ContentsMatcher is a regular expression, but not doing a regular expression match!"); return matcher.match(where).hasMatch(); + } else if constexpr(std::is_same_v) { + // TODO: fix + if(matchType == FilterLeaf::DomainMatch) { +// qDebug("matching %s", qUtf8Printable(QUrl(where).host())); + return QUrl(where).host().endsWith(matcher); + } else + return matcher == where; } else { qWarning("Matcher has no backend, returning false"); return false; @@ -65,7 +83,7 @@ public: } private: - int patternLength; + int patternLength; T matcher; FilterLeaf::UrlMatchType matchType; }; diff --git a/lib/urlfilter/formats/adblockrule_parse.cpp b/lib/urlfilter/formats/adblockrule_parse.cpp index 927a6a3..c01ddfd 100644 --- a/lib/urlfilter/formats/adblockrule_parse.cpp +++ b/lib/urlfilter/formats/adblockrule_parse.cpp @@ -15,6 +15,7 @@ // QString::mid(pos, len) const - Returns a string starting at the specified position index. // QString::chop(len) - Removes n characters from the end of the string. // QString::remove(pos, len) - Removes n characters from the string, starting at the given position index. +// QString::trimmed() const - Remove whitespace from start and end AdBlockRule *parseRule_adblock(const QString &filter) { diff --git a/lib/urlfilter/meson.build b/lib/urlfilter/meson.build index 5d0a970..1f4f47c 100644 --- a/lib/urlfilter/meson.build +++ b/lib/urlfilter/meson.build @@ -8,7 +8,8 @@ urlfilter_moc = qt5.preprocess( urlfilter_lib = static_library('urlfilter', ['filtertree.cpp', 'filterleaf.cpp', urlfilter_moc, 'domain.cpp', 'domain.h', - 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp', ], + 'formats/adblockrule.cpp', 'formats/adblockrule_parse.cpp', 'formats/hostlistrule.cpp', + 'formats/adblocklist.cpp'], dependencies: dep_qt5 ) -- cgit v1.2.1