From c3559b24eba76052deb9d4ce79e4704815d902a5 Mon Sep 17 00:00:00 2001 From: Aqua-sama Date: Mon, 10 Feb 2020 20:58:39 +0200 Subject: staging: rewrite AdblockPlus parser yet again --- meson.build | 3 + staging/adblock/filterlist.cpp | 137 +++++++++++++++++++++++++++++ staging/adblock/filterlist.h | 50 +++++++++++ staging/adblock/meson.build | 16 ++++ staging/adblock/options.cpp | 111 +++++++++++++++++++++++ staging/adblock/options.h | 37 ++++++++ staging/adblock/rule.cpp | 35 ++++++++ staging/adblock/rule.h | 55 ++++++++++++ staging/adblock/test/parser.cpp | 32 +++++++ test/matcherbenchmark/matcherbenchmark.cpp | 85 ++++++++++++++++++ test/matcherbenchmark/matcherbenchmark.h | 19 ++++ test/matcherbenchmark/meson.build | 5 ++ 12 files changed, 585 insertions(+) create mode 100644 staging/adblock/filterlist.cpp create mode 100644 staging/adblock/filterlist.h create mode 100644 staging/adblock/meson.build create mode 100644 staging/adblock/options.cpp create mode 100644 staging/adblock/options.h create mode 100644 staging/adblock/rule.cpp create mode 100644 staging/adblock/rule.h create mode 100644 staging/adblock/test/parser.cpp create mode 100644 test/matcherbenchmark/matcherbenchmark.cpp create mode 100644 test/matcherbenchmark/matcherbenchmark.h create mode 100644 test/matcherbenchmark/meson.build diff --git a/meson.build b/meson.build index f2a51bc..8619d5b 100644 --- a/meson.build +++ b/meson.build @@ -92,6 +92,9 @@ subdir('tools') #subdir('plugins/ProfileEditor') subdir('test/firefox-bookmarks-json-parser') +subdir('test/matcherbenchmark') + +subdir('staging/adblock') ssconfig = poi_sourceset.apply(cdata) diff --git a/staging/adblock/filterlist.cpp b/staging/adblock/filterlist.cpp new file mode 100644 index 0000000..be2bd4e --- /dev/null +++ b/staging/adblock/filterlist.cpp @@ -0,0 +1,137 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://library.iserlohn-fortress.net/aqua/smolbote.git + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "filterlist.h" +#include "rule.h" +#include +#include + +/** + * Documentation: + * + * https://help.eyeo.com/en/adblockplus/how-to-write-filters + * + * https://github.com/gorhill/uBlock/wiki/Introduction-to-basic-filtering-syntax + * https://github.com/gorhill/uBlock/wiki/Static-filter-syntax + * + */ + +using namespace AdblockPlus; + +FilterList::FilterList(QObject *parent) + : QObject(parent) +{ +} + +FilterList::~FilterList() +{ + qDeleteAll(m_rules); +} + +FilterList::ParseResult FilterList::parse(QTextStream &stream) +{ + FilterList::ParseResult result; + + if(stream.readLine().trimmed() != "[Adblock Plus 2.0]") { + result.state = FilterList::InvalidFormat; + return result; + } + + QString line; + while(stream.readLineInto(&line)) { + + if(!line.isEmpty()) { + ++result.lines_total; + + if(line.startsWith('!')) { + ++result.lines_comments; + parseComment(line); + + } else if(line.contains("##") || line.contains("#@#")) { + // ## is element hiding rule + // #@# is element hiding exception rule + if(qgetenv("PRINT_IGNORED") == "1") + qDebug("ignored: >%s<", qUtf8Printable(line)); + ++result.lines_ignored; + + } else { + if(parseRule(line)) + ++result.lines_parsed; + else { + if(qgetenv("PRINT_FAILED") == "1") + qDebug("failed: >%s<", qUtf8Printable(line)); + ++result.lines_failed; + } + } + } + } + + result.state = FilterList::Ok; + return result; +} + +void FilterList::parseComment(QString &line) +{ + m_comments.append(line); +} + +bool FilterList::parseRule(const QString &line) +{ + QString pattern = line; + Options opt; + + if(pattern.startsWith(QLatin1String("@@"))) { + pattern.remove(0, 2); + opt.exception = true; + } + + // parse options + if(pattern.contains('$')) { + const auto list = pattern.split('$'); + pattern = list.at(0); + const auto options = list.at(1).split(','); + + for(const auto &option : options) { + if(!opt.set(option)) + return false; + } + } + + if(pattern.startsWith("||") && pattern.endsWith("^")) { + // domain match + pattern = pattern.mid(2, pattern.length() - 3); + m_rules.append(new MatcherRule(pattern, opt)); + + } else if(pattern.startsWith("|") && pattern.endsWith("|")) { + // string equals + pattern = pattern.mid(1, pattern.length() - 2); + m_rules.append(new MatcherRule(pattern, opt)); + + } else if(pattern.startsWith("||")) { + // string starts with + pattern = pattern.mid(2, pattern.length() - 2); + m_rules.append(new MatcherRule(pattern, opt)); + + } else if(pattern.endsWith("|")) { + // string ends with + pattern = pattern.mid(0, pattern.length() - 1); + m_rules.append(new MatcherRule(pattern, opt)); + + } else if(pattern.startsWith("/") && pattern.endsWith("/")) { + // regular expression + pattern = pattern.mid(1, pattern.length() - 2); + m_rules.append(new RegexRule(pattern, opt)); + + } else { + // wildcard pattern + pattern = QRegularExpression::wildcardToRegularExpression(pattern); + m_rules.append(new RegexRule(pattern, opt)); + } + + return true; +} diff --git a/staging/adblock/filterlist.h b/staging/adblock/filterlist.h new file mode 100644 index 0000000..801700f --- /dev/null +++ b/staging/adblock/filterlist.h @@ -0,0 +1,50 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://library.iserlohn-fortress.net/aqua/smolbote.git + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include +#include +#include +#include + +class QTextStream; + +namespace AdblockPlus +{ +class Rule; +class FilterList : public QObject +{ + Q_DISABLE_COPY(FilterList); + +public: + explicit FilterList(QObject *parent = nullptr); + ~FilterList(); + + enum ParseResultState { + Ok, + InvalidFormat, + }; + struct ParseResult { + int lines_total = 0; + int lines_comments = 0; + int lines_ignored = 0; + int lines_parsed = 0; + int lines_failed = 0; + + ParseResultState state; + }; + ParseResult parse(QTextStream &stream); + +private: + void parseComment(QString &line); + bool parseRule(const QString &line); + + QStringList m_comments; + QVector m_rules; +}; + +} // namespace AdblockPlus diff --git a/staging/adblock/meson.build b/staging/adblock/meson.build new file mode 100644 index 0000000..1fc4c65 --- /dev/null +++ b/staging/adblock/meson.build @@ -0,0 +1,16 @@ +dep_adblockfilter = declare_dependency( + include_directories: include_directories('.'), + link_with: static_library('adblockfilter', + [ 'filterlist.cpp', 'rule.cpp', 'options.cpp' ], + dependencies: dep_qt5 + ) +) + +test('adblockfilter: parser', + executable('adblockfilter-parsefilter', dependencies: [ dep_qt5, dep_gtest, dep_adblockfilter ], + sources: [ 'test/parser.cpp' ] + ), + workdir: meson.current_source_dir() / 'test', + should_fail: true +) + diff --git a/staging/adblock/options.cpp b/staging/adblock/options.cpp new file mode 100644 index 0000000..0f70570 --- /dev/null +++ b/staging/adblock/options.cpp @@ -0,0 +1,111 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://library.iserlohn-fortress.net/aqua/smolbote.git + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "options.h" + +using namespace AdblockPlus; + +bool Options::set(const QString &opt) +{ + const bool exception = opt.startsWith("~"); + const QString option = [exception, opt]() { + if(exception) + return opt.mid(1); + else + return opt; + }(); + + if(option == "match-case") { + matchcase = exception; + return true; + } + + // TODO: map all ResourceType's to their respective strings + // TODO: websocket, webrtc, elemhide, generichide, genericblock, popup + + else if(option == "document") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeMainFrame] = exception; + return true; + } + + else if(option == "subdocument") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeSubFrame] = exception; + return true; + } + + else if(option == "stylesheet") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeStylesheet] = exception; + return true; + } + + else if(option == "script") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeScript] = exception; + return true; + } + + else if(option == "image") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeImage] = exception; + return true; + } + + else if(option == "font") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeFontResource] = exception; + return true; + } + + else if(option == "other") { + // An "other" subresource. + resource_options[QWebEngineUrlRequestInfo::ResourceTypeSubResource] = exception; + return true; + } + + else if(option == "object") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeObject] = exception; + return true; + } + + else if(option == "media") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeMedia] = exception; + return true; + } + /* +QWebEngineUrlRequestInfo::ResourceTypeWorker 9 The main resource of a dedicated worker. +QWebEngineUrlRequestInfo::ResourceTypeSharedWorker 10 The main resource of a shared worker. +QWebEngineUrlRequestInfo::ResourceTypePrefetch 11 An explicitly requested prefetch. +QWebEngineUrlRequestInfo::ResourceTypeFavicon 12 A favicon. +*/ + else if(option == "xmlhttprequest") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypeXhr] = exception; + return true; + } + + else if(option == "ping") { + resource_options[QWebEngineUrlRequestInfo::ResourceTypePing] = exception; + return true; + } + + /* +QWebEngineUrlRequestInfo::ResourceTypeServiceWorker 15 The main resource of a service worker. +QWebEngineUrlRequestInfo::ResourceTypeCspReport 16 A report of Content Security Policy (CSP) violations. CSP reports are in JSON format and they are delivered by HTTP POST requests to specified servers. (Added in Qt 5.7) +QWebEngineUrlRequestInfo::ResourceTypePluginResource 17 A resource requested by a plugin. (Added in Qt 5.7) +QWebEngineUrlRequestInfo::ResourceTypeNavigationPreloadMainFrame 19 A main-frame service worker navigation preload request. (Added in Qt 5.14) +QWebEngineUrlRequestInfo::ResourceTypeNavigationPreloadSubFrame 20 A sub-frame service worker navigation preload request. (Added in Qt 5.14) +QWebEngineUrlRequestInfo::ResourceTypeUnknown 255 Unknown request type. + */ + + // Restriction to third-party/first-party requests + else if(option == "third-party") { + if(exception) + thirdparty = false; + else + firstparty = false; + return true; + } + + return false; +} diff --git a/staging/adblock/options.h b/staging/adblock/options.h new file mode 100644 index 0000000..327e0ec --- /dev/null +++ b/staging/adblock/options.h @@ -0,0 +1,37 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://library.iserlohn-fortress.net/aqua/smolbote.git + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#ifndef SMOLBOTE_ADBLOCK_OPTIONS_H +#define SMOLBOTE_ADBLOCK_OPTIONS_H + +#include +#include +#include + +namespace AdblockPlus +{ + +enum OptionState { + Allow, + Block, + Unset +}; + +struct Options { + bool exception = false; + bool matchcase = false; + bool firstparty = true; + bool thirdparty = true; + QHash resource_options; + + bool set(const QString &option); +}; + +} + +#endif // SMOLBOTE_ADBLOCK_OPTIONS_H diff --git a/staging/adblock/rule.cpp b/staging/adblock/rule.cpp new file mode 100644 index 0000000..38d6b40 --- /dev/null +++ b/staging/adblock/rule.cpp @@ -0,0 +1,35 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://library.iserlohn-fortress.net/aqua/smolbote.git + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "rule.h" +#include +#include + +using namespace AdblockPlus; + +MatcherRule::MatcherRule(const QString &rule, const Options &opt) + : options(opt) +{ + matcher = new QStringMatcher(rule, Qt::CaseInsensitive); +} + +MatcherRule::~MatcherRule() +{ + delete matcher; +} + +RegexRule::RegexRule(const QString &rule, const Options &opt) + : options(opt) +{ + regex = new QRegularExpression(rule, QRegularExpression::CaseInsensitiveOption); +} + +RegexRule::~RegexRule() +{ + delete regex; +} diff --git a/staging/adblock/rule.h b/staging/adblock/rule.h new file mode 100644 index 0000000..a9a9592 --- /dev/null +++ b/staging/adblock/rule.h @@ -0,0 +1,55 @@ +/* + * This file is part of smolbote. It's copyrighted by the contributors recorded + * in the version control history of the file, available from its original + * location: https://library.iserlohn-fortress.net/aqua/smolbote.git + * + * SPDX-License-Identifier: GPL-3.0 + */ + +#include "options.h" +#include +#include + +#ifndef SMOLBOTE_ADBLOCK_RULE_H +#define SMOLBOTE_ADBLOCK_RULE_H + +class QStringMatcher; +class QRegularExpression; + +namespace AdblockPlus +{ +class Rule +{ +public: + // virtual bool hasMatch(const QString &url) const = 0; +}; + +class MatcherRule : public Rule +{ + Q_DISABLE_COPY(MatcherRule) + +public: + MatcherRule(const QString &rule, const Options &opt); + ~MatcherRule(); + +private: + Options options; + QStringMatcher *matcher; +}; + +class RegexRule : public Rule +{ + Q_DISABLE_COPY(RegexRule) + +public: + RegexRule(const QString &rule, const Options &opt); + ~RegexRule(); + +private: + Options options; + QRegularExpression *regex; +}; + +} // namespace AdblockPlus + +#endif // SMOLBOTE_ADBLOCK_RULE_H diff --git a/staging/adblock/test/parser.cpp b/staging/adblock/test/parser.cpp new file mode 100644 index 0000000..0ce1121 --- /dev/null +++ b/staging/adblock/test/parser.cpp @@ -0,0 +1,32 @@ +#include "filterlist.h" +#include +#include + +int main(int argc, char **argv) +{ + if(argc < 2) { + qDebug("usage: %s list1.txt ...", argv[0]); + return 77; + } + + for(int i = 1; i < argc; ++i) { + QFile f(argv[i]); + if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) { + qDebug("could not open %s", argv[i]); + return -1; + } + + AdblockPlus::FilterList list; + QTextStream stream(&f); + const auto result = list.parse(stream); + qDebug("[%s]: %s", argv[i], (result.state == AdblockPlus::FilterList::Ok) ? "okay" : "failed"); + qDebug(" total: %i", result.lines_total); + qDebug("comments: %i", result.lines_comments); + qDebug(" ignored: %i", result.lines_ignored); + qDebug(" parsed: %i", result.lines_parsed); + qDebug(" failed: %i", result.lines_failed); + + f.close(); + } + return 0; +} diff --git a/test/matcherbenchmark/matcherbenchmark.cpp b/test/matcherbenchmark/matcherbenchmark.cpp new file mode 100644 index 0000000..1fd87cf --- /dev/null +++ b/test/matcherbenchmark/matcherbenchmark.cpp @@ -0,0 +1,85 @@ +#include "matcherbenchmark.h" +#include +#include +#include +#include +#include +#include +#include + +void MatcherBenchmark::qstringcontains() +{ + const QString pattern("spamdomain"); + const QString request("subdomain.spamdomain.com"); + + QCOMPARE(request.contains(pattern), true); + QBENCHMARK + { + request.contains(pattern); + } +} + +void MatcherBenchmark::qstringmatcher() +{ + const QStringMatcher pattern("spamdomain"); + const QString request("subdomain.spamdomain.com"); + + QCOMPARE(pattern.indexIn(request) != -1, true); + QBENCHMARK + { + pattern.indexIn(request); + } +} + +void MatcherBenchmark::qregexp() +{ + const QRegExp pattern("spamdomain"); + const QString request("subdomain.spamdomain.com"); + + QCOMPARE(pattern.indexIn(request) != -1, true); + QBENCHMARK + { + pattern.indexIn(request); + } +} + +void MatcherBenchmark::qregularexpressionmatch() +{ + const QRegularExpression pattern("spamdomain"); + const QString request("subdomain.spamdomain.com"); + + QCOMPARE(pattern.match(request).hasMatch(), true); + QBENCHMARK + { + pattern.match(request).hasMatch(); + } +} + +void MatcherBenchmark::stdregex() +{ + const std::regex pattern("spamdomain"); + const std::string request("subdomain.spamdomain.com"); + + QCOMPARE(std::regex_search(request, pattern), true); + QBENCHMARK + { + std::regex_search(request, pattern); + } +} + +void MatcherBenchmark::cregex() +{ + regex_t pattern; + QCOMPARE(regcomp(&pattern, "spamdomain", 0), 0); + const std::string request("subdomain.spamdomain.com"); + + QCOMPARE(regexec(&pattern, request.c_str(), 0, NULL, 0), false); + QBENCHMARK + { + regexec(&pattern, request.c_str(), 0, NULL, 0); + } + + regfree(&pattern); +} + +QTEST_GUILESS_MAIN(MatcherBenchmark) diff --git a/test/matcherbenchmark/matcherbenchmark.h b/test/matcherbenchmark/matcherbenchmark.h new file mode 100644 index 0000000..70fd859 --- /dev/null +++ b/test/matcherbenchmark/matcherbenchmark.h @@ -0,0 +1,19 @@ +#ifndef MATCHER_BENCHMARK +#define MATCHER_BENCHMARK + +#include + +class MatcherBenchmark : public QObject +{ + Q_OBJECT + +private slots: + void qstringcontains(); + void qstringmatcher(); + void qregexp(); + void qregularexpressionmatch(); + void stdregex(); + void cregex(); +}; + +#endif diff --git a/test/matcherbenchmark/meson.build b/test/matcherbenchmark/meson.build new file mode 100644 index 0000000..bc8327b --- /dev/null +++ b/test/matcherbenchmark/meson.build @@ -0,0 +1,5 @@ +executable('matcherbenchmarks', + sources: [ 'matcherbenchmark.cpp', + mod_qt5.preprocess(moc_headers: 'matcherbenchmark.h')], + dependencies: [ dep_qt5 ] +) -- cgit v1.2.1