aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAqua-sama <aqua@iserlohn-fortress.net>2020-02-10 20:58:39 +0200
committerAqua-sama <aqua@iserlohn-fortress.net>2020-04-21 20:14:50 +0300
commitc3559b24eba76052deb9d4ce79e4704815d902a5 (patch)
tree900dd0a5b8039e5e7bdd9ad609d9b3ae45d9dbde
parentAdd libfuzzer test to libconfiguration (diff)
downloadsmolbote-c3559b24eba76052deb9d4ce79e4704815d902a5.tar.xz
staging: rewrite AdblockPlus parser yet again
-rw-r--r--meson.build3
-rw-r--r--staging/adblock/filterlist.cpp137
-rw-r--r--staging/adblock/filterlist.h50
-rw-r--r--staging/adblock/meson.build16
-rw-r--r--staging/adblock/options.cpp111
-rw-r--r--staging/adblock/options.h37
-rw-r--r--staging/adblock/rule.cpp35
-rw-r--r--staging/adblock/rule.h55
-rw-r--r--staging/adblock/test/parser.cpp32
-rw-r--r--test/matcherbenchmark/matcherbenchmark.cpp85
-rw-r--r--test/matcherbenchmark/matcherbenchmark.h19
-rw-r--r--test/matcherbenchmark/meson.build5
12 files changed, 585 insertions, 0 deletions
diff --git a/meson.build b/meson.build
index f2a51bc..8619d5b 100644
--- a/meson.build
+++ b/meson.build
@@ -92,6 +92,9 @@ subdir('tools')
#subdir('plugins/ProfileEditor')
subdir('test/firefox-bookmarks-json-parser')
+subdir('test/matcherbenchmark')
+
+subdir('staging/adblock')
ssconfig = poi_sourceset.apply(cdata)
diff --git a/staging/adblock/filterlist.cpp b/staging/adblock/filterlist.cpp
new file mode 100644
index 0000000..be2bd4e
--- /dev/null
+++ b/staging/adblock/filterlist.cpp
@@ -0,0 +1,137 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://library.iserlohn-fortress.net/aqua/smolbote.git
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "filterlist.h"
+#include "rule.h"
+#include <QRegularExpression>
+#include <QTextStream>
+
+/**
+ * Documentation:
+ *
+ * https://help.eyeo.com/en/adblockplus/how-to-write-filters
+ *
+ * https://github.com/gorhill/uBlock/wiki/Introduction-to-basic-filtering-syntax
+ * https://github.com/gorhill/uBlock/wiki/Static-filter-syntax
+ *
+ */
+
+using namespace AdblockPlus;
+
+FilterList::FilterList(QObject *parent)
+ : QObject(parent)
+{
+}
+
+FilterList::~FilterList()
+{
+ qDeleteAll(m_rules);
+}
+
+FilterList::ParseResult FilterList::parse(QTextStream &stream)
+{
+ FilterList::ParseResult result;
+
+ if(stream.readLine().trimmed() != "[Adblock Plus 2.0]") {
+ result.state = FilterList::InvalidFormat;
+ return result;
+ }
+
+ QString line;
+ while(stream.readLineInto(&line)) {
+
+ if(!line.isEmpty()) {
+ ++result.lines_total;
+
+ if(line.startsWith('!')) {
+ ++result.lines_comments;
+ parseComment(line);
+
+ } else if(line.contains("##") || line.contains("#@#")) {
+ // ## is element hiding rule
+ // #@# is element hiding exception rule
+ if(qgetenv("PRINT_IGNORED") == "1")
+ qDebug("ignored: >%s<", qUtf8Printable(line));
+ ++result.lines_ignored;
+
+ } else {
+ if(parseRule(line))
+ ++result.lines_parsed;
+ else {
+ if(qgetenv("PRINT_FAILED") == "1")
+ qDebug("failed: >%s<", qUtf8Printable(line));
+ ++result.lines_failed;
+ }
+ }
+ }
+ }
+
+ result.state = FilterList::Ok;
+ return result;
+}
+
+void FilterList::parseComment(QString &line)
+{
+ m_comments.append(line);
+}
+
+bool FilterList::parseRule(const QString &line)
+{
+ QString pattern = line;
+ Options opt;
+
+ if(pattern.startsWith(QLatin1String("@@"))) {
+ pattern.remove(0, 2);
+ opt.exception = true;
+ }
+
+ // parse options
+ if(pattern.contains('$')) {
+ const auto list = pattern.split('$');
+ pattern = list.at(0);
+ const auto options = list.at(1).split(',');
+
+ for(const auto &option : options) {
+ if(!opt.set(option))
+ return false;
+ }
+ }
+
+ if(pattern.startsWith("||") && pattern.endsWith("^")) {
+ // domain match
+ pattern = pattern.mid(2, pattern.length() - 3);
+ m_rules.append(new MatcherRule(pattern, opt));
+
+ } else if(pattern.startsWith("|") && pattern.endsWith("|")) {
+ // string equals
+ pattern = pattern.mid(1, pattern.length() - 2);
+ m_rules.append(new MatcherRule(pattern, opt));
+
+ } else if(pattern.startsWith("||")) {
+ // string starts with
+ pattern = pattern.mid(2, pattern.length() - 2);
+ m_rules.append(new MatcherRule(pattern, opt));
+
+ } else if(pattern.endsWith("|")) {
+ // string ends with
+ pattern = pattern.mid(0, pattern.length() - 1);
+ m_rules.append(new MatcherRule(pattern, opt));
+
+ } else if(pattern.startsWith("/") && pattern.endsWith("/")) {
+ // regular expression
+ pattern = pattern.mid(1, pattern.length() - 2);
+ m_rules.append(new RegexRule(pattern, opt));
+
+ } else {
+ // wildcard pattern
+ pattern = QRegularExpression::wildcardToRegularExpression(pattern);
+ m_rules.append(new RegexRule(pattern, opt));
+ }
+
+ return true;
+}
diff --git a/staging/adblock/filterlist.h b/staging/adblock/filterlist.h
new file mode 100644
index 0000000..801700f
--- /dev/null
+++ b/staging/adblock/filterlist.h
@@ -0,0 +1,50 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://library.iserlohn-fortress.net/aqua/smolbote.git
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include <QObject>
+#include <QString>
+#include <QStringList>
+#include <QVector>
+
+class QTextStream;
+
+namespace AdblockPlus
+{
+class Rule;
+class FilterList : public QObject
+{
+ Q_DISABLE_COPY(FilterList);
+
+public:
+ explicit FilterList(QObject *parent = nullptr);
+ ~FilterList();
+
+ enum ParseResultState {
+ Ok,
+ InvalidFormat,
+ };
+ struct ParseResult {
+ int lines_total = 0;
+ int lines_comments = 0;
+ int lines_ignored = 0;
+ int lines_parsed = 0;
+ int lines_failed = 0;
+
+ ParseResultState state;
+ };
+ ParseResult parse(QTextStream &stream);
+
+private:
+ void parseComment(QString &line);
+ bool parseRule(const QString &line);
+
+ QStringList m_comments;
+ QVector<Rule *> m_rules;
+};
+
+} // namespace AdblockPlus
diff --git a/staging/adblock/meson.build b/staging/adblock/meson.build
new file mode 100644
index 0000000..1fc4c65
--- /dev/null
+++ b/staging/adblock/meson.build
@@ -0,0 +1,16 @@
+dep_adblockfilter = declare_dependency(
+ include_directories: include_directories('.'),
+ link_with: static_library('adblockfilter',
+ [ 'filterlist.cpp', 'rule.cpp', 'options.cpp' ],
+ dependencies: dep_qt5
+ )
+)
+
+test('adblockfilter: parser',
+ executable('adblockfilter-parsefilter', dependencies: [ dep_qt5, dep_gtest, dep_adblockfilter ],
+ sources: [ 'test/parser.cpp' ]
+ ),
+ workdir: meson.current_source_dir() / 'test',
+ should_fail: true
+)
+
diff --git a/staging/adblock/options.cpp b/staging/adblock/options.cpp
new file mode 100644
index 0000000..0f70570
--- /dev/null
+++ b/staging/adblock/options.cpp
@@ -0,0 +1,111 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://library.iserlohn-fortress.net/aqua/smolbote.git
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "options.h"
+
+using namespace AdblockPlus;
+
+bool Options::set(const QString &opt)
+{
+ const bool exception = opt.startsWith("~");
+ const QString option = [exception, opt]() {
+ if(exception)
+ return opt.mid(1);
+ else
+ return opt;
+ }();
+
+ if(option == "match-case") {
+ matchcase = exception;
+ return true;
+ }
+
+ // TODO: map all ResourceType's to their respective strings
+ // TODO: websocket, webrtc, elemhide, generichide, genericblock, popup
+
+ else if(option == "document") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeMainFrame] = exception;
+ return true;
+ }
+
+ else if(option == "subdocument") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeSubFrame] = exception;
+ return true;
+ }
+
+ else if(option == "stylesheet") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeStylesheet] = exception;
+ return true;
+ }
+
+ else if(option == "script") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeScript] = exception;
+ return true;
+ }
+
+ else if(option == "image") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeImage] = exception;
+ return true;
+ }
+
+ else if(option == "font") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeFontResource] = exception;
+ return true;
+ }
+
+ else if(option == "other") {
+ // An "other" subresource.
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeSubResource] = exception;
+ return true;
+ }
+
+ else if(option == "object") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeObject] = exception;
+ return true;
+ }
+
+ else if(option == "media") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeMedia] = exception;
+ return true;
+ }
+ /*
+QWebEngineUrlRequestInfo::ResourceTypeWorker 9 The main resource of a dedicated worker.
+QWebEngineUrlRequestInfo::ResourceTypeSharedWorker 10 The main resource of a shared worker.
+QWebEngineUrlRequestInfo::ResourceTypePrefetch 11 An explicitly requested prefetch.
+QWebEngineUrlRequestInfo::ResourceTypeFavicon 12 A favicon.
+*/
+ else if(option == "xmlhttprequest") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypeXhr] = exception;
+ return true;
+ }
+
+ else if(option == "ping") {
+ resource_options[QWebEngineUrlRequestInfo::ResourceTypePing] = exception;
+ return true;
+ }
+
+ /*
+QWebEngineUrlRequestInfo::ResourceTypeServiceWorker 15 The main resource of a service worker.
+QWebEngineUrlRequestInfo::ResourceTypeCspReport 16 A report of Content Security Policy (CSP) violations. CSP reports are in JSON format and they are delivered by HTTP POST requests to specified servers. (Added in Qt 5.7)
+QWebEngineUrlRequestInfo::ResourceTypePluginResource 17 A resource requested by a plugin. (Added in Qt 5.7)
+QWebEngineUrlRequestInfo::ResourceTypeNavigationPreloadMainFrame 19 A main-frame service worker navigation preload request. (Added in Qt 5.14)
+QWebEngineUrlRequestInfo::ResourceTypeNavigationPreloadSubFrame 20 A sub-frame service worker navigation preload request. (Added in Qt 5.14)
+QWebEngineUrlRequestInfo::ResourceTypeUnknown 255 Unknown request type.
+ */
+
+ // Restriction to third-party/first-party requests
+ else if(option == "third-party") {
+ if(exception)
+ thirdparty = false;
+ else
+ firstparty = false;
+ return true;
+ }
+
+ return false;
+}
diff --git a/staging/adblock/options.h b/staging/adblock/options.h
new file mode 100644
index 0000000..327e0ec
--- /dev/null
+++ b/staging/adblock/options.h
@@ -0,0 +1,37 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://library.iserlohn-fortress.net/aqua/smolbote.git
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#ifndef SMOLBOTE_ADBLOCK_OPTIONS_H
+#define SMOLBOTE_ADBLOCK_OPTIONS_H
+
+#include <QHash>
+#include <QString>
+#include <QWebEngineUrlRequestInfo>
+
+namespace AdblockPlus
+{
+
+enum OptionState {
+ Allow,
+ Block,
+ Unset
+};
+
+struct Options {
+ bool exception = false;
+ bool matchcase = false;
+ bool firstparty = true;
+ bool thirdparty = true;
+ QHash<QWebEngineUrlRequestInfo::ResourceType, bool> resource_options;
+
+ bool set(const QString &option);
+};
+
+}
+
+#endif // SMOLBOTE_ADBLOCK_OPTIONS_H
diff --git a/staging/adblock/rule.cpp b/staging/adblock/rule.cpp
new file mode 100644
index 0000000..38d6b40
--- /dev/null
+++ b/staging/adblock/rule.cpp
@@ -0,0 +1,35 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://library.iserlohn-fortress.net/aqua/smolbote.git
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "rule.h"
+#include <QRegularExpression>
+#include <QStringMatcher>
+
+using namespace AdblockPlus;
+
+MatcherRule::MatcherRule(const QString &rule, const Options &opt)
+ : options(opt)
+{
+ matcher = new QStringMatcher(rule, Qt::CaseInsensitive);
+}
+
+MatcherRule::~MatcherRule()
+{
+ delete matcher;
+}
+
+RegexRule::RegexRule(const QString &rule, const Options &opt)
+ : options(opt)
+{
+ regex = new QRegularExpression(rule, QRegularExpression::CaseInsensitiveOption);
+}
+
+RegexRule::~RegexRule()
+{
+ delete regex;
+}
diff --git a/staging/adblock/rule.h b/staging/adblock/rule.h
new file mode 100644
index 0000000..a9a9592
--- /dev/null
+++ b/staging/adblock/rule.h
@@ -0,0 +1,55 @@
+/*
+ * This file is part of smolbote. It's copyrighted by the contributors recorded
+ * in the version control history of the file, available from its original
+ * location: https://library.iserlohn-fortress.net/aqua/smolbote.git
+ *
+ * SPDX-License-Identifier: GPL-3.0
+ */
+
+#include "options.h"
+#include <QObject>
+#include <QString>
+
+#ifndef SMOLBOTE_ADBLOCK_RULE_H
+#define SMOLBOTE_ADBLOCK_RULE_H
+
+class QStringMatcher;
+class QRegularExpression;
+
+namespace AdblockPlus
+{
+class Rule
+{
+public:
+ // virtual bool hasMatch(const QString &url) const = 0;
+};
+
+class MatcherRule : public Rule
+{
+ Q_DISABLE_COPY(MatcherRule)
+
+public:
+ MatcherRule(const QString &rule, const Options &opt);
+ ~MatcherRule();
+
+private:
+ Options options;
+ QStringMatcher *matcher;
+};
+
+class RegexRule : public Rule
+{
+ Q_DISABLE_COPY(RegexRule)
+
+public:
+ RegexRule(const QString &rule, const Options &opt);
+ ~RegexRule();
+
+private:
+ Options options;
+ QRegularExpression *regex;
+};
+
+} // namespace AdblockPlus
+
+#endif // SMOLBOTE_ADBLOCK_RULE_H
diff --git a/staging/adblock/test/parser.cpp b/staging/adblock/test/parser.cpp
new file mode 100644
index 0000000..0ce1121
--- /dev/null
+++ b/staging/adblock/test/parser.cpp
@@ -0,0 +1,32 @@
+#include "filterlist.h"
+#include <QFile>
+#include <QTextStream>
+
+int main(int argc, char **argv)
+{
+ if(argc < 2) {
+ qDebug("usage: %s list1.txt ...", argv[0]);
+ return 77;
+ }
+
+ for(int i = 1; i < argc; ++i) {
+ QFile f(argv[i]);
+ if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) {
+ qDebug("could not open %s", argv[i]);
+ return -1;
+ }
+
+ AdblockPlus::FilterList list;
+ QTextStream stream(&f);
+ const auto result = list.parse(stream);
+ qDebug("[%s]: %s", argv[i], (result.state == AdblockPlus::FilterList::Ok) ? "okay" : "failed");
+ qDebug(" total: %i", result.lines_total);
+ qDebug("comments: %i", result.lines_comments);
+ qDebug(" ignored: %i", result.lines_ignored);
+ qDebug(" parsed: %i", result.lines_parsed);
+ qDebug(" failed: %i", result.lines_failed);
+
+ f.close();
+ }
+ return 0;
+}
diff --git a/test/matcherbenchmark/matcherbenchmark.cpp b/test/matcherbenchmark/matcherbenchmark.cpp
new file mode 100644
index 0000000..1fd87cf
--- /dev/null
+++ b/test/matcherbenchmark/matcherbenchmark.cpp
@@ -0,0 +1,85 @@
+#include "matcherbenchmark.h"
+#include <QRegExp>
+#include <QRegularExpression>
+#include <QStringMatcher>
+#include <QtTest/QTest>
+#include <regex.h>
+#include <regex>
+#include <string>
+
+void MatcherBenchmark::qstringcontains()
+{
+ const QString pattern("spamdomain");
+ const QString request("subdomain.spamdomain.com");
+
+ QCOMPARE(request.contains(pattern), true);
+ QBENCHMARK
+ {
+ request.contains(pattern);
+ }
+}
+
+void MatcherBenchmark::qstringmatcher()
+{
+ const QStringMatcher pattern("spamdomain");
+ const QString request("subdomain.spamdomain.com");
+
+ QCOMPARE(pattern.indexIn(request) != -1, true);
+ QBENCHMARK
+ {
+ pattern.indexIn(request);
+ }
+}
+
+void MatcherBenchmark::qregexp()
+{
+ const QRegExp pattern("spamdomain");
+ const QString request("subdomain.spamdomain.com");
+
+ QCOMPARE(pattern.indexIn(request) != -1, true);
+ QBENCHMARK
+ {
+ pattern.indexIn(request);
+ }
+}
+
+void MatcherBenchmark::qregularexpressionmatch()
+{
+ const QRegularExpression pattern("spamdomain");
+ const QString request("subdomain.spamdomain.com");
+
+ QCOMPARE(pattern.match(request).hasMatch(), true);
+ QBENCHMARK
+ {
+ pattern.match(request).hasMatch();
+ }
+}
+
+void MatcherBenchmark::stdregex()
+{
+ const std::regex pattern("spamdomain");
+ const std::string request("subdomain.spamdomain.com");
+
+ QCOMPARE(std::regex_search(request, pattern), true);
+ QBENCHMARK
+ {
+ std::regex_search(request, pattern);
+ }
+}
+
+void MatcherBenchmark::cregex()
+{
+ regex_t pattern;
+ QCOMPARE(regcomp(&pattern, "spamdomain", 0), 0);
+ const std::string request("subdomain.spamdomain.com");
+
+ QCOMPARE(regexec(&pattern, request.c_str(), 0, NULL, 0), false);
+ QBENCHMARK
+ {
+ regexec(&pattern, request.c_str(), 0, NULL, 0);
+ }
+
+ regfree(&pattern);
+}
+
+QTEST_GUILESS_MAIN(MatcherBenchmark)
diff --git a/test/matcherbenchmark/matcherbenchmark.h b/test/matcherbenchmark/matcherbenchmark.h
new file mode 100644
index 0000000..70fd859
--- /dev/null
+++ b/test/matcherbenchmark/matcherbenchmark.h
@@ -0,0 +1,19 @@
+#ifndef MATCHER_BENCHMARK
+#define MATCHER_BENCHMARK
+
+#include <QObject>
+
+class MatcherBenchmark : public QObject
+{
+ Q_OBJECT
+
+private slots:
+ void qstringcontains();
+ void qstringmatcher();
+ void qregexp();
+ void qregularexpressionmatch();
+ void stdregex();
+ void cregex();
+};
+
+#endif
diff --git a/test/matcherbenchmark/meson.build b/test/matcherbenchmark/meson.build
new file mode 100644
index 0000000..bc8327b
--- /dev/null
+++ b/test/matcherbenchmark/meson.build
@@ -0,0 +1,5 @@
+executable('matcherbenchmarks',
+ sources: [ 'matcherbenchmark.cpp',
+ mod_qt5.preprocess(moc_headers: 'matcherbenchmark.h')],
+ dependencies: [ dep_qt5 ]
+)