aboutsummaryrefslogtreecommitdiff
path: root/lib/urlfilter/formats/adblockrule_parse.cpp
blob: c01ddfd234ba057a2fbfe38309e4e7d7fb819fea (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
/*
 * This file is part of smolbote. It's copyrighted by the contributors recorded
 * in the version control history of the file, available from its original
 * location: https://neueland.iserlohn-fortress.net/gitea/aqua/smolbote
 *
 * SPDX-License-Identifier: GPL-3.0
 */

#include "adblockrule.h"
#include "adblockrule_parse.h"

// adblock format documentation
// https://adblockplus.org/filters

// QString::mid(pos, len) const - Returns a string starting at the specified position index.
// QString::chop(len) - Removes n characters from the end of the string.
// QString::remove(pos, len) - Removes n characters from the string, starting at the given position index.
// QString::trimmed() const - Remove whitespace from start and end

AdBlockRule *parseRule_adblock(const QString &filter)
{
    QString parsedLine = filter.trimmed();

    // there is no rule, or it's a comment
    if(parsedLine.isEmpty() || parsedLine.startsWith("!")) {
        return nullptr;
    }

    // css rule -> filterleaves cannot do element blocking
    if(parsedLine.contains(QLatin1Literal("##")) || parsedLine.contains(QLatin1Literal("#@#"))) {
        return nullptr;
    }

    // exception rules
    FilterLeaf::Action action = FilterLeaf::Block;
    if(parsedLine.startsWith(QLatin1Literal("@@"))) {
        action = FilterLeaf::Allow;
        parsedLine.remove(0, 2);
    }

    // parse options
    QStringList enabledOn, disabledOn;
    QHash<QWebEngineUrlRequestInfo::ResourceType, bool> optionsHash;
    {
        const int sepPos = parsedLine.indexOf(QLatin1Literal("$"));
        if(sepPos != -1) {
            const auto options = parsedLine.mid(sepPos + 1).split(QLatin1Literal(","));
            parsedLine = parsedLine.mid(0, sepPos);

            for(const QString &option : options) {
                if(option.startsWith(QLatin1Literal("domain"))) {
                    const auto domainList = option.mid(7).split(QLatin1Literal("|"));

                    for(const QString &domain : domainList) {
                        if(domain.startsWith(QLatin1Literal("~"))) {
                            disabledOn.append(domain.mid(1));
                        } else {
                            enabledOn.append(domain);
                        }
                    }
                } else {
                    const auto pair = parseOption(option);
                    if(pair)
                        optionsHash.insert(pair.value().first, pair.value().second);
                }
            }
        }
    }

    FilterLeaf::UrlMatchType matchType = FilterLeaf::InvalidMatch;

    if(parsedLine.startsWith(QLatin1Literal("/")) && parsedLine.endsWith(QLatin1Literal("/"))) {
        // regular expression rule
        matchType = FilterLeaf::RegularExpressionMatch;
        parsedLine = parsedLine.mid(1, parsedLine.length() - 2);

    } else if(parsedLine.startsWith(QLatin1Literal("||")) && parsedLine.endsWith(QLatin1Literal("^"))) {
        matchType = FilterLeaf::DomainMatch;
        parsedLine = parsedLine.mid(2, parsedLine.length() - 3);

    } else if(parsedLine.startsWith(QLatin1Literal("|")) && parsedLine.endsWith(QLatin1Literal("|"))) {
        // string equals rule
        matchType = FilterLeaf::StringEquals;
        parsedLine = parsedLine.mid(1, parsedLine.length() - 2);

    } else if(parsedLine.startsWith(QLatin1Literal("||"))) {
        // string starts with rule
        matchType = FilterLeaf::StringStartsWith;
        parsedLine = parsedLine.mid(2);

    } else if(parsedLine.endsWith(QLatin1Literal("|"))) {
        // string ends with rule
        matchType = FilterLeaf::StringEndsWith;
        parsedLine.chop(1);

    } else {
        // generic contains rule
        matchType = FilterLeaf::StringContains;

        // Basic filter rules can use wildcards, which were supported by QRegExp,
        // but were deprecated in QRegularExpression.

        // remove beginning and ending wildcards
        if(parsedLine.startsWith(QLatin1Literal("*")))
            parsedLine = parsedLine.mid(1);

        if(parsedLine.endsWith(QLatin1Literal("*")))
            parsedLine.chop(1);

        if(parsedLine.contains(QLatin1Literal("*")) || parsedLine.contains(QLatin1Literal("^"))) {
            // check for wildcards and translate to regexp
            // wildcard "*" - any number of characters
            // separator "^" - end, ? or /
            parsedLine.replace(QLatin1Literal("||"), QLatin1Literal("^\\w+://"));
            parsedLine.replace(QLatin1Literal("|"), QLatin1Literal("\\|"));
            parsedLine.replace(QLatin1Literal("*"), QLatin1Literal(".*"));
            parsedLine.replace(QLatin1Literal("^"), QLatin1Literal("($|\\?|\\/)"));

            matchType = FilterLeaf::RegularExpressionMatch;
        }
    }

    AdBlockRule *rule = new AdBlockRule(matchType, parsedLine, action);
    rule->mergeOptions(optionsHash);
    return rule;
}

std::optional<QPair<QWebEngineUrlRequestInfo::ResourceType, bool>> parseOption(const QString &option)
{
    const bool exception = !option.startsWith(QLatin1Literal("~"));

    if(option.endsWith(QLatin1Literal("script"))) {
        //  external scripts loaded via HTML script tag
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeScript, exception);

    } else if(option.endsWith(QLatin1Literal("image"))) {
        // regular images, typically loaded via HTML img tag
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeImage, exception);

    } else if(option.endsWith(QLatin1Literal("stylesheet"))) {
        // external CSS stylesheet files
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeStylesheet, exception);

    } else if(option.endsWith(QLatin1Literal("object"))) {
        // content handled by browser plugins, e.g. Flash or Java
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeObject, exception);

    } else if(option.endsWith(QLatin1Literal("xmlhttprequest"))) {
        //  requests started using the XMLHttpRequest object or fetch() API
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeXhr, exception);

    } else if(option.endsWith(QLatin1Literal("object-subrequest"))) {
        // requests started by plugins like Flash
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePluginResource, exception);

    } else if(option.endsWith(QLatin1Literal("subdocument"))) {
        // embedded pages, usually included via HTML frames
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeSubFrame, exception);

    } else if(option.endsWith(QLatin1Literal("ping"))) {
        // requests started by <a ping> or navigator.sendBeacon()
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypePing, exception);

    } else if(option.endsWith(QLatin1Literal("websocket"))) {
        // requests initiated via WebSocket object
        qDebug("Resource type 'websocket' not available");

    } else if(option.endsWith(QLatin1Literal("webrtc"))) {
        // connections opened via RTCPeerConnection instances to ICE servers
        qDebug("Resource type 'webrtc' not available");

    } else if(option.endsWith(QLatin1Literal("document"))) {
        // the page itself
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeMainFrame, exception);

    } else if(option.endsWith(QLatin1Literal("other"))) {
        return qMakePair(QWebEngineUrlRequestInfo::ResourceTypeUnknown, exception);
    }

    return std::nullopt;
}