From e5815346cd4870640051856abed0887717aa0bf7 Mon Sep 17 00:00:00 2001 From: Benjamin Poulain Date: Wed, 18 Aug 2010 23:41:05 +0200 Subject: Improve the performance of AdBlockRuleTextMatchImpl Comparing unicode string without case sensitive is rather expensive because each codepoint must be converted, which is non trivial for unicode. This patch introduce a new argument of ::match() taking the encoded url in lowercase. This way, the conversion can be done only once for a lot of rules. --- src/adblock/adblockmanager.cpp | 10 +++++++--- src/adblock/adblockrule.h | 5 +++-- src/adblock/adblockrulefallbackimpl.cpp | 2 +- src/adblock/adblockrulefallbackimpl.h | 2 +- src/adblock/adblockruleimpl.h | 2 +- src/adblock/adblockruletextmatchimpl.cpp | 11 ++++++++--- src/adblock/adblockruletextmatchimpl.h | 2 +- 7 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/adblock/adblockmanager.cpp b/src/adblock/adblockmanager.cpp index 644ecff5..e0a109f0 100644 --- a/src/adblock/adblockmanager.cpp +++ b/src/adblock/adblockmanager.cpp @@ -161,11 +161,14 @@ QNetworkReply *AdBlockManager::block(const QNetworkRequest &request, WebPage *pa return 0; QString urlString = request.url().toString(); + // We compute a lowercase version of the URL so each rule does not + // have to do it. + const QString urlStringLowerCase = urlString.toLower(); // check white rules before :) foreach(const AdBlockRule &filter, _whiteList) { - if (filter.match(urlString)) + if (filter.match(urlString, urlStringLowerCase)) { kDebug() << "****ADBLOCK: WHITE RULE (@@) Matched: ***********"; kDebug() << "UrlString: " << urlString; @@ -176,7 +179,7 @@ QNetworkReply *AdBlockManager::block(const QNetworkRequest &request, WebPage *pa // then check the black ones :( foreach(const AdBlockRule &filter, _blackList) { - if (filter.match(urlString)) + if (filter.match(urlString, urlStringLowerCase)) { kDebug() << "****ADBLOCK: BLACK RULE Matched: ***********"; kDebug() << "UrlString: " << urlString; @@ -185,7 +188,8 @@ QNetworkReply *AdBlockManager::block(const QNetworkRequest &request, WebPage *pa QWebElementCollection elements = document.findAll("*"); foreach(QWebElement el, elements) { - if (filter.match(el.attribute("src"))) + const QString srcAttribute = el.attribute("src"); + if (filter.match(srcAttribute, srcAttribute.toLower())) { kDebug() << "MATCHES ATTRIBUTE!!!!!"; el.setStyleProperty(QL1S("visibility"), QL1S("hidden")); diff --git a/src/adblock/adblockrule.h b/src/adblock/adblockrule.h index 04409688..ef7b2f5f 100644 --- a/src/adblock/adblockrule.h +++ b/src/adblock/adblockrule.h @@ -70,9 +70,10 @@ class AdBlockRule public: AdBlockRule(const QString &filter); - bool match(const QString &encodedUrl) const + bool match(const QString &encodedUrl, const QString &encodedUrlLowerCase) const { - return m_implementation->match(encodedUrl); + Q_ASSERT(encodedUrl.toLower() == encodedUrlLowerCase); + return m_implementation->match(encodedUrl, encodedUrlLowerCase); } private: diff --git a/src/adblock/adblockrulefallbackimpl.cpp b/src/adblock/adblockrulefallbackimpl.cpp index decb895d..988f2895 100644 --- a/src/adblock/adblockrulefallbackimpl.cpp +++ b/src/adblock/adblockrulefallbackimpl.cpp @@ -61,7 +61,7 @@ AdBlockRuleFallbackImpl::AdBlockRuleFallbackImpl(const QString &filter) m_regExp.setPattern(parsedLine); } -bool AdBlockRuleFallbackImpl::match(const QString &encodedUrl) const +bool AdBlockRuleFallbackImpl::match(const QString &encodedUrl, const QString &) const { return m_regExp.indexIn(encodedUrl) != -1; } diff --git a/src/adblock/adblockrulefallbackimpl.h b/src/adblock/adblockrulefallbackimpl.h index 4e7ca555..ed0f6dc6 100644 --- a/src/adblock/adblockrulefallbackimpl.h +++ b/src/adblock/adblockrulefallbackimpl.h @@ -36,7 +36,7 @@ class AdBlockRuleFallbackImpl : public AdBlockRuleImpl { public: AdBlockRuleFallbackImpl(const QString &filter); - bool match(const QString &encodedUrl) const; + bool match(const QString &encodedUrl, const QString &encodedUrlLowerCase) const; private: QString convertPatternToRegExp(const QString &wildcardPattern); diff --git a/src/adblock/adblockruleimpl.h b/src/adblock/adblockruleimpl.h index da367aeb..db5cec30 100644 --- a/src/adblock/adblockruleimpl.h +++ b/src/adblock/adblockruleimpl.h @@ -33,7 +33,7 @@ class AdBlockRuleImpl public: AdBlockRuleImpl(const QString &) {} virtual ~AdBlockRuleImpl() {} - virtual bool match(const QString &encodedUrl) const = 0; + virtual bool match(const QString &encodedUrl, const QString &encodedUrlLowerCase) const = 0; }; #endif // ADBLOCKRULEIMPL_H diff --git a/src/adblock/adblockruletextmatchimpl.cpp b/src/adblock/adblockruletextmatchimpl.cpp index 7c02ea37..892d78e0 100644 --- a/src/adblock/adblockruletextmatchimpl.cpp +++ b/src/adblock/adblockruletextmatchimpl.cpp @@ -34,13 +34,18 @@ AdBlockRuleTextMatchImpl::AdBlockRuleTextMatchImpl(const QString &filter) { Q_ASSERT(AdBlockRuleTextMatchImpl::isTextMatchFilter(filter)); - m_textToMatch = filter; + m_textToMatch = filter.toLower(); m_textToMatch.remove(QL1C('*')); } -bool AdBlockRuleTextMatchImpl::match(const QString &encodedUrl) const +bool AdBlockRuleTextMatchImpl::match(const QString &encodedUrl, const QString &encodedUrlLowerCase) const { - return encodedUrl.contains(m_textToMatch, Qt::CaseInsensitive); + Q_UNUSED(encodedUrl); + // Case sensitive compare is faster, but would be incorrect with encodedUrl since + // we do want case insensitive. + // What we do is work on a lowercase version of m_textToMatch, and compare to the lowercase + // version of encodedUrl. + return encodedUrlLowerCase.contains(m_textToMatch, Qt::CaseSensitive); } bool AdBlockRuleTextMatchImpl::isTextMatchFilter(const QString &filter) diff --git a/src/adblock/adblockruletextmatchimpl.h b/src/adblock/adblockruletextmatchimpl.h index f0e78be0..28b0656c 100644 --- a/src/adblock/adblockruletextmatchimpl.h +++ b/src/adblock/adblockruletextmatchimpl.h @@ -36,7 +36,7 @@ class AdBlockRuleTextMatchImpl : public AdBlockRuleImpl { public: AdBlockRuleTextMatchImpl(const QString &filter); - bool match(const QString &encodedUrl) const; + bool match(const QString &encodedUrl, const QString &encodedUrlLowerCase) const; static bool isTextMatchFilter(const QString &filter); -- cgit v1.2.1 From a4631fb9ec2541f99e1aba05cf1e7d2b5ebecb98 Mon Sep 17 00:00:00 2001 From: Benjamin Poulain Date: Thu, 19 Aug 2010 02:26:40 +0200 Subject: Add a special matcher for ad block filters for host name Quite a few rules of ad block are just matching domains. Those are of the form: ||trolltech.com^$options This patch add a new class to deal with this kind of filter, AdBlockHostMatcher. Matching a host address is much faster (O(1)) than going through the entire list of rules. --- src/CMakeLists.txt | 1 + src/adblock/adblockhostmatcher.cpp | 55 ++++++++++++++++++++++++++++++++++++++ src/adblock/adblockhostmatcher.h | 51 +++++++++++++++++++++++++++++++++++ src/adblock/adblockmanager.cpp | 24 ++++++++++++++++- src/adblock/adblockmanager.h | 3 +++ 5 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 src/adblock/adblockhostmatcher.cpp create mode 100644 src/adblock/adblockhostmatcher.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dd90fc10..e873052b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -50,6 +50,7 @@ SET( rekonq_KDEINIT_SRCS bookmarks/bookmarksproxy.cpp bookmarks/bookmarkcontextmenu.cpp #---------------------------------------- + adblock/adblockhostmatcher.cpp adblock/adblockmanager.cpp adblock/adblocknetworkreply.cpp adblock/adblockrule.cpp diff --git a/src/adblock/adblockhostmatcher.cpp b/src/adblock/adblockhostmatcher.cpp new file mode 100644 index 00000000..b11dab2c --- /dev/null +++ b/src/adblock/adblockhostmatcher.cpp @@ -0,0 +1,55 @@ +/* ============================================================ +* +* This file is a part of the rekonq project +* +* Copyright (C) 2010 by Benjamin Poulain +* +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of +* the License or (at your option) version 3 or any later version +* accepted by the membership of KDE e.V. (or its successor approved +* by the membership of KDE e.V.), which shall act as a proxy +* defined in Section 14 of version 3 of the license. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +* +* ============================================================ */ + +// Self Includes +#include "adblockhostmatcher.h" + +// Rekonq Includes +#include "rekonq_defines.h" + +bool AdBlockHostMatcher::tryAddFilter(const QString &filter) +{ + if (filter.startsWith(QL1S("||"))) { + QString domain = filter.mid(2); + + const int indexOfFirstSeparator = domain.indexOf(QL1C('^')); + if (indexOfFirstSeparator < 0) + return false; + + const int indexOfLastDollar = domain.lastIndexOf(QL1C('$')); + if (indexOfLastDollar >= 0 && indexOfLastDollar != indexOfFirstSeparator + 1) + return false; + + domain = domain.left(indexOfFirstSeparator); + if (domain.contains(QL1C('/')) || domain.contains(QL1C('*'))) + return false; + + domain = domain.toLower(); + m_hostList.insert(domain); + m_hostList.insert(QL1S("www.") + domain); + return true; + } + return false; +} diff --git a/src/adblock/adblockhostmatcher.h b/src/adblock/adblockhostmatcher.h new file mode 100644 index 00000000..0a15bd4e --- /dev/null +++ b/src/adblock/adblockhostmatcher.h @@ -0,0 +1,51 @@ +/* ============================================================ +* +* This file is a part of the rekonq project +* +* Copyright (C) 2010 by Benjamin Poulain +* +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of +* the License or (at your option) version 3 or any later version +* accepted by the membership of KDE e.V. (or its successor approved +* by the membership of KDE e.V.), which shall act as a proxy +* defined in Section 14 of version 3 of the license. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +* +* ============================================================ */ + +#ifndef ADBLOCKHOSTMATCHER_H +#define ADBLOCKHOSTMATCHER_H + +#include +#include + +class AdBlockHostMatcher +{ +public: + // Try to add an adblock filter to this host matcher. + // If the filter is not an hostname, the filter is not added + // and the method return false; + bool tryAddFilter(const QString &filter); + + bool match(const QString &host) const + { + return m_hostList.contains(host.toLower()); + } + + void clear() { m_hostList.clear(); } + +private: + QSet m_hostList; +}; + +#endif // ADBLOCKHOSTMATCHER_H diff --git a/src/adblock/adblockmanager.cpp b/src/adblock/adblockmanager.cpp index e0a109f0..600dc5ce 100644 --- a/src/adblock/adblockmanager.cpp +++ b/src/adblock/adblockmanager.cpp @@ -67,6 +67,8 @@ void AdBlockManager::loadSettings(bool checkUpdateDate) _index = 0; _buffer.clear(); + _hostWhiteList.clear(); + _hostBlackList.clear(); _whiteList.clear(); _blackList.clear(); _hideList.clear(); @@ -133,7 +135,10 @@ void AdBlockManager::loadRules(const QStringList &rules) // white rules if (stringRule.startsWith(QL1S("@@"))) { - AdBlockRule rule(stringRule.mid(2)); + const QString filter = stringRule.mid(2); + if (_hostWhiteList.tryAddFilter(filter)) + continue; + AdBlockRule rule(filter); _whiteList << rule; continue; } @@ -145,6 +150,8 @@ void AdBlockManager::loadRules(const QStringList &rules) continue; } + if (_hostBlackList.tryAddFilter(stringRule)) + continue; AdBlockRule rule(stringRule); _blackList << rule; } @@ -164,8 +171,16 @@ QNetworkReply *AdBlockManager::block(const QNetworkRequest &request, WebPage *pa // We compute a lowercase version of the URL so each rule does not // have to do it. const QString urlStringLowerCase = urlString.toLower(); + const QString host = request.url().host(); // check white rules before :) + + if (_hostWhiteList.match(host)) { + kDebug() << "****ADBLOCK: WHITE RULE (@@) Matched by host matcher: ***********"; + kDebug() << "UrlString: " << urlString; + return 0; + } + foreach(const AdBlockRule &filter, _whiteList) { if (filter.match(urlString, urlStringLowerCase)) @@ -177,6 +192,13 @@ QNetworkReply *AdBlockManager::block(const QNetworkRequest &request, WebPage *pa } // then check the black ones :( + if (_hostBlackList.match(host)) { + kDebug() << "****ADBLOCK: BLACK RULE Matched by host matcher: ***********"; + kDebug() << "UrlString: " << urlString; + AdBlockNetworkReply *reply = new AdBlockNetworkReply(request, urlString, this); + return reply; + } + foreach(const AdBlockRule &filter, _blackList) { if (filter.match(urlString, urlStringLowerCase)) diff --git a/src/adblock/adblockmanager.h b/src/adblock/adblockmanager.h index eae761e0..69548994 100644 --- a/src/adblock/adblockmanager.h +++ b/src/adblock/adblockmanager.h @@ -108,6 +108,7 @@ #include "rekonq_defines.h" // Local Includes +#include "adblockhostmatcher.h" #include "adblockrule.h" // KDE Includes @@ -155,6 +156,8 @@ private: bool _isAdblockEnabled; bool _isHideAdsEnabled; + AdBlockHostMatcher _hostBlackList; + AdBlockHostMatcher _hostWhiteList; AdBlockRuleList _blackList; AdBlockRuleList _whiteList; QStringList _hideList; -- cgit v1.2.1 From 718c6d4c33b24723ee6861e017e269662f193b96 Mon Sep 17 00:00:00 2001 From: Benjamin Poulain Date: Thu, 19 Aug 2010 02:35:38 +0200 Subject: Skip the hiding rules specific to domains The rules to hide elements for a specific domains were interpreted as regular RegExp rules, which grows the list of filter to test. Those rules are not working with the current implementation, we should just skip them for efficiency. --- src/adblock/adblockmanager.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/adblock/adblockmanager.cpp b/src/adblock/adblockmanager.cpp index 600dc5ce..8deb0bcd 100644 --- a/src/adblock/adblockmanager.cpp +++ b/src/adblock/adblockmanager.cpp @@ -150,6 +150,10 @@ void AdBlockManager::loadRules(const QStringList &rules) continue; } + // TODO implement domain-specific hiding + if (stringRule.contains(QL1S("##"))) + continue; + if (_hostBlackList.tryAddFilter(stringRule)) continue; AdBlockRule rule(stringRule); -- cgit v1.2.1