// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "extensions/common/url_pattern.h"
#include <stddef.h>
#include <ostream>
#include "base/stl_util.h"
#include "base/strings/pattern.h"
#include "base/strings/strcat.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "content/public/common/url_constants.h"
#include "extensions/common/constants.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/base/url_util.h"
#include "url/gurl.h"
#include "url/url_util.h"
const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
namespace {
// TODO(aa): What about more obscure schemes like javascript: ?
// Note: keep this array in sync with kValidSchemeMasks.
const char* const kValidSchemes[] = {
url::kHttpScheme, url::kHttpsScheme,
url::kFileScheme, url::kFtpScheme,
content::kChromeUIScheme, extensions::kExtensionScheme,
url::kFileSystemScheme, url::kWsScheme,
url::kWssScheme, url::kDataScheme,
const int kValidSchemeMasks[] = {
static_assert(base::size(kValidSchemes) == base::size(kValidSchemeMasks),
"must keep these arrays in sync");
const char kParseSuccess[] = "Success.";
const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
const char kParseErrorInvalidScheme[] = "Invalid scheme.";
const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
const char kParseErrorEmptyHost[] = "Host can not be empty.";
const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
const char kParseErrorEmptyPath[] = "Empty path.";
const char kParseErrorInvalidPort[] = "Invalid port.";
const char kParseErrorInvalidHost[] = "Invalid host.";
// Message explaining each URLPattern::ParseResult.
const char* const kParseResultMessages[] = {
static_assert(static_cast<int>(URLPattern::ParseResult::kNumParseResults) ==
"must add message for each parse result");
const char kPathSeparator[] = "/";
bool IsStandardScheme(base::StringPiece scheme) {
// "*" gets the same treatment as a standard scheme.
if (scheme == "*")
return true;
return url::IsStandard(,
url::Component(0, static_cast<int>(scheme.length())));
bool IsValidPortForScheme(base::StringPiece scheme, base::StringPiece port) {
if (port == "*")
return true;
// Only accept non-wildcard ports if the scheme uses ports.
if (url::DefaultPortForScheme(, scheme.length()) ==
return false;
int parsed_port = url::PORT_UNSPECIFIED;
if (!base::StringToInt(port, &parsed_port))
return false;
return (parsed_port >= 0) && (parsed_port < 65536);
// Returns |path| with the trailing wildcard stripped if one existed.
// The functions that rely on this (OverlapsWith and Contains) are only
// called for the patterns inside URLPatternSet. In those cases, we know that
// the path will have only a single wildcard at the end. This makes figuring
// out overlap much easier. It seems like there is probably a computer-sciency
// way to solve the general case, but we don't need that yet.
base::StringPiece StripTrailingWildcard(base::StringPiece path) {
if (path.ends_with("*"))
return path;
// Removes trailing dot from |host_piece| if any.
base::StringPiece CanonicalizeHostForMatching(base::StringPiece host_piece) {
if (host_piece.ends_with("."))
return host_piece;
} // namespace
// static
bool URLPattern::IsValidSchemeForExtensions(base::StringPiece scheme) {
for (size_t i = 0; i < base::size(kValidSchemes); ++i) {
if (scheme == kValidSchemes[i])
return true;
return false;
// static
int URLPattern::GetValidSchemeMaskForExtensions() {
int result = 0;
for (size_t i = 0; i < base::size(kValidSchemeMasks); ++i)
result |= kValidSchemeMasks[i];
return result;
: valid_schemes_(SCHEME_NONE),
port_("*") {}
URLPattern::URLPattern(int valid_schemes)
: valid_schemes_(valid_schemes),
port_("*") {}
URLPattern::URLPattern(int valid_schemes, base::StringPiece pattern)
// Strict error checking is used, because this constructor is only
// appropriate when we know |pattern| is valid.
: valid_schemes_(valid_schemes),
port_("*") {
ParseResult result = Parse(pattern);
if (result != ParseResult::kSuccess) {
const char* error_string = GetParseResultString(result);
// Temporarily add more logging to investigate why this code path is
// reached. For
LOG(ERROR) << "Invalid pattern was given " << pattern << " result "
<< error_string;
NOTREACHED() << "URLPattern invalid: '" << pattern
<< "'; error: " << error_string;
URLPattern::URLPattern(const URLPattern& other) = default;
URLPattern::URLPattern(URLPattern&& other) = default;
URLPattern::~URLPattern() {
URLPattern& URLPattern::operator=(const URLPattern& other) = default;
URLPattern& URLPattern::operator=(URLPattern&& other) = default;
bool URLPattern::operator<(const URLPattern& other) const {
return GetAsString() < other.GetAsString();
bool URLPattern::operator>(const URLPattern& other) const {
return GetAsString() > other.GetAsString();
bool URLPattern::operator==(const URLPattern& other) const {
return GetAsString() == other.GetAsString();
std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
return out << '"' << url_pattern.GetAsString() << '"';
URLPattern::ParseResult URLPattern::Parse(base::StringPiece pattern) {
// Special case pattern to match every valid URL.
if (pattern == kAllUrlsPattern) {
return ParseResult::kSuccess;
// Parse out the scheme.
size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
bool has_standard_scheme_separator = true;
// Some urls also use ':' alone as the scheme separator.
if (scheme_end_pos == base::StringPiece::npos) {
scheme_end_pos = pattern.find(':');
has_standard_scheme_separator = false;
if (scheme_end_pos == base::StringPiece::npos)
return ParseResult::kMissingSchemeSeparator;
if (!SetScheme(pattern.substr(0, scheme_end_pos)))
return ParseResult::kInvalidScheme;
bool standard_scheme = IsStandardScheme(scheme_);
if (standard_scheme != has_standard_scheme_separator)
return ParseResult::kWrongSchemeSeparator;
// Advance past the scheme separator.
scheme_end_pos +=
(standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
if (scheme_end_pos >= pattern.size())
return ParseResult::kEmptyHost;
// Parse out the host and path.
size_t host_start_pos = scheme_end_pos;
size_t path_start_pos = 0;
if (!standard_scheme) {
path_start_pos = host_start_pos;
} else if (scheme_ == url::kFileScheme) {
size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
if (host_end_pos == base::StringPiece::npos) {
// Allow hostname omission.
// e.g. file://* is interpreted as file:///*,
// file://foo* is interpreted as file:///foo*.
path_start_pos = host_start_pos - 1;
} else {
// Ignore hostname if scheme is file://.
// e.g. file://localhost/foo is equal to file:///foo.
path_start_pos = host_end_pos;
} else {
size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
// Host is required.
if (host_start_pos == host_end_pos)
return ParseResult::kEmptyHost;
if (host_end_pos == base::StringPiece::npos)
return ParseResult::kEmptyPath;
base::StringPiece host_and_port =
pattern.substr(host_start_pos, host_end_pos - host_start_pos);
size_t port_separator_pos = base::StringPiece::npos;
if (host_and_port[0] != '[') {
// Not IPv6 (either IPv4 or just a normal address).
port_separator_pos = host_and_port.find(':');
} else { // IPv6.
size_t host_end_pos = host_and_port.find(']');
if (host_end_pos == base::StringPiece::npos)
return ParseResult::kInvalidHost;
if (host_end_pos == 1)
return ParseResult::kEmptyHost;
if (host_end_pos < host_and_port.length() - 1) {
// The host isn't the only component. Check for a port. This would
// require a ':' to follow the closing ']' from the host.
if (host_and_port[host_end_pos + 1] != ':')
return ParseResult::kInvalidHost;
port_separator_pos = host_end_pos + 1;
if (port_separator_pos != base::StringPiece::npos &&
!SetPort(host_and_port.substr(port_separator_pos + 1))) {
return ParseResult::kInvalidPort;
// Note: this substr() will be the entire string if the port position
// wasn't found.
base::StringPiece host_piece = host_and_port.substr(0, port_separator_pos);
if (host_piece.empty())
return ParseResult::kEmptyHost;
if (host_piece == "*") {
match_subdomains_ = true;
} else if (host_piece.starts_with("*.")) {
if (host_piece.length() == 2) {
// We don't allow just '*.' as a host.
return ParseResult::kEmptyHost;
match_subdomains_ = true;
host_piece = host_piece.substr(2);
host_ = host_piece.as_string();
path_start_pos = host_end_pos;
// No other '*' can occur in the host, though. This isn't necessary, but is
// done as a convenience to developers who might otherwise be confused and
// think '*' works as a glob in the host.
if (host_.find('*') != std::string::npos)
return ParseResult::kInvalidHostWildcard;
if (!host_.empty()) {
// If |host_| is present (i.e., isn't a wildcard), we need to canonicalize
// it.
url::CanonHostInfo host_info;
host_ = net::CanonicalizeHost(host_, &host_info);
// net::CanonicalizeHost() returns an empty string on failure.
if (host_.empty())
return ParseResult::kInvalidHost;
// Null characters are not allowed in hosts.
if (host_.find('\0') != std::string::npos)
return ParseResult::kInvalidHost;
return ParseResult::kSuccess;
void URLPattern::SetValidSchemes(int valid_schemes) {
// TODO(devlin): Should we check that valid_schemes agrees with |scheme_|
// here? Otherwise, valid_schemes_ and schemes_ may stop agreeing with each
// other (e.g., in the case of `*://*/*`, where the scheme should only be
// http or https).
valid_schemes_ = valid_schemes;
void URLPattern::SetHost(base::StringPiece host) {
void URLPattern::SetMatchAllURLs(bool val) {
match_all_urls_ = val;
if (val) {
match_subdomains_ = true;
scheme_ = "*";
void URLPattern::SetMatchSubdomains(bool val) {
match_subdomains_ = val;
bool URLPattern::SetScheme(base::StringPiece scheme) {
if (scheme_ == "*") {
valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
} else if (!IsValidScheme(scheme_)) {
return false;
return true;
bool URLPattern::IsValidScheme(base::StringPiece scheme) const {
if (valid_schemes_ == SCHEME_ALL)
return true;
for (size_t i = 0; i < base::size(kValidSchemes); ++i) {
if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
return true;
return false;
void URLPattern::SetPath(base::StringPiece path) {
path_escaped_ = path_;
base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
bool URLPattern::SetPort(base::StringPiece port) {
if (IsValidPortForScheme(scheme_, port)) {
return true;
return false;
bool URLPattern::MatchesURL(const GURL& test) const {
const GURL* test_url = &test;
bool has_inner_url = test.inner_url() != nullptr;
if (has_inner_url) {
if (!test.SchemeIsFileSystem())
return false; // The only nested URLs we handle are filesystem URLs.
test_url = test.inner_url();
// Ensure the scheme matches first, since <all_urls> may not match this URL if
// the scheme is excluded.
if (!MatchesScheme(test_url->scheme_piece()))
return false;
if (match_all_urls_)
return true;
// Unless |match_all_urls_| is true, the grammar only permits matching
// URLs with nonempty paths.
if (!test.has_path())
return false;
std::string path_for_request = test.PathForRequest();
if (has_inner_url) {
path_for_request = base::StringPrintf("%s%s", test_url->path_piece().data(),
return MatchesSecurityOriginHelper(*test_url) &&
bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
const GURL* test_url = &test;
bool has_inner_url = test.inner_url() != NULL;
if (has_inner_url) {
if (!test.SchemeIsFileSystem())
return false; // The only nested URLs we handle are filesystem URLs.
test_url = test.inner_url();
if (!MatchesScheme(test_url->scheme()))
return false;
if (match_all_urls_)
return true;
return MatchesSecurityOriginHelper(*test_url);
bool URLPattern::MatchesScheme(base::StringPiece test) const {
if (!IsValidScheme(test))
return false;
return scheme_ == "*" || test == scheme_;
bool URLPattern::MatchesHost(base::StringPiece host) const {
// TODO(devlin): This is a bit sad. Parsing urls is expensive. However, it's
// important that we do this conversion to a GURL in order to canonicalize the
// host (the pattern's host_ already is canonicalized from Parse()). We can't
// just do string comparison.
return MatchesHost(
GURL(base::StringPrintf("%s%s%s/", url::kHttpScheme,
bool URLPattern::MatchesHost(const GURL& test) const {
base::StringPiece test_host(CanonicalizeHostForMatching(test.host_piece()));
const base::StringPiece pattern_host(CanonicalizeHostForMatching(host_));
// If the hosts are exactly equal, we have a match.
if (test_host == pattern_host)
return true;
// If we're matching subdomains, and we have no host in the match pattern,
// that means that we're matching all hosts, which means we have a match no
// matter what the test host is.
if (match_subdomains_ && pattern_host.empty())
return true;
// Otherwise, we can only match if our match pattern matches subdomains.
if (!match_subdomains_)
return false;
// We don't do subdomain matching against IP addresses, so we can give up now
// if the test host is an IP address.
if (test.HostIsIPAddress())
return false;
// Check if the test host is a subdomain of our host.
if (test_host.length() <= (pattern_host.length() + 1))
return false;
if (!test_host.ends_with(pattern_host))
return false;
return test_host[test_host.length() - pattern_host.length() - 1] == '.';
bool URLPattern::MatchesEffectiveTld(
net::registry_controlled_domains::PrivateRegistryFilter private_filter,
net::registry_controlled_domains::UnknownRegistryFilter unknown_filter)
const {
// Check if it matches all urls or is a pattern like http://*/*.
if (match_all_urls_ || (match_subdomains_ && host_.empty()))
return true;
// If this doesn't even match subdomains, it can't possibly be a TLD wildcard.
if (!match_subdomains_)
return false;
// If there was more than just a TLD in the host (e.g., *, it
// doesn't match all hosts in an effective TLD.
if (net::registry_controlled_domains::HostHasRegistryControlledDomain(
host_, unknown_filter, private_filter)) {
return false;
// At this point the host could either be just a TLD ("com") or some unknown
// TLD-like string ("notatld"). To disambiguate between them construct a
// fake URL, and check the registry.
// If we recognized this TLD, then this is a pattern like *.com, and it
// matches an effective TLD.
return net::registry_controlled_domains::HostHasRegistryControlledDomain(
"notatld." + host_, unknown_filter, private_filter);
bool URLPattern::MatchesSingleOrigin() const {
// Strictly speaking, the port is part of the origin, but in URLPattern it
// defaults to *. It's not very interesting anyway, so leave it out.
return !MatchesEffectiveTld() && scheme_ != "*" && !match_subdomains_;
bool URLPattern::MatchesPath(base::StringPiece test) const {
// Make the behaviour of OverlapsWith consistent with MatchesURL, which is
// need to match hosted apps on e.g. '' also run on ''.
// The below if is a no-copy way of doing (test + "/*" == path_escaped_).
if (path_escaped_.length() == test.length() + 2 &&
base::StartsWith(path_escaped_.c_str(), test,
base::CompareCase::SENSITIVE) &&
base::EndsWith(path_escaped_, "/*", base::CompareCase::SENSITIVE)) {
return true;
return base::MatchPattern(test, path_escaped_);
const std::string& URLPattern::GetAsString() const {
if (!spec_.empty())
return spec_;
if (match_all_urls_) {
spec_ = kAllUrlsPattern;
return spec_;
bool standard_scheme = IsStandardScheme(scheme_);
std::string spec = scheme_ +
(standard_scheme ? url::kStandardSchemeSeparator : ":");
if (scheme_ != url::kFileScheme && standard_scheme) {
if (match_subdomains_) {
spec += "*";
if (!host_.empty())
spec += ".";
if (!host_.empty())
spec += host_;
if (port_ != "*") {
spec += ":";
spec += port_;
if (!path_.empty())
spec += path_;
spec_ = std::move(spec);
return spec_;
bool URLPattern::OverlapsWith(const URLPattern& other) const {
if (match_all_urls() || other.match_all_urls())
return true;
return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
&& (MatchesHost( || other.MatchesHost(host()))
&& (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
&& (MatchesPath(StripTrailingWildcard(other.path())) ||
bool URLPattern::Contains(const URLPattern& other) const {
// Important: it's not enough to just check match_all_urls(); we also need to
// make sure that the schemes in this pattern are a superset of those in
// |other|.
if (match_all_urls() &&
(valid_schemes_ & other.valid_schemes_) == other.valid_schemes_) {
return true;
return MatchesAllSchemes(other.GetExplicitSchemes()) &&
MatchesHost( &&
(!other.match_subdomains_ || match_subdomains_) &&
MatchesPortPattern(other.port()) &&
base::Optional<URLPattern> URLPattern::CreateIntersection(
const URLPattern& other) const {
// Easy case: Schemes don't overlap. Return nullopt.
int intersection_schemes = URLPattern::SCHEME_NONE;
if (valid_schemes_ == URLPattern::SCHEME_ALL)
intersection_schemes = other.valid_schemes_;
else if (other.valid_schemes_ == URLPattern::SCHEME_ALL)
intersection_schemes = valid_schemes_;
intersection_schemes = valid_schemes_ & other.valid_schemes_;
if (intersection_schemes == URLPattern::SCHEME_NONE)
return base::nullopt;
// In a few cases, we can (mostly) return a copy of one of the patterns.
// This can happen when either:
// - The URLPattern's are identical (possibly excluding valid_schemes_)
// - One of the patterns has match_all_urls() equal to true.
// NOTE(devlin): Theoretically, we could use Contains() instead of
// match_all_urls() here. However, Contains() strips the trailing wildcard
// from the path, which could yield the incorrect result.
const URLPattern* copy_source = nullptr;
if (*this == other || other.match_all_urls())
copy_source = this;
else if (match_all_urls())
copy_source = &other;
if (copy_source) {
// NOTE: equality checks don't take into account valid_schemes_, and
// schemes can be different in the case of match_all_urls() as well, so
// we can't always just return *copy_source.
if (intersection_schemes == copy_source->valid_schemes_)
return *copy_source;
URLPattern result(intersection_schemes);
ParseResult parse_result = result.Parse(copy_source->GetAsString());
CHECK_EQ(ParseResult::kSuccess, parse_result);
return result;
// No more easy cases. Go through component by component to find the patterns
// that intersect.
// Note: Alias the function type (rather than using auto) because
// MatchesHost() is overloaded.
using match_function_type = bool (URLPattern::*)(base::StringPiece) const;
auto get_intersection = [this, &other](base::StringPiece own_str,
base::StringPiece other_str,
match_function_type match_function,
base::StringPiece* out) {
if ((this->*match_function)(other_str)) {
*out = other_str;
return true;
if ((other.*match_function)(own_str)) {
*out = own_str;
return true;
return false;
base::StringPiece scheme;
base::StringPiece host;
base::StringPiece port;
base::StringPiece path;
// If any pieces fail to overlap, then there is no intersection.
if (!get_intersection(scheme_, other.scheme_, &URLPattern::MatchesScheme,
&scheme) ||
!get_intersection(host_, other.host_, &URLPattern::MatchesHost, &host) ||
!get_intersection(port_, other.port_, &URLPattern::MatchesPortPattern,
&port) ||
!get_intersection(path_, other.path_, &URLPattern::MatchesPath, &path)) {
return base::nullopt;
// Only match subdomains if both patterns match subdomains.
base::StringPiece subdomains;
if (match_subdomains_ && other.match_subdomains_) {
// The host may be empty (e.g., in the case of *://*/* - in that case, only
// append '*' instead of '*.'.
subdomains = host.empty() ? "*" : "*.";
base::StringPiece scheme_separator =
IsStandardScheme(scheme) ? url::kStandardSchemeSeparator : ":";
std::string pattern_str = base::StrCat(
{scheme, scheme_separator, subdomains, host, ":", port, path});
URLPattern pattern(intersection_schemes);
ParseResult result = pattern.Parse(pattern_str);
// TODO(devlin): I don't think there's any way this should ever fail, but
// use a CHECK() to flush any cases out. If nothing crops up, downgrade this
// to a DCHECK in M72.
CHECK_EQ(ParseResult::kSuccess, result);
return pattern;
bool URLPattern::MatchesAnyScheme(
const std::vector<std::string>& schemes) const {
for (auto i = schemes.cbegin(); i != schemes.cend(); ++i) {
if (MatchesScheme(*i))
return true;
return false;
bool URLPattern::MatchesAllSchemes(
const std::vector<std::string>& schemes) const {
for (auto i = schemes.cbegin(); i != schemes.cend(); ++i) {
if (!MatchesScheme(*i))
return false;
return true;
bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
// Ignore hostname if scheme is file://.
if (scheme_ != url::kFileScheme && !MatchesHost(test))
return false;
if (!MatchesPortPattern(base::NumberToString(test.EffectiveIntPort())))
return false;
return true;
bool URLPattern::MatchesPortPattern(base::StringPiece port) const {
return port_ == "*" || port_ == port;
std::vector<std::string> URLPattern::GetExplicitSchemes() const {
std::vector<std::string> result;
if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
return result;
for (size_t i = 0; i < base::size(kValidSchemes); ++i) {
if (MatchesScheme(kValidSchemes[i])) {
return result;
std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
std::vector<std::string> explicit_schemes = GetExplicitSchemes();
std::vector<URLPattern> result;
for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
i != explicit_schemes.end(); ++i) {
URLPattern temp = *this;
return result;
// static
const char* URLPattern::GetParseResultString(
URLPattern::ParseResult parse_result) {
return kParseResultMessages[static_cast<int>(parse_result)];