Internal change
PiperOrigin-RevId: 123874609
Change-Id: I41c29967273446a17f60bd8472f776d88bd00f42
diff --git a/README.google b/README.google
index eab9cf3..f6a9a6a 100644
--- a/README.google
+++ b/README.google
@@ -1,5 +1,5 @@
-URL: https://chromium.googlesource.com/chromium/src/+archive/6e0744b15b09421eac6634fb3fb7fe0a03427d56/url.tar.gz
-Version: 6e0744b15b09421eac6634fb3fb7fe0a03427d56 (matching Chromium 41.0.2272.118)
+URL: https://chromium.googlesource.com/chromium/src.git/+archive/51.0.2704.63.tar.gz
+Version: commit 8726b6fe03a76ef686c9e1606e67169c177cb883 (matching Chromium 51.0.2704.63)
License: BSD, MPL, ICU (one source file under MPL, one source file under ICU)
License File: LICENSE
@@ -8,27 +8,34 @@
Local Modifications:
1. src/base directory:
-- Remove BASE_EXPORT macros.
+- Remove the include of base/base_export.h. Define BASE_EXPORT as empty string.
- Wrap namespace base with namespace url to distinguish from google3 base.
- src/base/strings/string16.*
- * Include src/build/build_config.h to detect wchar_t size.
* PrintTo function and operator << are removed to eliminate dependency on
src/base/strings/utf_string_conversion.
+- src/base/strings/string_piece.*
+ * Add constructor for ::string.
+ * Change DCHECK_IS_ON() to !NDEBUG.
- src/base/strings/string_util.*
- * Only one MatchPattern function is kept for src/url/origin.cc.
- * Change the argument type from StringPiece to std::string to remove
- dependency on google3 StringPiece.
+ * Removed the include of base/compiler_specific.h.
+ * Only the needed functions are kept.
- src/base/third_party/icu/icu_utf.cc
- * Add FALLTHROUGH_INTENDED for fall-through switch cases.
+ * Add base/macros.h and FALLTHROUGH_INTENDED for fall-through switch cases.
2. src/url directory:
-- Use google3 version of //base, //util/gtl/lazy_static_ptr.h
+- Use google3 version of //base, //util/gtl/lazy_static_ptr.h,
//third_party/icu and //testing/base/public:gunit_main. Some users don't want
googleurl to be dependent on google3 (e.g. geo/render/mirth/net:googleurl),
so we try our best to do it.
-- src/url/gurl.cc
- * Replace scoped_ptr with std::unique_ptr to eliminate dependency on google3
- scoped_ptr.
+- Change NOTREACHED() to DCHECK(false).
+- src/url/origin.cc
+ * Remove the include of base/strings/string_number_conversions.h.
+- src/url/scheme_host_port.cc
+ * Remove the includes of base/numerics/safe_conversions.h and
+ base/strings/string_number_conversions.h.
+ * Include url/url_canon_internal.h.
+ * Replace base::checked_cast with static_cast.
+ * Change base::UintToString to _itoa_s, which is used in other places.
- src/url/url_canon_icu.cc
* Replace LazyInstance with google3 LazyStaticPtr, modify intialization
and access methods accordingly.
@@ -36,16 +43,13 @@
* Replace ANNOTATE_LEAKING_OBJECT_PTR() with google3
HeapLeakChecker::IgnoreObject(), and only use it when GOOGLEURL_IN_GOOGLE3
is defined.
-- src/url/url_canon_internal.h
- * Expand NOT_REACHED() as DCHECK(false).
- src/url/url_canon_icu.h and src/url/url_canon_stdstring.h
* Remove the include of src/base/compiler_specific.h.
- src/url/third_party/mozilla/url_parse.cc
* Compile filesystemurl related function only when NO_FILESYSTEMURL_SUPPORT
- is not defined, so that
- wireless/android/icing/lib/core:liburl_parse_icing_static doesn't need to
+ is not defined, so that wireless/android/icing/lib/core doesn't need to
depend on other googleurl srcs as well as third_party/icu.
-3. google3_addidions directory:
+3. google3_additions directory:
2014-07-30: added google3_additions/googleurl_init.cc, which properly
initializes googleurl during InitGoogle().
diff --git a/src/base/strings/string16.h b/src/base/strings/string16.h
index be488c3..925e52f 100644
--- a/src/base/strings/string16.h
+++ b/src/base/strings/string16.h
@@ -26,12 +26,17 @@
// libc functions with custom, 2-byte-char compatible routines. It is capable
// of carrying UTF-16-encoded data.
+#include <stddef.h>
+#include <stdint.h>
#include <stdio.h>
+
+#include <functional>
#include <string>
-#include "base/basictypes.h"
#include "build/build_config.h"
+#define BASE_EXPORT
+
#if defined(WCHAR_T_IS_UTF16)
namespace url {
@@ -49,17 +54,17 @@
namespace url {
namespace base {
-typedef uint16 char16;
+typedef uint16_t char16;
// char16 versions of the functions required by string16_char_traits; these
// are based on the wide character functions of similar names ("w" or "wcs"
// instead of "c16").
-int c16memcmp(const char16* s1, const char16* s2, size_t n);
-size_t c16len(const char16* s);
-const char16* c16memchr(const char16* s, char16 c, size_t n);
-char16* c16memmove(char16* s1, const char16* s2, size_t n);
-char16* c16memcpy(char16* s1, const char16* s2, size_t n);
-char16* c16memset(char16* s, char16 c, size_t n);
+BASE_EXPORT int c16memcmp(const char16* s1, const char16* s2, size_t n);
+BASE_EXPORT size_t c16len(const char16* s);
+BASE_EXPORT const char16* c16memchr(const char16* s, char16 c, size_t n);
+BASE_EXPORT char16* c16memmove(char16* s1, const char16* s2, size_t n);
+BASE_EXPORT char16* c16memcpy(char16* s1, const char16* s2, size_t n);
+BASE_EXPORT char16* c16memset(char16* s, char16 c, size_t n);
struct string16_char_traits {
typedef char16 char_type;
@@ -67,7 +72,8 @@
// int_type needs to be able to hold each possible value of char_type, and in
// addition, the distinct value of eof().
- COMPILE_ASSERT(sizeof(int_type) > sizeof(char_type), unexpected_type_width);
+ static_assert(sizeof(int_type) > sizeof(char_type),
+ "int must be larger than 16 bits wide");
typedef std::streamoff off_type;
typedef mbstate_t state_type;
@@ -97,7 +103,7 @@
return c16memchr(s, a, n);
}
- static char_type* move(char_type* s1, const char_type* s2, int_type n) {
+ static char_type* move(char_type* s1, const char_type* s2, size_t n) {
return c16memmove(s1, s2, n);
}
@@ -130,7 +136,7 @@
}
};
-typedef std::basic_string<char16, url::base::string16_char_traits> string16;
+typedef std::basic_string<char16, base::string16_char_traits> string16;
} // namespace base
} // namespace url
@@ -175,7 +181,22 @@
// TODO(mark): File this bug with Apple and update this note with a bug number.
extern template
-class std::basic_string<url::base::char16, url::base::string16_char_traits>;
+class BASE_EXPORT std::basic_string<url::base::char16, url::base::string16_char_traits>;
+
+// Specialize std::hash for base::string16. Although the style guide forbids
+// this in general, it is necessary for consistency with WCHAR_T_IS_UTF16
+// platforms, where base::string16 is a type alias for std::wstring.
+namespace std {
+template <>
+struct hash<url::base::string16> {
+ std::size_t operator()(const url::base::string16& s) const {
+ std::size_t result = 0;
+ for (url::base::char16 c : s)
+ result = (result * 131) + c;
+ return result;
+ }
+};
+} // namespace std
#endif // WCHAR_T_IS_UTF32
diff --git a/src/base/strings/string_piece.cc b/src/base/strings/string_piece.cc
new file mode 100644
index 0000000..b8006c1
--- /dev/null
+++ b/src/base/strings/string_piece.cc
@@ -0,0 +1,454 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Copied from strings/stringpiece.cc with modifications
+
+#include "base/strings/string_piece.h"
+
+#include <limits.h>
+
+#include <algorithm>
+#include <ostream>
+
+#include "base/logging.h"
+
+namespace url {
+namespace base {
+namespace {
+
+// For each character in characters_wanted, sets the index corresponding
+// to the ASCII code of that character to 1 in table. This is used by
+// the find_.*_of methods below to tell whether or not a character is in
+// the lookup table in constant time.
+// The argument `table' must be an array that is large enough to hold all
+// the possible values of an unsigned char. Thus it should be be declared
+// as follows:
+// bool table[UCHAR_MAX + 1]
+inline void BuildLookupTable(const StringPiece& characters_wanted,
+ bool* table) {
+ const size_t length = characters_wanted.length();
+ const char* const data = characters_wanted.data();
+ for (size_t i = 0; i < length; ++i) {
+ table[static_cast<unsigned char>(data[i])] = true;
+ }
+}
+
+} // namespace
+
+// MSVC doesn't like complex extern templates and DLLs.
+#if !defined(COMPILER_MSVC)
+template class BasicStringPiece<std::string>;
+template class BasicStringPiece<string16>;
+#endif
+
+bool operator==(const StringPiece& x, const StringPiece& y) {
+ if (x.size() != y.size())
+ return false;
+
+ return StringPiece::wordmemcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
+ o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
+ return o;
+}
+
+namespace internal {
+
+template<typename STR>
+void CopyToStringT(const BasicStringPiece<STR>& self, STR* target) {
+ if (self.empty())
+ target->clear();
+ else
+ target->assign(self.data(), self.size());
+}
+
+void CopyToString(const StringPiece& self, std::string* target) {
+ CopyToStringT(self, target);
+}
+
+void CopyToString(const StringPiece16& self, string16* target) {
+ CopyToStringT(self, target);
+}
+
+template<typename STR>
+void AppendToStringT(const BasicStringPiece<STR>& self, STR* target) {
+ if (!self.empty())
+ target->append(self.data(), self.size());
+}
+
+void AppendToString(const StringPiece& self, std::string* target) {
+ AppendToStringT(self, target);
+}
+
+void AppendToString(const StringPiece16& self, string16* target) {
+ AppendToStringT(self, target);
+}
+
+template<typename STR>
+size_t copyT(const BasicStringPiece<STR>& self,
+ typename STR::value_type* buf,
+ size_t n,
+ size_t pos) {
+ size_t ret = std::min(self.size() - pos, n);
+ memcpy(buf, self.data() + pos, ret * sizeof(typename STR::value_type));
+ return ret;
+}
+
+size_t copy(const StringPiece& self, char* buf, size_t n, size_t pos) {
+ return copyT(self, buf, n, pos);
+}
+
+size_t copy(const StringPiece16& self, char16* buf, size_t n, size_t pos) {
+ return copyT(self, buf, n, pos);
+}
+
+template<typename STR>
+size_t findT(const BasicStringPiece<STR>& self,
+ const BasicStringPiece<STR>& s,
+ size_t pos) {
+ if (pos > self.size())
+ return BasicStringPiece<STR>::npos;
+
+ typename BasicStringPiece<STR>::const_iterator result =
+ std::search(self.begin() + pos, self.end(), s.begin(), s.end());
+ const size_t xpos =
+ static_cast<size_t>(result - self.begin());
+ return xpos + s.size() <= self.size() ? xpos : BasicStringPiece<STR>::npos;
+}
+
+size_t find(const StringPiece& self, const StringPiece& s, size_t pos) {
+ return findT(self, s, pos);
+}
+
+size_t find(const StringPiece16& self, const StringPiece16& s, size_t pos) {
+ return findT(self, s, pos);
+}
+
+template<typename STR>
+size_t findT(const BasicStringPiece<STR>& self,
+ typename STR::value_type c,
+ size_t pos) {
+ if (pos >= self.size())
+ return BasicStringPiece<STR>::npos;
+
+ typename BasicStringPiece<STR>::const_iterator result =
+ std::find(self.begin() + pos, self.end(), c);
+ return result != self.end() ?
+ static_cast<size_t>(result - self.begin()) : BasicStringPiece<STR>::npos;
+}
+
+size_t find(const StringPiece& self, char c, size_t pos) {
+ return findT(self, c, pos);
+}
+
+size_t find(const StringPiece16& self, char16 c, size_t pos) {
+ return findT(self, c, pos);
+}
+
+template<typename STR>
+size_t rfindT(const BasicStringPiece<STR>& self,
+ const BasicStringPiece<STR>& s,
+ size_t pos) {
+ if (self.size() < s.size())
+ return BasicStringPiece<STR>::npos;
+
+ if (s.empty())
+ return std::min(self.size(), pos);
+
+ typename BasicStringPiece<STR>::const_iterator last =
+ self.begin() + std::min(self.size() - s.size(), pos) + s.size();
+ typename BasicStringPiece<STR>::const_iterator result =
+ std::find_end(self.begin(), last, s.begin(), s.end());
+ return result != last ?
+ static_cast<size_t>(result - self.begin()) : BasicStringPiece<STR>::npos;
+}
+
+size_t rfind(const StringPiece& self, const StringPiece& s, size_t pos) {
+ return rfindT(self, s, pos);
+}
+
+size_t rfind(const StringPiece16& self, const StringPiece16& s, size_t pos) {
+ return rfindT(self, s, pos);
+}
+
+template<typename STR>
+size_t rfindT(const BasicStringPiece<STR>& self,
+ typename STR::value_type c,
+ size_t pos) {
+ if (self.size() == 0)
+ return BasicStringPiece<STR>::npos;
+
+ for (size_t i = std::min(pos, self.size() - 1); ;
+ --i) {
+ if (self.data()[i] == c)
+ return i;
+ if (i == 0)
+ break;
+ }
+ return BasicStringPiece<STR>::npos;
+}
+
+size_t rfind(const StringPiece& self, char c, size_t pos) {
+ return rfindT(self, c, pos);
+}
+
+size_t rfind(const StringPiece16& self, char16 c, size_t pos) {
+ return rfindT(self, c, pos);
+}
+
+// 8-bit version using lookup table.
+size_t find_first_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos) {
+ if (self.size() == 0 || s.size() == 0)
+ return StringPiece::npos;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.size() == 1)
+ return find(self, s.data()[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_t i = pos; i < self.size(); ++i) {
+ if (lookup[static_cast<unsigned char>(self.data()[i])]) {
+ return i;
+ }
+ }
+ return StringPiece::npos;
+}
+
+// 16-bit brute force version.
+size_t find_first_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos) {
+ StringPiece16::const_iterator found =
+ std::find_first_of(self.begin() + pos, self.end(), s.begin(), s.end());
+ if (found == self.end())
+ return StringPiece16::npos;
+ return found - self.begin();
+}
+
+// 8-bit version using lookup table.
+size_t find_first_not_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece::npos;
+
+ if (s.size() == 0)
+ return 0;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.size() == 1)
+ return find_first_not_of(self, s.data()[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_t i = pos; i < self.size(); ++i) {
+ if (!lookup[static_cast<unsigned char>(self.data()[i])]) {
+ return i;
+ }
+ }
+ return StringPiece::npos;
+}
+
+// 16-bit brute-force version.
+BASE_EXPORT size_t find_first_not_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece16::npos;
+
+ for (size_t self_i = pos; self_i < self.size(); ++self_i) {
+ bool found = false;
+ for (size_t s_i = 0; s_i < s.size(); ++s_i) {
+ if (self[self_i] == s[s_i]) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return self_i;
+ }
+ return StringPiece16::npos;
+}
+
+template<typename STR>
+size_t find_first_not_ofT(const BasicStringPiece<STR>& self,
+ typename STR::value_type c,
+ size_t pos) {
+ if (self.size() == 0)
+ return BasicStringPiece<STR>::npos;
+
+ for (; pos < self.size(); ++pos) {
+ if (self.data()[pos] != c) {
+ return pos;
+ }
+ }
+ return BasicStringPiece<STR>::npos;
+}
+
+size_t find_first_not_of(const StringPiece& self,
+ char c,
+ size_t pos) {
+ return find_first_not_ofT(self, c, pos);
+}
+
+size_t find_first_not_of(const StringPiece16& self,
+ char16 c,
+ size_t pos) {
+ return find_first_not_ofT(self, c, pos);
+}
+
+// 8-bit version using lookup table.
+size_t find_last_of(const StringPiece& self, const StringPiece& s, size_t pos) {
+ if (self.size() == 0 || s.size() == 0)
+ return StringPiece::npos;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.size() == 1)
+ return rfind(self, s.data()[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_t i = std::min(pos, self.size() - 1); ; --i) {
+ if (lookup[static_cast<unsigned char>(self.data()[i])])
+ return i;
+ if (i == 0)
+ break;
+ }
+ return StringPiece::npos;
+}
+
+// 16-bit brute-force version.
+size_t find_last_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece16::npos;
+
+ for (size_t self_i = std::min(pos, self.size() - 1); ;
+ --self_i) {
+ for (size_t s_i = 0; s_i < s.size(); s_i++) {
+ if (self.data()[self_i] == s[s_i])
+ return self_i;
+ }
+ if (self_i == 0)
+ break;
+ }
+ return StringPiece16::npos;
+}
+
+// 8-bit version using lookup table.
+size_t find_last_not_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece::npos;
+
+ size_t i = std::min(pos, self.size() - 1);
+ if (s.size() == 0)
+ return i;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.size() == 1)
+ return find_last_not_of(self, s.data()[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (; ; --i) {
+ if (!lookup[static_cast<unsigned char>(self.data()[i])])
+ return i;
+ if (i == 0)
+ break;
+ }
+ return StringPiece::npos;
+}
+
+// 16-bit brute-force version.
+size_t find_last_not_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece::npos;
+
+ for (size_t self_i = std::min(pos, self.size() - 1); ; --self_i) {
+ bool found = false;
+ for (size_t s_i = 0; s_i < s.size(); s_i++) {
+ if (self.data()[self_i] == s[s_i]) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return self_i;
+ if (self_i == 0)
+ break;
+ }
+ return StringPiece16::npos;
+}
+
+template<typename STR>
+size_t find_last_not_ofT(const BasicStringPiece<STR>& self,
+ typename STR::value_type c,
+ size_t pos) {
+ if (self.size() == 0)
+ return BasicStringPiece<STR>::npos;
+
+ for (size_t i = std::min(pos, self.size() - 1); ; --i) {
+ if (self.data()[i] != c)
+ return i;
+ if (i == 0)
+ break;
+ }
+ return BasicStringPiece<STR>::npos;
+}
+
+size_t find_last_not_of(const StringPiece& self,
+ char c,
+ size_t pos) {
+ return find_last_not_ofT(self, c, pos);
+}
+
+size_t find_last_not_of(const StringPiece16& self,
+ char16 c,
+ size_t pos) {
+ return find_last_not_ofT(self, c, pos);
+}
+
+template<typename STR>
+BasicStringPiece<STR> substrT(const BasicStringPiece<STR>& self,
+ size_t pos,
+ size_t n) {
+ if (pos > self.size()) pos = self.size();
+ if (n > self.size() - pos) n = self.size() - pos;
+ return BasicStringPiece<STR>(self.data() + pos, n);
+}
+
+StringPiece substr(const StringPiece& self,
+ size_t pos,
+ size_t n) {
+ return substrT(self, pos, n);
+}
+
+StringPiece16 substr(const StringPiece16& self,
+ size_t pos,
+ size_t n) {
+ return substrT(self, pos, n);
+}
+
+#if !NDEBUG
+void AssertIteratorsInOrder(std::string::const_iterator begin,
+ std::string::const_iterator end) {
+ DCHECK(begin <= end) << "StringPiece iterators swapped or invalid.";
+}
+void AssertIteratorsInOrder(string16::const_iterator begin,
+ string16::const_iterator end) {
+ DCHECK(begin <= end) << "StringPiece iterators swapped or invalid.";
+}
+#endif
+
+} // namespace internal
+} // namespace base
+} // namespace url
diff --git a/src/base/strings/string_piece.h b/src/base/strings/string_piece.h
new file mode 100644
index 0000000..7396eb4
--- /dev/null
+++ b/src/base/strings/string_piece.h
@@ -0,0 +1,472 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Copied from strings/stringpiece.h with modifications
+//
+// A string-like object that points to a sized piece of memory.
+//
+// You can use StringPiece as a function or method parameter. A StringPiece
+// parameter can receive a double-quoted string literal argument, a "const
+// char*" argument, a string argument, or a StringPiece argument with no data
+// copying. Systematic use of StringPiece for arguments reduces data
+// copies and strlen() calls.
+//
+// Prefer passing StringPieces by value:
+// void MyFunction(StringPiece arg);
+// If circumstances require, you may also pass by const reference:
+// void MyFunction(const StringPiece& arg); // not preferred
+// Both of these have the same lifetime semantics. Passing by value
+// generates slightly smaller code. For more discussion, Googlers can see
+// the thread go/stringpiecebyvalue on c-users.
+
+#ifndef BASE_STRINGS_STRING_PIECE_H_
+#define BASE_STRINGS_STRING_PIECE_H_
+
+#include <stddef.h>
+
+#include <iosfwd>
+#include <string>
+
+#include "base/logging.h"
+#include "base/strings/string16.h"
+
+#define BASE_EXPORT
+
+namespace url {
+namespace base {
+
+template <typename STRING_TYPE> class BasicStringPiece;
+typedef BasicStringPiece<std::string> StringPiece;
+typedef BasicStringPiece<string16> StringPiece16;
+
+// internal --------------------------------------------------------------------
+
+// Many of the StringPiece functions use different implementations for the
+// 8-bit and 16-bit versions, and we don't want lots of template expansions in
+// this (very common) header that will slow down compilation.
+//
+// So here we define overloaded functions called by the StringPiece template.
+// For those that share an implementation, the two versions will expand to a
+// template internal to the .cc file.
+namespace internal {
+
+BASE_EXPORT void CopyToString(const StringPiece& self, std::string* target);
+BASE_EXPORT void CopyToString(const StringPiece16& self, string16* target);
+
+BASE_EXPORT void AppendToString(const StringPiece& self, std::string* target);
+BASE_EXPORT void AppendToString(const StringPiece16& self, string16* target);
+
+BASE_EXPORT size_t copy(const StringPiece& self,
+ char* buf,
+ size_t n,
+ size_t pos);
+BASE_EXPORT size_t copy(const StringPiece16& self,
+ char16* buf,
+ size_t n,
+ size_t pos);
+
+BASE_EXPORT size_t find(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t find(const StringPiece& self,
+ char c,
+ size_t pos);
+BASE_EXPORT size_t find(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+
+BASE_EXPORT size_t rfind(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t rfind(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t rfind(const StringPiece& self,
+ char c,
+ size_t pos);
+BASE_EXPORT size_t rfind(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+
+BASE_EXPORT size_t find_first_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find_first_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+
+BASE_EXPORT size_t find_first_not_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find_first_not_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t find_first_not_of(const StringPiece& self,
+ char c,
+ size_t pos);
+BASE_EXPORT size_t find_first_not_of(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+
+BASE_EXPORT size_t find_last_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find_last_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t find_last_of(const StringPiece& self,
+ char c,
+ size_t pos);
+BASE_EXPORT size_t find_last_of(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+
+BASE_EXPORT size_t find_last_not_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find_last_not_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t find_last_not_of(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+BASE_EXPORT size_t find_last_not_of(const StringPiece& self,
+ char c,
+ size_t pos);
+
+BASE_EXPORT StringPiece substr(const StringPiece& self,
+ size_t pos,
+ size_t n);
+BASE_EXPORT StringPiece16 substr(const StringPiece16& self,
+ size_t pos,
+ size_t n);
+
+#if !NDEBUG
+// Asserts that begin <= end to catch some errors with iterator usage.
+BASE_EXPORT void AssertIteratorsInOrder(std::string::const_iterator begin,
+ std::string::const_iterator end);
+BASE_EXPORT void AssertIteratorsInOrder(string16::const_iterator begin,
+ string16::const_iterator end);
+#endif
+
+} // namespace internal
+
+// BasicStringPiece ------------------------------------------------------------
+
+// Defines the types, methods, operators, and data members common to both
+// StringPiece and StringPiece16. Do not refer to this class directly, but
+// rather to BasicStringPiece, StringPiece, or StringPiece16.
+//
+// This is templatized by string class type rather than character type, so
+// BasicStringPiece<std::string> or BasicStringPiece<base::string16>.
+template <typename STRING_TYPE> class BasicStringPiece {
+ public:
+ // Standard STL container boilerplate.
+ typedef size_t size_type;
+ typedef typename STRING_TYPE::value_type value_type;
+ typedef const value_type* pointer;
+ typedef const value_type& reference;
+ typedef const value_type& const_reference;
+ typedef ptrdiff_t difference_type;
+ typedef const value_type* const_iterator;
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+ static const size_type npos;
+
+ public:
+ // We provide non-explicit singleton constructors so users can pass
+ // in a "const char*" or a "string" wherever a "StringPiece" is
+ // expected (likewise for char16, string16, StringPiece16).
+ BasicStringPiece() : ptr_(NULL), length_(0) {}
+ BasicStringPiece(const value_type* str)
+ : ptr_(str),
+ length_((str == NULL) ? 0 : STRING_TYPE::traits_type::length(str)) {}
+ BasicStringPiece(const STRING_TYPE& str)
+ : ptr_(str.data()), length_(str.size()) {}
+#ifdef HAS_GLOBAL_STRING
+ // ::basic_string<char> == ::string != std::string (in google3)
+ BasicStringPiece(const ::basic_string<value_type>& str)
+ : ptr_(str.data()), length_(str.size()) {}
+#endif
+ BasicStringPiece(const value_type* offset, size_type len)
+ : ptr_(offset), length_(len) {}
+ BasicStringPiece(const typename STRING_TYPE::const_iterator& begin,
+ const typename STRING_TYPE::const_iterator& end) {
+#if !NDEBUG
+ // This assertion is done out-of-line to avoid bringing in logging.h and
+ // instantiating logging macros for every instantiation.
+ internal::AssertIteratorsInOrder(begin, end);
+#endif
+ length_ = static_cast<size_t>(std::distance(begin, end));
+
+ // The length test before assignment is to avoid dereferencing an iterator
+ // that may point to the end() of a string.
+ ptr_ = length_ > 0 ? &*begin : nullptr;
+ }
+
+ // data() may return a pointer to a buffer with embedded NULs, and the
+ // returned buffer may or may not be null terminated. Therefore it is
+ // typically a mistake to pass data() to a routine that expects a NUL
+ // terminated string.
+ const value_type* data() const { return ptr_; }
+ size_type size() const { return length_; }
+ size_type length() const { return length_; }
+ bool empty() const { return length_ == 0; }
+
+ void clear() {
+ ptr_ = NULL;
+ length_ = 0;
+ }
+ void set(const value_type* data, size_type len) {
+ ptr_ = data;
+ length_ = len;
+ }
+ void set(const value_type* str) {
+ ptr_ = str;
+ length_ = str ? STRING_TYPE::traits_type::length(str) : 0;
+ }
+
+ value_type operator[](size_type i) const { return ptr_[i]; }
+ value_type front() const { return ptr_[0]; }
+ value_type back() const { return ptr_[length_ - 1]; }
+
+ void remove_prefix(size_type n) {
+ ptr_ += n;
+ length_ -= n;
+ }
+
+ void remove_suffix(size_type n) {
+ length_ -= n;
+ }
+
+ int compare(const BasicStringPiece<STRING_TYPE>& x) const {
+ int r = wordmemcmp(
+ ptr_, x.ptr_, (length_ < x.length_ ? length_ : x.length_));
+ if (r == 0) {
+ if (length_ < x.length_) r = -1;
+ else if (length_ > x.length_) r = +1;
+ }
+ return r;
+ }
+
+ STRING_TYPE as_string() const {
+ // std::string doesn't like to take a NULL pointer even with a 0 size.
+ return empty() ? STRING_TYPE() : STRING_TYPE(data(), size());
+ }
+
+ const_iterator begin() const { return ptr_; }
+ const_iterator end() const { return ptr_ + length_; }
+ const_reverse_iterator rbegin() const {
+ return const_reverse_iterator(ptr_ + length_);
+ }
+ const_reverse_iterator rend() const {
+ return const_reverse_iterator(ptr_);
+ }
+
+ size_type max_size() const { return length_; }
+ size_type capacity() const { return length_; }
+
+ static int wordmemcmp(const value_type* p,
+ const value_type* p2,
+ size_type N) {
+ return STRING_TYPE::traits_type::compare(p, p2, N);
+ }
+
+ // Sets the value of the given string target type to be the current string.
+ // This saves a temporary over doing |a = b.as_string()|
+ void CopyToString(STRING_TYPE* target) const {
+ internal::CopyToString(*this, target);
+ }
+
+ void AppendToString(STRING_TYPE* target) const {
+ internal::AppendToString(*this, target);
+ }
+
+ size_type copy(value_type* buf, size_type n, size_type pos = 0) const {
+ return internal::copy(*this, buf, n, pos);
+ }
+
+ // Does "this" start with "x"
+ bool starts_with(const BasicStringPiece& x) const {
+ return ((this->length_ >= x.length_) &&
+ (wordmemcmp(this->ptr_, x.ptr_, x.length_) == 0));
+ }
+
+ // Does "this" end with "x"
+ bool ends_with(const BasicStringPiece& x) const {
+ return ((this->length_ >= x.length_) &&
+ (wordmemcmp(this->ptr_ + (this->length_-x.length_),
+ x.ptr_, x.length_) == 0));
+ }
+
+ // find: Search for a character or substring at a given offset.
+ size_type find(const BasicStringPiece<STRING_TYPE>& s,
+ size_type pos = 0) const {
+ return internal::find(*this, s, pos);
+ }
+ size_type find(value_type c, size_type pos = 0) const {
+ return internal::find(*this, c, pos);
+ }
+
+ // rfind: Reverse find.
+ size_type rfind(const BasicStringPiece& s,
+ size_type pos = BasicStringPiece::npos) const {
+ return internal::rfind(*this, s, pos);
+ }
+ size_type rfind(value_type c, size_type pos = BasicStringPiece::npos) const {
+ return internal::rfind(*this, c, pos);
+ }
+
+ // find_first_of: Find the first occurence of one of a set of characters.
+ size_type find_first_of(const BasicStringPiece& s,
+ size_type pos = 0) const {
+ return internal::find_first_of(*this, s, pos);
+ }
+ size_type find_first_of(value_type c, size_type pos = 0) const {
+ return find(c, pos);
+ }
+
+ // find_first_not_of: Find the first occurence not of a set of characters.
+ size_type find_first_not_of(const BasicStringPiece& s,
+ size_type pos = 0) const {
+ return internal::find_first_not_of(*this, s, pos);
+ }
+ size_type find_first_not_of(value_type c, size_type pos = 0) const {
+ return internal::find_first_not_of(*this, c, pos);
+ }
+
+ // find_last_of: Find the last occurence of one of a set of characters.
+ size_type find_last_of(const BasicStringPiece& s,
+ size_type pos = BasicStringPiece::npos) const {
+ return internal::find_last_of(*this, s, pos);
+ }
+ size_type find_last_of(value_type c,
+ size_type pos = BasicStringPiece::npos) const {
+ return rfind(c, pos);
+ }
+
+ // find_last_not_of: Find the last occurence not of a set of characters.
+ size_type find_last_not_of(const BasicStringPiece& s,
+ size_type pos = BasicStringPiece::npos) const {
+ return internal::find_last_not_of(*this, s, pos);
+ }
+ size_type find_last_not_of(value_type c,
+ size_type pos = BasicStringPiece::npos) const {
+ return internal::find_last_not_of(*this, c, pos);
+ }
+
+ // substr.
+ BasicStringPiece substr(size_type pos,
+ size_type n = BasicStringPiece::npos) const {
+ return internal::substr(*this, pos, n);
+ }
+
+ protected:
+ const value_type* ptr_;
+ size_type length_;
+};
+
+template <typename STRING_TYPE>
+const typename BasicStringPiece<STRING_TYPE>::size_type
+BasicStringPiece<STRING_TYPE>::npos =
+ typename BasicStringPiece<STRING_TYPE>::size_type(-1);
+
+// MSVC doesn't like complex extern templates and DLLs.
+#if !defined(COMPILER_MSVC)
+extern template class BASE_EXPORT BasicStringPiece<std::string>;
+extern template class BASE_EXPORT BasicStringPiece<string16>;
+#endif
+
+// StingPiece operators --------------------------------------------------------
+
+BASE_EXPORT bool operator==(const StringPiece& x, const StringPiece& y);
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+ return !(x == y);
+}
+
+inline bool operator<(const StringPiece& x, const StringPiece& y) {
+ const int r = StringPiece::wordmemcmp(
+ x.data(), y.data(), (x.size() < y.size() ? x.size() : y.size()));
+ return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece& x, const StringPiece& y) {
+ return y < x;
+}
+
+inline bool operator<=(const StringPiece& x, const StringPiece& y) {
+ return !(x > y);
+}
+
+inline bool operator>=(const StringPiece& x, const StringPiece& y) {
+ return !(x < y);
+}
+
+// StringPiece16 operators -----------------------------------------------------
+
+inline bool operator==(const StringPiece16& x, const StringPiece16& y) {
+ if (x.size() != y.size())
+ return false;
+
+ return StringPiece16::wordmemcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece16& x, const StringPiece16& y) {
+ return !(x == y);
+}
+
+inline bool operator<(const StringPiece16& x, const StringPiece16& y) {
+ const int r = StringPiece16::wordmemcmp(
+ x.data(), y.data(), (x.size() < y.size() ? x.size() : y.size()));
+ return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece16& x, const StringPiece16& y) {
+ return y < x;
+}
+
+inline bool operator<=(const StringPiece16& x, const StringPiece16& y) {
+ return !(x > y);
+}
+
+inline bool operator>=(const StringPiece16& x, const StringPiece16& y) {
+ return !(x < y);
+}
+
+BASE_EXPORT std::ostream& operator<<(std::ostream& o,
+ const StringPiece& piece);
+
+// Hashing ---------------------------------------------------------------------
+
+// We provide appropriate hash functions so StringPiece and StringPiece16 can
+// be used as keys in hash sets and maps.
+
+// This hash function is copied from base/strings/string16.h. We don't use the
+// ones already defined for string and string16 directly because it would
+// require the string constructors to be called, which we don't want.
+#define HASH_STRING_PIECE(StringPieceType, string_piece) \
+ std::size_t result = 0; \
+ for (StringPieceType::const_iterator i = string_piece.begin(); \
+ i != string_piece.end(); ++i) \
+ result = (result * 131) + *i; \
+ return result;
+
+struct StringPieceHash {
+ std::size_t operator()(const StringPiece& sp) const {
+ HASH_STRING_PIECE(StringPiece, sp);
+ }
+};
+struct StringPiece16Hash {
+ std::size_t operator()(const StringPiece16& sp16) const {
+ HASH_STRING_PIECE(StringPiece16, sp16);
+ }
+};
+
+} // namespace base
+} // namespace url
+
+#endif // BASE_STRINGS_STRING_PIECE_H_
diff --git a/src/base/strings/string_util.cc b/src/base/strings/string_util.cc
index 8b2e068..095fbfa 100644
--- a/src/base/strings/string_util.cc
+++ b/src/base/strings/string_util.cc
@@ -4,144 +4,173 @@
#include "base/strings/string_util.h"
-#include "base/basictypes.h"
-#include "base/third_party/icu/icu_utf.h"
+#include <ctype.h>
+#include <errno.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
-static bool IsWildcard(base_icu::UChar32 character) {
- return character == '*' || character == '?';
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+namespace url {
+namespace base {
+
+namespace {
+
+// Assuming that a pointer is the size of a "machine word", then
+// uintptr_t is an integer type that is also a machine word.
+typedef uintptr_t MachineWord;
+const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
+
+inline bool IsAlignedToMachineWord(const void* pointer) {
+ return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
}
-// Move the strings pointers to the point where they start to differ.
-template <typename CHAR, typename NEXT>
-static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
- const CHAR** string, const CHAR* string_end,
- NEXT next) {
- const CHAR* escape = NULL;
- while (*pattern != pattern_end && *string != string_end) {
- if (!escape && IsWildcard(**pattern)) {
- // We don't want to match wildcard here, except if it's escaped.
- return;
- }
-
- // Check if the escapement char is found. If so, skip it and move to the
- // next character.
- if (!escape && **pattern == '\\') {
- escape = *pattern;
- next(pattern, pattern_end);
- continue;
- }
-
- // Check if the chars match, if so, increment the ptrs.
- const CHAR* pattern_next = *pattern;
- const CHAR* string_next = *string;
- base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
- if (pattern_char == next(&string_next, string_end) &&
- pattern_char != CBU_SENTINEL) {
- *pattern = pattern_next;
- *string = string_next;
- } else {
- // Uh oh, it did not match, we are done. If the last char was an
- // escapement, that means that it was an error to advance the ptr here,
- // let's put it back where it was. This also mean that the MatchPattern
- // function will return false because if we can't match an escape char
- // here, then no one will.
- if (escape) {
- *pattern = escape;
- }
- return;
- }
-
- escape = NULL;
- }
+template<typename T> inline T* AlignToMachineWord(T* pointer) {
+ return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
+ ~kMachineWordAlignmentMask);
}
-template <typename CHAR, typename NEXT>
-static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
- while (*pattern != end) {
- if (!IsWildcard(**pattern))
- return;
- next(pattern, end);
- }
-}
-
-template <typename CHAR, typename NEXT>
-static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
- const CHAR* pattern, const CHAR* pattern_end,
- int depth,
- NEXT next) {
- const int kMaxDepth = 16;
- if (depth > kMaxDepth)
- return false;
-
- // Eat all the matching chars.
- EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
-
- // If the string is empty, then the pattern must be empty too, or contains
- // only wildcards.
- if (eval == eval_end) {
- EatWildcard(&pattern, pattern_end, next);
- return pattern == pattern_end;
- }
-
- // Pattern is empty but not string, this is not a match.
- if (pattern == pattern_end)
- return false;
-
- // If this is a question mark, then we need to compare the rest with
- // the current string or the string with one character eaten.
- const CHAR* next_pattern = pattern;
- next(&next_pattern, pattern_end);
- if (pattern[0] == '?') {
- if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
- depth + 1, next))
- return true;
- const CHAR* next_eval = eval;
- next(&next_eval, eval_end);
- if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
- depth + 1, next))
- return true;
- }
-
- // This is a *, try to match all the possible substrings with the remainder
- // of the pattern.
- if (pattern[0] == '*') {
- // Collapse duplicate wild cards (********** into *) so that the
- // method does not recurse unnecessarily. http://crbug.com/52839
- EatWildcard(&next_pattern, pattern_end, next);
-
- while (eval != eval_end) {
- if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
- depth + 1, next))
- return true;
- eval++;
- }
-
- // We reached the end of the string, let see if the pattern contains only
- // wildcards.
- if (eval == eval_end) {
- EatWildcard(&pattern, pattern_end, next);
- if (pattern != pattern_end)
- return false;
- return true;
- }
- }
-
- return false;
-}
-
-struct NextCharUTF8 {
- base_icu::UChar32 operator()(const char** p, const char* end) {
- base_icu::UChar32 c;
- int offset = 0;
- CBU8_NEXT(*p, offset, end - *p, c);
- *p += offset;
- return c;
- }
+template<size_t size, typename CharacterType> struct NonASCIIMask;
+template<> struct NonASCIIMask<4, char16> {
+ static inline uint32_t value() { return 0xFF80FF80U; }
};
+template<> struct NonASCIIMask<4, char> {
+ static inline uint32_t value() { return 0x80808080U; }
+};
+template<> struct NonASCIIMask<8, char16> {
+ static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
+};
+template<> struct NonASCIIMask<8, char> {
+ static inline uint64_t value() { return 0x8080808080808080ULL; }
+};
+#if defined(WCHAR_T_IS_UTF32)
+template<> struct NonASCIIMask<4, wchar_t> {
+ static inline uint32_t value() { return 0xFFFFFF80U; }
+};
+template<> struct NonASCIIMask<8, wchar_t> {
+ static inline uint64_t value() { return 0xFFFFFF80FFFFFF80ULL; }
+};
+#endif // WCHAR_T_IS_UTF32
-bool MatchPattern(const std::string& eval,
- const std::string& pattern) {
- return MatchPatternT(eval.data(), eval.data() + eval.size(),
- pattern.data(), pattern.data() + pattern.size(),
- 0, NextCharUTF8());
+template<typename StringType>
+StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) {
+ StringType ret;
+ ret.reserve(str.size());
+ for (size_t i = 0; i < str.size(); i++)
+ ret.push_back(ToLowerASCII(str[i]));
+ return ret;
}
+
+template<typename StringType>
+StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) {
+ StringType ret;
+ ret.reserve(str.size());
+ for (size_t i = 0; i < str.size(); i++)
+ ret.push_back(ToUpperASCII(str[i]));
+ return ret;
+}
+
+} // namespace
+
+std::string ToLowerASCII(StringPiece str) {
+ return ToLowerASCIIImpl<std::string>(str);
+}
+
+string16 ToLowerASCII(StringPiece16 str) {
+ return ToLowerASCIIImpl<string16>(str);
+}
+
+template <class Char>
+inline bool DoIsStringASCII(const Char* characters, size_t length) {
+ MachineWord all_char_bits = 0;
+ const Char* end = characters + length;
+
+ // Prologue: align the input.
+ while (!IsAlignedToMachineWord(characters) && characters != end) {
+ all_char_bits |= *characters;
+ ++characters;
+ }
+
+ // Compare the values of CPU word size.
+ const Char* word_end = AlignToMachineWord(end);
+ const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
+ while (characters < word_end) {
+ all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
+ characters += loop_increment;
+ }
+
+ // Process the remaining bytes.
+ while (characters != end) {
+ all_char_bits |= *characters;
+ ++characters;
+ }
+
+ MachineWord non_ascii_bit_mask =
+ NonASCIIMask<sizeof(MachineWord), Char>::value();
+ return !(all_char_bits & non_ascii_bit_mask);
+}
+
+bool IsStringASCII(const StringPiece& str) {
+ return DoIsStringASCII(str.data(), str.length());
+}
+
+bool IsStringASCII(const StringPiece16& str) {
+ return DoIsStringASCII(str.data(), str.length());
+}
+
+bool IsStringASCII(const string16& str) {
+ return DoIsStringASCII(str.data(), str.length());
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+bool IsStringASCII(const std::wstring& str) {
+ return DoIsStringASCII(str.data(), str.length());
+}
+#endif
+
+// Implementation note: Normally this function will be called with a hardcoded
+// constant for the lowercase_ascii parameter. Constructing a StringPiece from
+// a C constant requires running strlen, so the result will be two passes
+// through the buffers, one to file the length of lowercase_ascii, and one to
+// compare each letter.
+//
+// This function could have taken a const char* to avoid this and only do one
+// pass through the string. But the strlen is faster than the case-insensitive
+// compares and lets us early-exit in the case that the strings are different
+// lengths (will often be the case for non-matches). So whether one approach or
+// the other will be faster depends on the case.
+//
+// The hardcoded strings are typically very short so it doesn't matter, and the
+// string piece gives additional flexibility for the caller (doesn't have to be
+// null terminated) so we choose the StringPiece route.
+template<typename Str>
+static inline bool DoLowerCaseEqualsASCII(BasicStringPiece<Str> str,
+ StringPiece lowercase_ascii) {
+ if (str.size() != lowercase_ascii.size())
+ return false;
+ for (size_t i = 0; i < str.size(); i++) {
+ if (ToLowerASCII(str[i]) != lowercase_ascii[i])
+ return false;
+ }
+ return true;
+}
+
+bool LowerCaseEqualsASCII(StringPiece str, StringPiece lowercase_ascii) {
+ return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
+}
+
+bool LowerCaseEqualsASCII(StringPiece16 str, StringPiece lowercase_ascii) {
+ return DoLowerCaseEqualsASCII<string16>(str, lowercase_ascii);
+}
+
+} // namespace base
+} // namespace url
diff --git a/src/base/strings/string_util.h b/src/base/strings/string_util.h
index ffc1579..458d202 100644
--- a/src/base/strings/string_util.h
+++ b/src/base/strings/string_util.h
@@ -7,14 +7,53 @@
#ifndef BASE_STRINGS_STRING_UTIL_H_
#define BASE_STRINGS_STRING_UTIL_H_
-#include "base/basictypes.h"
+#include <ctype.h>
+#include <stdarg.h> // va_list
+#include <stddef.h>
+#include <stdint.h>
-// Returns true if the string passed in matches the pattern. The pattern
-// string can contain wildcards like * and ?
-// The backslash character (\) is an escape character for * and ?
-// We limit the patterns to having a max of 16 * or ? characters.
-// ? matches 0 or 1 character, while * matches 0 or more characters.
-bool MatchPattern(const std::string& string,
- const std::string& pattern);
+#include <string>
+#include <vector>
+
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h" // For implicit conversions.
+#include "build/build_config.h"
+
+#define BASE_EXPORT
+
+namespace url {
+namespace base {
+
+// ASCII-specific tolower. The standard library's tolower is locale sensitive,
+// so we don't want to use it here.
+inline char ToLowerASCII(char c) {
+ return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
+}
+inline char16 ToLowerASCII(char16 c) {
+ return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
+}
+
+// Converts the given string to it's ASCII-lowercase equivalent.
+BASE_EXPORT std::string ToLowerASCII(StringPiece str);
+BASE_EXPORT string16 ToLowerASCII(StringPiece16 str);
+
+BASE_EXPORT bool IsStringASCII(const StringPiece& str);
+BASE_EXPORT bool IsStringASCII(const StringPiece16& str);
+// A convenience adaptor for WebStrings, as they don't convert into
+// StringPieces directly.
+BASE_EXPORT bool IsStringASCII(const string16& str);
+#if defined(WCHAR_T_IS_UTF32)
+BASE_EXPORT bool IsStringASCII(const std::wstring& str);
+#endif
+
+// Compare the lower-case form of the given string against the given
+// previously-lower-cased ASCII string (typically a constant).
+BASE_EXPORT bool LowerCaseEqualsASCII(StringPiece str,
+ StringPiece lowecase_ascii);
+BASE_EXPORT bool LowerCaseEqualsASCII(StringPiece16 str,
+ StringPiece lowecase_ascii);
+
+} // namespace base
+} // namespace url
#endif // BASE_STRINGS_STRING_UTIL_H_
diff --git a/src/base/strings/utf_string_conversion_utils.cc b/src/base/strings/utf_string_conversion_utils.cc
index e71605b..a22b109 100644
--- a/src/base/strings/utf_string_conversion_utils.cc
+++ b/src/base/strings/utf_string_conversion_utils.cc
@@ -12,15 +12,15 @@
// ReadUnicodeCharacter --------------------------------------------------------
bool ReadUnicodeCharacter(const char* src,
- int32 src_len,
- int32* char_index,
- uint32* code_point_out) {
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point_out) {
// U8_NEXT expects to be able to use -1 to signal an error, so we must
// use a signed type for code_point. But this function returns false
// on error anyway, so code_point_out is unsigned.
- int32 code_point;
+ int32_t code_point;
CBU8_NEXT(src, *char_index, src_len, code_point);
- *code_point_out = static_cast<uint32>(code_point);
+ *code_point_out = static_cast<uint32_t>(code_point);
// The ICU macro above moves to the next char, we want to point to the last
// char consumed.
@@ -31,9 +31,9 @@
}
bool ReadUnicodeCharacter(const char16* src,
- int32 src_len,
- int32* char_index,
- uint32* code_point) {
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point) {
if (CBU16_IS_SURROGATE(src[*char_index])) {
if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
*char_index + 1 >= src_len ||
@@ -56,9 +56,9 @@
#if defined(WCHAR_T_IS_UTF32)
bool ReadUnicodeCharacter(const wchar_t* src,
- int32 src_len,
- int32* char_index,
- uint32* code_point) {
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point) {
// Conversion is easy since the source is 32-bit.
*code_point = src[*char_index];
@@ -69,7 +69,7 @@
// WriteUnicodeCharacter -------------------------------------------------------
-size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) {
+size_t WriteUnicodeCharacter(uint32_t code_point, std::string* output) {
if (code_point <= 0x7f) {
// Fast path the common case of one byte.
output->push_back(static_cast<char>(code_point));
@@ -90,7 +90,7 @@
return char_offset - original_char_offset;
}
-size_t WriteUnicodeCharacter(uint32 code_point, string16* output) {
+size_t WriteUnicodeCharacter(uint32_t code_point, string16* output) {
if (CBU16_LENGTH(code_point) == 1) {
// Thie code point is in the Basic Multilingual Plane (BMP).
output->push_back(static_cast<char16>(code_point));
diff --git a/src/base/strings/utf_string_conversion_utils.h b/src/base/strings/utf_string_conversion_utils.h
index b24f03b..294670e 100644
--- a/src/base/strings/utf_string_conversion_utils.h
+++ b/src/base/strings/utf_string_conversion_utils.h
@@ -7,12 +7,17 @@
// This should only be used by the various UTF string conversion files.
+#include <stddef.h>
+#include <stdint.h>
+
#include "base/strings/string16.h"
+#define BASE_EXPORT
+
namespace url {
namespace base {
-inline bool IsValidCodepoint(uint32 code_point) {
+inline bool IsValidCodepoint(uint32_t code_point) {
// Excludes the surrogate code points ([0xD800, 0xDFFF]) and
// codepoints larger than 0x10FFFF (the highest codepoint allowed).
// Non-characters and unassigned codepoints are allowed.
@@ -20,7 +25,7 @@
(code_point >= 0xE000u && code_point <= 0x10FFFFu);
}
-inline bool IsValidCharacter(uint32 code_point) {
+inline bool IsValidCharacter(uint32_t code_point) {
// Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
// 0xFFFE or 0xFFFF) from the set of valid code points.
return code_point < 0xD800u || (code_point >= 0xE000u &&
@@ -37,41 +42,40 @@
// (as in a for loop) will take the reader to the next character.
//
// Returns true on success. On false, |*code_point| will be invalid.
-bool ReadUnicodeCharacter(const char* src,
- int32 src_len,
- int32* char_index,
- uint32* code_point_out);
+BASE_EXPORT bool ReadUnicodeCharacter(const char* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point_out);
// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
-bool ReadUnicodeCharacter(const char16* src,
- int32 src_len,
- int32* char_index,
- uint32* code_point);
+BASE_EXPORT bool ReadUnicodeCharacter(const char16* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point);
#if defined(WCHAR_T_IS_UTF32)
// Reads UTF-32 character. The usage is the same as the 8-bit version above.
-bool ReadUnicodeCharacter(const wchar_t* src,
- int32 src_len,
- int32* char_index,
- uint32* code_point);
+BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point);
#endif // defined(WCHAR_T_IS_UTF32)
// WriteUnicodeCharacter -------------------------------------------------------
// Appends a UTF-8 character to the given 8-bit string. Returns the number of
// bytes written.
-// TODO(brettw) Bug 79631: This function should not be exposed.
-size_t WriteUnicodeCharacter(uint32 code_point,
- std::string* output);
+BASE_EXPORT size_t WriteUnicodeCharacter(uint32_t code_point,
+ std::string* output);
// Appends the given code point as a UTF-16 character to the given 16-bit
// string. Returns the number of 16-bit values written.
-size_t WriteUnicodeCharacter(uint32 code_point, string16* output);
+BASE_EXPORT size_t WriteUnicodeCharacter(uint32_t code_point, string16* output);
#if defined(WCHAR_T_IS_UTF32)
// Appends the given UTF-32 character to the given 32-bit string. Returns the
// number of 32-bit values written.
-inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
+inline size_t WriteUnicodeCharacter(uint32_t code_point, std::wstring* output) {
// This is the easy case, just append the character.
output->push_back(code_point);
return 1;
diff --git a/src/base/third_party/icu/icu_utf.cc b/src/base/third_party/icu/icu_utf.cc
index 55edce1..9d48707 100644
--- a/src/base/third_party/icu/icu_utf.cc
+++ b/src/base/third_party/icu/icu_utf.cc
@@ -18,6 +18,7 @@
*/
#include "base/third_party/icu/icu_utf.h"
+#include "base/macros.h"
namespace base_icu {
@@ -74,32 +75,28 @@
* lead bytes above 0xf4 are illegal.
* We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
*/
-const uint8
-utf8_countTrailBytes[256]={
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+const uint8_t utf8_countTrailBytes[256] =
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3,
- 3, 3, 3, /* illegal in Unicode */
- 4, 4, 4, 4, /* illegal in Unicode */
- 5, 5, /* illegal in Unicode */
- 0, 0 /* illegal bytes 0xfe and 0xff */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
+ 3, 3, /* illegal in Unicode */
+ 4, 4, 4, 4, /* illegal in Unicode */
+ 5, 5, /* illegal in Unicode */
+ 0, 0 /* illegal bytes 0xfe and 0xff */
};
static const UChar32
@@ -133,12 +130,15 @@
*
* Note that a UBool is the same as an int8_t.
*/
-UChar32
-utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict) {
- int32 i=*pi;
- uint8 count=CBU8_COUNT_TRAIL_BYTES(c);
+UChar32 utf8_nextCharSafeBody(const uint8_t* s,
+ int32_t* pi,
+ int32_t length,
+ UChar32 c,
+ UBool strict) {
+ int32_t i = *pi;
+ uint8_t count = CBU8_COUNT_TRAIL_BYTES(c);
if((i)+count<=(length)) {
- uint8 trail, illegal=0;
+ uint8_t trail, illegal = 0;
CBU8_MASK_LEAD_BYTE((c), count);
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
@@ -194,7 +194,7 @@
/* illegal is also set if count>=4 */
if(illegal || (c)<utf8_minLegal[count] || (CBU_IS_SURROGATE(c) && strict!=-2)) {
/* error handling */
- uint8 errorCount=count;
+ uint8_t errorCount = count;
/* don't go beyond this sequence */
i=*pi;
while(count>0 && CBU8_IS_TRAIL(s[i])) {
@@ -212,7 +212,7 @@
}
} else /* too few bytes left */ {
/* error handling */
- int32 i0=i;
+ int32_t i0 = i;
/* don't just set (i)=(length) in case there is an illegal sequence */
while((i)<(length) && CBU8_IS_TRAIL(s[i])) {
++(i);
diff --git a/src/base/third_party/icu/icu_utf.h b/src/base/third_party/icu/icu_utf.h
index 2b993b0..4370fde 100644
--- a/src/base/third_party/icu/icu_utf.h
+++ b/src/base/third_party/icu/icu_utf.h
@@ -17,13 +17,13 @@
#ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_
#define BASE_THIRD_PARTY_ICU_ICU_UTF_H_
-#include "base/basictypes.h"
+#include <stdint.h>
namespace base_icu {
-typedef int32 UChar32;
-typedef uint16 UChar;
-typedef int8 UBool;
+typedef int32_t UChar32;
+typedef uint16_t UChar;
+typedef int8_t UBool;
// General ---------------------------------------------------------------------
// from utf.h
@@ -54,10 +54,9 @@
* @return TRUE or FALSE
* @stable ICU 2.4
*/
-#define CBU_IS_UNICODE_NONCHAR(c) \
- ((c)>=0xfdd0 && \
- ((uint32)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
- (uint32)(c)<=0x10ffff)
+#define CBU_IS_UNICODE_NONCHAR(c) \
+ ((c) >= 0xfdd0 && ((uint32_t)(c) <= 0xfdef || ((c)&0xfffe) == 0xfffe) && \
+ (uint32_t)(c) <= 0x10ffff)
/**
* Is c a Unicode code point value (0..U+10ffff)
@@ -76,11 +75,10 @@
* @return TRUE or FALSE
* @stable ICU 2.4
*/
-#define CBU_IS_UNICODE_CHAR(c) \
- ((uint32)(c)<0xd800 || \
- ((uint32)(c)>0xdfff && \
- (uint32)(c)<=0x10ffff && \
- !CBU_IS_UNICODE_NONCHAR(c)))
+#define CBU_IS_UNICODE_CHAR(c) \
+ ((uint32_t)(c) < 0xd800 || \
+ ((uint32_t)(c) > 0xdfff && (uint32_t)(c) <= 0x10ffff && \
+ !CBU_IS_UNICODE_NONCHAR(c)))
/**
* Is this code point a surrogate (U+d800..U+dfff)?
@@ -103,13 +101,14 @@
// UTF-8 macros ----------------------------------------------------------------
// from utf8.h
-extern const uint8 utf8_countTrailBytes[256];
+extern const uint8_t utf8_countTrailBytes[256];
/**
* Count the trail bytes for a UTF-8 lead byte.
* @internal
*/
-#define CBU8_COUNT_TRAIL_BYTES(leadByte) (base_icu::utf8_countTrailBytes[(uint8)leadByte])
+#define CBU8_COUNT_TRAIL_BYTES(leadByte) \
+ (base_icu::utf8_countTrailBytes[(uint8_t)leadByte])
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
@@ -131,7 +130,7 @@
* @return TRUE or FALSE
* @stable ICU 2.4
*/
-#define CBU8_IS_LEAD(c) ((uint8)((c)-0xc0)<0x3e)
+#define CBU8_IS_LEAD(c) ((uint8_t)((c)-0xc0) < 0x3e)
/**
* Is this code unit (byte) a UTF-8 trail byte?
@@ -148,16 +147,16 @@
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
* @stable ICU 2.4
*/
-#define CBU8_LENGTH(c) \
- ((uint32)(c)<=0x7f ? 1 : \
- ((uint32)(c)<=0x7ff ? 2 : \
- ((uint32)(c)<=0xd7ff ? 3 : \
- ((uint32)(c)<=0xdfff || (uint32)(c)>0x10ffff ? 0 : \
- ((uint32)(c)<=0xffff ? 3 : 4)\
- ) \
- ) \
- ) \
- )
+#define CBU8_LENGTH(c) \
+ ((uint32_t)(c) <= 0x7f \
+ ? 1 \
+ : ((uint32_t)(c) <= 0x7ff \
+ ? 2 \
+ : ((uint32_t)(c) <= 0xd7ff \
+ ? 3 \
+ : ((uint32_t)(c) <= 0xdfff || (uint32_t)(c) > 0x10ffff \
+ ? 0 \
+ : ((uint32_t)(c) <= 0xffff ? 3 : 4)))))
/**
* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
@@ -170,7 +169,11 @@
* Function for handling "next code point" with error-checking.
* @internal
*/
-UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict);
+UChar32 utf8_nextCharSafeBody(const uint8_t* s,
+ int32_t* pi,
+ int32_t length,
+ UChar32 c,
+ UBool strict);
/**
* Get a code point from a string at a code point boundary offset,
@@ -183,55 +186,59 @@
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to a negative value.
*
- * @param s const uint8 * string
+ * @param s const uint8_t * string
* @param i string offset, i<length
* @param length string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see CBU8_NEXT_UNSAFE
* @stable ICU 2.4
*/
-#define CBU8_NEXT(s, i, length, c) { \
- (c)=(s)[(i)++]; \
- if(((uint8)(c))>=0x80) { \
- if(CBU8_IS_LEAD(c)) { \
- (c)=base_icu::utf8_nextCharSafeBody((const uint8 *)s, &(i), (int32)(length), c, -1); \
- } else { \
- (c)=CBU_SENTINEL; \
- } \
- } \
-}
+#define CBU8_NEXT(s, i, length, c) \
+ { \
+ (c) = (s)[(i)++]; \
+ if (((uint8_t)(c)) >= 0x80) { \
+ if (CBU8_IS_LEAD(c)) { \
+ (c) = base_icu::utf8_nextCharSafeBody((const uint8_t*)s, &(i), \
+ (int32_t)(length), c, -1); \
+ } else { \
+ (c) = CBU_SENTINEL; \
+ } \
+ } \
+ }
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
- * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the
+ * string.
* Otherwise, the result is undefined.
*
- * @param s const uint8 * string buffer
+ * @param s const uint8_t * string buffer
* @param i string offset
* @param c code point to append
* @see CBU8_APPEND
* @stable ICU 2.4
*/
-#define CBU8_APPEND_UNSAFE(s, i, c) { \
- if((uint32)(c)<=0x7f) { \
- (s)[(i)++]=(uint8)(c); \
- } else { \
- if((uint32)(c)<=0x7ff) { \
- (s)[(i)++]=(uint8)(((c)>>6)|0xc0); \
- } else { \
- if((uint32)(c)<=0xffff) { \
- (s)[(i)++]=(uint8)(((c)>>12)|0xe0); \
- } else { \
- (s)[(i)++]=(uint8)(((c)>>18)|0xf0); \
- (s)[(i)++]=(uint8)((((c)>>12)&0x3f)|0x80); \
- } \
- (s)[(i)++]=(uint8)((((c)>>6)&0x3f)|0x80); \
- } \
- (s)[(i)++]=(uint8)(((c)&0x3f)|0x80); \
- } \
-}
+#define CBU8_APPEND_UNSAFE(s, i, c) \
+ { \
+ if ((uint32_t)(c) <= 0x7f) { \
+ (s)[(i)++] = (uint8_t)(c); \
+ } else { \
+ if ((uint32_t)(c) <= 0x7ff) { \
+ (s)[(i)++] = (uint8_t)(((c) >> 6) | 0xc0); \
+ } else { \
+ if ((uint32_t)(c) <= 0xffff) { \
+ (s)[(i)++] = (uint8_t)(((c) >> 12) | 0xe0); \
+ } else { \
+ (s)[(i)++] = (uint8_t)(((c) >> 18) | 0xf0); \
+ (s)[(i)++] = (uint8_t)((((c) >> 12) & 0x3f) | 0x80); \
+ } \
+ (s)[(i)++] = (uint8_t)((((c) >> 6) & 0x3f) | 0x80); \
+ } \
+ (s)[(i)++] = (uint8_t)(((c)&0x3f) | 0x80); \
+ } \
+ }
// UTF-16 macros ---------------------------------------------------------------
// from utf16.h
@@ -325,7 +332,7 @@
* @return 1 or 2
* @stable ICU 2.4
*/
-#define CBU16_LENGTH(c) ((uint32)(c)<=0xffff ? 1 : 2)
+#define CBU16_LENGTH(c) ((uint32_t)(c) <= 0xffff ? 1 : 2)
/**
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
@@ -353,16 +360,17 @@
* @param c output UChar32 variable
* @stable ICU 2.4
*/
-#define CBU16_NEXT(s, i, length, c) { \
- (c)=(s)[(i)++]; \
- if(CBU16_IS_LEAD(c)) { \
- uint16 __c2; \
- if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \
- ++(i); \
- (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
- } \
- } \
-}
+#define CBU16_NEXT(s, i, length, c) \
+ { \
+ (c) = (s)[(i)++]; \
+ if (CBU16_IS_LEAD(c)) { \
+ uint16_t __c2; \
+ if ((i) < (length) && CBU16_IS_TRAIL(__c2 = (s)[(i)])) { \
+ ++(i); \
+ (c) = CBU16_GET_SUPPLEMENTARY((c), __c2); \
+ } \
+ } \
+ }
/**
* Append a code point to a string, overwriting 1 or 2 code units.
@@ -377,14 +385,15 @@
* @see CBU16_APPEND
* @stable ICU 2.4
*/
-#define CBU16_APPEND_UNSAFE(s, i, c) { \
- if((uint32)(c)<=0xffff) { \
- (s)[(i)++]=(uint16)(c); \
- } else { \
- (s)[(i)++]=(uint16)(((c)>>10)+0xd7c0); \
- (s)[(i)++]=(uint16)(((c)&0x3ff)|0xdc00); \
- } \
-}
+#define CBU16_APPEND_UNSAFE(s, i, c) \
+ { \
+ if ((uint32_t)(c) <= 0xffff) { \
+ (s)[(i)++] = (uint16_t)(c); \
+ } else { \
+ (s)[(i)++] = (uint16_t)(((c) >> 10) + 0xd7c0); \
+ (s)[(i)++] = (uint16_t)(((c)&0x3ff) | 0xdc00); \
+ } \
+ }
} // namesapce base_icu
diff --git a/src/build/build_config.h b/src/build/build_config.h
index b07660d..d8c3db6 100644
--- a/src/build/build_config.h
+++ b/src/build/build_config.h
@@ -61,8 +61,8 @@
#error Please add support for your platform in build/build_config.h
#endif
-#if defined(USE_OPENSSL) && defined(USE_NSS)
-#error Cannot use both OpenSSL and NSS
+#if defined(USE_OPENSSL_CERTS) && defined(USE_NSS_CERTS)
+#error Cannot use both OpenSSL and NSS for certificates
#endif
// For access to standard BSD features, use OS_BSD instead of a
diff --git a/src/url/gurl.cc b/src/url/gurl.cc
index 6801dda..b75c8f5 100644
--- a/src/url/gurl.cc
+++ b/src/url/gurl.cc
@@ -2,21 +2,25 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#include "url/gurl.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <ostream>
+
+#include "base/logging.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/string_util.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_util.h"
+
#ifdef WIN32
#include <windows.h>
#else
#include <pthread.h>
#endif
-#include <algorithm>
-#include <ostream>
-
-#include "url/gurl.h"
-
-#include "base/logging.h"
-#include "url/url_canon_stdstring.h"
-#include "url/url_util.h"
-
namespace {
static std::string* empty_string = NULL;
@@ -59,7 +63,7 @@
#endif // WIN32
-} // namespace
+} // namespace
GURL::GURL() : is_valid_(false) {
}
@@ -74,16 +78,16 @@
DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
}
-GURL::GURL(const std::string& url_string) {
+GURL::GURL(url::base::StringPiece url_string) {
InitCanonical(url_string, true);
}
-GURL::GURL(const url::base::string16& url_string) {
+GURL::GURL(url::base::StringPiece16 url_string) {
InitCanonical(url_string, true);
}
GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
- InitCanonical(url_string, false);
+ InitCanonical(url::base::StringPiece(url_string), false);
}
GURL::GURL(const char* canonical_spec,
@@ -104,7 +108,8 @@
}
template<typename STR>
-void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
+void GURL::InitCanonical(url::base::BasicStringPiece<STR> input_spec,
+ bool trim_path_end) {
// Reserve enough room in the output for the input, plus some extra so that
// we have room if we have to escape a few things without reallocating.
spec_.reserve(input_spec.size() + 32);
@@ -130,7 +135,7 @@
#ifndef NDEBUG
// For testing purposes, check that the parsed canonical URL is identical to
// what we would have produced. Skip checking for invalid URLs have no meaning
- // and we can't always canonicalize then reproducabely.
+ // and we can't always canonicalize then reproducibly.
if (is_valid_) {
url::Component scheme;
// We can't do this check on the inner_url of a filesystem URL, as
@@ -193,17 +198,8 @@
return spec_ > other.spec_;
}
-GURL GURL::Resolve(const std::string& relative) const {
- return ResolveWithCharsetConverter(relative, NULL);
-}
-GURL GURL::Resolve(const url::base::string16& relative) const {
- return ResolveWithCharsetConverter(relative, NULL);
-}
-
// Note: code duplicated below (it's inconvenient to use a template here).
-GURL GURL::ResolveWithCharsetConverter(
- const std::string& relative,
- url::CharsetConverter* charset_converter) const {
+GURL GURL::Resolve(const std::string& relative) const {
// Not allowed for invalid URLs.
if (!is_valid_)
return GURL();
@@ -218,7 +214,7 @@
if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
parsed_, relative.data(),
static_cast<int>(relative.length()),
- charset_converter, &output, &result.parsed_)) {
+ nullptr, &output, &result.parsed_)) {
// Error resolving, return an empty URL.
return GURL();
}
@@ -234,9 +230,7 @@
}
// Note: code duplicated above (it's inconvenient to use a template here).
-GURL GURL::ResolveWithCharsetConverter(
- const url::base::string16& relative,
- url::CharsetConverter* charset_converter) const {
+GURL GURL::Resolve(const url::base::string16& relative) const {
// Not allowed for invalid URLs.
if (!is_valid_)
return GURL();
@@ -251,7 +245,7 @@
if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
parsed_, relative.data(),
static_cast<int>(relative.length()),
- charset_converter, &output, &result.parsed_)) {
+ nullptr, &output, &result.parsed_)) {
// Error resolving, return an empty URL.
return GURL();
}
@@ -320,7 +314,7 @@
GURL GURL::GetOrigin() const {
// This doesn't make sense for invalid or nonstandard URLs, so return
- // the empty URL
+ // the empty URL.
if (!is_valid_ || !IsStandard())
return GURL();
@@ -338,7 +332,7 @@
}
GURL GURL::GetAsReferrer() const {
- if (!is_valid_ || !SchemeIsHTTPOrHTTPS())
+ if (!SchemeIsValidForReferrer())
return GURL();
if (!has_ref() && !has_username() && !has_password())
@@ -379,18 +373,23 @@
return url::IsStandard(spec_.data(), parsed_.scheme);
}
-bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
+bool GURL::SchemeIs(url::base::StringPiece lower_ascii_scheme) const {
+ DCHECK(url::base::IsStringASCII(lower_ascii_scheme));
+ DCHECK(url::base::ToLowerASCII(lower_ascii_scheme) == lower_ascii_scheme);
+
if (parsed_.scheme.len <= 0)
- return lower_ascii_scheme == NULL;
- return url::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
- spec_.data() + parsed_.scheme.end(),
- lower_ascii_scheme);
+ return lower_ascii_scheme.empty();
+ return scheme_piece() == lower_ascii_scheme;
}
bool GURL::SchemeIsHTTPOrHTTPS() const {
return SchemeIs(url::kHttpScheme) || SchemeIs(url::kHttpsScheme);
}
+bool GURL::SchemeIsValidForReferrer() const {
+ return is_valid_ && IsReferrerScheme(spec_.data(), parsed_.scheme);
+}
+
bool GURL::SchemeIsWSOrWSS() const {
return SchemeIs(url::kWsScheme) || SchemeIs(url::kWssScheme);
}
@@ -416,16 +415,17 @@
}
std::string GURL::PathForRequest() const {
- DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
+ DCHECK(parsed_.path.len > 0)
+ << "Canonical path for requests should be non-empty";
if (parsed_.ref.len >= 0) {
- // Clip off the reference when it exists. The reference starts after the #
- // sign, so we have to subtract one to also remove it.
+ // Clip off the reference when it exists. The reference starts after the
+ // #-sign, so we have to subtract one to also remove it.
return std::string(spec_, parsed_.path.begin,
parsed_.ref.begin - parsed_.path.begin - 1);
}
// Compute the actual path length, rather than depending on the spec's
- // terminator. If we're an inner_url, our spec continues on into our outer
- // url's path/query/ref.
+ // terminator. If we're an inner_url, our spec continues on into our outer
+ // URL's path/query/ref.
int path_len = parsed_.path.len;
if (parsed_.query.is_valid())
path_len = parsed_.query.end() - parsed_.path.begin;
@@ -490,48 +490,45 @@
#endif // WIN32
-bool GURL::DomainIs(const char* lower_ascii_domain,
- int domain_len) const {
- // Return false if this URL is not valid or domain is empty.
- if (!is_valid_ || !domain_len)
+bool GURL::DomainIs(url::base::StringPiece lower_ascii_domain) const {
+ if (!is_valid_ || lower_ascii_domain.empty())
return false;
// FileSystem URLs have empty parsed_.host, so check this first.
if (SchemeIsFileSystem() && inner_url_)
- return inner_url_->DomainIs(lower_ascii_domain, domain_len);
+ return inner_url_->DomainIs(lower_ascii_domain);
if (!parsed_.host.is_nonempty())
return false;
- // Check whether the host name is end with a dot. If yes, treat it
- // the same as no-dot unless the input comparison domain is end
- // with dot.
- const char* last_pos = spec_.data() + parsed_.host.end() - 1;
+ // If the host name ends with a dot but the input domain doesn't,
+ // then we ignore the dot in the host name.
+ const char* host_last_pos = spec_.data() + parsed_.host.end() - 1;
int host_len = parsed_.host.len;
- if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
- last_pos--;
+ int domain_len = lower_ascii_domain.length();
+ if ('.' == *host_last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
+ host_last_pos--;
host_len--;
}
- // Return false if host's length is less than domain's length.
if (host_len < domain_len)
return false;
- // Compare this url whether belong specific domain.
- const char* start_pos = spec_.data() + parsed_.host.begin +
- host_len - domain_len;
+ // |host_first_pos| is the start of the compared part of the host name, not
+ // start of the whole host name.
+ const char* host_first_pos = spec_.data() + parsed_.host.begin +
+ host_len - domain_len;
- if (!url::LowerCaseEqualsASCII(start_pos,
- last_pos + 1,
- lower_ascii_domain,
- lower_ascii_domain + domain_len))
+ if (!url::base::LowerCaseEqualsASCII(
+ url::base::StringPiece(host_first_pos, domain_len), lower_ascii_domain))
return false;
- // Check whether host has right domain start with dot, make sure we got
- // right domain range. For example www.google.com has domain
- // "google.com" but www.iamnotgoogle.com does not.
+ // Make sure there aren't extra characters in host before the compared part;
+ // if the host name is longer than the input domain name, then the character
+ // immediately before the compared part should be a dot. For example,
+ // www.google.com has domain "google.com", but www.iamnotgoogle.com does not.
if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
- '.' != *(start_pos - 1))
+ '.' != *(host_first_pos - 1))
return false;
return true;
diff --git a/src/url/gurl.h b/src/url/gurl.h
index 16d9a2a..70f70ec 100644
--- a/src/url/gurl.h
+++ b/src/url/gurl.h
@@ -5,21 +5,48 @@
#ifndef URL_GURL_H_
#define URL_GURL_H_
+#include <stddef.h>
+
#include <iosfwd>
#include <memory>
#include <string>
#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_canon_stdstring.h"
#include "url/url_constants.h"
#include "url/url_export.h"
-#include "url/url_parse.h"
+// Represents a URL.
+//
+// A parsed canonicalized URL will be guaranteed UTF-8. Only the ref (if
+// specified) can be non-ASCII, the host, path, etc. will be guaranteed ASCII
+// and any non-ASCII characters will be encoded and % escaped.
+//
+// The string representation of a URL is called the spec(). Getting the
+// spec will assert if the URL is invalid to help protect against malicious
+// URLs. If you want the "best effort" canonicalization of an invalid URL, you
+// can use possibly_invalid_spec(). Test validity with is_valid(). Data and
+// javascript URLs use GetContent() to extract the data.
+//
+// This class has existence checkers and getters for the various components of
+// a URL. Existence is different than being nonempty. "http://www.google.com/?"
+// has a query that just happens to be empty, and has_query() will return true
+// while the query getters will return the empty string.
+//
+// Prefer not to modify a URL using string operations (though sometimes this is
+// unavoidable). Instead, use ReplaceComponents which can replace or delete
+// multiple parts of a URL in one step, doesn't re-canonicalize unchanged
+// sections, and avoids some screw-ups. An example is creating a URL with a
+// path that contains a literal '#'. Using string concatenation will generate a
+// URL with a truncated path and a reference fragment, while ReplaceComponents
+// will know to escape this and produce the desired result.
class URL_EXPORT GURL {
public:
- typedef url::StdStringReplacements<std::string> Replacements;
- typedef url::StdStringReplacements<url::base::string16> ReplacementsW;
+ typedef url::StringPieceReplacements<std::string> Replacements;
+ typedef url::StringPieceReplacements<url::base::string16> ReplacementsW;
// Creates an empty, invalid URL.
GURL();
@@ -28,15 +55,9 @@
// to reallocating the string. It does not re-parse.
GURL(const GURL& other);
- // The narrow version requires the input be UTF-8. Invalid UTF-8 input will
- // result in an invalid URL.
- //
- // The wide version should also take an encoding parameter so we know how to
- // encode the query parameters. It is probably sufficient for the narrow
- // version to assume the query parameter encoding should be the same as the
- // input encoding.
- explicit GURL(const std::string& url_string /*, output_param_encoding*/);
- explicit GURL(const url::base::string16& url_string /*, output_param_encoding*/);
+ // The strings to this contructor should be UTF-8 / UTF-16.
+ explicit GURL(url::base::StringPiece url_string);
+ explicit GURL(url::base::StringPiece16 url_string);
// Constructor for URLs that have already been parsed and canonicalized. This
// is used for conversions from KURL, for example. The caller must supply all
@@ -91,7 +112,7 @@
// Returns the potentially invalid spec for a the URL. This spec MUST NOT be
// modified or sent over the network. It is designed to be displayed in error
- // messages to the user, as the apperance of the spec may explain the error.
+ // messages to the user, as the appearance of the spec may explain the error.
// If the spec is valid, the valid spec will be returned.
//
// The returned string is guaranteed to be valid UTF-8.
@@ -124,9 +145,8 @@
// pages.
//
// It may be impossible to resolve the URLs properly. If the input is not
- // "standard" (SchemeIsStandard() == false) and the input looks relative, we
- // can't resolve it. In these cases, the result will be an empty, invalid
- // GURL.
+ // "standard" (IsStandard() == false) and the input looks relative, we can't
+ // resolve it. In these cases, the result will be an empty, invalid GURL.
//
// The result may also be a nonempty, invalid URL if the input has some kind
// of encoding error. In these cases, we will try to construct a "good" URL
@@ -137,20 +157,6 @@
GURL Resolve(const std::string& relative) const;
GURL Resolve(const url::base::string16& relative) const;
- // Like Resolve() above but takes a character set encoder which will be used
- // for any query text specified in the input. The charset converter parameter
- // may be NULL, in which case it will be treated as UTF-8.
- //
- // TODO(brettw): These should be replaced with versions that take something
- // more friendly than a raw CharsetConverter (maybe like an ICU character set
- // name).
- GURL ResolveWithCharsetConverter(
- const std::string& relative,
- url::CharsetConverter* charset_converter) const;
- GURL ResolveWithCharsetConverter(
- const url::base::string16& relative,
- url::CharsetConverter* charset_converter) const;
-
// Creates a new GURL by replacing the current URL's components with the
// supplied versions. See the Replacements class in url_canon.h for more.
//
@@ -194,21 +200,24 @@
// returned.
GURL GetAsReferrer() const;
- // Returns true if the scheme for the current URL is a known "standard"
- // scheme. Standard schemes have an authority and a path section. This
- // includes file: and filesystem:, which some callers may want to filter out
- // explicitly by calling SchemeIsFile[System].
+ // Returns true if the scheme for the current URL is a known "standard-format"
+ // scheme. A standard-format scheme adheres to what RFC 3986 calls "generic
+ // URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). This includes
+ // file: and filesystem:, which some callers may want to filter out explicitly
+ // by calling SchemeIsFile[System].
bool IsStandard() const;
// Returns true if the given parameter (should be lower-case ASCII to match
- // the canonicalized scheme) is the scheme for this URL. This call is more
- // efficient than getting the scheme and comparing it because no copies or
- // object constructions are done.
- bool SchemeIs(const char* lower_ascii_scheme) const;
+ // the canonicalized scheme) is the scheme for this URL. Do not include a
+ // colon.
+ bool SchemeIs(url::base::StringPiece lower_ascii_scheme) const;
// Returns true if the scheme is "http" or "https".
bool SchemeIsHTTPOrHTTPS() const;
+ // Returns true if the scheme is valid for use as a referrer.
+ bool SchemeIsValidForReferrer() const;
+
// Returns true is the scheme is "ws" or "wss".
bool SchemeIsWSOrWSS() const;
@@ -223,10 +232,15 @@
return SchemeIs(url::kFileSystemScheme);
}
- // If the scheme indicates a secure connection
- bool SchemeIsSecure() const {
- return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme) ||
- (SchemeIsFileSystem() && inner_url() && inner_url()->SchemeIsSecure());
+ // Returns true if the scheme indicates a network connection that uses TLS or
+ // some other cryptographic protocol (e.g. QUIC) for security.
+ //
+ // This function is a not a complete test of whether or not an origin's code
+ // is minimally trustworthy. For that, see Chromium's |IsOriginSecure| for a
+ // higher-level and more complete semantics. See that function's documentation
+ // for more detail.
+ bool SchemeIsCryptographic() const {
+ return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme);
}
// Returns true if the scheme is "blob".
@@ -235,82 +249,112 @@
}
// The "content" of the URL is everything after the scheme (skipping the
- // scheme delimiting colon). It is an error to get the origin of an invalid
- // URL. The result will be an empty string.
+ // scheme delimiting colon). It is an error to get the content of an invalid
+ // URL: the result will be an empty string.
std::string GetContent() const;
// Returns true if the hostname is an IP address. Note: this function isn't
// as cheap as a simple getter because it re-parses the hostname to verify.
- // This currently identifies only IPv4 addresses (bug 822685).
bool HostIsIPAddress() const;
- // Getters for various components of the URL. The returned string will be
- // empty if the component is empty or is not present.
- std::string scheme() const { // Not including the colon. See also SchemeIs.
+ // Not including the colon. If you are comparing schemes, prefer SchemeIs.
+ bool has_scheme() const {
+ return parsed_.scheme.len >= 0;
+ }
+ std::string scheme() const {
return ComponentString(parsed_.scheme);
}
+ url::base::StringPiece scheme_piece() const {
+ return ComponentStringPiece(parsed_.scheme);
+ }
+
+ bool has_username() const {
+ return parsed_.username.len >= 0;
+ }
std::string username() const {
return ComponentString(parsed_.username);
}
- std::string password() const {
- return ComponentString(parsed_.password);
- }
- // Note that this may be a hostname, an IPv4 address, or an IPv6 literal
- // surrounded by square brackets, like "[2001:db8::1]". To exclude these
- // brackets, use HostNoBrackets() below.
- std::string host() const {
- return ComponentString(parsed_.host);
- }
- std::string port() const { // Returns -1 if "default"
- return ComponentString(parsed_.port);
- }
- std::string path() const { // Including first slash following host
- return ComponentString(parsed_.path);
- }
- std::string query() const { // Stuff following '?'
- return ComponentString(parsed_.query);
- }
- std::string ref() const { // Stuff following '#'
- return ComponentString(parsed_.ref);
+ url::base::StringPiece username_piece() const {
+ return ComponentStringPiece(parsed_.username);
}
- // Existance querying. These functions will return true if the corresponding
- // URL component exists in this URL. Note that existance is different than
- // being nonempty. http://www.google.com/? has a query that just happens to
- // be empty, and has_query() will return true.
- bool has_scheme() const {
- return parsed_.scheme.len >= 0;
- }
- bool has_username() const {
- return parsed_.username.len >= 0;
- }
bool has_password() const {
return parsed_.password.len >= 0;
}
+ std::string password() const {
+ return ComponentString(parsed_.password);
+ }
+ url::base::StringPiece password_piece() const {
+ return ComponentStringPiece(parsed_.password);
+ }
+
+ // The host may be a hostname, an IPv4 address, or an IPv6 literal surrounded
+ // by square brackets, like "[2001:db8::1]". To exclude these brackets, use
+ // HostNoBrackets() below.
bool has_host() const {
- // Note that hosts are special, absense of host means length 0.
+ // Note that hosts are special, absence of host means length 0.
return parsed_.host.len > 0;
}
+ std::string host() const {
+ return ComponentString(parsed_.host);
+ }
+ url::base::StringPiece host_piece() const {
+ return ComponentStringPiece(parsed_.host);
+ }
+
+ // The port if one is explicitly specified. Most callers will want IntPort()
+ // or EffectiveIntPort() instead of these. The getters will not include the
+ // ':'.
bool has_port() const {
return parsed_.port.len >= 0;
}
+ std::string port() const {
+ return ComponentString(parsed_.port);
+ }
+ url::base::StringPiece port_piece() const {
+ return ComponentStringPiece(parsed_.port);
+ }
+
+ // Including first slash following host, up to the query. The URL
+ // "http://www.google.com/" has a path of "/".
bool has_path() const {
- // Note that http://www.google.com/" has a path, the path is "/". This can
- // return false only for invalid or nonstandard URLs.
return parsed_.path.len >= 0;
}
+ std::string path() const {
+ return ComponentString(parsed_.path);
+ }
+ url::base::StringPiece path_piece() const {
+ return ComponentStringPiece(parsed_.path);
+ }
+
+ // Stuff following '?' up to the ref. The getters will not include the '?'.
bool has_query() const {
return parsed_.query.len >= 0;
}
+ std::string query() const {
+ return ComponentString(parsed_.query);
+ }
+ url::base::StringPiece query_piece() const {
+ return ComponentStringPiece(parsed_.query);
+ }
+
+ // Stuff following '#' to the end of the string. This will be UTF-8 encoded
+ // (not necessarily ASCII). The getters will not include the '#'.
bool has_ref() const {
return parsed_.ref.len >= 0;
}
+ std::string ref() const {
+ return ComponentString(parsed_.ref);
+ }
+ url::base::StringPiece ref_piece() const {
+ return ComponentStringPiece(parsed_.ref);
+ }
// Returns a parsed version of the port. Can also be any of the special
// values defined in Parsed for ExtractPort.
int IntPort() const;
- // Returns the port number of the url, or the default port number.
+ // Returns the port number of the URL, or the default port number.
// If the scheme has no concept of port (or unknown default) returns
// PORT_UNSPECIFIED.
int EffectiveIntPort() const;
@@ -324,54 +368,48 @@
std::string PathForRequest() const;
// Returns the host, excluding the square brackets surrounding IPv6 address
- // literals. This can be useful for passing to getaddrinfo().
+ // literals. This can be useful for passing to getaddrinfo().
std::string HostNoBrackets() const;
// Returns true if this URL's host matches or is in the same domain as
- // the given input string. For example if this URL was "www.google.com",
- // this would match "com", "google.com", and "www.google.com
- // (input domain should be lower-case ASCII to match the canonicalized
- // scheme). This call is more efficient than getting the host and check
+ // the given input string. For example, if the hostname of the URL is
+ // "www.google.com", this will return true for "com", "google.com", and
+ // "www.google.com".
+ //
+ // The input domain should be lower-case ASCII to match the canonicalized
+ // scheme. This call is more efficient than getting the host and check
// whether host has the specific domain or not because no copies or
// object constructions are done.
- //
- // If function DomainIs has parameter domain_len, which means the parameter
- // lower_ascii_domain does not gurantee to terminate with NULL character.
- bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
+ bool DomainIs(url::base::StringPiece lower_ascii_domain) const;
- // If function DomainIs only has parameter lower_ascii_domain, which means
- // domain string should be terminate with NULL character.
- bool DomainIs(const char* lower_ascii_domain) const {
- return DomainIs(lower_ascii_domain,
- static_cast<int>(strlen(lower_ascii_domain)));
- }
-
- // Swaps the contents of this GURL object with the argument without doing
+ // Swaps the contents of this GURL object with |other|, without doing
// any memory allocations.
void Swap(GURL* other);
// Returns a reference to a singleton empty GURL. This object is for callers
// who return references but don't have anything to return in some cases.
- // This function may be called from any thread.
+ // If you just want an empty URL for normal use, prefer GURL(). This function
+ // may be called from any thread.
static const GURL& EmptyGURL();
- // Returns the inner URL of a nested URL [currently only non-null for
- // filesystem: URLs].
+ // Returns the inner URL of a nested URL (currently only non-null for
+ // filesystem URLs).
const GURL* inner_url() const {
return inner_url_.get();
}
private:
// Variant of the string parsing constructor that allows the caller to elect
- // retain trailing whitespace, if any, on the passed URL spec but only if the
- // scheme is one that allows trailing whitespace. The primary use-case is
+ // retain trailing whitespace, if any, on the passed URL spec, but only if
+ // the scheme is one that allows trailing whitespace. The primary use-case is
// for data: URLs. In most cases, you want to use the single parameter
// constructor above.
enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
GURL(const std::string& url_string, RetainWhiteSpaceSelector);
template<typename STR>
- void InitCanonical(const STR& input_spec, bool trim_path_end);
+ void InitCanonical(url::base::BasicStringPiece<STR> input_spec,
+ bool trim_path_end);
void InitializeFromCanonicalSpec();
@@ -381,6 +419,11 @@
return std::string();
return std::string(spec_, comp.begin, comp.len);
}
+ url::base::StringPiece ComponentStringPiece(const url::Component& comp) const {
+ if (comp.len <= 0)
+ return url::base::StringPiece();
+ return url::base::StringPiece(&spec_[comp.begin], comp.len);
+ }
// The actual text of the URL, in canonical ASCII form.
std::string spec_;
@@ -395,8 +438,6 @@
// Used for nested schemes [currently only filesystem:].
std::unique_ptr<GURL> inner_url_;
-
- // TODO bug 684583: Add encoding for query params.
};
// Stream operator so GURL can be used in assertion statements.
diff --git a/src/url/gurl_unittest.cc b/src/url/gurl_unittest.cc
index 112ee5f..7b83468 100644
--- a/src/url/gurl_unittest.cc
+++ b/src/url/gurl_unittest.cc
@@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#include <stddef.h>
+
#include "base/macros.h"
#include "testing/base/public/gunit.h"
#include "url/gurl.h"
@@ -45,14 +47,15 @@
EXPECT_EQ("something:///HOSTNAME.com/",
TypesTestCase("something:///HOSTNAME.com/"));
- // In the reverse, known schemes should always trigger standard URL handling.
+ // Conversely, URLs with known schemes should always trigger standard URL
+ // handling.
EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com"));
EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com"));
EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com"));
EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com"));
#ifdef WIN32
- // URLs that look like absolute Windows drive specs.
+ // URLs that look like Windows absolute path specs.
EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt"));
EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt"));
EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt"));
@@ -60,11 +63,16 @@
#endif
}
-// Test the basic creation and querying of components in a GURL. We assume
+// Test the basic creation and querying of components in a GURL. We assume that
// the parser is already tested and works, so we are mostly interested if the
// object does the right thing with the results.
TEST(GURLTest, Components) {
+ GURL empty_url(WStringToUTF16(L""));
+ EXPECT_TRUE(empty_url.is_empty());
+ EXPECT_FALSE(empty_url.is_valid());
+
GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref"));
+ EXPECT_FALSE(url.is_empty());
EXPECT_TRUE(url.is_valid());
EXPECT_TRUE(url.SchemeIs("http"));
EXPECT_FALSE(url.SchemeIsFile());
@@ -175,7 +183,7 @@
EXPECT_EQ("", invalid2.ref());
}
-// This is a regression test for http://crbug.com/309975 .
+// This is a regression test for http://crbug.com/309975.
TEST(GURLTest, SelfAssign) {
GURL a("filesystem:http://example.com/temporary/");
// This should not crash.
@@ -245,9 +253,9 @@
}
TEST(GURLTest, ExtraSlashesBeforeAuthority) {
- // According to RFC3986, the hier-part for URI with an authority must use only
- // two slashes, GURL intentionally just ignores slashes more than 2 and parses
- // the following part as an authority.
+ // According to RFC3986, the hierarchical part for URI with an authority
+ // must use only two slashes; GURL intentionally just ignores extra slashes
+ // if there are more than 2, and parses the following part as an authority.
GURL url("http:///host");
EXPECT_EQ("host", url.host());
EXPECT_EQ("/", url.path());
@@ -281,6 +289,9 @@
const char* expected;
} resolve_cases[] = {
{"http://www.google.com/", "foo.html", true, "http://www.google.com/foo.html"},
+ {"http://www.google.com/foo/", "bar", true, "http://www.google.com/foo/bar"},
+ {"http://www.google.com/foo/", "/bar", true, "http://www.google.com/bar"},
+ {"http://www.google.com/foo", "bar", true, "http://www.google.com/bar"},
{"http://www.google.com/", "http://images.google.com/foo.html", true, "http://images.google.com/foo.html"},
{"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"},
{"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"},
@@ -375,7 +386,7 @@
}
TEST(GURLTest, Replacements) {
- // The url canonicalizer replacement test will handle most of these case.
+ // The URL canonicalizer replacement test will handle most of these case.
// The most important thing to do here is to check that the proper
// canonicalizer gets called based on the scheme of the input.
struct ReplaceCase {
@@ -392,7 +403,7 @@
} replace_cases[] = {
{"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "http://www.google.com/"},
{"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", "", "window.open('foo');", "", "", "javascript:window.open('foo');"},
- {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo","search", "ref", "http://www.google.com:99/foo?search#ref"},
+ {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo", "search", "ref", "http://www.google.com:99/foo?search#ref"},
#ifdef WIN32
{"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", "c:\\", "", "", "file:///C:/"},
#endif
@@ -432,7 +443,7 @@
EXPECT_EQ("data: one ? two ", url_no_ref.spec());
- // Importing a parsed url via this constructor overload will retain trailing
+ // Importing a parsed URL via this constructor overload will retain trailing
// whitespace.
GURL import_url(url_no_ref.spec(),
url_no_ref.parsed_for_possibly_invalid_spec(),
@@ -558,43 +569,56 @@
}
TEST(GURLTest, DomainIs) {
- const char google_domain[] = "google.com";
+ GURL url_1("http://google.com/foo");
+ EXPECT_TRUE(url_1.DomainIs("google.com"));
- GURL url_1("http://www.google.com:99/foo");
- EXPECT_TRUE(url_1.DomainIs(google_domain));
+ // Subdomain and port are ignored.
+ GURL url_2("http://www.google.com:99/foo");
+ EXPECT_TRUE(url_2.DomainIs("google.com"));
- GURL url_2("http://google.com:99/foo");
- EXPECT_TRUE(url_2.DomainIs(google_domain));
+ // Different top-level domain.
+ GURL url_3("http://www.google.com.cn/foo");
+ EXPECT_FALSE(url_3.DomainIs("google.com"));
- GURL url_3("http://google.com./foo");
- EXPECT_TRUE(url_3.DomainIs(google_domain));
+ // Different host name.
+ GURL url_4("http://www.iamnotgoogle.com/foo");
+ EXPECT_FALSE(url_4.DomainIs("google.com"));
- GURL url_4("http://google.com/foo");
- EXPECT_FALSE(url_4.DomainIs("google.com."));
+ // The input must be lower-cased otherwise DomainIs returns false.
+ GURL url_5("http://www.google.com/foo");
+ EXPECT_FALSE(url_5.DomainIs("Google.com"));
- GURL url_5("http://google.com./foo");
- EXPECT_TRUE(url_5.DomainIs("google.com."));
+ // If the URL is invalid, DomainIs returns false.
+ GURL invalid_url("google.com");
+ EXPECT_FALSE(invalid_url.is_valid());
+ EXPECT_FALSE(invalid_url.DomainIs("google.com"));
+}
- GURL url_6("http://www.google.com./foo");
- EXPECT_TRUE(url_6.DomainIs(".com."));
+TEST(GURLTest, DomainIsTerminatingDotBehavior) {
+ // If the host part ends with a dot, it matches input domains
+ // with or without a dot.
+ GURL url_with_dot("http://www.google.com./foo");
+ EXPECT_TRUE(url_with_dot.DomainIs("google.com"));
+ EXPECT_TRUE(url_with_dot.DomainIs("google.com."));
+ EXPECT_TRUE(url_with_dot.DomainIs(".com"));
+ EXPECT_TRUE(url_with_dot.DomainIs(".com."));
- GURL url_7("http://www.balabala.com/foo");
- EXPECT_FALSE(url_7.DomainIs(google_domain));
+ // But, if the host name doesn't end with a dot and the input
+ // domain does, then it's considered to not match.
+ GURL url_without_dot("http://google.com/foo");
+ EXPECT_FALSE(url_without_dot.DomainIs("google.com."));
- GURL url_8("http://www.google.com.cn/foo");
- EXPECT_FALSE(url_8.DomainIs(google_domain));
+ // If the URL ends with two dots, it doesn't match.
+ GURL url_with_two_dots("http://www.google.com../foo");
+ EXPECT_FALSE(url_with_two_dots.DomainIs("google.com"));
+}
- GURL url_9("http://www.iamnotgoogle.com/foo");
- EXPECT_FALSE(url_9.DomainIs(google_domain));
+TEST(GURLTest, DomainIsWithFilesystemScheme) {
+ GURL url_1("filesystem:http://www.google.com:99/foo/");
+ EXPECT_TRUE(url_1.DomainIs("google.com"));
- GURL url_10("http://www.iamnotgoogle.com../foo");
- EXPECT_FALSE(url_10.DomainIs(".com"));
-
- GURL url_11("filesystem:http://www.google.com:99/foo/");
- EXPECT_TRUE(url_11.DomainIs(google_domain));
-
- GURL url_12("filesystem:http://www.iamnotgoogle.com/foo/");
- EXPECT_FALSE(url_12.DomainIs(google_domain));
+ GURL url_2("filesystem:http://www.iamnotgoogle.com/foo/");
+ EXPECT_FALSE(url_2.DomainIs("google.com"));
}
// Newlines should be stripped from inputs.
@@ -639,4 +663,29 @@
EXPECT_FALSE(GURL("http://bar/").SchemeIsBlob());
}
+TEST(GURLTest, ContentAndPathForNonStandardURLs) {
+ struct TestCase {
+ const char* url;
+ const char* expected;
+ } cases[] = {
+ {"null", ""},
+ {"not-a-standard-scheme:this is arbitrary content",
+ "this is arbitrary content"},
+ {"view-source:http://example.com/path", "http://example.com/path"},
+ {"blob:http://example.com/GUID", "http://example.com/GUID"},
+ {"blob://http://example.com/GUID", "//http://example.com/GUID"},
+ {"blob:http://user:password@example.com/GUID",
+ "http://user:password@example.com/GUID"},
+
+ // TODO(mkwst): This seems like a bug. https://crbug.com/513600
+ {"filesystem:http://example.com/path", "/"},
+ };
+
+ for (const auto& test : cases) {
+ GURL url(test.url);
+ EXPECT_EQ(test.expected, url.path()) << test.url;
+ EXPECT_EQ(test.expected, url.GetContent()) << test.url;
+ }
+}
+
} // namespace url
diff --git a/src/url/origin.cc b/src/url/origin.cc
index fdb8913..43b5e7e 100644
--- a/src/url/origin.cc
+++ b/src/url/origin.cc
@@ -1,20 +1,86 @@
-// Copyright 2014 The Chromium Authors. All rights reserved.
+// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "url/origin.h"
+#include <stdint.h>
+#include <string.h>
+
#include "base/logging.h"
-#include "base/strings/string_util.h"
+#include "url/gurl.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_constants.h"
+#include "url/url_util.h"
namespace url {
-Origin::Origin() : string_("null") {}
+Origin::Origin() : unique_(true) {
+}
-Origin::Origin(const std::string& origin) : string_(origin) {
- DCHECK(origin == "null" || MatchPattern(origin, "?*://?*"));
- DCHECK_GT(origin.size(), 0u);
- DCHECK(origin == "file://" || origin[origin.size() - 1] != '/');
+Origin::Origin(const GURL& url) : unique_(true) {
+ if (!url.is_valid() || (!url.IsStandard() && !url.SchemeIsBlob()))
+ return;
+
+ if (url.SchemeIsFileSystem()) {
+ tuple_ = SchemeHostPort(*url.inner_url());
+ } else if (url.SchemeIsBlob()) {
+ // If we're dealing with a 'blob:' URL, https://url.spec.whatwg.org/#origin
+ // defines the origin as the origin of the URL which results from parsing
+ // the "path", which boils down to everything after the scheme. GURL's
+ // 'GetContent()' gives us exactly that.
+ tuple_ = SchemeHostPort(GURL(url.GetContent()));
+ } else {
+ tuple_ = SchemeHostPort(url);
+ }
+
+ unique_ = tuple_.IsInvalid();
+}
+
+Origin::Origin(base::StringPiece scheme, base::StringPiece host, uint16_t port)
+ : tuple_(scheme, host, port) {
+ unique_ = tuple_.IsInvalid();
+}
+
+Origin::~Origin() {
+}
+
+// static
+Origin Origin::UnsafelyCreateOriginWithoutNormalization(
+ base::StringPiece scheme,
+ base::StringPiece host,
+ uint16_t port) {
+ return Origin(scheme, host, port);
+}
+
+std::string Origin::Serialize() const {
+ if (unique())
+ return "null";
+
+ if (scheme() == kFileScheme)
+ return "file://";
+
+ return tuple_.Serialize();
+}
+
+bool Origin::IsSameOriginWith(const Origin& other) const {
+ if (unique_ || other.unique_)
+ return false;
+
+ return tuple_.Equals(other.tuple_);
+}
+
+bool Origin::operator<(const Origin& other) const {
+ return tuple_ < other.tuple_;
+}
+
+std::ostream& operator<<(std::ostream& out, const url::Origin& origin) {
+ return out << origin.Serialize();
+}
+
+bool IsSameOriginWith(const GURL& a, const GURL& b) {
+ return Origin(a).IsSameOriginWith(Origin(b));
}
} // namespace url
diff --git a/src/url/origin.h b/src/url/origin.h
index 777e4e1..aab1f05 100644
--- a/src/url/origin.h
+++ b/src/url/origin.h
@@ -1,33 +1,142 @@
-// Copyright 2014 The Chromium Authors. All rights reserved.
+// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef URL_ORIGIN_H_
#define URL_ORIGIN_H_
+#include <stdint.h>
+
#include <string>
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "url/scheme_host_port.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_constants.h"
#include "url/url_export.h"
+class GURL;
+
namespace url {
-// Origin represents a Web Origin serialized to a string.
-// See RFC6454 for details.
+// An Origin is a tuple of (scheme, host, port), as described in RFC 6454.
+//
+// TL;DR: If you need to make a security-relevant decision, use 'url::Origin'.
+// If you only need to extract the bits of a URL which are relevant for a
+// network connection, use 'url::SchemeHostPort'.
+//
+// STL;SDR: If you aren't making actual network connections, use 'url::Origin'.
+//
+// 'Origin', like 'SchemeHostPort', is composed of a tuple of (scheme, host,
+// port), but contains a number of additional concepts which make it appropriate
+// for use as a security boundary and access control mechanism between contexts.
+//
+// This class ought to be used when code needs to determine if two resources
+// are "same-origin", and when a canonical serialization of an origin is
+// required. Note that some origins are "unique", meaning that they are not
+// same-origin with any other origin (including themselves).
+//
+// There are a few subtleties to note:
+//
+// * Invalid and non-standard GURLs are parsed as unique origins. This includes
+// non-hierarchical URLs like 'data:text/html,...' and 'javascript:alert(1)'.
+//
+// * GURLs with schemes of 'filesystem' or 'blob' parse the origin out of the
+// internals of the URL. That is, 'filesystem:https://example.com/temporary/f'
+// is parsed as ('https', 'example.com', 443).
+//
+// * Unique origins all serialize to the string "null"; this means that the
+// serializations of two unique origins are identical to each other, though
+// the origins themselves are not "the same". This means that origins'
+// serializations must not be relied upon for security checks.
+//
+// * GURLs with a 'file' scheme are tricky. They are parsed as ('file', '', 0),
+// but their behavior may differ from embedder to embedder.
+//
+// * The host component of an IPv6 address includes brackets, just like the URL
+// representation.
+//
+// Usage:
+//
+// * Origins are generally constructed from an already-canonicalized GURL:
+//
+// GURL url("https://example.com/");
+// url::Origin origin(url);
+// origin.scheme(); // "https"
+// origin.host(); // "example.com"
+// origin.port(); // 443
+// origin.unique(); // false
+//
+// * To answer the question "Are |this| and |that| "same-origin" with each
+// other?", use |Origin::IsSameOriginWith|:
+//
+// if (this.IsSameOriginWith(that)) {
+// // Amazingness goes here.
+// }
class URL_EXPORT Origin {
public:
+ // Creates a unique Origin.
Origin();
- explicit Origin(const std::string& origin);
- const std::string& string() const { return string_; }
+ // Creates an Origin from |url|, as described at
+ // https://url.spec.whatwg.org/#origin, with the following additions:
+ //
+ // 1. If |url| is invalid or non-standard, a unique Origin is constructed.
+ // 2. 'filesystem' URLs behave as 'blob' URLs (that is, the origin is parsed
+ // out of everything in the URL which follows the scheme).
+ // 3. 'file' URLs all parse as ("file", "", 0).
+ explicit Origin(const GURL& url);
- bool IsSameAs(const Origin& that) const {
- return string_ == that.string_;
+ // Creates an Origin from a |scheme|, |host|, and |port|. All the parameters
+ // must be valid and canonicalized. In particular, note that this cannot be
+ // used to create unique origins; 'url::Origin()' is the right way to do that.
+ //
+ // This constructor should be used in order to pass 'Origin' objects back and
+ // forth over IPC (as transitioning through GURL would risk potentially
+ // dangerous recanonicalization); other potential callers should prefer the
+ // 'GURL'-based constructor.
+ static Origin UnsafelyCreateOriginWithoutNormalization(
+ base::StringPiece scheme,
+ base::StringPiece host,
+ uint16_t port);
+
+ ~Origin();
+
+ // For unique origins, these return ("", "", 0).
+ const std::string& scheme() const { return tuple_.scheme(); }
+ const std::string& host() const { return tuple_.host(); }
+ uint16_t port() const { return tuple_.port(); }
+
+ bool unique() const { return unique_; }
+
+ // An ASCII serialization of the Origin as per Section 6.2 of RFC 6454, with
+ // the addition that all Origins with a 'file' scheme serialize to "file://".
+ std::string Serialize() const;
+
+ // Two Origins are "same-origin" if their schemes, hosts, and ports are exact
+ // matches; and neither is unique.
+ bool IsSameOriginWith(const Origin& other) const;
+ bool operator==(const Origin& other) const {
+ return IsSameOriginWith(other);
}
+ // Allows Origin to be used as a key in STL (for example, a std::set or
+ // std::map).
+ bool operator<(const Origin& other) const;
+
private:
- std::string string_;
+ Origin(base::StringPiece scheme, base::StringPiece host, uint16_t port);
+
+ SchemeHostPort tuple_;
+ bool unique_;
};
+URL_EXPORT std::ostream& operator<<(std::ostream& out, const Origin& origin);
+
+URL_EXPORT bool IsSameOriginWith(const GURL& a, const GURL& b);
+
} // namespace url
#endif // URL_ORIGIN_H_
diff --git a/src/url/origin_unittest.cc b/src/url/origin_unittest.cc
index 910a1cf..68371a8 100644
--- a/src/url/origin_unittest.cc
+++ b/src/url/origin_unittest.cc
@@ -1,41 +1,255 @@
-// Copyright 2014 The Chromium Authors. All rights reserved.
+// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "testing/base/public/gunit.h"
-#include "url/origin.h"
+#include <stddef.h>
+#include <stdint.h>
-namespace url {
+#include "base/logging.h"
+#include "base/macros.h"
+#include "testing/base/public/gunit.h"
+#include "url/gurl.h"
+#include "url/origin.h"
namespace {
-// Each test examines the Origin is constructed correctly without
-// violating DCHECKs.
-TEST(OriginTest, constructEmpty) {
- Origin origin;
- EXPECT_EQ("null", origin.string());
+TEST(OriginTest, UniqueOriginComparison) {
+ url::Origin unique_origin;
+ EXPECT_EQ("", unique_origin.scheme());
+ EXPECT_EQ("", unique_origin.host());
+ EXPECT_EQ(0, unique_origin.port());
+ EXPECT_TRUE(unique_origin.unique());
+ EXPECT_FALSE(unique_origin.IsSameOriginWith(unique_origin));
+
+ const char* const urls[] = {"data:text/html,Hello!",
+ "javascript:alert(1)",
+ "file://example.com:443/etc/passwd",
+ "yay",
+ "http::///invalid.example.com/"};
+
+ for (const auto& test_url : urls) {
+ SCOPED_TRACE(test_url);
+ GURL url(test_url);
+ url::Origin origin(url);
+ EXPECT_EQ("", origin.scheme());
+ EXPECT_EQ("", origin.host());
+ EXPECT_EQ(0, origin.port());
+ EXPECT_TRUE(origin.unique());
+ EXPECT_FALSE(origin.IsSameOriginWith(origin));
+ EXPECT_FALSE(unique_origin.IsSameOriginWith(origin));
+ EXPECT_FALSE(origin.IsSameOriginWith(unique_origin));
+ }
}
-TEST(OriginTest, constructNull) {
- Origin origin("null");
- EXPECT_EQ("null", origin.string());
+TEST(OriginTest, ConstructFromGURL) {
+ url::Origin different_origin(GURL("https://not-in-the-list.test/"));
+
+ struct TestCases {
+ const char* const url;
+ const char* const expected_scheme;
+ const char* const expected_host;
+ const uint16_t expected_port;
+ } cases[] = {
+ // IP Addresses
+ {"http://192.168.9.1/", "http", "192.168.9.1", 80},
+ {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80},
+
+ // Punycode
+ {"http://☃.net/", "http", "xn--n3h.net", 80},
+ {"blob:http://☃.net/", "http", "xn--n3h.net", 80},
+
+ // Generic URLs
+ {"http://example.com/", "http", "example.com", 80},
+ {"http://example.com:123/", "http", "example.com", 123},
+ {"https://example.com/", "https", "example.com", 443},
+ {"https://example.com:123/", "https", "example.com", 123},
+ {"http://user:pass@example.com/", "http", "example.com", 80},
+ {"http://example.com:123/?query", "http", "example.com", 123},
+ {"https://example.com/#1234", "https", "example.com", 443},
+ {"https://u:p@example.com:123/?query#1234", "https", "example.com", 123},
+
+ // Registered URLs
+ {"ftp://example.com/", "ftp", "example.com", 21},
+ {"gopher://example.com/", "gopher", "example.com", 70},
+ {"ws://example.com/", "ws", "example.com", 80},
+ {"wss://example.com/", "wss", "example.com", 443},
+
+ // file: URLs
+ {"file:///etc/passwd", "file", "", 0},
+ {"file://example.com/etc/passwd", "file", "example.com", 0},
+
+ // Filesystem:
+ {"filesystem:http://example.com/type/", "http", "example.com", 80},
+ {"filesystem:http://example.com:123/type/", "http", "example.com", 123},
+ {"filesystem:https://example.com/type/", "https", "example.com", 443},
+ {"filesystem:https://example.com:123/type/", "https", "example.com", 123},
+
+ // Blob:
+ {"blob:http://example.com/guid-goes-here", "http", "example.com", 80},
+ {"blob:http://example.com:123/guid-goes-here", "http", "example.com", 123},
+ {"blob:https://example.com/guid-goes-here", "https", "example.com", 443},
+ {"blob:http://u:p@example.com/guid-goes-here", "http", "example.com", 80},
+ };
+
+ for (const auto& test_case : cases) {
+ SCOPED_TRACE(test_case.url);
+ GURL url(test_case.url);
+ EXPECT_TRUE(url.is_valid());
+ url::Origin origin(url);
+ EXPECT_EQ(test_case.expected_scheme, origin.scheme());
+ EXPECT_EQ(test_case.expected_host, origin.host());
+ EXPECT_EQ(test_case.expected_port, origin.port());
+ EXPECT_FALSE(origin.unique());
+ EXPECT_TRUE(origin.IsSameOriginWith(origin));
+ EXPECT_FALSE(different_origin.IsSameOriginWith(origin));
+ EXPECT_FALSE(origin.IsSameOriginWith(different_origin));
+ }
}
-TEST(OriginTest, constructValidOrigin) {
- Origin origin("http://example.com:8080");
- EXPECT_EQ("http://example.com:8080", origin.string());
+TEST(OriginTest, Serialization) {
+ struct TestCases {
+ const char* const url;
+ const char* const expected;
+ } cases[] = {
+ {"http://192.168.9.1/", "http://192.168.9.1"},
+ {"http://[2001:db8::1]/", "http://[2001:db8::1]"},
+ {"http://☃.net/", "http://xn--n3h.net"},
+ {"http://example.com/", "http://example.com"},
+ {"http://example.com:123/", "http://example.com:123"},
+ {"https://example.com/", "https://example.com"},
+ {"https://example.com:123/", "https://example.com:123"},
+ {"file:///etc/passwd", "file://"},
+ {"file://example.com/etc/passwd", "file://"},
+ };
+
+ for (const auto& test_case : cases) {
+ SCOPED_TRACE(test_case.url);
+ GURL url(test_case.url);
+ EXPECT_TRUE(url.is_valid());
+ url::Origin origin(url);
+ EXPECT_EQ(test_case.expected, origin.Serialize());
+
+ // The '<<' operator should produce the same serialization as Serialize().
+ std::stringstream out;
+ out << origin;
+ EXPECT_EQ(test_case.expected, out.str());
+ }
}
-TEST(OriginTest, constructValidFileOrigin) {
- Origin origin("file://");
- EXPECT_EQ("file://", origin.string());
+TEST(OriginTest, Comparison) {
+ // These URLs are arranged in increasing order:
+ const char* const urls[] = {
+ "data:uniqueness",
+ "http://a:80",
+ "http://b:80",
+ "https://a:80",
+ "https://b:80",
+ "http://a:81",
+ "http://b:81",
+ "https://a:81",
+ "https://b:81",
+ };
+
+ for (size_t i = 0; i < arraysize(urls); i++) {
+ GURL current_url(urls[i]);
+ url::Origin current(current_url);
+ for (size_t j = i; j < arraysize(urls); j++) {
+ GURL compare_url(urls[j]);
+ url::Origin to_compare(compare_url);
+ EXPECT_EQ(i < j, current < to_compare) << i << " < " << j;
+ EXPECT_EQ(j < i, to_compare < current) << j << " < " << i;
+ }
+ }
}
-TEST(OriginTest, constructValidOriginWithoutPort) {
- Origin origin("wss://example2.com");
- EXPECT_EQ("wss://example2.com", origin.string());
+TEST(OriginTest, UnsafelyCreate) {
+ struct TestCase {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {
+ {"http", "example.com", 80},
+ {"http", "example.com", 123},
+ {"https", "example.com", 443},
+ {"https", "example.com", 123},
+ {"file", "", 0},
+ {"file", "example.com", 0},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization(
+ test.scheme, test.host, test.port);
+ EXPECT_EQ(test.scheme, origin.scheme());
+ EXPECT_EQ(test.host, origin.host());
+ EXPECT_EQ(test.port, origin.port());
+ EXPECT_FALSE(origin.unique());
+ EXPECT_TRUE(origin.IsSameOriginWith(origin));
+ }
}
-} // namespace
+TEST(OriginTest, UnsafelyCreateUniqueOnInvalidInput) {
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {{"", "", 0},
+ {"data", "", 0},
+ {"blob", "", 0},
+ {"filesystem", "", 0},
+ {"data", "example.com", 80},
+ {"http", "☃.net", 80},
+ {"http\nmore", "example.com", 80},
+ {"http\rmore", "example.com", 80},
+ {"http\n", "example.com", 80},
+ {"http\r", "example.com", 80},
+ {"http", "example.com\nnot-example.com", 80},
+ {"http", "example.com\rnot-example.com", 80},
+ {"http", "example.com\n", 80},
+ {"http", "example.com\r", 80},
+ {"http", "example.com", 0},
+ {"file", "", 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization(
+ test.scheme, test.host, test.port);
+ EXPECT_EQ("", origin.scheme());
+ EXPECT_EQ("", origin.host());
+ EXPECT_EQ(0, origin.port());
+ EXPECT_TRUE(origin.unique());
+ EXPECT_FALSE(origin.IsSameOriginWith(origin));
+ }
+}
+
+TEST(OriginTest, UnsafelyCreateUniqueViaEmbeddedNulls) {
+ struct TestCases {
+ const char* scheme;
+ size_t scheme_length;
+ const char* host;
+ size_t host_length;
+ uint16_t port;
+ } cases[] = {{"http\0more", 9, "example.com", 11, 80},
+ {"http\0", 5, "example.com", 11, 80},
+ {"\0http", 5, "example.com", 11, 80},
+ {"http", 4, "example.com\0not-example.com", 27, 80},
+ {"http", 4, "example.com\0", 12, 80},
+ {"http", 4, "\0example.com", 12, 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization(
+ std::string(test.scheme, test.scheme_length),
+ std::string(test.host, test.host_length), test.port);
+ EXPECT_EQ("", origin.scheme());
+ EXPECT_EQ("", origin.host());
+ EXPECT_EQ(0, origin.port());
+ EXPECT_TRUE(origin.unique());
+ EXPECT_FALSE(origin.IsSameOriginWith(origin));
+ }
+}
} // namespace url
diff --git a/src/url/scheme_host_port.cc b/src/url/scheme_host_port.cc
new file mode 100644
index 0000000..ebc5232
--- /dev/null
+++ b/src/url/scheme_host_port.cc
@@ -0,0 +1,182 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/scheme_host_port.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <tuple>
+
+#include "base/logging.h"
+#include "url/gurl.h"
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_constants.h"
+#include "url/url_util.h"
+
+namespace url {
+
+namespace {
+
+bool IsCanonicalHost(const base::StringPiece& host) {
+ std::string canon_host;
+
+ // Try to canonicalize the host (copy/pasted from net/base. :( ).
+ const Component raw_host_component(0,
+ static_cast<int>(host.length()));
+ StdStringCanonOutput canon_host_output(&canon_host);
+ CanonHostInfo host_info;
+ CanonicalizeHostVerbose(host.data(), raw_host_component,
+ &canon_host_output, &host_info);
+
+ if (host_info.out_host.is_nonempty() &&
+ host_info.family != CanonHostInfo::BROKEN) {
+ // Success! Assert that there's no extra garbage.
+ canon_host_output.Complete();
+ DCHECK_EQ(host_info.out_host.len, static_cast<int>(canon_host.length()));
+ } else {
+ // Empty host, or canonicalization failed.
+ canon_host.clear();
+ }
+
+ return host == canon_host;
+}
+
+bool IsValidInput(const base::StringPiece& scheme,
+ const base::StringPiece& host,
+ uint16_t port) {
+ SchemeType scheme_type = SCHEME_WITH_PORT;
+ bool is_standard = GetStandardSchemeType(
+ scheme.data(),
+ Component(0, static_cast<int>(scheme.length())),
+ &scheme_type);
+ if (!is_standard)
+ return false;
+
+ // These schemes do not follow the generic URL syntax, so we treat them as
+ // invalid (scheme, host, port) tuples (even though such URLs' _Origin_ might
+ // have a (scheme, host, port) tuple, they themselves do not).
+ if (scheme == kFileSystemScheme || scheme == kBlobScheme)
+ return false;
+
+ switch (scheme_type) {
+ case SCHEME_WITH_PORT:
+ // A URL with |scheme| is required to have the host and port (may be
+ // omitted in a serialization if it's the same as the default value).
+ // Return an invalid instance if either of them is not given.
+ if (host.empty() || port == 0)
+ return false;
+
+ if (!IsCanonicalHost(host))
+ return false;
+
+ return true;
+
+ case SCHEME_WITHOUT_PORT:
+ if (port != 0) {
+ // Return an invalid object if a URL with the scheme never represents
+ // the port data but the given |port| is non-zero.
+ return false;
+ }
+
+ if (!IsCanonicalHost(host))
+ return false;
+
+ return true;
+
+ case SCHEME_WITHOUT_AUTHORITY:
+ return false;
+
+ default:
+ DCHECK(false); // NOTREACHED();
+ return false;
+ }
+}
+
+} // namespace
+
+SchemeHostPort::SchemeHostPort() : port_(0) {
+}
+
+SchemeHostPort::SchemeHostPort(base::StringPiece scheme,
+ base::StringPiece host,
+ uint16_t port)
+ : port_(0) {
+ if (!IsValidInput(scheme, host, port))
+ return;
+
+ scheme.CopyToString(&scheme_);
+ host.CopyToString(&host_);
+ port_ = port;
+}
+
+SchemeHostPort::SchemeHostPort(const GURL& url) : port_(0) {
+ if (!url.is_valid())
+ return;
+
+ base::StringPiece scheme = url.scheme_piece();
+ base::StringPiece host = url.host_piece();
+
+ // A valid GURL never returns PORT_INVALID.
+ int port = url.EffectiveIntPort();
+ if (port == PORT_UNSPECIFIED)
+ port = 0;
+
+ if (!IsValidInput(scheme, host, port))
+ return;
+
+ scheme.CopyToString(&scheme_);
+ host.CopyToString(&host_);
+ port_ = port;
+}
+
+SchemeHostPort::~SchemeHostPort() {
+}
+
+bool SchemeHostPort::IsInvalid() const {
+ return scheme_.empty() && host_.empty() && !port_;
+}
+
+std::string SchemeHostPort::Serialize() const {
+ std::string result;
+ if (IsInvalid())
+ return result;
+
+ result.append(scheme_);
+ result.append(kStandardSchemeSeparator);
+ result.append(host_);
+
+ if (port_ == 0)
+ return result;
+
+ // Omit the port component if the port matches with the default port
+ // defined for the scheme, if any.
+ int default_port = DefaultPortForScheme(scheme_.data(),
+ static_cast<int>(scheme_.length()));
+ if (default_port == PORT_UNSPECIFIED)
+ return result;
+ if (port_ != default_port) {
+ result.push_back(':');
+ const int buf_size = 6;
+ char buf[buf_size];
+ _itoa_s(port_, buf, buf_size, 10);
+ result.append(buf);
+ }
+
+ return result;
+}
+
+bool SchemeHostPort::Equals(const SchemeHostPort& other) const {
+ return port_ == other.port() && scheme_ == other.scheme() &&
+ host_ == other.host();
+}
+
+bool SchemeHostPort::operator<(const SchemeHostPort& other) const {
+ return std::tie(port_, scheme_, host_) <
+ std::tie(other.port_, other.scheme_, other.host_);
+}
+
+} // namespace url
diff --git a/src/url/scheme_host_port.h b/src/url/scheme_host_port.h
new file mode 100644
index 0000000..47a9041
--- /dev/null
+++ b/src/url/scheme_host_port.h
@@ -0,0 +1,134 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_SCHEME_HOST_PORT_H_
+#define URL_SCHEME_HOST_PORT_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "base/strings/string_piece.h"
+#include "url/url_export.h"
+
+class GURL;
+
+namespace url {
+
+// This class represents a (scheme, host, port) tuple extracted from a URL.
+//
+// The primary purpose of this class is to represent relevant network-authority
+// information for a URL. It is _not_ an Origin, as described in RFC 6454. In
+// particular, it is generally NOT the right thing to use for security
+// decisions.
+//
+// Instead, this class is a mechanism for simplifying URLs with standard schemes
+// (that is, those which follow the generic syntax of RFC 3986) down to the
+// uniquely identifying information necessary for network fetches. This makes it
+// suitable as a cache key for a collection of active connections, for instance.
+// It may, however, be inappropriate to use as a cache key for persistent
+// storage associated with a host.
+//
+// In particular, note that:
+//
+// * SchemeHostPort can only represent schemes which follow the RFC 3986 syntax
+// (e.g. those registered with GURL as "standard schemes"). Non-standard
+// schemes such as "blob", "filesystem", "data", and "javascript" can only be
+// represented as invalid SchemeHostPort objects.
+//
+// * For example, the "file" scheme follows the standard syntax, but it is
+// important to note that the authority portion (host, port) is optional.
+// URLs without an authority portion will be represented with an empty string
+// for the host, and a port of 0 (e.g. "file:///etc/hosts" =>
+// ("file", "", 0)), and URLs with a host-only authority portion will be
+// represented with a port of 0 (e.g. "file://example.com/etc/hosts" =>
+// ("file", "example.com", 0)). See Section 3 of RFC 3986 to better understand
+// these constructs.
+//
+// * SchemeHostPort has no notion of the Origin concept (RFC 6454), and in
+// particular, it has no notion of a "unique" Origin. If you need to take
+// uniqueness into account (and, if you're making security-relevant decisions
+// then you absolutely do), please use 'url::Origin' instead.
+//
+// Usage:
+//
+// * SchemeHostPort objects are commonly created from GURL objects:
+//
+// GURL url("https://example.com/");
+// url::SchemeHostPort tuple(url);
+// tuple.scheme(); // "https"
+// tuple.host(); // "example.com"
+// tuple.port(); // 443
+//
+// * Objects may also be explicitly created and compared:
+//
+// url::SchemeHostPort tuple(url::kHttpsScheme, "example.com", 443);
+// tuple.scheme(); // "https"
+// tuple.host(); // "example.com"
+// tuple.port(); // 443
+//
+// GURL url("https://example.com/");
+// tuple.Equals(url::SchemeHostPort(url)); // true
+class URL_EXPORT SchemeHostPort {
+ public:
+ // Creates an invalid (scheme, host, port) tuple, which represents an invalid
+ // or non-standard URL.
+ SchemeHostPort();
+
+ // Creates a (scheme, host, port) tuple. |host| must be a canonicalized
+ // A-label (that is, '☃.net' must be provided as 'xn--n3h.net'). |scheme|
+ // must be a standard scheme. |port| must not be 0, unless |scheme| does not
+ // support ports (e.g. 'file'). In that case, |port| must be 0.
+ //
+ // Copies the data in |scheme| and |host|.
+ SchemeHostPort(base::StringPiece scheme,
+ base::StringPiece host,
+ uint16_t port);
+
+ // Creates a (scheme, host, port) tuple from |url|, as described at
+ // https://tools.ietf.org/html/rfc6454#section-4
+ //
+ // If |url| is invalid or non-standard, the result will be an invalid
+ // SchemeHostPort object.
+ explicit SchemeHostPort(const GURL& url);
+
+ ~SchemeHostPort();
+
+ // Returns the host component, in URL form. That is all IDN domain names will
+ // be expressed as A-Labels ('☃.net' will be returned as 'xn--n3h.net'), and
+ // and all IPv6 addresses will be enclosed in brackets ("[2001:db8::1]").
+ const std::string& host() const { return host_; }
+ const std::string& scheme() const { return scheme_; }
+ uint16_t port() const { return port_; }
+ bool IsInvalid() const;
+
+ // Serializes the SchemeHostPort tuple to a canonical form.
+ //
+ // While this string form resembles the Origin serialization specified in
+ // Section 6.2 of RFC 6454, it is important to note that invalid
+ // SchemeHostPort tuples serialize to the empty string, rather than being
+ // serialized as a unique Origin.
+ std::string Serialize() const;
+
+ // Two SchemeHostPort objects are "equal" iff their schemes, hosts, and ports
+ // are exact matches.
+ //
+ // Note that this comparison is _not_ the same as an origin-based comparison.
+ // In particular, invalid SchemeHostPort objects match each other (and
+ // themselves). Unique origins, on the other hand, would not.
+ bool Equals(const SchemeHostPort& other) const;
+
+ // Allows SchemeHostPort to be used as a key in STL (for example, a std::set
+ // or std::map).
+ bool operator<(const SchemeHostPort& other) const;
+
+ private:
+ std::string scheme_;
+ std::string host_;
+ uint16_t port_;
+};
+
+} // namespace url
+
+#endif // URL_SCHEME_HOST_PORT_H_
diff --git a/src/url/scheme_host_port_unittest.cc b/src/url/scheme_host_port_unittest.cc
new file mode 100644
index 0000000..790a5f1
--- /dev/null
+++ b/src/url/scheme_host_port_unittest.cc
@@ -0,0 +1,219 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "base/macros.h"
+#include "testing/base/public/gunit.h"
+#include "url/gurl.h"
+#include "url/scheme_host_port.h"
+
+namespace {
+
+TEST(SchemeHostPortTest, Invalid) {
+ url::SchemeHostPort invalid;
+ EXPECT_EQ("", invalid.scheme());
+ EXPECT_EQ("", invalid.host());
+ EXPECT_EQ(0, invalid.port());
+ EXPECT_TRUE(invalid.IsInvalid());
+ EXPECT_TRUE(invalid.Equals(invalid));
+
+ const char* urls[] = {"data:text/html,Hello!",
+ "javascript:alert(1)",
+ "file://example.com:443/etc/passwd",
+ "blob:https://example.com/uuid-goes-here",
+ "filesystem:https://example.com/temporary/yay.png"};
+
+ for (const auto& test : urls) {
+ SCOPED_TRACE(test);
+ GURL url(test);
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ EXPECT_TRUE(tuple.Equals(tuple));
+ EXPECT_TRUE(tuple.Equals(invalid));
+ EXPECT_TRUE(invalid.Equals(tuple));
+ }
+}
+
+TEST(SchemeHostPortTest, ExplicitConstruction) {
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {
+ {"http", "example.com", 80},
+ {"http", "example.com", 123},
+ {"https", "example.com", 443},
+ {"https", "example.com", 123},
+ {"file", "", 0},
+ {"file", "example.com", 0},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(test.scheme, test.host, test.port);
+ EXPECT_EQ(test.scheme, tuple.scheme());
+ EXPECT_EQ(test.host, tuple.host());
+ EXPECT_EQ(test.port, tuple.port());
+ EXPECT_FALSE(tuple.IsInvalid());
+ EXPECT_TRUE(tuple.Equals(tuple));
+ }
+}
+
+TEST(SchemeHostPortTest, InvalidConstruction) {
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {{"", "", 0},
+ {"data", "", 0},
+ {"blob", "", 0},
+ {"filesystem", "", 0},
+ {"http", "", 80},
+ {"data", "example.com", 80},
+ {"http", "☃.net", 80},
+ {"http\nmore", "example.com", 80},
+ {"http\rmore", "example.com", 80},
+ {"http\n", "example.com", 80},
+ {"http\r", "example.com", 80},
+ {"http", "example.com\nnot-example.com", 80},
+ {"http", "example.com\rnot-example.com", 80},
+ {"http", "example.com\n", 80},
+ {"http", "example.com\r", 80},
+ {"http", "example.com", 0},
+ {"file", "", 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(test.scheme, test.host, test.port);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ EXPECT_TRUE(tuple.Equals(tuple));
+ }
+}
+
+TEST(SchemeHostPortTest, InvalidConstructionWithEmbeddedNulls) {
+ struct TestCases {
+ const char* scheme;
+ size_t scheme_length;
+ const char* host;
+ size_t host_length;
+ uint16_t port;
+ } cases[] = {{"http\0more", 9, "example.com", 11, 80},
+ {"http\0", 5, "example.com", 11, 80},
+ {"\0http", 5, "example.com", 11, 80},
+ {"http", 4, "example.com\0not-example.com", 27, 80},
+ {"http", 4, "example.com\0", 12, 80},
+ {"http", 4, "\0example.com", 12, 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(std::string(test.scheme, test.scheme_length),
+ std::string(test.host, test.host_length),
+ test.port);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ }
+}
+
+TEST(SchemeHostPortTest, GURLConstruction) {
+ struct TestCases {
+ const char* url;
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {
+ {"http://192.168.9.1/", "http", "192.168.9.1", 80},
+ {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80},
+ {"http://☃.net/", "http", "xn--n3h.net", 80},
+ {"http://example.com/", "http", "example.com", 80},
+ {"http://example.com:123/", "http", "example.com", 123},
+ {"https://example.com/", "https", "example.com", 443},
+ {"https://example.com:123/", "https", "example.com", 123},
+ {"file:///etc/passwd", "file", "", 0},
+ {"file://example.com/etc/passwd", "file", "example.com", 0},
+ {"http://u:p@example.com/", "http", "example.com", 80},
+ {"http://u:p@example.com/path", "http", "example.com", 80},
+ {"http://u:p@example.com/path?123", "http", "example.com", 80},
+ {"http://u:p@example.com/path?123#hash", "http", "example.com", 80},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(test.url);
+ GURL url(test.url);
+ EXPECT_TRUE(url.is_valid());
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ(test.scheme, tuple.scheme());
+ EXPECT_EQ(test.host, tuple.host());
+ EXPECT_EQ(test.port, tuple.port());
+ EXPECT_FALSE(tuple.IsInvalid());
+ EXPECT_TRUE(tuple.Equals(tuple));
+ }
+}
+
+TEST(SchemeHostPortTest, Serialization) {
+ struct TestCases {
+ const char* url;
+ const char* expected;
+ } cases[] = {
+ {"http://192.168.9.1/", "http://192.168.9.1"},
+ {"http://[2001:db8::1]/", "http://[2001:db8::1]"},
+ {"http://☃.net/", "http://xn--n3h.net"},
+ {"http://example.com/", "http://example.com"},
+ {"http://example.com:123/", "http://example.com:123"},
+ {"https://example.com/", "https://example.com"},
+ {"https://example.com:123/", "https://example.com:123"},
+ {"file:///etc/passwd", "file://"},
+ {"file://example.com/etc/passwd", "file://example.com"},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(test.url);
+ GURL url(test.url);
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ(test.expected, tuple.Serialize());
+ }
+}
+
+TEST(SchemeHostPortTest, Comparison) {
+ // These tuples are arranged in increasing order:
+ struct SchemeHostPorts {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } tuples[] = {
+ {"http", "a", 80},
+ {"http", "b", 80},
+ {"https", "a", 80},
+ {"https", "b", 80},
+ {"http", "a", 81},
+ {"http", "b", 81},
+ {"https", "a", 81},
+ {"https", "b", 81},
+ };
+
+ for (size_t i = 0; i < arraysize(tuples); i++) {
+ url::SchemeHostPort current(tuples[i].scheme, tuples[i].host,
+ tuples[i].port);
+ for (size_t j = i; j < arraysize(tuples); j++) {
+ url::SchemeHostPort to_compare(tuples[j].scheme, tuples[j].host,
+ tuples[j].port);
+ EXPECT_EQ(i < j, current < to_compare) << i << " < " << j;
+ EXPECT_EQ(j < i, to_compare < current) << j << " < " << i;
+ }
+ }
+}
+
+} // namespace url
diff --git a/src/url/third_party/mozilla/url_parse.h b/src/url/third_party/mozilla/url_parse.h
index 71dbb78..222d605 100644
--- a/src/url/third_party/mozilla/url_parse.h
+++ b/src/url/third_party/mozilla/url_parse.h
@@ -5,18 +5,11 @@
#ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
#define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
-#include <string>
-
-#include "base/basictypes.h"
#include "base/strings/string16.h"
#include "url/url_export.h"
namespace url {
-// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and
-// KURLGoogle.cpp still rely on this type.
-typedef base::char16 UTF16Char;
-
// Component ------------------------------------------------------------------
// Represents a substring for URL parsing.
diff --git a/src/url/url_canon.h b/src/url/url_canon.h
index 89e3509..95d5345 100644
--- a/src/url/url_canon.h
+++ b/src/url/url_canon.h
@@ -9,8 +9,8 @@
#include <string.h>
#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_export.h"
-#include "url/url_parse.h"
namespace url {
@@ -285,7 +285,7 @@
// User info: username/password. If present, this will add the delimiters so
// the output will be "<username>:<password>@" or "<username>@". Empty
// username/password pairs, or empty passwords, will get converted to
-// nonexistant in the canonical version.
+// nonexistent in the canonical version.
//
// The components for the username and password refer to ranges in the
// respective source strings. Usually, these will be the same string, which
@@ -317,13 +317,13 @@
// This field summarizes how the input was classified by the canonicalizer.
enum Family {
- NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
+ NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
// canonicalizer is concerned, it should be treated as a
// hostname.
- BROKEN, // - Almost an IP, but was not canonicalized. This could be an
+ BROKEN, // - Almost an IP, but was not canonicalized. This could be an
// IPv4 address where truncation occurred, or something
// containing the special characters :[] which did not parse
- // as an IPv6 address. Never attempt to connect to this
+ // as an IPv6 address. Never attempt to connect to this
// address, because it might actually succeed!
IPV4, // - Successfully canonicalized as an IPv4 address.
IPV6, // - Successfully canonicalized as an IPv6 address.
@@ -331,7 +331,7 @@
Family family;
// If |family| is IPV4, then this is the number of nonempty dot-separated
- // components in the input text, from 1 to 4. If |family| is not IPV4,
+ // components in the input text, from 1 to 4. If |family| is not IPV4,
// this value is undefined.
int num_ipv4_components;
@@ -355,7 +355,7 @@
// Host.
//
-// The 8-bit version requires UTF-8 encoding. Use this version when you only
+// The 8-bit version requires UTF-8 encoding. Use this version when you only
// need to know whether canonicalization succeeded.
URL_EXPORT bool CanonicalizeHost(const char* spec,
const Component& host,
@@ -368,7 +368,7 @@
// Extended version of CanonicalizeHost, which returns additional information.
// Use this when you need to know whether the hostname was an IP address.
-// A successful return is indicated by host_info->family != BROKEN. See the
+// A successful return is indicated by host_info->family != BROKEN. See the
// definition of CanonHostInfo above for details.
URL_EXPORT void CanonicalizeHostVerbose(const char* spec,
const Component& host,
@@ -554,7 +554,7 @@
CanonOutput* output,
Parsed* new_parsed);
-// Use for mailto URLs. This "canonicalizes" the url into a path and query
+// Use for mailto URLs. This "canonicalizes" the URL into a path and query
// component. It does not attempt to merge "to" fields. It uses UTF-8 for
// the query encoding if there is a query. This is because a mailto URL is
// really intended for an external mail program, and the encoding of a page,
@@ -578,9 +578,9 @@
// treated on the same code path as regular canonicalization (the same string
// for each component).
//
-// A Parsed structure usually goes along with this. Those
-// components identify offsets within these strings, so that they can all be
-// in the same string, or spread arbitrarily across different ones.
+// A Parsed structure usually goes along with this. Those components identify
+// offsets within these strings, so that they can all be in the same string,
+// or spread arbitrarily across different ones.
//
// This structures does not own any data. It is the caller's responsibility to
// ensure that the data the pointers point to stays in scope and is not
@@ -725,7 +725,7 @@
}
bool IsRefOverridden() const { return sources_.ref != NULL; }
- // Getters for the itnernal data. See the variables below for how the
+ // Getters for the internal data. See the variables below for how the
// information is encoded.
const URLComponentSource<CHAR>& sources() const { return sources_; }
const Parsed& components() const { return components_; }
@@ -734,8 +734,8 @@
// Returns a pointer to a static empty string that is used as a placeholder
// to indicate a component should be deleted (see below).
const CHAR* Placeholder() {
- static const CHAR empty_string = 0;
- return &empty_string;
+ static const CHAR empty_cstr = 0;
+ return &empty_cstr;
}
// We support three states:
@@ -863,7 +863,7 @@
// The base URL should be canonical and have a host (may be empty for file
// URLs) and a path. If it doesn't have these, we can't resolve relative
// URLs off of it and will return the base as the output with an error flag.
-// Becausee it is canonical is should also be ASCII.
+// Because it is canonical is should also be ASCII.
//
// The query charset converter follows the same rules as CanonicalizeQuery.
//
diff --git a/src/url/url_canon_etc.cc b/src/url/url_canon_etc.cc
index 7409efd..e9da94c 100644
--- a/src/url/url_canon_etc.cc
+++ b/src/url/url_canon_etc.cc
@@ -95,9 +95,9 @@
// The output scheme starts from the current position.
out_scheme->begin = output->length();
- // Danger: it's important that this code does not strip any characters: it
- // only emits the canonical version (be it valid or escaped) of each of
- // the input characters. Stripping would put it out of sync with
+ // Danger: it's important that this code does not strip any characters;
+ // it only emits the canonical version (be it valid or escaped) for each
+ // of the input characters. Stripping would put it out of sync with
// FindAndCompareScheme, which could cause some security checks on
// schemes to be incorrect.
bool success = true;
@@ -218,7 +218,7 @@
char buf[buf_size];
WritePortInt(buf, buf_size, port_num);
- // Append the port number to the output, preceeded by a colon.
+ // Append the port number to the output, preceded by a colon.
output->push_back(':');
out_port->begin = output->length();
for (int i = 0; i < buf_size && buf[i]; i++)
diff --git a/src/url/url_canon_host.cc b/src/url/url_canon_host.cc
index 513248a..d4cdfd5 100644
--- a/src/url/url_canon_host.cc
+++ b/src/url/url_canon_host.cc
@@ -34,7 +34,7 @@
// NOTE: I didn't actually test all the control characters. Some may be
// disallowed in the input, but they are all accepted escaped except for 0.
// I also didn't test if characters affecting HTML parsing are allowed
-// unescaped, eg. (") or (#), which would indicate the beginning of the path.
+// unescaped, e.g. (") or (#), which would indicate the beginning of the path.
// Surprisingly, space is accepted in the input and always escaped.
// This table lists the canonical version of all characters we allow in the
@@ -165,6 +165,8 @@
// Canonicalizes a host that requires IDN conversion. Returns true on success
bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
+ int original_output_len = output->length(); // So we can rewind below.
+
// We need to escape URL before doing IDN conversion, since punicode strings
// cannot be escaped after they are created.
RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
@@ -187,7 +189,26 @@
bool success = DoSimpleHost(wide_output.data(),
wide_output.length(),
output, &has_non_ascii);
- DCHECK(!has_non_ascii);
+ if (has_non_ascii) {
+ // ICU generated something that DoSimpleHost didn't think looked like
+ // ASCII. This is quite rare, but ICU might convert some characters to
+ // percent signs which might generate new escape sequences which might in
+ // turn be invalid. An example is U+FE6A "small percent" which ICU will
+ // name prep into an ASCII percent and then we can interpret the following
+ // characters as escaped characters.
+ //
+ // If DoSimpleHost didn't think the output was ASCII, just escape the
+ // thing we gave ICU and give up. DoSimpleHost will have handled a further
+ // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
+ // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
+ // do more (like handle escaped non-ASCII sequences). Handling the escaped
+ // ASCII isn't strictly necessary, but DoSimpleHost handles this case
+ // anyway so we handle it/
+ output->set_length(original_output_len);
+ AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
+ output);
+ return false;
+ }
return success;
}
@@ -316,11 +337,11 @@
}
if (!success) {
- // Canonicalization failed. Set BROKEN to notify the caller.
+ // Canonicalization failed. Set BROKEN to notify the caller.
host_info->family = CanonHostInfo::BROKEN;
} else {
// After all the other canonicalization, check if we ended up with an IP
- // address. IP addresses are small, so writing into this temporary buffer
+ // address. IP addresses are small, so writing into this temporary buffer
// should not cause an allocation.
RawCanonOutput<64> canon_ip;
CanonicalizeIPAddress(output->data(),
@@ -328,7 +349,7 @@
&canon_ip, host_info);
// If we got an IPv4/IPv6 address, copy the canonical form back to the
- // real buffer. Otherwise, it's a hostname or broken IP, in which case
+ // real buffer. Otherwise, it's a hostname or broken IP, in which case
// we just leave it in place.
if (host_info->IsIPAddress()) {
output->set_length(output_begin);
diff --git a/src/url/url_canon_icu.cc b/src/url/url_canon_icu.cc
index 60bb004..70a2b27 100644
--- a/src/url/url_canon_icu.cc
+++ b/src/url/url_canon_icu.cc
@@ -4,6 +4,7 @@
// ICU integration functions.
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@@ -99,8 +100,10 @@
// TODO(jungshik): Change options as different parties (browsers,
// registrars, search engines) converge toward a consensus.
value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);
- if (U_FAILURE(err))
+ if (U_FAILURE(err)) {
+ CHECK(false) << "failed to open UTS46 data with error: " << err;
value = NULL;
+ }
}
UIDNA* value;
diff --git a/src/url/url_canon_icu_unittest.cc b/src/url/url_canon_icu_unittest.cc
index cfa4b49..f7ce199 100644
--- a/src/url/url_canon_icu_unittest.cc
+++ b/src/url/url_canon_icu_unittest.cc
@@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#include <stddef.h>
+
#include "base/macros.h"
#include "testing/base/public/gunit.h"
#include "third_party/icu/include/unicode/ucnv.h"
diff --git a/src/url/url_canon_internal.cc b/src/url/url_canon_internal.cc
index 1554814..a727ca2 100644
--- a/src/url/url_canon_internal.cc
+++ b/src/url/url_canon_internal.cc
@@ -5,6 +5,7 @@
#include "url/url_canon_internal.h"
#include <errno.h>
+#include <stddef.h>
#include <stdlib.h>
#include <cstdio>
@@ -249,9 +250,9 @@
bool ReadUTFChar(const char* str, int* begin, int length,
unsigned* code_point_out) {
- // This depends on ints and int32s being the same thing. If they're not, it
+ // This depends on ints and int32s being the same thing. If they're not, it
// will fail to compile.
- // TODO(mmenke): This should probably be fixed.
+ // TODO(mmenke): This should probably be fixed.
if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
!base::IsValidCharacter(*code_point_out)) {
*code_point_out = kUnicodeReplacementCharacter;
@@ -262,9 +263,9 @@
bool ReadUTFChar(const base::char16* str, int* begin, int length,
unsigned* code_point_out) {
- // This depends on ints and int32s being the same thing. If they're not, it
+ // This depends on ints and int32s being the same thing. If they're not, it
// will fail to compile.
- // TODO(mmenke): This should probably be fixed.
+ // TODO(mmenke): This should probably be fixed.
if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
!base::IsValidCharacter(*code_point_out)) {
*code_point_out = kUnicodeReplacementCharacter;
diff --git a/src/url/url_canon_internal.h b/src/url/url_canon_internal.h
index a66cd8d..ba915e8 100644
--- a/src/url/url_canon_internal.h
+++ b/src/url/url_canon_internal.h
@@ -7,9 +7,10 @@
// This file is intended to be included in another C++ file where the character
// types are defined. This allows us to write mostly generic code, but not have
-// templace bloat because everything is inlined when anybody calls any of our
+// template bloat because everything is inlined when anybody calls any of our
// functions.
+#include <stddef.h>
#include <stdlib.h>
#include "base/logging.h"
@@ -41,7 +42,7 @@
// Valid in an ASCII-representation of an octal digit.
CHAR_OCT = 32,
- // Characters that do not require escaping in encodeURIComponent. Characters
+ // Characters that do not require escaping in encodeURIComponent. Characters
// that do not have this flag will be escaped; see url_util.cc.
CHAR_COMPONENT = 64,
};
@@ -175,7 +176,7 @@
output);
Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
output);
- } else if (char_value <= 0x10FFFF) { // Max unicode code point.
+ } else if (char_value <= 0x10FFFF) { // Max Unicode code point.
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
output);
@@ -199,7 +200,7 @@
}
// Writes the given character to the output as UTF-8. This does NO checking
-// of the validity of the unicode characters; the caller should ensure that
+// of the validity of the Unicode characters; the caller should ensure that
// the value it is appending is valid to append.
inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) {
DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output);
@@ -207,7 +208,7 @@
// Writes the given character to the output as UTF-8, escaping ALL
// characters (even when they are ASCII). This does NO checking of the
-// validity of the unicode characters; the caller should ensure that the value
+// validity of the Unicode characters; the caller should ensure that the value
// it is appending is valid to append.
inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) {
DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output);
@@ -260,7 +261,7 @@
// that any following characters are.
inline bool AppendUTF8EscapedChar(const base::char16* str, int* begin,
int length, CanonOutput* output) {
- // UTF-16 input. Readchar16 will handle invalid characters for us and give
+ // UTF-16 input. ReadUTFChar will handle invalid characters for us and give
// us the kUnicodeReplacementCharacter, so we don't have to do special
// checking after failure, just pass through the failure to the caller.
unsigned char_value;
diff --git a/src/url/url_canon_internal_file.h b/src/url/url_canon_internal_file.h
index 6903098..26a3eae 100644
--- a/src/url/url_canon_internal_file.h
+++ b/src/url/url_canon_internal_file.h
@@ -113,15 +113,15 @@
new_parsed->path.begin = output->length();
output->push_back('/');
- // Copies and normalizes the "c:" at the beginning, if present.
+ // Copy and normalize the "c:" at the beginning, if present.
int after_drive = FileDoDriveSpec(source.path, parsed.path.begin,
parsed.path.end(), output);
- // Copies the rest of the path
+ // Copy the rest of the path.
FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output);
new_parsed->path.len = output->length() - new_parsed->path.begin;
- // Things following the path we can use the standard canonicalizers for.
+ // For things following the path, we can use the standard canonicalizers.
success &= URLCanonInternal<CHAR, UCHAR>::DoQuery(
source.query, parsed.query, output, &new_parsed->query);
success &= URLCanonInternal<CHAR, UCHAR>::DoRef(
diff --git a/src/url/url_canon_ip.cc b/src/url/url_canon_ip.cc
index 45f95de..87c30c7 100644
--- a/src/url/url_canon_ip.cc
+++ b/src/url/url_canon_ip.cc
@@ -4,9 +4,10 @@
#include "url/url_canon_ip.h"
+#include <stdint.h>
#include <stdlib.h>
+#include <limits>
-#include "base/basictypes.h"
#include "base/logging.h"
#include "url/url_canon_internal.h"
@@ -92,7 +93,7 @@
template<typename CHAR>
CanonHostInfo::Family IPv4ComponentToNumber(const CHAR* spec,
const Component& component,
- uint32* number) {
+ uint32_t* number) {
// Figure out the base
SharedCharTypes base;
int base_prefix_len = 0; // Size of the prefix for this base.
@@ -118,7 +119,7 @@
base_prefix_len++;
// Put the component, minus any base prefix, into a NULL-terminated buffer so
- // we can call the standard library. Because leading zeros have already been
+ // we can call the standard library. Because leading zeros have already been
// discarded, filling the entire buffer is guaranteed to trigger the 32-bit
// overflow check.
const int kMaxComponentLen = 16;
@@ -133,7 +134,7 @@
if (!IsCharOfType(input, base))
return CanonHostInfo::NEUTRAL;
- // Fill the buffer, if there's space remaining. This check allows us to
+ // Fill the buffer, if there's space remaining. This check allows us to
// verify that all characters are numeric, even those that don't fit.
if (dest_i < kMaxComponentLen)
buf[dest_i++] = input;
@@ -143,14 +144,14 @@
// Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
// number can overflow a 64-bit number in <= 16 characters).
- uint64 num = _strtoui64(buf, NULL, BaseForType(base));
+ uint64_t num = _strtoui64(buf, NULL, BaseForType(base));
// Check for 32-bit overflow.
- if (num > kuint32max)
+ if (num > std::numeric_limits<uint32_t>::max())
return CanonHostInfo::BROKEN;
- // No overflow. Success!
- *number = static_cast<uint32>(num);
+ // No overflow. Success!
+ *number = static_cast<uint32_t>(num);
return CanonHostInfo::IPV4;
}
@@ -167,10 +168,10 @@
// Convert existing components to digits. Values up to
// |existing_components| will be valid.
- uint32 component_values[4];
+ uint32_t component_values[4];
int existing_components = 0;
- // Set to true if one or more components are BROKEN. BROKEN is only
+ // Set to true if one or more components are BROKEN. BROKEN is only
// returned if all components are IPV4 or BROKEN, so, for example,
// 12345678912345.de returns NEUTRAL rather than broken.
bool broken = false;
@@ -198,7 +199,7 @@
// First, process all components but the last, while making sure each fits
// within an 8-bit field.
for (int i = 0; i < existing_components - 1; i++) {
- if (component_values[i] > kuint8max)
+ if (component_values[i] > std::numeric_limits<uint8_t>::max())
return CanonHostInfo::BROKEN;
address[i] = static_cast<unsigned char>(component_values[i]);
}
@@ -209,7 +210,7 @@
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
- uint32 last_value = component_values[existing_components - 1];
+ uint32_t last_value = component_values[existing_components - 1];
#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4)
#pragma GCC diagnostic pop
#endif
@@ -440,11 +441,12 @@
return true;
}
-// Converts a hex comonent into a number. This cannot fail since the caller has
+// Converts a hex component into a number. This cannot fail since the caller has
// already verified that each character in the string was a hex digit, and
// that there were no more than 4 characters.
-template<typename CHAR>
-uint16 IPv6HexComponentToNumber(const CHAR* spec, const Component& component) {
+template <typename CHAR>
+uint16_t IPv6HexComponentToNumber(const CHAR* spec,
+ const Component& component) {
DCHECK(component.len <= 4);
// Copy the hex string into a C-string.
@@ -455,7 +457,7 @@
// Convert it to a number (overflow is not possible, since with 4 hex
// characters we can at most have a 16 bit number).
- return static_cast<uint16>(_strtoui64(buf, NULL, 16));
+ return static_cast<uint16_t>(_strtoui64(buf, NULL, 16));
}
// Converts an IPv6 address to a 128-bit number (network byte order), returning
@@ -497,7 +499,7 @@
// Append the hex component's value.
if (i != ipv6_parsed.num_hex_components) {
// Get the 16-bit value for this hex component.
- uint16 number = IPv6HexComponentToNumber<CHAR>(
+ uint16_t number = IPv6HexComponentToNumber<CHAR>(
spec, ipv6_parsed.hex_components[i]);
// Append to |address|, in network byte order.
address[cur_index_in_address++] = (number & 0xFF00) >> 8;
@@ -576,7 +578,7 @@
}
}
- // No invalid characters. Could still be IPv4 or a hostname.
+ // No invalid characters. Could still be IPv4 or a hostname.
host_info->family = CanonHostInfo::NEUTRAL;
return false;
}
diff --git a/src/url/url_canon_ip.h b/src/url/url_canon_ip.h
index 19ecfdb..937bd46 100644
--- a/src/url/url_canon_ip.h
+++ b/src/url/url_canon_ip.h
@@ -6,9 +6,9 @@
#define URL_URL_CANON_IP_H_
#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_export.h"
-#include "url/url_parse.h"
namespace url {
@@ -30,14 +30,14 @@
// Not all components may exist. If there are only 3 components, for example,
// the last one will have a length of -1 or 0 to indicate it does not exist.
//
-// Note that many platform's inet_addr will ignore everything after a space
-// in certain curcumstances if the stuff before the space looks like an IP
+// Note that many platforms' inet_addr will ignore everything after a space
+// in certain circumstances if the stuff before the space looks like an IP
// address. IE6 is included in this. We do NOT handle this case. In many cases,
// the browser's canonicalization will get run before this which converts
-// spaces to %20 (in the case of IE7) or rejects them (in the case of
-// Mozilla), so this code path never gets hit. Our host canonicalization will
-// notice these spaces and escape them, which will make IP address finding
-// fail. This seems like better behavior than stripping after a space.
+// spaces to %20 (in the case of IE7) or rejects them (in the case of Mozilla),
+// so this code path never gets hit. Our host canonicalization will notice
+// these spaces and escape them, which will make IP address finding fail. This
+// seems like better behavior than stripping after a space.
URL_EXPORT bool FindIPv4Components(const char* spec,
const Component& host,
Component components[4]);
diff --git a/src/url/url_canon_mailtourl.cc b/src/url/url_canon_mailtourl.cc
index 7c48b95..fb6bc9a 100644
--- a/src/url/url_canon_mailtourl.cc
+++ b/src/url/url_canon_mailtourl.cc
@@ -55,7 +55,7 @@
new_parsed->path.reset();
}
- // Query -- always use the default utf8 charset converter.
+ // Query -- always use the default UTF8 charset converter.
CanonicalizeQuery(source.query, parsed.query, NULL,
output, &new_parsed->query);
diff --git a/src/url/url_canon_path.cc b/src/url/url_canon_path.cc
index ceff689..2e088a0 100644
--- a/src/url/url_canon_path.cc
+++ b/src/url/url_canon_path.cc
@@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#include <limits.h>
+
#include "base/logging.h"
#include "url/url_canon.h"
#include "url/url_canon_internal.h"
@@ -162,6 +164,76 @@
output->set_length(i + 1);
}
+// Looks for problematic nested escape sequences and escapes the output as
+// needed to ensure they can't be misinterpreted.
+//
+// Our concern is that in input escape sequence that's invalid because it
+// contains nested escape sequences might look valid once those are unescaped.
+// For example, "%%300" is not a valid escape sequence, but after unescaping the
+// inner "%30" this becomes "%00" which is valid. Leaving this in the output
+// string can result in callers re-canonicalizing the string and unescaping this
+// sequence, thus resulting in something fundamentally different than the
+// original input here. This can cause a variety of problems.
+//
+// This function is called after we've just unescaped a sequence that's within
+// two output characters of a previous '%' that we know didn't begin a valid
+// escape sequence in the input string. We look for whether the output is going
+// to turn into a valid escape sequence, and if so, convert the initial '%' into
+// an escaped "%25" so the output can't be misinterpreted.
+//
+// |spec| is the input string we're canonicalizing.
+// |next_input_index| is the index of the next unprocessed character in |spec|.
+// |input_len| is the length of |spec|.
+// |last_invalid_percent_index| is the index in |output| of a previously-seen
+// '%' character. The caller knows this '%' character isn't followed by a valid
+// escape sequence in the input string.
+// |output| is the canonicalized output thus far. The caller guarantees this
+// ends with a '%' followed by one or two characters, and the '%' is the one
+// pointed to by |last_invalid_percent_index|. The last character in the string
+// was just unescaped.
+template<typename CHAR>
+void CheckForNestedEscapes(const CHAR* spec,
+ int next_input_index,
+ int input_len,
+ int last_invalid_percent_index,
+ CanonOutput* output) {
+ const int length = output->length();
+ const char last_unescaped_char = output->at(length - 1);
+
+ // If |output| currently looks like "%c", we need to try appending the next
+ // input character to see if this will result in a problematic escape
+ // sequence. Note that this won't trigger on the first nested escape of a
+ // two-escape sequence like "%%30%30" -- we'll allow the conversion to
+ // "%0%30" -- but the second nested escape will be caught by this function
+ // when it's called again in that case.
+ const bool append_next_char = last_invalid_percent_index == length - 2;
+ if (append_next_char) {
+ // If the input doesn't contain a 7-bit character next, this case won't be a
+ // problem.
+ if ((next_input_index == input_len) || (spec[next_input_index] >= 0x80))
+ return;
+ output->push_back(static_cast<char>(spec[next_input_index]));
+ }
+
+ // Now output ends like "%cc". Try to unescape this.
+ int begin = last_invalid_percent_index;
+ unsigned char temp;
+ if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) {
+ // New escape sequence found. Overwrite the characters following the '%'
+ // with "25", and push_back() the one or two characters that were following
+ // the '%' when we were called.
+ if (!append_next_char)
+ output->push_back(output->at(last_invalid_percent_index + 1));
+ output->set(last_invalid_percent_index + 1, '2');
+ output->set(last_invalid_percent_index + 2, '5');
+ output->push_back(last_unescaped_char);
+ } else if (append_next_char) {
+ // Not a valid escape sequence, but we still need to undo appending the next
+ // source character so the caller can process it normally.
+ output->set_length(length);
+ }
+}
+
// Appends the given path to the output. It assumes that if the input path
// starts with a slash, it should be copied to the output. If no path has
// already been appended to the output (the case when not resolving
@@ -173,7 +245,7 @@
// copied to the output.
//
// We do not collapse multiple slashes in a row to a single slash. It seems
-// no web browsers do this, and we don't want incompababilities, even though
+// no web browsers do this, and we don't want incompatibilities, even though
// it would be correct for most systems.
template<typename CHAR, typename UCHAR>
bool DoPartialPath(const CHAR* spec,
@@ -182,10 +254,15 @@
CanonOutput* output) {
int end = path.end();
+ // We use this variable to minimize the amount of work done when unescaping --
+ // we'll only call CheckForNestedEscapes() when this points at one of the last
+ // couple of characters in |output|.
+ int last_invalid_percent_index = INT_MIN;
+
bool success = true;
for (int i = path.begin; i < end; i++) {
UCHAR uch = static_cast<UCHAR>(spec[i]);
- if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) {
+ if (sizeof(CHAR) > 1 && uch >= 0x80) {
// We only need to test wide input for having non-ASCII characters. For
// narrow input, we'll always just use the lookup table. We don't try to
// do anything tricky with decoding/validating UTF-8. This function will
@@ -200,7 +277,7 @@
// Needs special handling of some sort.
int dotlen;
if ((dotlen = IsDot(spec, i, end)) > 0) {
- // See if this dot was preceeded by a slash in the output. We
+ // See if this dot was preceded by a slash in the output. We
// assume that when canonicalizing paths, they will always
// start with a slash and not a dot, so we don't have to
// bounds check the output.
@@ -230,7 +307,7 @@
break;
}
} else {
- // This dot is not preceeded by a slash, it is just part of some
+ // This dot is not preceded by a slash, it is just part of some
// file name.
output->push_back('.');
i += dotlen - 1;
@@ -245,33 +322,40 @@
unsigned char unescaped_value;
if (DecodeEscaped(spec, &i, end, &unescaped_value)) {
// Valid escape sequence, see if we keep, reject, or unescape it.
+ // Note that at this point DecodeEscape() will have advanced |i| to
+ // the last character of the escape sequence.
char unescaped_flags = kPathCharLookup[unescaped_value];
if (unescaped_flags & UNESCAPE) {
- // This escaped value shouldn't be escaped, copy it.
+ // This escaped value shouldn't be escaped. Try to copy it.
output->push_back(unescaped_value);
- } else if (unescaped_flags & INVALID_BIT) {
- // Invalid escaped character, copy it and remember the error.
- output->push_back('%');
- output->push_back(static_cast<char>(spec[i - 1]));
- output->push_back(static_cast<char>(spec[i]));
- success = false;
+ // If we just unescaped a value within 2 output characters of the
+ // '%' from a previously-detected invalid escape sequence, we
+ // might have an input string with problematic nested escape
+ // sequences; detect and fix them.
+ if (last_invalid_percent_index >= (output->length() - 3)) {
+ CheckForNestedEscapes(spec, i + 1, end,
+ last_invalid_percent_index, output);
+ }
} else {
- // Valid escaped character but we should keep it escaped. We
- // don't want to change the case of any hex letters in case
- // the server is sensitive to that, so we just copy the two
- // characters without checking (DecodeEscape will have advanced
- // to the last character of the pair).
+ // Either this is an invalid escaped character, or it's a valid
+ // escaped character we should keep escaped. In the first case we
+ // should just copy it exactly and remember the error. In the
+ // second we also copy exactly in case the server is sensitive to
+ // changing the case of any hex letters.
output->push_back('%');
output->push_back(static_cast<char>(spec[i - 1]));
output->push_back(static_cast<char>(spec[i]));
+ if (unescaped_flags & INVALID_BIT)
+ success = false;
}
} else {
- // Invalid escape sequence. IE7 rejects any URLs with such
- // sequences, while Firefox, IE6, and Safari all pass it through
- // unchanged. We are more permissive unlike IE7. I don't think this
- // can cause significant problems, if it does, we should change
- // to be more like IE7.
+ // Invalid escape sequence. IE7+ rejects any URLs with such
+ // sequences, while other browsers pass them through unchanged. We
+ // use the permissive behavior.
+ // TODO(brettw): Consider testing IE's strict behavior, which would
+ // allow removing the code to handle nested escapes above.
+ last_invalid_percent_index = output->length();
output->push_back('%');
}
diff --git a/src/url/url_canon_pathurl.cc b/src/url/url_canon_pathurl.cc
index 0d23ccb..494fbda 100644
--- a/src/url/url_canon_pathurl.cc
+++ b/src/url/url_canon_pathurl.cc
@@ -14,7 +14,7 @@
namespace {
// Canonicalize the given |component| from |source| into |output| and
-// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput|
+// |new_component|. If |separator| is non-zero, it is pre-pended to |output|
// prior to the canonicalized component; i.e. for the '?' or '#' characters.
template<typename CHAR, typename UCHAR>
bool DoCanonicalizePathComponent(const CHAR* source,
diff --git a/src/url/url_canon_query.cc b/src/url/url_canon_query.cc
index 5494ddf..bf59d10 100644
--- a/src/url/url_canon_query.cc
+++ b/src/url/url_canon_query.cc
@@ -80,7 +80,7 @@
}
// Runs the converter with the given UTF-16 input. We don't have to do
-// anything, but this overriddden function allows us to use the same code
+// anything, but this overridden function allows us to use the same code
// for both UTF-8 and UTF-16 input.
void RunConverter(const base::char16* spec,
const Component& query,
diff --git a/src/url/url_canon_relative.cc b/src/url/url_canon_relative.cc
index 9436245..e34ea2f 100644
--- a/src/url/url_canon_relative.cc
+++ b/src/url/url_canon_relative.cc
@@ -17,14 +17,14 @@
namespace {
// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
-// 379034), whereas IE is case-insensetive.
+// 379034), whereas IE is case-insensitive.
//
// We choose to be more permissive like IE. We don't need to worry about
// unescaping or anything here: neither IE or Firefox allow this. We also
// don't have to worry about invalid scheme characters since we are comparing
// against the canonical scheme of the base.
//
-// The base URL should always be canonical, therefore is ASCII.
+// The base URL should always be canonical, therefore it should be ASCII.
template<typename CHAR>
bool AreSchemesEqual(const char* base,
const Component& base_scheme,
@@ -75,6 +75,10 @@
TrimURL(url, &begin, &url_len);
if (begin >= url_len) {
// Empty URLs are relative, but do nothing.
+ if (!is_base_hierarchical) {
+ // Don't allow relative URLs if the base scheme doesn't support it.
+ return false;
+ }
*relative_component = Component(begin, 0);
*is_relative = true;
return true;
@@ -82,7 +86,7 @@
#ifdef WIN32
// We special case paths like "C:\foo" so they can link directly to the
- // file on Windows (IE compatability). The security domain stuff should
+ // file on Windows (IE compatibility). The security domain stuff should
// prevent a link like this from actually being followed if its on a
// web page.
//
@@ -91,22 +95,22 @@
// is a file and the answer will still be correct.
//
// We require strict backslashes when detecting UNC since two forward
- // shashes should be treated a a relative URL with a hostname.
+ // slashes should be treated a a relative URL with a hostname.
if (DoesBeginWindowsDriveSpec(url, begin, url_len) ||
DoesBeginUNCPath(url, begin, url_len, true))
return true;
#endif // WIN32
// See if we've got a scheme, if not, we know this is a relative URL.
- // BUT: Just because we have a scheme, doesn't make it absolute.
+ // BUT, just because we have a scheme, doesn't make it absolute.
// "http:foo.html" is a relative URL with path "foo.html". If the scheme is
- // empty, we treat it as relative (":foo") like IE does.
+ // empty, we treat it as relative (":foo"), like IE does.
Component scheme;
const bool scheme_is_empty =
!ExtractScheme(url, url_len, &scheme) || scheme.len == 0;
if (scheme_is_empty) {
if (url[begin] == '#') {
- // |url| is a bare fragement (e.g. "#foo"). This can be resolved against
+ // |url| is a bare fragment (e.g. "#foo"). This can be resolved against
// any base. Fall-through.
} else if (!is_base_hierarchical) {
// Don't allow relative URLs if the base scheme doesn't support it.
@@ -145,7 +149,7 @@
int colon_offset = scheme.end();
// If it's a filesystem URL, the only valid way to make it relative is not to
- // supply a scheme. There's no equivalent to e.g. http:index.html.
+ // supply a scheme. There's no equivalent to e.g. http:index.html.
if (CompareSchemeComponent(url, scheme, kFileSystemScheme))
return true;
@@ -170,8 +174,8 @@
// up until and including the last slash. There should be a slash in the
// range, if not, nothing will be copied.
//
-// The input is assumed to be canonical, so we search only for exact slashes
-// and not backslashes as well. We also know that it's ASCII.
+// For stardard URLs the input should be canonical, but when resolving relative
+// URLs on a non-standard base (like "data:") the input can be anything.
void CopyToLastSlash(const char* spec,
int begin,
int end,
@@ -179,7 +183,7 @@
// Find the last slash.
int last_slash = -1;
for (int i = end - 1; i >= begin; i--) {
- if (spec[i] == '/') {
+ if (spec[i] == '/' || spec[i] == '\\') {
last_slash = i;
break;
}
@@ -394,7 +398,7 @@
query_converter, output, out_parsed);
}
-// Resolves a relative URL that happens to be an absolute file path. Examples
+// Resolves a relative URL that happens to be an absolute file path. Examples
// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
template<typename CHAR>
bool DoResolveAbsoluteFile(const CHAR* relative_url,
@@ -460,7 +464,7 @@
// how strict the UNC finder is).
//
// We also allow Windows absolute drive specs on any scheme (for example
- // "c:\foo") like IE does. There must be no preceeding slashes in this
+ // "c:\foo") like IE does. There must be no preceding slashes in this
// case (we reject anything like "/c:/foo") because that should be treated
// as a path. For file URLs, we allow any number of slashes since that would
// be setting the path.
diff --git a/src/url/url_canon_stdstring.h b/src/url/url_canon_stdstring.h
index f8a847d..aefc76a 100644
--- a/src/url/url_canon_stdstring.h
+++ b/src/url/url_canon_stdstring.h
@@ -11,6 +11,7 @@
#include <string>
+#include "base/strings/string_piece.h"
#include "url/url_canon.h"
#include "url/url_export.h"
@@ -47,35 +48,35 @@
};
// An extension of the Replacements class that allows the setters to use
-// standard strings.
+// StringPieces (implicitly allowing strings or char*s).
//
-// The strings passed as arguments are not copied and must remain valid until
-// this class goes out of scope.
+// The contents of the StringPieces are not copied and must remain valid until
+// the StringPieceReplacements object goes out of scope.
template<typename STR>
-class StdStringReplacements : public Replacements<typename STR::value_type> {
+class StringPieceReplacements : public Replacements<typename STR::value_type> {
public:
- void SetSchemeStr(const STR& s) {
+ void SetSchemeStr(const base::BasicStringPiece<STR>& s) {
this->SetScheme(s.data(), Component(0, static_cast<int>(s.length())));
}
- void SetUsernameStr(const STR& s) {
+ void SetUsernameStr(const base::BasicStringPiece<STR>& s) {
this->SetUsername(s.data(), Component(0, static_cast<int>(s.length())));
}
- void SetPasswordStr(const STR& s) {
+ void SetPasswordStr(const base::BasicStringPiece<STR>& s) {
this->SetPassword(s.data(), Component(0, static_cast<int>(s.length())));
}
- void SetHostStr(const STR& s) {
+ void SetHostStr(const base::BasicStringPiece<STR>& s) {
this->SetHost(s.data(), Component(0, static_cast<int>(s.length())));
}
- void SetPortStr(const STR& s) {
+ void SetPortStr(const base::BasicStringPiece<STR>& s) {
this->SetPort(s.data(), Component(0, static_cast<int>(s.length())));
}
- void SetPathStr(const STR& s) {
+ void SetPathStr(const base::BasicStringPiece<STR>& s) {
this->SetPath(s.data(), Component(0, static_cast<int>(s.length())));
}
- void SetQueryStr(const STR& s) {
+ void SetQueryStr(const base::BasicStringPiece<STR>& s) {
this->SetQuery(s.data(), Component(0, static_cast<int>(s.length())));
}
- void SetRefStr(const STR& s) {
+ void SetRefStr(const base::BasicStringPiece<STR>& s) {
this->SetRef(s.data(), Component(0, static_cast<int>(s.length())));
}
};
diff --git a/src/url/url_canon_stdurl.cc b/src/url/url_canon_stdurl.cc
index 7a61de8..7d1758b 100644
--- a/src/url/url_canon_stdurl.cc
+++ b/src/url/url_canon_stdurl.cc
@@ -169,7 +169,7 @@
}
// For 16-bit replacements, we turn all the replacements into UTF-8 so the
-// regular codepath can be used.
+// regular code path can be used.
bool ReplaceStandardURL(const char* base,
const Parsed& base_parsed,
const Replacements<base::char16>& replacements,
diff --git a/src/url/url_canon_unittest.cc b/src/url/url_canon_unittest.cc
index 1917cc9..3dd617d 100644
--- a/src/url/url_canon_unittest.cc
+++ b/src/url/url_canon_unittest.cc
@@ -3,13 +3,14 @@
// found in the LICENSE file.
#include <errno.h>
+#include <stddef.h>
#include "base/macros.h"
#include "testing/base/public/gunit.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_canon_internal.h"
#include "url/url_canon_stdstring.h"
-#include "url/url_parse.h"
#include "url/url_test_utils.h"
namespace url {
@@ -38,7 +39,7 @@
bool expected_success;
};
-// Test cases for CanonicalizeIPAddress(). The inputs are identical to
+// Test cases for CanonicalizeIPAddress(). The inputs are identical to
// DualComponentCase, but the output has extra CanonHostInfo fields.
struct IPAddressCase {
const char* input8;
@@ -127,7 +128,7 @@
#if defined(GTEST_HAS_DEATH_TEST)
// TODO(mattm): Can't run this in debug mode for now, since the DCHECK will
-// cause the Chromium stacktrace dialog to appear and hang the test.
+// cause the Chromium stack trace dialog to appear and hang the test.
// See http://crbug.com/49580.
#if defined(NDEBUG) && !defined(DCHECK_ALWAYS_ON)
#define MAYBE_DoAppendUTF8Invalid DoAppendUTF8Invalid
@@ -157,10 +158,10 @@
} utf_cases[] = {
// Valid canonical input should get passed through & escaped.
{"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
- // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
+ // Test a character that takes > 16 bits (U+10300 = old italic letter A)
{"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
- // Non-shortest-form UTF-8 are invalid. The bad char should be replaced
- // with the invalid character (EF BF DB in UTF-8).
+ // Non-shortest-form UTF-8 characters are invalid. The bad character
+ // should be replaced with the invalid character (EF BF DB in UTF-8).
{"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, "%EF%BF%BD%E5%A5%BD"},
// Invalid UTF-8 sequences should be marked as invalid (the first
// sequence is truncated).
@@ -259,7 +260,7 @@
EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
- // Now try the wide version
+ // Now try the wide version.
out_str.clear();
StdStringCanonOutput output2(&out_str);
@@ -275,7 +276,7 @@
EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
}
- // Test the case where the scheme is declared nonexistant, it should be
+ // Test the case where the scheme is declared nonexistent, it should be
// converted into an empty scheme.
Component out_comp;
out_str.clear();
@@ -321,6 +322,17 @@
// ...%00 in fullwidth should fail (also as escaped UTF-8 input)
{"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
{"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ // ICU will convert weird percents into ASCII percents, but not unescape
+ // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
+ // "small percent". At this point we should be within our rights to mark
+ // anything as invalid since the URL is corrupt or malicious. The code
+ // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
+ // and kept as valid, so we validate that behavior here, but this level
+ // of fixing the input shouldn't be seen as required. "%81" is invalid.
+ {"\xef\xb9\xaa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"%ef%b9%aa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xef\xb9\xaa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"%ef%b9%aa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
// Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
{"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
// See http://unicode.org/cldr/utility/idna.jsp for other
@@ -390,6 +402,13 @@
// (added in Unicode 4.1). UTS 46 table 4 row (k)
{"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com",
Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
+ // Maps U+FF43 (Full Width Small Letter C) to 'c'.
+ {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz",
+ Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
+ // Maps U+1D68C (Math Monospace Small C) to 'c'.
+ // U+1D68C = \xD835\xDE8C in UTF-16
+ {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
+ Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
// BiDi check test
// "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM.
// Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008.
@@ -638,7 +657,7 @@
{"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
{"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
{"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
- // Old trunctations tests. They're all "BROKEN" now.
+ // Old trunctations tests. They're all "BROKEN" now.
{"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
@@ -754,16 +773,17 @@
{"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
- // Can only have one "::" contraction in an IPv6 string literal.
+ // Can only have one "::" contraction in an IPv6 string literal.
{"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
- // No more than 2 consecutive ':'s.
+ // No more than 2 consecutive ':'s.
{"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
- // Non-IP addresses due to invalid characters.
+ // Non-IP addresses due to invalid characters.
{"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
- // If there are not enough components, the last one should fill them out.
+ // If there are not enough components, the last one should fill them out.
// ... omitted at this time ...
- // Too many components means not an IP address. Similarly with too few if using IPv4 compat or mapped addresses.
+ // Too many components means not an IP address. Similarly, with too few
+ // if using IPv4 compat or mapped addresses.
{"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
@@ -887,7 +907,7 @@
{"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true},
{"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true },
- // IE7 compatability: old versions allowed backslashes in usernames, but
+ // IE7 compatibility: old versions allowed backslashes in usernames, but
// IE7 does not. We disallow it as well.
{"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true},
};
@@ -943,7 +963,7 @@
// buffer. The parser unit tests will test scanning the number correctly.
//
// Note that the CanonicalizePort will always prepend a colon to the output
- // to separate it from the colon that it assumes preceeds it.
+ // to separate it from the colon that it assumes precedes it.
struct PortCase {
const char* input;
int default_port;
@@ -1059,6 +1079,21 @@
{"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true},
// @ should be passed through unchanged (escaped or unescaped).
{"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true},
+ // Nested escape sequences should result in escaping the leading '%' if
+ // unescaping would result in a new escape sequence.
+ {"/%A%42", L"/%A%42", "/%25AB", Component(0, 6), true},
+ {"/%%41B", L"/%%41B", "/%25AB", Component(0, 6), true},
+ {"/%%41%42", L"/%%41%42", "/%25AB", Component(0, 6), true},
+ // Make sure truncated "nested" escapes don't result in reading off the
+ // string end.
+ {"/%%41", L"/%%41", "/%A", Component(0, 3), true},
+ // Don't unescape the leading '%' if unescaping doesn't result in a valid
+ // new escape sequence.
+ {"/%%470", L"/%%470", "/%G0", Component(0, 4), true},
+ {"/%%2D%41", L"/%%2D%41", "/%-A", Component(0, 4), true},
+ // Don't erroneously downcast a UTF-16 charater in a way that makes it
+ // look like part of an escape sequence.
+ {NULL, L"/%%41\x0130", "/%A%C4%B0", Component(0, 9), true},
// ----- encoding tests -----
// Basic conversions
@@ -1300,6 +1335,13 @@
{"wss://foo:81/", "wss://foo:81/", true},
{"wss://foo:443/", "wss://foo/", true},
{"wss://foo:815/", "wss://foo:815/", true},
+
+ // This particular code path ends up "backing up" to replace an invalid
+ // host ICU generated with an escaped version. Test that in the context
+ // of a full URL to make sure the backing up doesn't mess up the non-host
+ // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that
+ // ICU will convert to an ASCII one, generating "%81".
+ {"ws:)W\x1eW\xef\xb9\xaa""81:80/", "ws://%29w%1ew%81/", false},
};
for (size_t i = 0; i < arraysize(cases); i++) {
@@ -1329,7 +1371,7 @@
{"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"},
// Replace nothing
{"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"},
- // Replace scheme with filesystem. The result is garbage, but you asked
+ // Replace scheme with filesystem. The result is garbage, but you asked
// for it.
{"http://a:b@google.com:22/foo?baz@cat", "filesystem", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem://a:b@google.com:22/foo?baz@cat"},
};
@@ -1594,7 +1636,7 @@
{"file:", "file:///", true, Component(), Component(7, 1)},
{"file:UNChost/path", "file://unchost/path", true, Component(7, 7), Component(14, 5)},
// CanonicalizeFileURL supports absolute Windows style paths for IE
- // compatability. Note that the caller must decide that this is a file
+ // compatibility. Note that the caller must decide that this is a file
// URL itself so it can call the file canonicalizer. This is usually
// done automatically as part of relative URL resolving.
{"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)},
@@ -1605,7 +1647,7 @@
{"\\\\server\\file", "file://server/file", true, Component(7, 6), Component(13, 5)},
{"/\\server/file", "file://server/file", true, Component(7, 6), Component(13, 5)},
// We should preserve the number of slashes after the colon for IE
- // compatability, except when there is none, in which case we should
+ // compatibility, except when there is none, in which case we should
// add one.
{"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(), Component(7, 16)},
{"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, Component(), Component(7, 19)},
@@ -1807,7 +1849,7 @@
TEST(URLCanonTest, _itoa_s) {
// We fill the buffer with 0xff to ensure that it's getting properly
- // null-terminated. We also allocate one byte more than what we tell
+ // null-terminated. We also allocate one byte more than what we tell
// _itoa_s about, and ensure that the extra byte is untouched.
char buf[6];
memset(buf, 0xff, sizeof(buf));
@@ -1846,7 +1888,7 @@
TEST(URLCanonTest, _itow_s) {
// We fill the buffer with 0xff to ensure that it's getting properly
- // null-terminated. We also allocate one byte more than what we tell
+ // null-terminated. We also allocate one byte more than what we tell
// _itoa_s about, and ensure that the extra byte is untouched.
base::char16 buf[6];
const char fill_mem = 0xff;
@@ -1956,6 +1998,8 @@
// Non-hierarchical base: absolute input should succeed.
{"data:foobar", false, false, "http://host/", true, false, false, NULL},
{"data:foobar", false, false, "http:host", true, false, false, NULL},
+ // Non-hierarchical base: empty URL should give error.
+ {"data:foobar", false, false, "", false, false, false, NULL},
// Invalid schemes should be treated as relative.
{"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"},
{"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"},
@@ -2022,7 +2066,7 @@
// which is what is required.
{"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false, "file://host:80/bar.txt"},
// Filesystem URL tests; filesystem URLs are only valid and relative if
- // they have no scheme, e.g. "./index.html". There's no valid equivalent
+ // they have no scheme, e.g. "./index.html". There's no valid equivalent
// to http:index.html.
{"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL},
{"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL},
@@ -2090,10 +2134,10 @@
}
}
-// It used to be when we did a replacement with a long buffer of UTF-16
-// characters, we would get invalid data in the URL. This is because the buffer
-// it used to hold the UTF-8 data was resized, while some pointers were still
-// kept to the old buffer that was removed.
+// It used to be the case that when we did a replacement with a long buffer of
+// UTF-16 characters, we would get invalid data in the URL. This is because the
+// buffer that it used to hold the UTF-8 data was resized, while some pointers
+// were still kept to the old buffer that was removed.
TEST(URLCanonTest, ReplacementOverflow) {
const char src[] = "file:///C:/foo/bar";
int src_len = static_cast<int>(strlen(src));
@@ -2101,7 +2145,7 @@
ParseFileURL(src, src_len, &parsed);
// Override two components, the path with something short, and the query with
- // sonething long enough to trigger the bug.
+ // something long enough to trigger the bug.
Replacements<base::char16> repl;
base::string16 new_query;
for (int i = 0; i < 4800; i++)
diff --git a/src/url/url_constants.cc b/src/url/url_constants.cc
index 2dc1478..0388fbc 100644
--- a/src/url/url_constants.cc
+++ b/src/url/url_constants.cc
@@ -25,4 +25,6 @@
const char kStandardSchemeSeparator[] = "://";
+const size_t kMaxURLChars = 2 * 1024 * 1024;
+
} // namespace url
diff --git a/src/url/url_constants.h b/src/url/url_constants.h
index c48dafc..fa71164 100644
--- a/src/url/url_constants.h
+++ b/src/url/url_constants.h
@@ -5,6 +5,8 @@
#ifndef URL_URL_CONSTANTS_H_
#define URL_URL_CONSTANTS_H_
+#include <stddef.h>
+
#include "url/url_export.h"
namespace url {
@@ -30,6 +32,8 @@
// Used to separate a standard scheme and the hostname: "://".
URL_EXPORT extern const char kStandardSchemeSeparator[];
+URL_EXPORT extern const size_t kMaxURLChars;
+
} // namespace url
#endif // URL_URL_CONSTANTS_H_
diff --git a/src/url/url_parse.h b/src/url/url_parse.h
deleted file mode 100644
index 3b9c546..0000000
--- a/src/url/url_parse.h
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright 2013 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef URL_URL_PARSE_H_
-#define URL_URL_PARSE_H_
-
-// TODO(tfarina): Remove this file when the callers are updated.
-#include "url/third_party/mozilla/url_parse.h"
-
-#endif // URL_URL_PARSE_H_
diff --git a/src/url/url_parse_file.cc b/src/url/url_parse_file.cc
index c08ddc6..fcbb12d 100644
--- a/src/url/url_parse_file.cc
+++ b/src/url/url_parse_file.cc
@@ -3,8 +3,8 @@
// found in the LICENSE file.
#include "base/logging.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_file.h"
-#include "url/url_parse.h"
#include "url/url_parse_internal.h"
// Interesting IE file:isms...
diff --git a/src/url/url_parse_internal.h b/src/url/url_parse_internal.h
index 4070b7e..7630878 100644
--- a/src/url/url_parse_internal.h
+++ b/src/url/url_parse_internal.h
@@ -7,11 +7,11 @@
// Contains common inline helper functions used by the URL parsing routines.
-#include "url/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
namespace url {
-// We treat slashes and backslashes the same for IE compatability.
+// We treat slashes and backslashes the same for IE compatibility.
inline bool IsURLSlash(base::char16 ch) {
return ch == '/' || ch == '\\';
}
diff --git a/src/url/url_parse_unittest.cc b/src/url/url_parse_unittest.cc
index dedd663..c0d5960 100644
--- a/src/url/url_parse_unittest.cc
+++ b/src/url/url_parse_unittest.cc
@@ -2,11 +2,13 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "url/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
+
+#include <stddef.h>
#include "base/macros.h"
#include "testing/base/public/gunit.h"
-#include "url/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
// Interesting IE file:isms...
//
@@ -90,13 +92,13 @@
bool ComponentMatches(const char* input,
const char* reference,
const Component& component) {
- // If the component is nonexistant (length == -1), it should begin at 0.
+ // If the component is nonexistent (length == -1), it should begin at 0.
EXPECT_TRUE(component.len >= 0 || component.len == -1);
// Begin should be valid.
EXPECT_LE(0, component.begin);
- // A NULL reference means the component should be nonexistant.
+ // A NULL reference means the component should be nonexistent.
if (!reference)
return component.len == -1;
if (component.len < 0)
@@ -345,7 +347,7 @@
TEST(URLParser, PathURL) {
// Declared outside for loop to try to catch cases in init() where we forget
- // to reset something that is reset by the construtor.
+ // to reset something that is reset by the constructor.
Parsed parsed;
for (size_t i = 0; i < arraysize(path_cases); i++) {
const char* url = path_cases[i].input;
@@ -356,7 +358,7 @@
EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent()))
<< i;
- // The remaining components are never used for path urls.
+ // The remaining components are never used for path URLs.
ExpectInvalidComponent(parsed.username);
ExpectInvalidComponent(parsed.password);
ExpectInvalidComponent(parsed.host);
@@ -537,7 +539,7 @@
Component key, value;
if (!ExtractQueryKeyValue(url, &query, &key, &value)) {
if (parameter >= i && !expected_key)
- return true; // Expected nonexistant key, got one.
+ return true; // Expected nonexistent key, got one.
return false; // Not enough keys.
}
@@ -613,7 +615,7 @@
TEST(URLParser, MailtoUrl) {
// Declared outside for loop to try to catch cases in init() where we forget
- // to reset something that is reset by the construtor.
+ // to reset something that is reset by the constructor.
Parsed parsed;
for (size_t i = 0; i < arraysize(mailto_cases); ++i) {
const char* url = mailto_cases[i].input;
@@ -625,7 +627,7 @@
EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query));
EXPECT_EQ(PORT_UNSPECIFIED, port);
- // The remaining components are never used for mailto urls.
+ // The remaining components are never used for mailto URLs.
ExpectInvalidComponent(parsed.username);
ExpectInvalidComponent(parsed.password);
ExpectInvalidComponent(parsed.port);
@@ -645,7 +647,7 @@
TEST(URLParser, FileSystemURL) {
// Declared outside for loop to try to catch cases in init() where we forget
- // to reset something that is reset by the construtor.
+ // to reset something that is reset by the constructor.
Parsed parsed;
for (size_t i = 0; i < arraysize(filesystem_cases); i++) {
const FileSystemURLParseCase* parsecase = &filesystem_cases[i];
@@ -667,7 +669,7 @@
int port = ParsePort(url, parsed.inner_parsed()->port);
EXPECT_EQ(parsecase->inner_port, port);
- // The remaining components are never used for filesystem urls.
+ // The remaining components are never used for filesystem URLs.
ExpectInvalidComponent(parsed.inner_parsed()->query);
ExpectInvalidComponent(parsed.inner_parsed()->ref);
}
@@ -676,7 +678,7 @@
EXPECT_TRUE(ComponentMatches(url, parsecase->query, parsed.query));
EXPECT_TRUE(ComponentMatches(url, parsecase->ref, parsed.ref));
- // The remaining components are never used for filesystem urls.
+ // The remaining components are never used for filesystem URLs.
ExpectInvalidComponent(parsed.username);
ExpectInvalidComponent(parsed.password);
ExpectInvalidComponent(parsed.host);
diff --git a/src/url/url_test_utils.h b/src/url/url_test_utils.h
index 6e66e85..174e5e0 100644
--- a/src/url/url_test_utils.h
+++ b/src/url/url_test_utils.h
@@ -19,7 +19,7 @@
namespace test_utils {
// Converts a UTF-16 string from native wchar_t format to char16, by
-// truncating the high 32 bits. This is not meant to handle true UTF-32
+// truncating the high 32 bits. This is not meant to handle true UTF-32
// encoded strings.
inline base::string16 WStringToUTF16(const wchar_t* src) {
base::string16 str;
@@ -30,7 +30,7 @@
return str;
}
-// Converts a string from UTF-8 to UTF-16
+// Converts a string from UTF-8 to UTF-16.
inline base::string16 ConvertUTF8ToUTF16(const std::string& src) {
int length = static_cast<int>(src.length());
EXPECT_LT(length, 1024);
@@ -39,7 +39,7 @@
return base::string16(output.data(), output.length());
}
-// Converts a string from UTF-16 to UTF-8
+// Converts a string from UTF-16 to UTF-8.
inline std::string ConvertUTF16ToUTF8(const base::string16& src) {
std::string str;
StdStringCanonOutput output(&str);
diff --git a/src/url/url_util.cc b/src/url/url_util.cc
index f4246e9..bb43a4a 100644
--- a/src/url/url_util.cc
+++ b/src/url/url_util.cc
@@ -4,6 +4,7 @@
#include "url/url_util.h"
+#include <stddef.h>
#include <string.h>
#include <vector>
@@ -11,6 +12,7 @@
#include "base/heap-checker.h"
#endif
#include "base/logging.h"
+#include "base/strings/string_util.h"
#include "url/url_canon_internal.h"
#include "url/url_file.h"
#include "url/url_util_internal.h"
@@ -19,51 +21,69 @@
namespace {
-// ASCII-specific tolower. The standard library's tolower is locale sensitive,
-// so we don't want to use it here.
-template<class Char>
-inline Char ToLowerASCII(Char c) {
- return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
-}
-
-// Backend for LowerCaseEqualsASCII.
-template<typename Iter>
-inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
- for (Iter it = a_begin; it != a_end; ++it, ++b) {
- if (!*b || ToLowerASCII(*it) != *b)
- return false;
- }
- return *b == 0;
-}
-
const int kNumStandardURLSchemes = 8;
-const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
- kHttpScheme,
- kHttpsScheme,
- kFileScheme, // Yes, file urls can have a hostname!
- kFtpScheme,
- kGopherScheme,
- kWsScheme, // WebSocket.
- kWssScheme, // WebSocket secure.
- kFileSystemScheme,
+const SchemeWithType kStandardURLSchemes[kNumStandardURLSchemes] = {
+ {kHttpScheme, SCHEME_WITH_PORT},
+ {kHttpsScheme, SCHEME_WITH_PORT},
+ // Yes, file URLs can have a hostname, so file URLs should be handled as
+ // "standard". File URLs never have a port as specified by the SchemeType
+ // field.
+ {kFileScheme, SCHEME_WITHOUT_PORT},
+ {kFtpScheme, SCHEME_WITH_PORT},
+ {kGopherScheme, SCHEME_WITH_PORT},
+ {kWsScheme, SCHEME_WITH_PORT}, // WebSocket.
+ {kWssScheme, SCHEME_WITH_PORT}, // WebSocket secure.
+ {kFileSystemScheme, SCHEME_WITHOUT_AUTHORITY},
};
-// List of the currently installed standard schemes. This list is lazily
-// initialized by InitStandardSchemes and is leaked on shutdown to prevent
-// any destructors from being called that will slow us down or cause problems.
-std::vector<const char*>* standard_schemes = NULL;
+const int kNumReferrerURLSchemes = 2;
+const SchemeWithType kReferrerURLSchemes[kNumReferrerURLSchemes] = {
+ {kHttpScheme, SCHEME_WITH_PORT},
+ {kHttpsScheme, SCHEME_WITH_PORT},
+};
-// See the LockStandardSchemes declaration in the header.
-bool standard_schemes_locked = false;
+// Lists of the currently installed standard and referrer schemes. These lists
+// are lazily initialized by InitStandardSchemes and InitReferrerSchemes and are
+// leaked on shutdown to prevent any destructors from being called that will
+// slow us down or cause problems.
+std::vector<SchemeWithType>* standard_schemes = nullptr;
+std::vector<SchemeWithType>* referrer_schemes = nullptr;
-// Ensures that the standard_schemes list is initialized, does nothing if it
-// already has values.
-void InitStandardSchemes() {
- if (standard_schemes)
+// See the LockSchemeRegistries declaration in the header.
+bool scheme_registries_locked = false;
+
+// This template converts a given character type to the corresponding
+// StringPiece type.
+template<typename CHAR> struct CharToStringPiece {
+};
+template<> struct CharToStringPiece<char> {
+ typedef base::StringPiece Piece;
+};
+template<> struct CharToStringPiece<base::char16> {
+ typedef base::StringPiece16 Piece;
+};
+
+void InitSchemes(std::vector<SchemeWithType>** schemes,
+ const SchemeWithType* initial_schemes,
+ size_t size) {
+ if (*schemes)
return;
- standard_schemes = new std::vector<const char*>;
- for (int i = 0; i < kNumStandardURLSchemes; i++)
- standard_schemes->push_back(kStandardURLSchemes[i]);
+ *schemes = new std::vector<SchemeWithType>(size);
+ for (size_t i = 0; i < size; i++) {
+ (*schemes)->push_back(initial_schemes[i]);
+ }
+}
+
+// Ensures that the standard_schemes list is initialized, does nothing if
+// it already has values.
+void InitStandardSchemes() {
+ InitSchemes(&standard_schemes, kStandardURLSchemes, kNumStandardURLSchemes);
+}
+
+// Ensures that the referrer_schemes list is initialized, does nothing if
+// it already has values.
+void InitReferrerSchemes() {
+ InitSchemes(&referrer_schemes, kReferrerURLSchemes, kNumReferrerURLSchemes);
}
// Given a string and a range inside the string, compares it to the given
@@ -74,28 +94,41 @@
const char* compare_to) {
if (!component.is_nonempty())
return compare_to[0] == 0; // When component is empty, match empty scheme.
- return LowerCaseEqualsASCII(&spec[component.begin],
- &spec[component.end()],
- compare_to);
+ return base::LowerCaseEqualsASCII(
+ typename CharToStringPiece<CHAR>::Piece(
+ &spec[component.begin], component.len),
+ compare_to);
}
-// Returns true if the given scheme identified by |scheme| within |spec| is one
-// of the registered "standard" schemes.
+// Returns true and sets |type| to the SchemeType of the given scheme
+// identified by |scheme| within |spec| if in |schemes|.
template<typename CHAR>
-bool DoIsStandard(const CHAR* spec, const Component& scheme) {
+bool DoIsInSchemes(const CHAR* spec,
+ const Component& scheme,
+ SchemeType* type,
+ const std::vector<SchemeWithType>& schemes) {
if (!scheme.is_nonempty())
return false; // Empty or invalid schemes are non-standard.
- InitStandardSchemes();
- for (size_t i = 0; i < standard_schemes->size(); i++) {
- if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
- standard_schemes->at(i)))
+ for (const SchemeWithType& scheme_with_type : schemes) {
+ if (base::LowerCaseEqualsASCII(typename CharToStringPiece<CHAR>::Piece(
+ &spec[scheme.begin], scheme.len),
+ scheme_with_type.scheme)) {
+ *type = scheme_with_type.type;
return true;
+ }
}
return false;
}
template<typename CHAR>
+bool DoIsStandard(const CHAR* spec, const Component& scheme, SchemeType* type) {
+ InitStandardSchemes();
+ return DoIsInSchemes(spec, scheme, type, *standard_schemes);
+}
+
+
+template<typename CHAR>
bool DoFindAndCompareScheme(const CHAR* str,
int str_len,
const char* compare,
@@ -136,7 +169,7 @@
Parsed parsed_input;
#ifdef WIN32
// For Windows, we allow things that look like absolute Windows paths to be
- // fixed up magically to file URLs. This is done for IE compatability. For
+ // fixed up magically to file URLs. This is done for IE compatibility. For
// example, this will change "c:/foo" into a file URL rather than treating
// it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
// There is similar logic in url_canon_relative.cc for
@@ -160,6 +193,7 @@
// This is the parsed version of the input URL, we have to canonicalize it
// before storing it in our object.
bool success;
+ SchemeType unused_scheme_type = SCHEME_WITH_PORT;
if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) {
// File URLs are special.
ParseFileURL(spec, spec_len, &parsed_input);
@@ -172,20 +206,21 @@
charset_converter, output,
output_parsed);
- } else if (DoIsStandard(spec, scheme)) {
+ } else if (DoIsStandard(spec, scheme, &unused_scheme_type)) {
// All "normal" URLs.
ParseStandardURL(spec, spec_len, &parsed_input);
success = CanonicalizeStandardURL(spec, spec_len, parsed_input,
charset_converter, output, output_parsed);
} else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) {
- // Mailto are treated like a standard url with only a scheme, path, query
+ // Mailto URLs are treated like standard URLs, with only a scheme, path,
+ // and query.
ParseMailtoURL(spec, spec_len, &parsed_input);
success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output,
output_parsed);
} else {
- // "Weird" URLs like data: and javascript:
+ // "Weird" URLs like data: and javascript:.
ParsePathURL(spec, spec_len, trim_path_end, &parsed_input);
success = CanonicalizePathURL(spec, spec_len, parsed_input, output,
output_parsed);
@@ -220,9 +255,10 @@
base_is_hierarchical = num_slashes > 0;
}
+ SchemeType unused_scheme_type = SCHEME_WITH_PORT;
bool standard_base_scheme =
base_parsed.scheme.is_nonempty() &&
- DoIsStandard(base_spec, base_parsed.scheme);
+ DoIsStandard(base_spec, base_parsed.scheme, &unused_scheme_type);
bool is_relative;
Component relative_component;
@@ -275,7 +311,7 @@
CanonOutput* output,
Parsed* out_parsed) {
// If the scheme is overridden, just do a simple string substitution and
- // reparse the whole thing. There are lots of edge cases that we really don't
+ // re-parse the whole thing. There are lots of edge cases that we really don't
// want to deal with. Like what happens if I replace "http://e:8080/foo"
// with a file. Does it become "file:///E:/8080/foo" where the port number
// becomes part of the path? Parsing that string as a file URL says "yes"
@@ -322,7 +358,7 @@
// getting replaced here. If ReplaceComponents didn't re-check everything,
// we wouldn't know if something *not* getting replaced is a problem.
// If the scheme-specific replacers are made more intelligent so they don't
- // re-check everything, we should instead recanonicalize the whole thing
+ // re-check everything, we should instead re-canonicalize the whole thing
// after this call to check validity (this assumes replacing the scheme is
// much much less common than other types of replacements, like clearing the
// ref).
@@ -343,7 +379,8 @@
return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter,
output, out_parsed);
}
- if (DoIsStandard(spec, parsed.scheme)) {
+ SchemeType unused_scheme_type = SCHEME_WITH_PORT;
+ if (DoIsStandard(spec, parsed.scheme, &unused_scheme_type)) {
return ReplaceStandardURL(spec, parsed, replacements, charset_converter,
output, out_parsed);
}
@@ -355,36 +392,25 @@
return ReplacePathURL(spec, parsed, replacements, output, out_parsed);
}
-} // namespace
-
-void Initialize() {
- InitStandardSchemes();
-}
-
-void Shutdown() {
- if (standard_schemes) {
- delete standard_schemes;
- standard_schemes = NULL;
- }
-}
-
-void AddStandardScheme(const char* new_scheme) {
- // If this assert triggers, it means you've called AddStandardScheme after
- // LockStandardSchemes have been called (see the header file for
- // LockStandardSchemes for more).
+void DoAddScheme(const char* new_scheme,
+ SchemeType type,
+ std::vector<SchemeWithType>* schemes) {
+ DCHECK(schemes);
+ // If this assert triggers, it means you've called Add*Scheme after
+ // LockSchemeRegistries has been called (see the header file for
+ // LockSchemeRegistries for more).
//
- // This normally means you're trying to set up a new standard scheme too late
- // in your application's init process. Locate where your app does this
- // initialization and calls LockStandardScheme, and add your new standard
- // scheme there.
- DCHECK(!standard_schemes_locked) <<
- "Trying to add a standard scheme after the list has been locked.";
+ // This normally means you're trying to set up a new scheme too late in your
+ // application's init process. Locate where your app does this initialization
+ // and calls LockSchemeRegistries, and add your new scheme there.
+ DCHECK(!scheme_registries_locked)
+ << "Trying to add a scheme after the lists have been locked.";
size_t scheme_len = strlen(new_scheme);
if (scheme_len == 0)
return;
- // Dulicate the scheme into a new buffer and add it to the list of standard
+ // Duplicate the scheme into a new buffer and add it to the list of standard
// schemes. This pointer will be leaked on shutdown.
char* dup_scheme = new char[scheme_len + 1];
#ifdef GOOGLEURL_IN_GOOGLE3
@@ -392,20 +418,64 @@
#endif
memcpy(dup_scheme, new_scheme, scheme_len + 1);
- InitStandardSchemes();
- standard_schemes->push_back(dup_scheme);
+ SchemeWithType scheme_with_type;
+ scheme_with_type.scheme = dup_scheme;
+ scheme_with_type.type = type;
+ schemes->push_back(scheme_with_type);
}
-void LockStandardSchemes() {
- standard_schemes_locked = true;
+} // namespace
+
+void Initialize() {
+ InitStandardSchemes();
+ InitReferrerSchemes();
+}
+
+void Shutdown() {
+ if (standard_schemes) {
+ delete standard_schemes;
+ standard_schemes = NULL;
+ }
+ if (referrer_schemes) {
+ delete referrer_schemes;
+ referrer_schemes = NULL;
+ }
+}
+
+void AddStandardScheme(const char* new_scheme, SchemeType type) {
+ InitStandardSchemes();
+ DoAddScheme(new_scheme, type, standard_schemes);
+}
+
+void AddReferrerScheme(const char* new_scheme, SchemeType type) {
+ InitReferrerSchemes();
+ DoAddScheme(new_scheme, type, referrer_schemes);
+}
+
+void LockSchemeRegistries() {
+ scheme_registries_locked = true;
}
bool IsStandard(const char* spec, const Component& scheme) {
- return DoIsStandard(spec, scheme);
+ SchemeType unused_scheme_type;
+ return DoIsStandard(spec, scheme, &unused_scheme_type);
+}
+
+bool GetStandardSchemeType(const char* spec,
+ const Component& scheme,
+ SchemeType* type) {
+ return DoIsStandard(spec, scheme, type);
}
bool IsStandard(const base::char16* spec, const Component& scheme) {
- return DoIsStandard(spec, scheme);
+ SchemeType unused_scheme_type;
+ return DoIsStandard(spec, scheme, &unused_scheme_type);
+}
+
+bool IsReferrerScheme(const char* spec, const Component& scheme) {
+ InitReferrerSchemes();
+ SchemeType unused_scheme_type;
+ return DoIsInSchemes(spec, scheme, &unused_scheme_type, *referrer_schemes);
}
bool FindAndCompareScheme(const char* str,
@@ -490,31 +560,6 @@
charset_converter, output, out_parsed);
}
-// Front-ends for LowerCaseEqualsASCII.
-bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b) {
- return DoLowerCaseEqualsASCII(a_begin, a_end, b);
-}
-
-bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b_begin,
- const char* b_end) {
- while (a_begin != a_end && b_begin != b_end &&
- ToLowerASCII(*a_begin) == *b_begin) {
- a_begin++;
- b_begin++;
- }
- return a_begin == a_end && b_begin == b_end;
-}
-
-bool LowerCaseEqualsASCII(const base::char16* a_begin,
- const base::char16* a_end,
- const char* b) {
- return DoLowerCaseEqualsASCII(a_begin, a_end, b);
-}
-
void DecodeURLEscapeSequences(const char* input,
int length,
CanonOutputW* output) {
diff --git a/src/url/url_util.h b/src/url/url_util.h
index 458d1e8..a209a61 100644
--- a/src/url/url_util.h
+++ b/src/url/url_util.h
@@ -8,10 +8,10 @@
#include <string>
#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_constants.h"
#include "url/url_export.h"
-#include "url/url_parse.h"
namespace url {
@@ -20,14 +20,13 @@
// Initialization is NOT required, it will be implicitly initialized when first
// used. However, this implicit initialization is NOT threadsafe. If you are
// using this library in a threaded environment and don't have a consistent
-// "first call" (an example might be calling "AddStandardScheme" with your
-// special application-specific schemes) then you will want to call initialize
-// before spawning any threads.
+// "first call" (an example might be calling Add*Scheme with your special
+// application-specific schemes) then you will want to call initialize before
+// spawning any threads.
//
-// It is OK to call this function more than once, subsequent calls will simply
-// "noop", unless Shutdown() was called in the mean time. This will also be a
-// "noop" if other calls to the library have forced an initialization
-// beforehand.
+// It is OK to call this function more than once, subsequent calls will be
+// no-ops, unless Shutdown was called in the mean time. This will also be a
+// no-op if other calls to the library have forced an initialization beforehand.
URL_EXPORT void Initialize();
// Cleanup is not required, except some strings may leak. For most user
@@ -38,25 +37,57 @@
// Schemes --------------------------------------------------------------------
-// Adds an application-defined scheme to the internal list of "standard" URL
-// schemes. This function is not threadsafe and can not be called concurrently
-// with any other url_util function. It will assert if the list of standard
-// schemes has been locked (see LockStandardSchemes).
-URL_EXPORT void AddStandardScheme(const char* new_scheme);
+// Types of a scheme representing the requirements on the data represented by
+// the authority component of a URL with the scheme.
+enum SchemeType {
+ // The authority component of a URL with the scheme, if any, has the port
+ // (the default values may be omitted in a serialization).
+ SCHEME_WITH_PORT,
+ // The authority component of a URL with the scheme, if any, doesn't have a
+ // port.
+ SCHEME_WITHOUT_PORT,
+ // A URL with the scheme doesn't have the authority component.
+ SCHEME_WITHOUT_AUTHORITY,
+};
-// Sets a flag to prevent future calls to AddStandardScheme from succeeding.
+// A pair for representing a standard scheme name and the SchemeType for it.
+struct URL_EXPORT SchemeWithType {
+ const char* scheme;
+ SchemeType type;
+};
+
+// Adds an application-defined scheme to the internal list of "standard-format"
+// URL schemes. A standard-format scheme adheres to what RFC 3986 calls "generic
+// URI syntax" (https://tools.ietf.org/html/rfc3986#section-3).
+//
+// This function is not threadsafe and can not be called concurrently with any
+// other url_util function. It will assert if the lists of schemes have
+// been locked (see LockSchemeRegistries).
+URL_EXPORT void AddStandardScheme(const char* new_scheme,
+ SchemeType scheme_type);
+
+// Adds an application-defined scheme to the internal list of schemes allowed
+// for referrers.
+//
+// This function is not threadsafe and can not be called concurrently with any
+// other url_util function. It will assert if the lists of schemes have
+// been locked (see LockSchemeRegistries).
+URL_EXPORT void AddReferrerScheme(const char* new_scheme,
+ SchemeType scheme_type);
+
+// Sets a flag to prevent future calls to Add*Scheme from succeeding.
//
// This is designed to help prevent errors for multithreaded applications.
-// Normal usage would be to call AddStandardScheme for your custom schemes at
-// the beginning of program initialization, and then LockStandardSchemes. This
-// prevents future callers from mistakenly calling AddStandardScheme when the
+// Normal usage would be to call Add*Scheme for your custom schemes at
+// the beginning of program initialization, and then LockSchemeRegistries. This
+// prevents future callers from mistakenly calling Add*Scheme when the
// program is running with multiple threads, where such usage would be
// dangerous.
//
-// We could have had AddStandardScheme use a lock instead, but that would add
+// We could have had Add*Scheme use a lock instead, but that would add
// some platform-specific dependencies we don't otherwise have now, and is
// overkill considering the normal usage is so simple.
-URL_EXPORT void LockStandardSchemes();
+URL_EXPORT void LockSchemeRegistries();
// Locates the scheme in the given string and places it into |found_scheme|,
// which may be NULL to indicate the caller does not care about the range.
@@ -85,18 +116,21 @@
compare, found_scheme);
}
-// Returns true if the given string represents a standard URL. This means that
-// either the scheme is in the list of known standard schemes.
+// Returns true if the given scheme identified by |scheme| within |spec| is in
+// the list of known standard-format schemes (see AddStandardScheme).
URL_EXPORT bool IsStandard(const char* spec, const Component& scheme);
URL_EXPORT bool IsStandard(const base::char16* spec, const Component& scheme);
-// TODO(brettw) remove this. This is a temporary compatibility hack to avoid
-// breaking the WebKit build when this version is synced via Chrome.
-inline bool IsStandard(const char* spec,
- int spec_len,
- const Component& scheme) {
- return IsStandard(spec, scheme);
-}
+// Returns true if the given scheme identified by |scheme| within |spec| is in
+// the list of allowed schemes for referrers (see AddReferrerScheme).
+URL_EXPORT bool IsReferrerScheme(const char* spec, const Component& scheme);
+
+// Returns true and sets |type| to the SchemeType of the given scheme
+// identified by |scheme| within |spec| if the scheme is in the list of known
+// standard-format schemes (see AddStandardScheme).
+URL_EXPORT bool GetStandardSchemeType(const char* spec,
+ const Component& scheme,
+ SchemeType* type);
// URL library wrappers -------------------------------------------------------
@@ -150,7 +184,7 @@
CanonOutput* output,
Parsed* output_parsed);
-// Replaces components in the given VALID input url. The new canonical URL info
+// Replaces components in the given VALID input URL. The new canonical URL info
// is written to output and out_parsed.
//
// Returns true if the resulting URL is valid.
@@ -172,29 +206,12 @@
// String helper functions ----------------------------------------------------
-// Compare the lower-case form of the given string against the given ASCII
-// string. This is useful for doing checking if an input string matches some
-// token, and it is optimized to avoid intermediate string copies.
-//
-// The versions of this function that don't take a b_end assume that the b
-// string is NULL terminated.
-URL_EXPORT bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b);
-URL_EXPORT bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b_begin,
- const char* b_end);
-URL_EXPORT bool LowerCaseEqualsASCII(const base::char16* a_begin,
- const base::char16* a_end,
- const char* b);
-
// Unescapes the given string using URL escaping rules.
URL_EXPORT void DecodeURLEscapeSequences(const char* input,
int length,
CanonOutputW* output);
-// Escapes the given string as defined by the JS method encodeURIComponent. See
+// Escapes the given string as defined by the JS method encodeURIComponent. See
// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent
URL_EXPORT void EncodeURIComponent(const char* input,
int length,
diff --git a/src/url/url_util_internal.h b/src/url/url_util_internal.h
index c72598f..756c736 100644
--- a/src/url/url_util_internal.h
+++ b/src/url/url_util_internal.h
@@ -8,7 +8,7 @@
#include <string>
#include "base/strings/string16.h"
-#include "url/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
namespace url {
diff --git a/src/url/url_util_unittest.cc b/src/url/url_util_unittest.cc
index 2216252..74db9e5 100644
--- a/src/url/url_util_unittest.cc
+++ b/src/url/url_util_unittest.cc
@@ -2,11 +2,13 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#include <stddef.h>
+
#include "base/macros.h"
#include "testing/base/public/gunit.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_canon_stdstring.h"
-#include "url/url_parse.h"
#include "url/url_test_utils.h"
#include "url/url_util.h"
@@ -44,7 +46,7 @@
EXPECT_FALSE(FindAndCompareScheme("", 0, "", &found_scheme));
EXPECT_TRUE(found_scheme == Component());
- // When there is a whitespace char in scheme, it should canonicalize the url
+ // When there is a whitespace char in scheme, it should canonicalize the URL
// before comparison.
const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)";
EXPECT_TRUE(FindAndCompareScheme(whtspc_str,
@@ -61,6 +63,53 @@
EXPECT_TRUE(found_scheme == Component(1, 11));
}
+TEST(URLUtilTest, IsStandard) {
+ const char kHTTPScheme[] = "http";
+ EXPECT_TRUE(IsStandard(kHTTPScheme, Component(0, strlen(kHTTPScheme))));
+
+ const char kFooScheme[] = "foo";
+ EXPECT_FALSE(IsStandard(kFooScheme, Component(0, strlen(kFooScheme))));
+}
+
+TEST(URLUtilTest, IsReferrerScheme) {
+ const char kHTTPScheme[] = "http";
+ EXPECT_TRUE(IsReferrerScheme(kHTTPScheme, Component(0, strlen(kHTTPScheme))));
+
+ const char kFooScheme[] = "foo";
+ EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+}
+
+TEST(URLUtilTest, AddReferrerScheme) {
+ const char kFooScheme[] = "foo";
+ EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+ AddReferrerScheme(kFooScheme, url::SCHEME_WITHOUT_PORT);
+ EXPECT_TRUE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+}
+
+TEST(URLUtilTest, GetStandardSchemeType) {
+ url::SchemeType scheme_type;
+
+ const char kHTTPScheme[] = "http";
+ scheme_type = url::SCHEME_WITHOUT_AUTHORITY;
+ EXPECT_TRUE(GetStandardSchemeType(kHTTPScheme,
+ Component(0, strlen(kHTTPScheme)),
+ &scheme_type));
+ EXPECT_EQ(url::SCHEME_WITH_PORT, scheme_type);
+
+ const char kFilesystemScheme[] = "filesystem";
+ scheme_type = url::SCHEME_WITH_PORT;
+ EXPECT_TRUE(GetStandardSchemeType(kFilesystemScheme,
+ Component(0, strlen(kFilesystemScheme)),
+ &scheme_type));
+ EXPECT_EQ(url::SCHEME_WITHOUT_AUTHORITY, scheme_type);
+
+ const char kFooScheme[] = "foo";
+ scheme_type = url::SCHEME_WITH_PORT;
+ EXPECT_FALSE(GetStandardSchemeType(kFooScheme,
+ Component(0, strlen(kFooScheme)),
+ &scheme_type));
+}
+
TEST(URLUtilTest, ReplaceComponents) {
Parsed parsed;
RawCanonOutputT<char> output;
@@ -220,7 +269,7 @@
}
TEST(URLUtilTest, TestResolveRelativeWithNonStandardBase) {
- // This tests non-standard (in the sense that GIsStandard() == false)
+ // This tests non-standard (in the sense that IsStandard() == false)
// hierarchical schemes.
struct ResolveRelativeCase {
const char* base;
@@ -273,6 +322,15 @@
// any URL scheme is we might break javascript: URLs by doing so...
{"javascript:alert('foo#bar')", "#badfrag", true,
"javascript:alert('foo#badfrag" },
+ // In this case, the backslashes will not be canonicalized because it's a
+ // non-standard URL, but they will be treated as a path separators,
+ // giving the base URL here a path of "\".
+ //
+ // The result here is somewhat arbitrary. One could argue it should be
+ // either "aaa://a\" or "aaa://a/" since the path is being replaced with
+ // the "current directory". But in the context of resolving on data URLs,
+ // adding the requested dot doesn't seem wrong either.
+ {"aaa://a\\", "aaa:.", true, "aaa://a\\." }
};
for (size_t i = 0; i < arraysize(resolve_non_standard_cases); i++) {
@@ -296,8 +354,8 @@
}
TEST(URLUtilTest, TestNoRefComponent) {
- // The hash-mark must be ignored when mailto: scheme is
- // parsed, even if the url has a base and relative part.
+ // The hash-mark must be ignored when mailto: scheme is parsed,
+ // even if the URL has a base and relative part.
const char* base = "mailto://to/";
const char* rel = "any#body";