Internal change
PiperOrigin-RevId: 96791655
Change-Id: I1232926a48b9fa8ba7ca739ba16294d17da1dd6a
diff --git a/LICENSE b/LICENSE
index ac40837..4917789 100644
--- a/LICENSE
+++ b/LICENSE
@@ -63,3 +63,40 @@
and other provisions required by the GPL or the LGPL. If you do not delete
the provisions above, a recipient may use your version of this file under
the terms of any one of the MPL, the GPL or the LGPL.
+
+-------------------------------------------------------------------------------
+
+The file icu_utf.cc is from IBM. This file is licensed separately as follows:
+
+ICU License - ICU 1.8.1 and later
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2009 International Business Machines Corporation and others
+
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
diff --git a/README.google b/README.google
index b7346d5..eab9cf3 100644
--- a/README.google
+++ b/README.google
@@ -1,32 +1,51 @@
-URL: http://google-url.googlecode.com/svn/trunk/
-Version: Snapshot of Subversion trunk, revision [139]
-License: BSD and MPL (one source file under MPL)
-License File: googleurl/LICENSE.txt
+URL: https://chromium.googlesource.com/chromium/src/+archive/6e0744b15b09421eac6634fb3fb7fe0a03427d56/url.tar.gz
+Version: 6e0744b15b09421eac6634fb3fb7fe0a03427d56 (matching Chromium 41.0.2272.118)
+License: BSD, MPL, ICU (one source file under MPL, one source file under ICU)
+License File: LICENSE
Description:
A small library for parsing and canonicalizing URLs
Local Modifications:
-We use our google3 versions of //base (which is a superset of the base/
-functionality in googleurl), and ICU. These are both injected through BUILD
-and don't require source modifications.
+1. src/base directory:
+- Remove BASE_EXPORT macros.
+- Wrap namespace base with namespace url to distinguish from google3 base.
+- src/base/strings/string16.*
+ * Include src/build/build_config.h to detect wchar_t size.
+ * PrintTo function and operator << are removed to eliminate dependency on
+ src/base/strings/utf_string_conversion.
+- src/base/strings/string_util.*
+ * Only one MatchPattern function is kept for src/url/origin.cc.
+ * Change the argument type from StringPiece to std::string to remove
+ dependency on google3 StringPiece.
+- src/base/third_party/icu/icu_utf.cc
+ * Add FALLTHROUGH_INTENDED for fall-through switch cases.
-We use //depot/google3/third_party/breakpad/import_to_p4_from_svn.py
-to ease synchronization of this Perforce mirror with the authoritative
-Subversion repository. When using this script, don't forget to update this
-file, README.google, to reflect the Subversion revision being imported.
+2. src/url directory:
+- Use google3 version of //base, //util/gtl/lazy_static_ptr.h
+ //third_party/icu and //testing/base/public:gunit_main. Some users don't want
+ googleurl to be dependent on google3 (e.g. geo/render/mirth/net:googleurl),
+ so we try our best to do it.
+- src/url/gurl.cc
+ * Replace scoped_ptr with std::unique_ptr to eliminate dependency on google3
+ scoped_ptr.
+- src/url/url_canon_icu.cc
+ * Replace LazyInstance with google3 LazyStaticPtr, modify intialization
+ and access methods accordingly.
+- src/url/url_util.cc
+ * Replace ANNOTATE_LEAKING_OBJECT_PTR() with google3
+ HeapLeakChecker::IgnoreObject(), and only use it when GOOGLEURL_IN_GOOGLE3
+ is defined.
+- src/url/url_canon_internal.h
+ * Expand NOT_REACHED() as DCHECK(false).
+- src/url/url_canon_icu.h and src/url/url_canon_stdstring.h
+ * Remove the include of src/base/compiler_specific.h.
+- src/url/third_party/mozilla/url_parse.cc
+ * Compile filesystemurl related function only when NO_FILESYSTEMURL_SUPPORT
+ is not defined, so that
+ wireless/android/icing/lib/core:liburl_parse_icing_static doesn't need to
+ depend on other googleurl srcs as well as third_party/icu.
-Because googleurl uses include paths like "googleurl/src/header.h", the source
-is located in a googleurl subdirectory under this directory. This allows
-the paths to work correctly without adding //third_party to the include path.
-
-2010-01-22: the upstream code uses an open-source version of gunit and the
-google3 code uses a google3 version of gunit. When importing, be careful
-to use the current google3 names: testing/base/public/googletest.h and
-testing/base/public/gunit.h . -- mec
-
+3. google3_addidions directory:
2014-07-30: added google3_additions/googleurl_init.cc, which properly
initializes googleurl during InitGoogle().
-
-2014-09-29: Adjusted googleurl/src/url_canon_unittest.cc for C++11
-compatibility.
diff --git a/google3_additions/googleurl_init.cc b/google3_additions/googleurl_init.cc
index e0af8e6..03470f0 100644
--- a/google3_additions/googleurl_init.cc
+++ b/google3_additions/googleurl_init.cc
@@ -4,12 +4,12 @@
// InitGoogle() at startup.
#include "base/googleinit.h"
-#include "third_party/googleurl/googleurl/src/url_util.h"
+#include "third_party/googleurl/src/url/url_util.h"
namespace {
void InitGoogleUrl() {
- url_util::Initialize();
+ url::Initialize();
}
} // namespace
diff --git a/googleurl/LICENSE.txt b/googleurl/LICENSE.txt
deleted file mode 100644
index ac40837..0000000
--- a/googleurl/LICENSE.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-Copyright 2007, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
- * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
--------------------------------------------------------------------------------
-
-The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is
-licensed separately as follows:
-
-The contents of this file are subject to the Mozilla Public License Version
-1.1 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-http://www.mozilla.org/MPL/
-
-Software distributed under the License is distributed on an "AS IS" basis,
-WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
-for the specific language governing rights and limitations under the
-License.
-
-The Original Code is mozilla.org code.
-
-The Initial Developer of the Original Code is
-Netscape Communications Corporation.
-Portions created by the Initial Developer are Copyright (C) 1998
-the Initial Developer. All Rights Reserved.
-
-Contributor(s):
- Darin Fisher (original author)
-
-Alternatively, the contents of this file may be used under the terms of
-either the GNU General Public License Version 2 or later (the "GPL"), or
-the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
-in which case the provisions of the GPL or the LGPL are applicable instead
-of those above. If you wish to allow use of your version of this file only
-under the terms of either the GPL or the LGPL, and not to allow others to
-use your version of this file under the terms of the MPL, indicate your
-decision by deleting the provisions above and replace them with the notice
-and other provisions required by the GPL or the LGPL. If you do not delete
-the provisions above, a recipient may use your version of this file under
-the terms of any one of the MPL, the GPL or the LGPL.
diff --git a/googleurl/README.txt b/googleurl/README.txt
deleted file mode 100644
index b28fd04..0000000
--- a/googleurl/README.txt
+++ /dev/null
@@ -1,180 +0,0 @@
- ==============================
- The Google URL Parsing Library
- ==============================
-
-This is the Google URL Parsing Library which parses and canonicalizes URLs.
-Please see the LICENSE.txt file for licensing information.
-
-Features
-========
-
- * Easily embeddable: This library was written for a variety of client and
- server programs in mind, so unlike most implementations of URL parsing
- and canonicalization, it can be easily emdedded.
-
- * Fast: hundreds of thousands of typical URLs can be parsed and
- canonicalized per second on a modern CPU. It is much faster than, for
- example, calling WinInet's corresponding functions.
-
- * Compatible: When possible, this library has strived for IE7 compatability
- for both general web compatability, and so IE addons or other applications
- that communicate with or embed IE will work properly.
-
- It supports Unix-style file URLs, as well as the more complex rules for
- Window file URLs. Note that total compatability is not possible (for
- example, IE6 and IE7 disagree about how to parse certain IP addresses),
- and that this is more strict about certain illegal, rarely used, and
- potentially dangerous constructs such as escaped control characters in
- host names that IE will allow. It is typically a little less strict than
- Firefox.
-
-
-Example
-=======
-
-An example implementation of a URL object that uses this library is provided
-in src/gurl.*. This implementation uses the "application integration" layer
-discussed below to interface with the low-level parsing and canonicalization
-functions.
-
-
-Building
-========
-
-The canonicalization files require ICU for some UTF-8 and UTF-16 conversion
-macros. If your project does not use ICU, it should be straightforward to
-factor out the macros and functions used in ICU, there are only a few well-
-isolated things that are used.
-
-TODO(brettw) ADD INSTRUCTIONS FOR GETTING ICU HERE!
-
-logging.h and logging.cc are Windows-only because the corresponding Unix
-logging system has many dependencies. This library uses few of the logging
-macros, and a dummy header can easily be written that defines the
-appropriate things for Unix.
-
-
-Definitions
-===========
-
-"Standard URL": A URL with an "authority", which is a hostname and optionally
- a port, username, and password. Most URLs are standard such as HTTP and FTP.
-
-"File URL": A URL that references a file on disk. There are special rules for
- this type of URL. Note that it may have a hostname! "localhost" is allowed,
- for example "file://localhost/foo" is the same as "file:///foo".
-
-"Path URL": This is everything else. There is no standard on how to treat these
- URLs, or even what they are called. This library decomposes them into a
- scheme and a path. The path is everything following the scheme. This type of
- URL includes "javascript", "data", and even "mailto" (although "mailto"
- might look like a standard scheme in some respects, it is not).
-
-
-Design
-======
-
-The library is divided into four layers. They are listed here from the lowest
-to the highest; you can use any portion of the library as long as you embed the
-layers below it.
-
-1. Parsing
-----------
-At the lowest level is the parsing code. The files encompasing this are
-url_parse.* and the main include file is src/url_parse.h. This code will, given
-an input string, parse it into the most likely form of a URL.
-
-Parsing can not fail and does no validation. The exception is the port number,
-which it currently validates, but this is a bug. Given crazy input, the parser
-will do its best to find the various URL components according to its rules (see
-url_parse_unittest.cc for some examples).
-
-To use this, an application will typically use ExtractScheme to determine the
-type of a given input URL, and then call one of the initialization functions:
-"ParseStandardURL", "ParsePathURL", or "ParseFileURL". This will result in
-a "Parsed" structure which identifies the substrings of each identified
-component.
-
-2. Canonicalization
--------------------
-At the next highest level is canonicalization. The files encompasing this are
-url_canon.* and the main include file is src/url_canon.h. This code will
-validate an already-parsed URL, and will convert it to a canonical form. For
-example, this will convert host names to lowercase, convert IP addresses
-into dotted-decimal notation, handle encoding issues, etc.
-
-This layer will always do its best to produce a reasonable output string, but
-it may return that the string is invalid. For example, if there are invalid
-characters in the host name, it will escape them or replace them with the
-Unicode "invalid character" character, but will fail. This way, the program can
-display error messages to the user with the output, log it, etc. and the
-string will have some meaning.
-
-Canonicalized output is written to a CanonOutput object which is a simple
-wrapper around an expanding buffer. An implementation called RawCanonOutput is
-proivided that writes to a raw buffer with a fixed amount statically allocated
-(for performance). Applications using STL can use StdStringCanonOutput defined
-in url_canon_stdstring.h which writes into a std::string.
-
-A normal application would call one of the three high-level functions
-"CanonicalizeStandardURL", "CanonicalizeFileURL", and CanonicalizePathURL"
-depending on the type of URL in question. Lower-level functions are also
-provided which will canonicalize individual parts of a URL (for example,
-"CanonicalizeHost").
-
-Part of this layer is the integration with the host system for IDN and encoding
-conversion. An implementation that provides integration with the ICU
-(http://www-306.ibm.com/software/globalization/icu/index.jsp) is provided in
-src/url_canon_icu.cc. The embedder may wish to replace this file with
-implementations of the functions for their own IDN library if they do not use
-ICU.
-
-3. Application integration
---------------------------
-The canonicalization and parsing layers do not know anything about the URI
-schemes supported by your application. The parsing and canonicalization
-functions are very low-level, and you must call the correct function to do the
-work (for example, "CanonicalizeFileURL").
-
-The application integration in url_util.* provides wrappers around the
-low-level parsing and canonicalization to call the correct versions for
-different identified schemes. Embedders will want to modify this file if
-necessary to suit the needs of their application.
-
-4. URL object
--------------
-The highest level is the "URL" object that a C++ application would use to
-to encapsulate a URL. Embedders will typically want to provide their own URL
-object that meets the requirements of their system. A reasonably complete
-example implemnetation is provided in src/gurl.*. You may wish to use this
-object, extend or modify it, or write your own.
-
-Whitespace
-----------
-Sometimes, you may want to remove linefeeds and tabs from the content of a URL.
-Some web pages, for example, expect that a URL spanning two lines should be
-treated as one with the newline removed. Depending on the source of the URLs
-you are canonicalizing, these newlines may or may not be trimmed off.
-
-If you want this behavior, call RemoveURLWhitespace before parsing. This will
-remove CR, LF and TAB from the input. Note that it preserves spaces. On typical
-URLs, this function produces a 10-15% speed reduction, so it is optional and
-not done automatically. The example GURL object and the url_util wrapper does
-this for you.
-
-Tests
-=====
-
-There are a number of *_unittest.cc and *_perftest.cc files. These files are
-not currently compilable as they rely on a not-included unit testing framework
-Tests are declared like this:
- TEST(TestCaseName, TestName) {
- ASSERT_TRUE(a);
- EXPECT_EQ(a, b);
- }
-If you would like to compile them, it should be straightforward to define
-the TEST macro (which would declare a function by combining the two arguments)
-and the other macros whose behavior should be self-explanatory (EXPECT is like
-an ASSERT, but does not stop the test, if you are doing this, you probably
-don't care about this difference). Then you would define a .cc file that
-calls all of these functions.
diff --git a/googleurl/base/README.txt b/googleurl/base/README.txt
deleted file mode 100644
index 311faa0..0000000
--- a/googleurl/base/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-These files contain some shared code. You can define your own assertion macros
-to eliminate the dependency on logging.h.
diff --git a/googleurl/base/string16.cc b/googleurl/base/string16.cc
deleted file mode 100644
index fc25809..0000000
--- a/googleurl/base/string16.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "base/string16.h"
-
-#ifdef WIN32
-
-#error This file should not be used on 2-byte wchar_t systems
-// If this winds up being needed on 2-byte wchar_t systems, either the
-// definitions below can be used, or the host system's wide character
-// functions like wmemcmp can be wrapped.
-
-#else // !WIN32
-
-namespace base {
-
-int c16memcmp(const char16* s1, const char16* s2, size_t n) {
- // We cannot call memcmp because that changes the semantics.
- while (n-- > 0) {
- if (*s1 != *s2) {
- // We cannot use (*s1 - *s2) because char16 is unsigned.
- return ((*s1 < *s2) ? -1 : 1);
- }
- ++s1;
- ++s2;
- }
- return 0;
-}
-
-size_t c16len(const char16* s) {
- const char16 *s_orig = s;
- while (*s) {
- ++s;
- }
- return s - s_orig;
-}
-
-const char16* c16memchr(const char16* s, char16 c, size_t n) {
- while (n-- > 0) {
- if (*s == c) {
- return s;
- }
- ++s;
- }
- return 0;
-}
-
-char16* c16memmove(char16* s1, const char16* s2, size_t n) {
- return reinterpret_cast<char16*>(memmove(s1, s2, n * sizeof(char16)));
-}
-
-char16* c16memcpy(char16* s1, const char16* s2, size_t n) {
- return reinterpret_cast<char16*>(memcpy(s1, s2, n * sizeof(char16)));
-}
-
-char16* c16memset(char16* s, char16 c, size_t n) {
- char16 *s_orig = s;
- while (n-- > 0) {
- *s = c;
- ++s;
- }
- return s_orig;
-}
-
-} // namespace base
-
-template class std::basic_string<char16, base::string16_char_traits>;
-
-#endif // WIN32
diff --git a/googleurl/build/README.txt b/googleurl/build/README.txt
deleted file mode 100644
index eab011a..0000000
--- a/googleurl/build/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-This directory includes solution and project files for compiling with
-Visual Studio 2005 on Windows.
-
-The base checkout directory must be named 'googleurl'.
diff --git a/googleurl/build/base.vcproj b/googleurl/build/base.vcproj
deleted file mode 100644
index 0e923cf..0000000
--- a/googleurl/build/base.vcproj
+++ /dev/null
@@ -1,151 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
- ProjectType="Visual C++"
- Version="8.00"
- Name="base"
- ProjectGUID="{ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}"
- RootNamespace="base"
- Keyword="Win32Proj"
- >
- <Platforms>
- <Platform
- Name="Win32"
- />
- </Platforms>
- <ToolFiles>
- </ToolFiles>
- <Configurations>
- <Configuration
- Name="Debug|Win32"
- ConfigurationType="4"
- InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\debug.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLibrarianTool"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- />
- </Configuration>
- <Configuration
- Name="Release|Win32"
- ConfigurationType="4"
- InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\release.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLibrarianTool"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- />
- </Configuration>
- </Configurations>
- <References>
- </References>
- <Files>
- <File
- RelativePath="..\base\basictypes.h"
- >
- </File>
- <File
- RelativePath="..\base\logging.cc"
- >
- </File>
- <File
- RelativePath="..\base\logging.h"
- >
- </File>
- <File
- RelativePath="..\base\README.txt"
- >
- </File>
- <File
- RelativePath="..\base\scoped_ptr.h"
- >
- </File>
- <File
- RelativePath="..\base\string16.h"
- >
- </File>
- </Files>
- <Globals>
- </Globals>
-</VisualStudioProject>
diff --git a/googleurl/build/common.vsprops b/googleurl/build/common.vsprops
deleted file mode 100644
index ede28e9..0000000
--- a/googleurl/build/common.vsprops
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioPropertySheet
- ProjectType="Visual C++"
- Version="8.00"
- Name="common"
- OutputDirectory="$(SolutionDir)$(ConfigurationName)"
- IntermediateDirectory="$(SolutionDir)$(ConfigurationName)\obj\$(ProjectName)"
- CharacterSet="1"
- >
- <Tool
- Name="VCCLCompilerTool"
- AdditionalIncludeDirectories="$(SolutionDir)..\..;$(SolutionDir).."
- PreprocessorDefinitions="_WIN32_WINNT=0x0501;WINVER=0x0501;WIN32;_WINDOWS"
- MinimalRebuild="false"
- BufferSecurityCheck="true"
- EnableFunctionLevelLinking="true"
- WarningLevel="3"
- WarnAsError="true"
- Detect64BitPortabilityProblems="true"
- DebugInformationFormat="3"
- />
-</VisualStudioPropertySheet>
diff --git a/googleurl/build/debug.vsprops b/googleurl/build/debug.vsprops
deleted file mode 100644
index d2aa43f..0000000
--- a/googleurl/build/debug.vsprops
+++ /dev/null
@@ -1,18 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioPropertySheet
- ProjectType="Visual C++"
- Version="8.00"
- Name="debug"
- >
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- PreprocessorDefinitions="_DEBUG"
- BasicRuntimeChecks="3"
- RuntimeLibrary="1"
- />
- <Tool
- Name="VCLinkerTool"
- LinkIncremental="2"
- />
-</VisualStudioPropertySheet>
diff --git a/googleurl/build/googleurl.sln b/googleurl/build/googleurl.sln
deleted file mode 100644
index 347810d..0000000
--- a/googleurl/build/googleurl.sln
+++ /dev/null
@@ -1,32 +0,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 9.00
-# Visual Studio 2005
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "googleurl", "googleurl.vcproj", "{EF5E94AB-B646-4E5B-A058-52EF07B8351C}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "base", "base.vcproj", "{ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{D8E84C85-89D3-4B8D-9A3A-C44B63C3383A}"
- ProjectSection(SolutionItems) = preProject
- ..\LICENSE.txt = ..\LICENSE.txt
- ..\README.txt = ..\README.txt
- EndProjectSection
-EndProject
-Global
- GlobalSection(SolutionConfigurationPlatforms) = preSolution
- Debug|Win32 = Debug|Win32
- Release|Win32 = Release|Win32
- EndGlobalSection
- GlobalSection(ProjectConfigurationPlatforms) = postSolution
- {EF5E94AB-B646-4E5B-A058-52EF07B8351C}.Debug|Win32.ActiveCfg = Debug|Win32
- {EF5E94AB-B646-4E5B-A058-52EF07B8351C}.Debug|Win32.Build.0 = Debug|Win32
- {EF5E94AB-B646-4E5B-A058-52EF07B8351C}.Release|Win32.ActiveCfg = Release|Win32
- {EF5E94AB-B646-4E5B-A058-52EF07B8351C}.Release|Win32.Build.0 = Release|Win32
- {ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}.Debug|Win32.ActiveCfg = Debug|Win32
- {ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}.Debug|Win32.Build.0 = Debug|Win32
- {ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}.Release|Win32.ActiveCfg = Release|Win32
- {ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}.Release|Win32.Build.0 = Release|Win32
- EndGlobalSection
- GlobalSection(SolutionProperties) = preSolution
- HideSolutionNode = FALSE
- EndGlobalSection
-EndGlobal
diff --git a/googleurl/build/googleurl.vcproj b/googleurl/build/googleurl.vcproj
deleted file mode 100644
index 71b3123..0000000
--- a/googleurl/build/googleurl.vcproj
+++ /dev/null
@@ -1,239 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
- ProjectType="Visual C++"
- Version="8.00"
- Name="googleurl"
- ProjectGUID="{EF5E94AB-B646-4E5B-A058-52EF07B8351C}"
- RootNamespace="googleurl"
- Keyword="Win32Proj"
- >
- <Platforms>
- <Platform
- Name="Win32"
- />
- </Platforms>
- <ToolFiles>
- </ToolFiles>
- <Configurations>
- <Configuration
- Name="Debug|Win32"
- ConfigurationType="4"
- InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\debug.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLibrarianTool"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- />
- </Configuration>
- <Configuration
- Name="Release|Win32"
- ConfigurationType="4"
- InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\release.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLibrarianTool"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- />
- </Configuration>
- </Configurations>
- <References>
- </References>
- <Files>
- <File
- RelativePath="..\src\gurl.cc"
- >
- </File>
- <File
- RelativePath="..\src\gurl.h"
- >
- </File>
- <File
- RelativePath=".\README.txt"
- >
- </File>
- <File
- RelativePath="..\src\url_canon.h"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_etc.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_fileurl.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_host.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_icu.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_icu.h"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_internal.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_internal.h"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_internal_file.h"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_ip.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_ip.h"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_mailtourl.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_path.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_pathurl.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_query.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_relative.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_stdstring.h"
- >
- </File>
- <File
- RelativePath="..\src\url_canon_stdurl.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_file.h"
- >
- </File>
- <File
- RelativePath="..\src\url_parse.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_parse.h"
- >
- </File>
- <File
- RelativePath="..\src\url_parse_file.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_parse_internal.h"
- >
- </File>
- <File
- RelativePath="..\src\url_util.cc"
- >
- </File>
- <File
- RelativePath="..\src\url_util.h"
- >
- </File>
- </Files>
- <Globals>
- </Globals>
-</VisualStudioProject>
diff --git a/googleurl/build/release.vsprops b/googleurl/build/release.vsprops
deleted file mode 100644
index 2e59356..0000000
--- a/googleurl/build/release.vsprops
+++ /dev/null
@@ -1,23 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioPropertySheet
- ProjectType="Visual C++"
- Version="8.00"
- Name="release"
- >
- <Tool
- Name="VCCLCompilerTool"
- WholeProgramOptimization="true"
- PreprocessorDefinitions="NDEBUG"
- />
- <Tool
- Name="VCLibrarianTool"
- AdditionalOptions="/ltcg"
- />
- <Tool
- Name="VCLinkerTool"
- LinkIncremental="1"
- OptimizeReferences="2"
- EnableCOMDATFolding="2"
- LinkTimeCodeGeneration="1"
- />
-</VisualStudioPropertySheet>
diff --git a/googleurl/src/gurl.cc b/googleurl/src/gurl.cc
deleted file mode 100644
index a0bfd26..0000000
--- a/googleurl/src/gurl.cc
+++ /dev/null
@@ -1,449 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifdef WIN32
-#include <windows.h>
-#else
-#include <pthread.h>
-#endif
-
-#include <algorithm>
-
-#include "googleurl/src/gurl.h"
-
-#include "base/logging.h"
-#include "googleurl/src/url_canon_stdstring.h"
-#include "googleurl/src/url_util.h"
-
-namespace {
-
-// External template that can handle initialization of either character type.
-// The input spec is given, and the canonical version will be placed in
-// |*canonical|, along with the parsing of the canonical spec in |*parsed|.
-template<typename STR>
-bool InitCanonical(const STR& input_spec,
- std::string* canonical,
- url_parse::Parsed* parsed) {
- // Reserve enough room in the output for the input, plus some extra so that
- // we have room if we have to escape a few things without reallocating.
- canonical->reserve(input_spec.size() + 32);
- url_canon::StdStringCanonOutput output(canonical);
- bool success = url_util::Canonicalize(
- input_spec.data(), static_cast<int>(input_spec.length()),
- NULL, &output, parsed);
-
- output.Complete(); // Must be done before using string.
- return success;
-}
-
-static std::string* empty_string = NULL;
-static GURL* empty_gurl = NULL;
-
-#ifdef WIN32
-
-// Returns a static reference to an empty string for returning a reference
-// when there is no underlying string.
-const std::string& EmptyStringForGURL() {
- // Avoid static object construction/destruction on startup/shutdown.
- if (!empty_string) {
- // Create the string. Be careful that we don't break in the case that this
- // is being called from multiple threads. Statics are not threadsafe.
- std::string* new_empty_string = new std::string;
- if (InterlockedCompareExchangePointer(
- reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
- // The old value was non-NULL, so no replacement was done. Another
- // thread did the initialization out from under us.
- delete new_empty_string;
- }
- }
- return *empty_string;
-}
-
-#else
-
-static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
-static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
-
-void EmptyStringForGURLOnce(void) {
- empty_string = new std::string;
-}
-
-const std::string& EmptyStringForGURL() {
- // Avoid static object construction/destruction on startup/shutdown.
- pthread_once(&empty_string_once, EmptyStringForGURLOnce);
- return *empty_string;
-}
-
-#endif // WIN32
-
-} // namespace
-
-GURL::GURL() : is_valid_(false) {
-}
-
-GURL::GURL(const GURL& other)
- : spec_(other.spec_),
- is_valid_(other.is_valid_),
- parsed_(other.parsed_) {
-}
-
-GURL::GURL(const std::string& url_string) {
- is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
-}
-
-GURL::GURL(const string16& url_string) {
- is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
-}
-
-GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
- const url_parse::Parsed& parsed, bool is_valid)
- : spec_(canonical_spec, canonical_spec_len),
- is_valid_(is_valid),
- parsed_(parsed) {
-#ifndef NDEBUG
- // For testing purposes, check that the parsed canonical URL is identical to
- // what we would have produced. Skip checking for invalid URLs have no meaning
- // and we can't always canonicalize then reproducabely.
- if (is_valid_) {
- GURL test_url(spec_);
-
- DCHECK(test_url.is_valid_ == is_valid_);
- DCHECK(test_url.spec_ == spec_);
-
- DCHECK(test_url.parsed_.scheme == parsed_.scheme);
- DCHECK(test_url.parsed_.username == parsed_.username);
- DCHECK(test_url.parsed_.password == parsed_.password);
- DCHECK(test_url.parsed_.host == parsed_.host);
- DCHECK(test_url.parsed_.port == parsed_.port);
- DCHECK(test_url.parsed_.path == parsed_.path);
- DCHECK(test_url.parsed_.query == parsed_.query);
- DCHECK(test_url.parsed_.ref == parsed_.ref);
- }
-#endif
-}
-
-const std::string& GURL::spec() const {
- if (is_valid_ || spec_.empty())
- return spec_;
-
- DCHECK(false) << "Trying to get the spec of an invalid URL!";
- return EmptyStringForGURL();
-}
-
-GURL GURL::Resolve(const std::string& relative) const {
- return ResolveWithCharsetConverter(relative, NULL);
-}
-GURL GURL::Resolve(const string16& relative) const {
- return ResolveWithCharsetConverter(relative, NULL);
-}
-
-// Note: code duplicated below (it's inconvenient to use a template here).
-GURL GURL::ResolveWithCharsetConverter(
- const std::string& relative,
- url_canon::CharsetConverter* charset_converter) const {
- // Not allowed for invalid URLs.
- if (!is_valid_)
- return GURL();
-
- GURL result;
-
- // Reserve enough room in the output for the input, plus some extra so that
- // we have room if we have to escape a few things without reallocating.
- result.spec_.reserve(spec_.size() + 32);
- url_canon::StdStringCanonOutput output(&result.spec_);
-
- if (!url_util::ResolveRelative(
- spec_.data(), static_cast<int>(spec_.length()), parsed_,
- relative.data(), static_cast<int>(relative.length()),
- charset_converter, &output, &result.parsed_)) {
- // Error resolving, return an empty URL.
- return GURL();
- }
-
- output.Complete();
- result.is_valid_ = true;
- return result;
-}
-
-// Note: code duplicated above (it's inconvenient to use a template here).
-GURL GURL::ResolveWithCharsetConverter(
- const string16& relative,
- url_canon::CharsetConverter* charset_converter) const {
- // Not allowed for invalid URLs.
- if (!is_valid_)
- return GURL();
-
- GURL result;
-
- // Reserve enough room in the output for the input, plus some extra so that
- // we have room if we have to escape a few things without reallocating.
- result.spec_.reserve(spec_.size() + 32);
- url_canon::StdStringCanonOutput output(&result.spec_);
-
- if (!url_util::ResolveRelative(
- spec_.data(), static_cast<int>(spec_.length()), parsed_,
- relative.data(), static_cast<int>(relative.length()),
- charset_converter, &output, &result.parsed_)) {
- // Error resolving, return an empty URL.
- return GURL();
- }
-
- output.Complete();
- result.is_valid_ = true;
- return result;
-}
-
-// Note: code duplicated below (it's inconvenient to use a template here).
-GURL GURL::ReplaceComponents(
- const url_canon::Replacements<char>& replacements) const {
- GURL result;
-
- // Not allowed for invalid URLs.
- if (!is_valid_)
- return GURL();
-
- // Reserve enough room in the output for the input, plus some extra so that
- // we have room if we have to escape a few things without reallocating.
- result.spec_.reserve(spec_.size() + 32);
- url_canon::StdStringCanonOutput output(&result.spec_);
-
- result.is_valid_ = url_util::ReplaceComponents(
- spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
- NULL, &output, &result.parsed_);
-
- output.Complete();
- return result;
-}
-
-// Note: code duplicated above (it's inconvenient to use a template here).
-GURL GURL::ReplaceComponents(
- const url_canon::Replacements<char16>& replacements) const {
- GURL result;
-
- // Not allowed for invalid URLs.
- if (!is_valid_)
- return GURL();
-
- // Reserve enough room in the output for the input, plus some extra so that
- // we have room if we have to escape a few things without reallocating.
- result.spec_.reserve(spec_.size() + 32);
- url_canon::StdStringCanonOutput output(&result.spec_);
-
- result.is_valid_ = url_util::ReplaceComponents(
- spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
- NULL, &output, &result.parsed_);
-
- output.Complete();
- return result;
-}
-
-GURL GURL::GetOrigin() const {
- // This doesn't make sense for invalid or nonstandard URLs, so return
- // the empty URL
- if (!is_valid_ || !IsStandard())
- return GURL();
-
- url_canon::Replacements<char> replacements;
- replacements.ClearUsername();
- replacements.ClearPassword();
- replacements.ClearPath();
- replacements.ClearQuery();
- replacements.ClearRef();
-
- return ReplaceComponents(replacements);
-}
-
-GURL GURL::GetWithEmptyPath() const {
- // This doesn't make sense for invalid or nonstandard URLs, so return
- // the empty URL.
- if (!is_valid_ || !IsStandard())
- return GURL();
-
- // We could optimize this since we know that the URL is canonical, and we are
- // appending a canonical path, so avoiding re-parsing.
- GURL other(*this);
- if (parsed_.path.len == 0)
- return other;
-
- // Clear everything after the path.
- other.parsed_.query.reset();
- other.parsed_.ref.reset();
-
- // Set the path, since the path is longer than one, we can just set the
- // first character and resize.
- other.spec_[other.parsed_.path.begin] = '/';
- other.parsed_.path.len = 1;
- other.spec_.resize(other.parsed_.path.begin + 1);
- return other;
-}
-
-bool GURL::IsStandard() const {
- return url_util::IsStandard(spec_.data(), parsed_.scheme);
-}
-
-bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
- if (parsed_.scheme.len <= 0)
- return lower_ascii_scheme == NULL;
- return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
- spec_.data() + parsed_.scheme.end(),
- lower_ascii_scheme);
-}
-
-int GURL::IntPort() const {
- if (parsed_.port.is_nonempty())
- return url_parse::ParsePort(spec_.data(), parsed_.port);
- return url_parse::PORT_UNSPECIFIED;
-}
-
-int GURL::EffectiveIntPort() const {
- int int_port = IntPort();
- if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
- return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
- parsed_.scheme.len);
- return int_port;
-}
-
-std::string GURL::ExtractFileName() const {
- url_parse::Component file_component;
- url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
- return ComponentString(file_component);
-}
-
-std::string GURL::PathForRequest() const {
- DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
- if (parsed_.ref.len >= 0) {
- // Clip off the reference when it exists. The reference starts after the #
- // sign, so we have to subtract one to also remove it.
- return std::string(spec_, parsed_.path.begin,
- parsed_.ref.begin - parsed_.path.begin - 1);
- }
-
- // Use everything form the path to the end.
- return std::string(spec_, parsed_.path.begin);
-}
-
-std::string GURL::HostNoBrackets() const {
- // If host looks like an IPv6 literal, strip the square brackets.
- url_parse::Component h(parsed_.host);
- if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
- h.begin++;
- h.len -= 2;
- }
- return ComponentString(h);
-}
-
-bool GURL::HostIsIPAddress() const {
- if (!is_valid_ || spec_.empty())
- return false;
-
- url_canon::RawCanonOutputT<char, 128> ignored_output;
- url_canon::CanonHostInfo host_info;
- url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
- &ignored_output, &host_info);
- return host_info.IsIPAddress();
-}
-
-#ifdef WIN32
-
-const GURL& GURL::EmptyGURL() {
- // Avoid static object construction/destruction on startup/shutdown.
- if (!empty_gurl) {
- // Create the string. Be careful that we don't break in the case that this
- // is being called from multiple threads.
- GURL* new_empty_gurl = new GURL;
- if (InterlockedCompareExchangePointer(
- reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
- // The old value was non-NULL, so no replacement was done. Another
- // thread did the initialization out from under us.
- delete new_empty_gurl;
- }
- }
- return *empty_gurl;
-}
-
-#else
-
-void EmptyGURLOnce(void) {
- empty_gurl = new GURL;
-}
-
-const GURL& GURL::EmptyGURL() {
- // Avoid static object construction/destruction on startup/shutdown.
- pthread_once(&empty_gurl_once, EmptyGURLOnce);
- return *empty_gurl;
-}
-
-#endif // WIN32
-
-bool GURL::DomainIs(const char* lower_ascii_domain,
- int domain_len) const {
- // Return false if this URL is not valid or domain is empty.
- if (!is_valid_ || !parsed_.host.is_nonempty() || !domain_len)
- return false;
-
- // Check whether the host name is end with a dot. If yes, treat it
- // the same as no-dot unless the input comparison domain is end
- // with dot.
- const char* last_pos = spec_.data() + parsed_.host.end() - 1;
- int host_len = parsed_.host.len;
- if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
- last_pos--;
- host_len--;
- }
-
- // Return false if host's length is less than domain's length.
- if (host_len < domain_len)
- return false;
-
- // Compare this url whether belong specific domain.
- const char* start_pos = spec_.data() + parsed_.host.begin +
- host_len - domain_len;
-
- if (!url_util::LowerCaseEqualsASCII(start_pos,
- last_pos + 1,
- lower_ascii_domain,
- lower_ascii_domain + domain_len))
- return false;
-
- // Check whether host has right domain start with dot, make sure we got
- // right domain range. For example www.google.com has domain
- // "google.com" but www.iamnotgoogle.com does not.
- if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
- '.' != *(start_pos - 1))
- return false;
-
- return true;
-}
-
-void GURL::Swap(GURL* other) {
- spec_.swap(other->spec_);
- std::swap(is_valid_, other->is_valid_);
- std::swap(parsed_, other->parsed_);
-}
-
diff --git a/googleurl/src/gurl_test_main.cc b/googleurl/src/gurl_test_main.cc
deleted file mode 100644
index 9a7c9f4..0000000
--- a/googleurl/src/gurl_test_main.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "build/build_config.h"
-
-#if defined(OS_WIN)
-#include <windows.h>
-#endif
-
-#include <string>
-
-#include "testing/gtest/include/gtest/gtest.h"
-#include "unicode/putil.h"
-#include "unicode/udata.h"
-
-#define ICU_UTIL_DATA_SHARED 1
-#define ICU_UTIL_DATA_STATIC 2
-
-#ifndef ICU_UTIL_DATA_IMPL
-
-#if defined(OS_WIN)
-#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_SHARED
-#elif defined(OS_MACOSX)
-#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_STATIC
-#elif defined(OS_LINUX)
-#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_FILE
-#endif
-
-#endif // ICU_UTIL_DATA_IMPL
-
-#if defined(OS_WIN)
-#define ICU_UTIL_DATA_SYMBOL "icudt" U_ICU_VERSION_SHORT "_dat"
-#define ICU_UTIL_DATA_SHARED_MODULE_NAME "icudt" U_ICU_VERSION_SHORT ".dll"
-#endif
-
-bool InitializeICU() {
-#if (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_SHARED)
- // We expect to find the ICU data module alongside the current module.
- // Because the module name is ASCII-only, "A" API should be safe.
- HMODULE module = LoadLibraryA(ICU_UTIL_DATA_SHARED_MODULE_NAME);
- if (!module)
- return false;
-
- FARPROC addr = GetProcAddress(module, ICU_UTIL_DATA_SYMBOL);
- if (!addr)
- return false;
-
- UErrorCode err = U_ZERO_ERROR;
- udata_setCommonData(reinterpret_cast<void*>(addr), &err);
- return err == U_ZERO_ERROR;
-#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_STATIC)
- // Mac bundles the ICU data in.
- return true;
-#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE)
- // We expect to find the ICU data module alongside the current module.
- u_setDataDirectory(".");
- // Only look for the packaged data file;
- // the default behavior is to look for individual files.
- UErrorCode err = U_ZERO_ERROR;
- udata_setFileAccess(UDATA_ONLY_PACKAGES, &err);
- return err == U_ZERO_ERROR;
-#endif
-}
-
-int main(int argc, char **argv) {
- ::testing::InitGoogleTest(&argc, argv);
-
- InitializeICU();
-
- return RUN_ALL_TESTS();
-}
diff --git a/googleurl/src/gurl_unittest.cc b/googleurl/src/gurl_unittest.cc
deleted file mode 100644
index b548cc2..0000000
--- a/googleurl/src/gurl_unittest.cc
+++ /dev/null
@@ -1,432 +0,0 @@
-// Copyright 2007 Google Inc. All Rights Reserved.
-// Author: brettw@google.com (Brett Wilson)
-
-#include "googleurl/src/gurl.h"
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_test_utils.h"
-#include "testing/base/public/gunit.h"
-
-// Some implementations of base/basictypes.h may define ARRAYSIZE.
-// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro
-// which is in our version of basictypes.h.
-#ifndef ARRAYSIZE
-#define ARRAYSIZE ARRAYSIZE_UNSAFE
-#endif
-
-using url_test_utils::WStringToUTF16;
-using url_test_utils::ConvertUTF8ToUTF16;
-
-namespace {
-
-template<typename CHAR>
-void SetupReplacement(void (url_canon::Replacements<CHAR>::*func)(const CHAR*,
- const url_parse::Component&),
- url_canon::Replacements<CHAR>* replacements,
- const CHAR* str) {
- if (str) {
- url_parse::Component comp;
- if (str[0])
- comp.len = static_cast<int>(strlen(str));
- (replacements->*func)(str, comp);
- }
-}
-
-// Returns the canonicalized string for the given URL string for the
-// GURLTest.Types test.
-std::string TypesTestCase(const char* src) {
- GURL gurl(src);
- return gurl.possibly_invalid_spec();
-}
-
-} // namespace
-
-// Different types of URLs should be handled differently by url_util, and
-// handed off to different canonicalizers.
-TEST(GURLTest, Types) {
- // URLs with unknown schemes should be treated as path URLs, even when they
- // have things like "://".
- EXPECT_EQ("something:///HOSTNAME.com/",
- TypesTestCase("something:///HOSTNAME.com/"));
-
- // In the reverse, known schemes should always trigger standard URL handling.
- EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com"));
- EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com"));
- EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com"));
- EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com"));
-
-#ifdef WIN32
- // URLs that look like absolute Windows drive specs.
- EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt"));
- EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt"));
- EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt"));
- EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt"));
-#endif
-}
-
-// Test the basic creation and querying of components in a GURL. We assume
-// the parser is already tested and works, so we are mostly interested if the
-// object does the right thing with the results.
-TEST(GURLTest, Components) {
- GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref"));
- EXPECT_TRUE(url.is_valid());
- EXPECT_TRUE(url.SchemeIs("http"));
- EXPECT_FALSE(url.SchemeIsFile());
-
- // This is the narrow version of the URL, which should match the wide input.
- EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url.spec());
-
- EXPECT_EQ("http", url.scheme());
- EXPECT_EQ("user", url.username());
- EXPECT_EQ("pass", url.password());
- EXPECT_EQ("google.com", url.host());
- EXPECT_EQ("99", url.port());
- EXPECT_EQ(99, url.IntPort());
- EXPECT_EQ("/foo;bar", url.path());
- EXPECT_EQ("q=a", url.query());
- EXPECT_EQ("ref", url.ref());
-}
-
-TEST(GURLTest, Empty) {
- GURL url;
- EXPECT_FALSE(url.is_valid());
- EXPECT_EQ("", url.spec());
-
- EXPECT_EQ("", url.scheme());
- EXPECT_EQ("", url.username());
- EXPECT_EQ("", url.password());
- EXPECT_EQ("", url.host());
- EXPECT_EQ("", url.port());
- EXPECT_EQ(url_parse::PORT_UNSPECIFIED, url.IntPort());
- EXPECT_EQ("", url.path());
- EXPECT_EQ("", url.query());
- EXPECT_EQ("", url.ref());
-}
-
-TEST(GURLTest, Copy) {
- GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref"));
-
- GURL url2(url);
- EXPECT_TRUE(url2.is_valid());
-
- EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec());
- EXPECT_EQ("http", url2.scheme());
- EXPECT_EQ("user", url2.username());
- EXPECT_EQ("pass", url2.password());
- EXPECT_EQ("google.com", url2.host());
- EXPECT_EQ("99", url2.port());
- EXPECT_EQ(99, url2.IntPort());
- EXPECT_EQ("/foo;bar", url2.path());
- EXPECT_EQ("q=a", url2.query());
- EXPECT_EQ("ref", url2.ref());
-
- // Copying of invalid URL should be invalid
- GURL invalid;
- GURL invalid2(invalid);
- EXPECT_FALSE(invalid2.is_valid());
- EXPECT_EQ("", invalid2.spec());
- EXPECT_EQ("", invalid2.scheme());
- EXPECT_EQ("", invalid2.username());
- EXPECT_EQ("", invalid2.password());
- EXPECT_EQ("", invalid2.host());
- EXPECT_EQ("", invalid2.port());
- EXPECT_EQ(url_parse::PORT_UNSPECIFIED, invalid2.IntPort());
- EXPECT_EQ("", invalid2.path());
- EXPECT_EQ("", invalid2.query());
- EXPECT_EQ("", invalid2.ref());
-}
-
-// Given an invalid URL, we should still get most of the components.
-TEST(GURLTest, Invalid) {
- GURL url("http:google.com:foo");
- EXPECT_FALSE(url.is_valid());
- EXPECT_EQ("http://google.com:foo/", url.possibly_invalid_spec());
-
- EXPECT_EQ("http", url.scheme());
- EXPECT_EQ("", url.username());
- EXPECT_EQ("", url.password());
- EXPECT_EQ("google.com", url.host());
- EXPECT_EQ("foo", url.port());
- EXPECT_EQ(url_parse::PORT_INVALID, url.IntPort());
- EXPECT_EQ("/", url.path());
- EXPECT_EQ("", url.query());
- EXPECT_EQ("", url.ref());
-}
-
-TEST(GURLTest, Resolve) {
- // The tricky cases for relative URL resolving are tested in the
- // canonicalizer unit test. Here, we just test that the GURL integration
- // works properly.
- struct ResolveCase {
- const char* base;
- const char* relative;
- bool expected_valid;
- const char* expected;
- } resolve_cases[] = {
- {"http://www.google.com/", "foo.html", true, "http://www.google.com/foo.html"},
- {"http://www.google.com/", "http://images.google.com/foo.html", true, "http://images.google.com/foo.html"},
- {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"},
- {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"},
- {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"},
- // Unknown schemes are not standard.
- {"data:blahblah", "http://google.com/", true, "http://google.com/"},
- {"data:blahblah", "http:google.com", true, "http://google.com/"},
- {"data:/blahblah", "file.html", false, ""},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(resolve_cases); i++) {
- // 8-bit code path.
- GURL input(resolve_cases[i].base);
- GURL output = input.Resolve(resolve_cases[i].relative);
- EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i;
- EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i;
-
- // Wide code path.
- GURL inputw(ConvertUTF8ToUTF16(resolve_cases[i].base));
- GURL outputw =
- input.Resolve(ConvertUTF8ToUTF16(resolve_cases[i].relative));
- EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i;
- EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i;
- }
-}
-
-TEST(GURLTest, GetOrigin) {
- struct TestCase {
- const char* input;
- const char* expected;
- } cases[] = {
- {"http://www.google.com", "http://www.google.com/"},
- {"javascript:window.alert(\"hello,world\");", ""},
- {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/"},
- {"http://user@www.google.com", "http://www.google.com/"},
- {"http://:pass@www.google.com", "http://www.google.com/"},
- {"http://:@www.google.com", "http://www.google.com/"},
- };
- for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
- GURL url(cases[i].input);
- GURL origin = url.GetOrigin();
- EXPECT_EQ(cases[i].expected, origin.spec());
- }
-}
-
-TEST(GURLTest, GetWithEmptyPath) {
- struct TestCase {
- const char* input;
- const char* expected;
- } cases[] = {
- {"http://www.google.com", "http://www.google.com/"},
- {"javascript:window.alert(\"hello, world\");", ""},
- {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
- GURL url(cases[i].input);
- GURL empty_path = url.GetWithEmptyPath();
- EXPECT_EQ(cases[i].expected, empty_path.spec());
- }
-}
-
-TEST(GURLTest, Replacements) {
- // The url canonicalizer replacement test will handle most of these case.
- // The most important thing to do here is to check that the proper
- // canonicalizer gets called based on the scheme of the input.
- struct ReplaceCase {
- const char* base;
- const char* scheme;
- const char* username;
- const char* password;
- const char* host;
- const char* port;
- const char* path;
- const char* query;
- const char* ref;
- const char* expected;
- } replace_cases[] = {
- {"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "http://www.google.com/"},
- {"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", "", "window.open('foo');", "", "", "javascript:window.open('foo');"},
- {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo","search", "ref", "http://www.google.com:99/foo?search#ref"},
-#ifdef WIN32
- {"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", "c:\\", "", "", "file:///C:/"},
-#endif
- };
-
- for (size_t i = 0; i < ARRAYSIZE(replace_cases); i++) {
- const ReplaceCase& cur = replace_cases[i];
- GURL url(cur.base);
- GURL::Replacements repl;
- SetupReplacement(&GURL::Replacements::SetScheme, &repl, cur.scheme);
- SetupReplacement(&GURL::Replacements::SetUsername, &repl, cur.username);
- SetupReplacement(&GURL::Replacements::SetPassword, &repl, cur.password);
- SetupReplacement(&GURL::Replacements::SetHost, &repl, cur.host);
- SetupReplacement(&GURL::Replacements::SetPort, &repl, cur.port);
- SetupReplacement(&GURL::Replacements::SetPath, &repl, cur.path);
- SetupReplacement(&GURL::Replacements::SetQuery, &repl, cur.query);
- SetupReplacement(&GURL::Replacements::SetRef, &repl, cur.ref);
- GURL output = url.ReplaceComponents(repl);
-
- EXPECT_EQ(replace_cases[i].expected, output.spec());
- }
-}
-
-TEST(GURLTest, PathForRequest) {
- struct TestCase {
- const char* input;
- const char* expected;
- } cases[] = {
- {"http://www.google.com", "/"},
- {"http://www.google.com/", "/"},
- {"http://www.google.com/foo/bar.html?baz=22", "/foo/bar.html?baz=22"},
- {"http://www.google.com/foo/bar.html#ref", "/foo/bar.html"},
- {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query"},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
- GURL url(cases[i].input);
- std::string path_request = url.PathForRequest();
- EXPECT_EQ(cases[i].expected, path_request);
- }
-}
-
-TEST(GURLTest, EffectiveIntPort) {
- struct PortTest {
- const char* spec;
- int expected_int_port;
- } port_tests[] = {
- // http
- {"http://www.google.com/", 80},
- {"http://www.google.com:80/", 80},
- {"http://www.google.com:443/", 443},
-
- // https
- {"https://www.google.com/", 443},
- {"https://www.google.com:443/", 443},
- {"https://www.google.com:80/", 80},
-
- // ftp
- {"ftp://www.google.com/", 21},
- {"ftp://www.google.com:21/", 21},
- {"ftp://www.google.com:80/", 80},
-
- // gopher
- {"gopher://www.google.com/", 70},
- {"gopher://www.google.com:70/", 70},
- {"gopher://www.google.com:80/", 80},
-
- // file - no port
- {"file://www.google.com/", url_parse::PORT_UNSPECIFIED},
- {"file://www.google.com:443/", url_parse::PORT_UNSPECIFIED},
-
- // data - no port
- {"data:www.google.com:90", url_parse::PORT_UNSPECIFIED},
- {"data:www.google.com", url_parse::PORT_UNSPECIFIED},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(port_tests); i++) {
- GURL url(port_tests[i].spec);
- EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort());
- }
-}
-
-TEST(GURLTest, IPAddress) {
- struct IPTest {
- const char* spec;
- bool expected_ip;
- } ip_tests[] = {
- {"http://www.google.com/", false},
- {"http://192.168.9.1/", true},
- {"http://192.168.9.1.2/", false},
- {"http://192.168.m.1/", false},
- {"http://2001:db8::1/", false},
- {"http://[2001:db8::1]/", true},
- {"", false},
- {"some random input!", false},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(ip_tests); i++) {
- GURL url(ip_tests[i].spec);
- EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress());
- }
-}
-
-TEST(GURLTest, HostNoBrackets) {
- struct TestCase {
- const char* input;
- const char* expected_host;
- const char* expected_plainhost;
- } cases[] = {
- {"http://www.google.com", "www.google.com", "www.google.com"},
- {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"},
- {"http://[::]/", "[::]", "::"},
-
- // Don't require a valid URL, but don't crash either.
- {"http://[]/", "[]", ""},
- {"http://[x]/", "[x]", "x"},
- {"http://[x/", "[x", "[x"},
- {"http://x]/", "x]", "x]"},
- {"http://[/", "[", "["},
- {"http://]/", "]", "]"},
- {"", "", ""},
- };
- for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
- GURL url(cases[i].input);
- EXPECT_EQ(cases[i].expected_host, url.host());
- EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets());
- }
-}
-
-TEST(GURLTest, DomainIs) {
- const char google_domain[] = "google.com";
-
- GURL url_1("http://www.google.com:99/foo");
- EXPECT_TRUE(url_1.DomainIs(google_domain));
-
- GURL url_2("http://google.com:99/foo");
- EXPECT_TRUE(url_2.DomainIs(google_domain));
-
- GURL url_3("http://google.com./foo");
- EXPECT_TRUE(url_3.DomainIs(google_domain));
-
- GURL url_4("http://google.com/foo");
- EXPECT_FALSE(url_4.DomainIs("google.com."));
-
- GURL url_5("http://google.com./foo");
- EXPECT_TRUE(url_5.DomainIs("google.com."));
-
- GURL url_6("http://www.google.com./foo");
- EXPECT_TRUE(url_6.DomainIs(".com."));
-
- GURL url_7("http://www.balabala.com/foo");
- EXPECT_FALSE(url_7.DomainIs(google_domain));
-
- GURL url_8("http://www.google.com.cn/foo");
- EXPECT_FALSE(url_8.DomainIs(google_domain));
-
- GURL url_9("http://www.iamnotgoogle.com/foo");
- EXPECT_FALSE(url_9.DomainIs(google_domain));
-
- GURL url_10("http://www.iamnotgoogle.com../foo");
- EXPECT_FALSE(url_10.DomainIs(".com"));
-}
-
-// Newlines should be stripped from inputs.
-TEST(GURLTest, Newlines) {
- // Constructor.
- GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n ");
- EXPECT_EQ("http://www.google.com/asdf", url_1.spec());
-
- // Relative path resolver.
- GURL url_2 = url_1.Resolve(" \n /fo\to\r ");
- EXPECT_EQ("http://www.google.com/foo", url_2.spec());
-
- // Note that newlines are NOT stripped from ReplaceComponents.
-}
-
-TEST(GURLTest, IsStandard) {
- GURL a("http:foo/bar");
- EXPECT_TRUE(a.IsStandard());
-
- GURL b("foo:bar/baz");
- EXPECT_FALSE(b.IsStandard());
-
- GURL c("foo://bar/baz");
- EXPECT_FALSE(c.IsStandard());
-}
diff --git a/googleurl/src/url_canon_fileurl.cc b/googleurl/src/url_canon_fileurl.cc
deleted file mode 100644
index 97023eb..0000000
--- a/googleurl/src/url_canon_fileurl.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Functions for canonicalizing "file:" URLs.
-
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_canon_internal.h"
-#include "googleurl/src/url_file.h"
-#include "googleurl/src/url_parse_internal.h"
-
-namespace url_canon {
-
-namespace {
-
-#ifdef WIN32
-
-// Given a pointer into the spec, this copies and canonicalizes the drive
-// letter and colon to the output, if one is found. If there is not a drive
-// spec, it won't do anything. The index of the next character in the input
-// spec is returned (after the colon when a drive spec is found, the begin
-// offset if one is not).
-template<typename CHAR>
-int FileDoDriveSpec(const CHAR* spec, int begin, int end,
- CanonOutput* output) {
- // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
- // (with backslashes instead of slashes as well).
- int num_slashes = url_parse::CountConsecutiveSlashes(spec, begin, end);
- int after_slashes = begin + num_slashes;
-
- if (!url_parse::DoesBeginWindowsDriveSpec(spec, after_slashes, end))
- return begin; // Haven't consumed any characters
-
- // A drive spec is the start of a path, so we need to add a slash for the
- // authority terminator (typically the third slash).
- output->push_back('/');
-
- // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
- // and that it is followed by a colon/pipe.
-
- // Normalize Windows drive letters to uppercase
- if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
- output->push_back(spec[after_slashes] - 'a' + 'A');
- else
- output->push_back(static_cast<char>(spec[after_slashes]));
-
- // Normalize the character following it to a colon rather than pipe.
- output->push_back(':');
- return after_slashes + 2;
-}
-
-#endif // WIN32
-
-template<typename CHAR, typename UCHAR>
-bool DoFileCanonicalizePath(const CHAR* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path) {
- // Copies and normalizes the "c:" at the beginning, if present.
- out_path->begin = output->length();
- int after_drive;
-#ifdef WIN32
- after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output);
-#else
- after_drive = path.begin;
-#endif
-
- // Copies the rest of the path, starting from the slash following the
- // drive colon (if any, Windows only), or the first slash of the path.
- bool success = true;
- if (after_drive < path.end()) {
- // Use the regular path canonicalizer to canonicalize the rest of the
- // path. Give it a fake output component to write into. DoCanonicalizeFile
- // will compute the full path component.
- url_parse::Component sub_path =
- url_parse::MakeRange(after_drive, path.end());
- url_parse::Component fake_output_path;
- success = CanonicalizePath(spec, sub_path, output, &fake_output_path);
- } else {
- // No input path, canonicalize to a slash.
- output->push_back('/');
- }
-
- out_path->len = output->length() - out_path->begin;
- return success;
-}
-
-template<typename CHAR, typename UCHAR>
-bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- // Things we don't set in file: URLs.
- new_parsed->username = url_parse::Component();
- new_parsed->password = url_parse::Component();
- new_parsed->port = url_parse::Component();
-
- // Scheme (known, so we don't bother running it through the more
- // complicated scheme canonicalizer).
- new_parsed->scheme.begin = output->length();
- output->Append("file://", 7);
- new_parsed->scheme.len = 4;
-
- // Append the host. For many file URLs, this will be empty. For UNC, this
- // will be present.
- // TODO(brettw) This doesn't do any checking for host name validity. We
- // should probably handle validity checking of UNC hosts differently than
- // for regular IP hosts.
- bool success = CanonicalizeHost(source.host, parsed.host,
- output, &new_parsed->host);
- success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path,
- output, &new_parsed->path);
- CanonicalizeQuery(source.query, parsed.query, query_converter,
- output, &new_parsed->query);
-
- // Ignore failure for refs since the URL can probably still be loaded.
- CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
-
- return success;
-}
-
-} // namespace
-
-bool CanonicalizeFileURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- return DoCanonicalizeFileURL<char, unsigned char>(
- URLComponentSource<char>(spec), parsed, query_converter,
- output, new_parsed);
-}
-
-bool CanonicalizeFileURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- return DoCanonicalizeFileURL<char16, char16>(
- URLComponentSource<char16>(spec), parsed, query_converter,
- output, new_parsed);
-}
-
-bool FileCanonicalizePath(const char* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path) {
- return DoFileCanonicalizePath<char, unsigned char>(spec, path,
- output, out_path);
-}
-
-bool FileCanonicalizePath(const char16* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path) {
- return DoFileCanonicalizePath<char16, char16>(spec, path,
- output, out_path);
-}
-
-bool ReplaceFileURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- URLComponentSource<char> source(base);
- url_parse::Parsed parsed(base_parsed);
- SetupOverrideComponents(base, replacements, &source, &parsed);
- return DoCanonicalizeFileURL<char, unsigned char>(
- source, parsed, query_converter, output, new_parsed);
-}
-
-bool ReplaceFileURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- RawCanonOutput<1024> utf8;
- URLComponentSource<char> source(base);
- url_parse::Parsed parsed(base_parsed);
- SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
- return DoCanonicalizeFileURL<char, unsigned char>(
- source, parsed, query_converter, output, new_parsed);
-}
-
-} // namespace url_canon
diff --git a/googleurl/src/url_canon_icu.cc b/googleurl/src/url_canon_icu.cc
deleted file mode 100644
index b06808c..0000000
--- a/googleurl/src/url_canon_icu.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ICU integration functions.
-
-#include <stdlib.h>
-#include <string.h>
-#include <unicode/ucnv.h>
-#include <unicode/ucnv_cb.h>
-#include <unicode/uidna.h>
-
-#include "googleurl/src/url_canon_icu.h"
-#include "googleurl/src/url_canon_internal.h" // for _itoa_s
-
-#include "base/logging.h"
-
-namespace url_canon {
-
-namespace {
-
-// Called when converting a character that can not be represented, this will
-// append an escaped version of the numerical character reference for that code
-// point. It is of the form "Ӓ" and we will escape the non-digits to
-// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
-void appendURLEscapedChar(const void* context,
- UConverterFromUnicodeArgs* from_args,
- const UChar* code_units,
- int32_t length,
- UChar32 code_point,
- UConverterCallbackReason reason,
- UErrorCode* err) {
- if (reason == UCNV_UNASSIGNED) {
- *err = U_ZERO_ERROR;
-
- const static int prefix_len = 6;
- const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped
- ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
-
- DCHECK(code_point < 0x110000);
- char number[8]; // Max Unicode code point is 7 digits.
- _itoa_s(code_point, number, 10);
- int number_len = static_cast<int>(strlen(number));
- ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
-
- const static int postfix_len = 3;
- const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped
- ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
- }
-}
-
-// A class for scoping the installation of the invalid character callback.
-class AppendHandlerInstaller {
- public:
- // The owner of this object must ensure that the converter is alive for the
- // duration of this object's lifetime.
- AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
- UErrorCode err = U_ZERO_ERROR;
- ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
- &old_callback_, &old_context_, &err);
- }
-
- ~AppendHandlerInstaller() {
- UErrorCode err = U_ZERO_ERROR;
- ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
- }
-
- private:
- UConverter* converter_;
-
- UConverterFromUCallback old_callback_;
- const void* old_context_;
-};
-
-} // namespace
-
-ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
- : converter_(converter) {
-}
-
-void ICUCharsetConverter::ConvertFromUTF16(const char16* input,
- int input_len,
- CanonOutput* output) {
- // Install our error handler. It will be called for character that can not
- // be represented in the destination character set.
- AppendHandlerInstaller handler(converter_);
-
- int begin_offset = output->length();
- int dest_capacity = output->capacity() - begin_offset;
- output->set_length(output->length());
-
- do {
- UErrorCode err = U_ZERO_ERROR;
- char* dest = &output->data()[begin_offset];
- int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
- input, input_len, &err);
- if (err != U_BUFFER_OVERFLOW_ERROR) {
- output->set_length(begin_offset + required_capacity);
- return;
- }
-
- // Output didn't fit, expand
- dest_capacity = required_capacity;
- output->Resize(begin_offset + dest_capacity);
- } while (true);
-}
-
-// Converts the Unicode input representing a hostname to ASCII using IDN rules.
-// The output must be ASCII, but is represented as wide characters.
-//
-// On success, the output will be filled with the ASCII host name and it will
-// return true. Unlike most other canonicalization functions, this assumes that
-// the output is empty. The beginning of the host will be at offset 0, and
-// the length of the output will be set to the length of the new host name.
-//
-// On error, this will return false. The output in this case is undefined.
-bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) {
- DCHECK(output->length() == 0); // Output buffer is assumed empty.
- while (true) {
- // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
- // the spec (which do exist). This does not present any risk and is a
- // little more future proof.
- UErrorCode err = U_ZERO_ERROR;
- int num_converted = uidna_IDNToASCII(src, src_len, output->data(),
- output->capacity(),
- UIDNA_ALLOW_UNASSIGNED, NULL, &err);
- if (err == U_ZERO_ERROR) {
- output->set_length(num_converted);
- return true;
- }
- if (err != U_BUFFER_OVERFLOW_ERROR)
- return false; // Unknown error, give up.
-
- // Not enough room in our buffer, expand.
- output->Resize(output->capacity() * 2);
- }
-}
-
-bool ReadUTFChar(const char* str, int* begin, int length,
- unsigned* code_point_out) {
- int code_point; // Avoids warning when U8_NEXT writes -1 to it.
- U8_NEXT(str, *begin, length, code_point);
- *code_point_out = static_cast<unsigned>(code_point);
-
- // The ICU macro above moves to the next char, we want to point to the last
- // char consumed.
- (*begin)--;
-
- // Validate the decoded value.
- if (U_IS_UNICODE_CHAR(code_point))
- return true;
- *code_point_out = kUnicodeReplacementCharacter;
- return false;
-}
-
-bool ReadUTFChar(const char16* str, int* begin, int length,
- unsigned* code_point) {
- if (U16_IS_SURROGATE(str[*begin])) {
- if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
- !U16_IS_TRAIL(str[*begin + 1])) {
- // Invalid surrogate pair.
- *code_point = kUnicodeReplacementCharacter;
- return false;
- } else {
- // Valid surrogate pair.
- *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
- (*begin)++;
- }
- } else {
- // Not a surrogate, just one 16-bit word.
- *code_point = str[*begin];
- }
-
- if (U_IS_UNICODE_CHAR(*code_point))
- return true;
-
- // Invalid code point.
- *code_point = kUnicodeReplacementCharacter;
- return false;
-}
-
-} // namespace url_canon
diff --git a/googleurl/src/url_canon_icu.h b/googleurl/src/url_canon_icu.h
deleted file mode 100644
index 6bc52c3..0000000
--- a/googleurl/src/url_canon_icu.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ICU integration functions.
-
-#ifndef GOOGLEURL_SRC_URL_CANON_ICU_H__
-#define GOOGLEURL_SRC_URL_CANON_ICU_H__
-
-#include "googleurl/src/url_canon.h"
-
-typedef struct UConverter UConverter;
-
-namespace url_canon {
-
-// An implementation of CharsetConverter that implementations can use to
-// interface the canonicalizer with ICU's conversion routines.
-class ICUCharsetConverter : public CharsetConverter {
- public:
- // Constructs a converter using an already-existing ICU character set
- // converter. This converter is NOT owned by this object; the lifetime must
- // be managed by the creator such that it is alive as long as this is.
- GURL_API ICUCharsetConverter(UConverter* converter);
-
- GURL_API virtual ~ICUCharsetConverter() {}
-
- GURL_API virtual void ConvertFromUTF16(const char16* input,
- int input_len,
- CanonOutput* output);
-
- private:
- // The ICU converter, not owned by this class.
- UConverter* converter_;
-};
-
-} // namespace url_canon
-
-#endif // GOOGLEURL_SRC_URL_CANON_ICU_H__
diff --git a/googleurl/src/url_canon_ip.h b/googleurl/src/url_canon_ip.h
deleted file mode 100644
index 0a01c9f..0000000
--- a/googleurl/src/url_canon_ip.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef GOOGLEURL_SRC_URL_CANON_IP_H__
-#define GOOGLEURL_SRC_URL_CANON_IP_H__
-
-#include "base/string16.h"
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_common.h"
-#include "googleurl/src/url_parse.h"
-
-namespace url_canon {
-
-// Searches the host name for the portions of the IPv4 address. On success,
-// each component will be placed into |components| and it will return true.
-// It will return false if the host can not be separated as an IPv4 address
-// or if there are any non-7-bit characters or other characters that can not
-// be in an IP address. (This is important so we fail as early as possible for
-// common non-IP hostnames.)
-//
-// Not all components may exist. If there are only 3 components, for example,
-// the last one will have a length of -1 or 0 to indicate it does not exist.
-//
-// Note that many platform's inet_addr will ignore everything after a space
-// in certain curcumstances if the stuff before the space looks like an IP
-// address. IE6 is included in this. We do NOT handle this case. In many cases,
-// the browser's canonicalization will get run before this which converts
-// spaces to %20 (in the case of IE7) or rejects them (in the case of
-// Mozilla), so this code path never gets hit. Our host canonicalization will
-// notice these spaces and escape them, which will make IP address finding
-// fail. This seems like better behavior than stripping after a space.
-GURL_API bool FindIPv4Components(const char* spec,
- const url_parse::Component& host,
- url_parse::Component components[4]);
-GURL_API bool FindIPv4Components(const char16* spec,
- const url_parse::Component& host,
- url_parse::Component components[4]);
-
-// Converts an IPv4 address to a 32-bit number (network byte order).
-//
-// Possible return values:
-// IPV4 - IPv4 address was successfully parsed.
-// BROKEN - Input was formatted like an IPv4 address, but overflow occurred
-// during parsing.
-// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address.
-// It might be an IPv6 address, or a hostname.
-//
-// On success, |num_ipv4_components| will be populated with the number of
-// components in the IPv4 address.
-GURL_API CanonHostInfo::Family IPv4AddressToNumber(
- const char* spec,
- const url_parse::Component& host,
- unsigned char address[4],
- int* num_ipv4_components);
-GURL_API CanonHostInfo::Family IPv4AddressToNumber(
- const char16* spec,
- const url_parse::Component& host,
- unsigned char address[4],
- int* num_ipv4_components);
-
-// Converts an IPv6 address to a 128-bit number (network byte order), returning
-// true on success. False means that the input was not a valid IPv6 address.
-//
-// NOTE that |host| is expected to be surrounded by square brackets.
-// i.e. "[::1]" rather than "::1".
-GURL_API bool IPv6AddressToNumber(const char* spec,
- const url_parse::Component& host,
- unsigned char address[16]);
-GURL_API bool IPv6AddressToNumber(const char16* spec,
- const url_parse::Component& host,
- unsigned char address[16]);
-
-} // namespace url_canon
-
-#endif // GOOGLEURL_SRC_URL_CANON_IP_H__
diff --git a/googleurl/src/url_canon_mailtourl.cc b/googleurl/src/url_canon_mailtourl.cc
deleted file mode 100644
index 97868b8..0000000
--- a/googleurl/src/url_canon_mailtourl.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Functions for canonicalizing "mailto:" URLs.
-
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_canon_internal.h"
-#include "googleurl/src/url_file.h"
-#include "googleurl/src/url_parse_internal.h"
-
-namespace url_canon {
-
-namespace {
-
-
-template<typename CHAR, typename UCHAR>
-bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
-
- // mailto: only uses {scheme, path, query} -- clear the rest.
- new_parsed->username = url_parse::Component();
- new_parsed->password = url_parse::Component();
- new_parsed->host = url_parse::Component();
- new_parsed->port = url_parse::Component();
- new_parsed->ref = url_parse::Component();
-
- // Scheme (known, so we don't bother running it through the more
- // complicated scheme canonicalizer).
- new_parsed->scheme.begin = output->length();
- output->Append("mailto:", 7);
- new_parsed->scheme.len = 6;
-
- bool success = true;
-
- // Path
- if (parsed.path.is_valid()) {
- new_parsed->path.begin = output->length();
-
- // Copy the path using path URL's more lax escaping rules.
- // We convert to UTF-8 and escape non-ASCII, but leave all
- // ASCII characters alone.
- int end = parsed.path.end();
- for (int i = parsed.path.begin; i < end; ++i) {
- UCHAR uch = static_cast<UCHAR>(source.path[i]);
- if (uch < 0x20 || uch >= 0x80)
- success &= AppendUTF8EscapedChar(source.path, &i, end, output);
- else
- output->push_back(static_cast<char>(uch));
- }
-
- new_parsed->path.len = output->length() - new_parsed->path.begin;
- } else {
- // No path at all
- new_parsed->path.reset();
- }
-
- // Query -- always use the default utf8 charset converter.
- CanonicalizeQuery(source.query, parsed.query, NULL,
- output, &new_parsed->query);
-
- return success;
-}
-
-} // namespace
-
-bool CanonicalizeMailtoURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- return DoCanonicalizeMailtoURL<char, unsigned char>(
- URLComponentSource<char>(spec), parsed, output, new_parsed);
-}
-
-bool CanonicalizeMailtoURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- return DoCanonicalizeMailtoURL<char16, char16>(
- URLComponentSource<char16>(spec), parsed, output, new_parsed);
-}
-
-bool ReplaceMailtoURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- URLComponentSource<char> source(base);
- url_parse::Parsed parsed(base_parsed);
- SetupOverrideComponents(base, replacements, &source, &parsed);
- return DoCanonicalizeMailtoURL<char, unsigned char>(
- source, parsed, output, new_parsed);
-}
-
-bool ReplaceMailtoURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- RawCanonOutput<1024> utf8;
- URLComponentSource<char> source(base);
- url_parse::Parsed parsed(base_parsed);
- SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
- return DoCanonicalizeMailtoURL<char, unsigned char>(
- source, parsed, output, new_parsed);
-}
-
-} // namespace url_canon
diff --git a/googleurl/src/url_canon_pathurl.cc b/googleurl/src/url_canon_pathurl.cc
deleted file mode 100644
index 4a990c7..0000000
--- a/googleurl/src/url_canon_pathurl.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Functions for canonicalizing "path" URLs. Not to be confused with the path
-// of a URL, these are URLs that have no authority section, only a path. For
-// example, "javascript:" and "data:".
-
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_canon_internal.h"
-
-namespace url_canon {
-
-namespace {
-
-template<typename CHAR, typename UCHAR>
-bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- // Scheme: this will append the colon.
- bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
- output, &new_parsed->scheme);
-
- // We assume there's no authority for path URLs. Note that hosts should never
- // have -1 length.
- new_parsed->username.reset();
- new_parsed->password.reset();
- new_parsed->host.reset();
- new_parsed->port.reset();
-
- if (parsed.path.is_valid()) {
- // Copy the path using path URL's more lax escaping rules (think for
- // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
- // ASCII characters alone. This helps readability of JavaStript.
- new_parsed->path.begin = output->length();
- int end = parsed.path.end();
- for (int i = parsed.path.begin; i < end; i++) {
- UCHAR uch = static_cast<UCHAR>(source.path[i]);
- if (uch < 0x20 || uch >= 0x80)
- success &= AppendUTF8EscapedChar(source.path, &i, end, output);
- else
- output->push_back(static_cast<char>(uch));
- }
- new_parsed->path.len = output->length() - new_parsed->path.begin;
- } else {
- // Empty path.
- new_parsed->path.reset();
- }
-
- // Assume there's no query or ref.
- new_parsed->query.reset();
- new_parsed->ref.reset();
-
- return success;
-}
-
-} // namespace
-
-bool CanonicalizePathURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- return DoCanonicalizePathURL<char, unsigned char>(
- URLComponentSource<char>(spec), parsed, output, new_parsed);
-}
-
-bool CanonicalizePathURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- return DoCanonicalizePathURL<char16, char16>(
- URLComponentSource<char16>(spec), parsed, output, new_parsed);
-}
-
-bool ReplacePathURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- URLComponentSource<char> source(base);
- url_parse::Parsed parsed(base_parsed);
- SetupOverrideComponents(base, replacements, &source, &parsed);
- return DoCanonicalizePathURL<char, unsigned char>(
- source, parsed, output, new_parsed);
-}
-
-bool ReplacePathURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed) {
- RawCanonOutput<1024> utf8;
- URLComponentSource<char> source(base);
- url_parse::Parsed parsed(base_parsed);
- SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
- return DoCanonicalizePathURL<char, unsigned char>(
- source, parsed, output, new_parsed);
-}
-
-} // namespace url_canon
diff --git a/googleurl/src/url_canon_stdstring.h b/googleurl/src/url_canon_stdstring.h
deleted file mode 100644
index c43b777..0000000
--- a/googleurl/src/url_canon_stdstring.h
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This header file defines a canonicalizer output method class for STL
-// strings. Because the canonicalizer tries not to be dependent on the STL,
-// we have segregated it here.
-
-#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
-#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
-
-#include <string>
-#include "googleurl/src/url_canon.h"
-
-namespace url_canon {
-
-// Write into a std::string given in the constructor. This object does not own
-// the string itself, and the user must ensure that the string stays alive
-// throughout the lifetime of this object.
-//
-// The given string will be appended to; any existing data in the string will
-// be preserved. The caller should reserve() the amount of data in the string
-// they expect to be written. We will resize if necessary, but that's slow.
-//
-// Note that when canonicalization is complete, the string will likely have
-// unused space at the end because we make the string very big to start out
-// with (by |initial_size|). This ends up being important because resize
-// operations are slow, and because the base class needs to write directly
-// into the buffer.
-//
-// Therefore, the user should call Complete() before using the string that
-// this class wrote into.
-class StdStringCanonOutput : public CanonOutput {
- public:
- StdStringCanonOutput(std::string* str)
- : CanonOutput(),
- str_(str) {
- cur_len_ = static_cast<int>(str_->size()); // Append to existing data.
- str_->resize(str_->capacity());
- buffer_ = &(*str_)[0];
- buffer_len_ = static_cast<int>(str_->size());
- }
- virtual ~StdStringCanonOutput() {
- // Nothing to do, we don't own the string.
- }
-
- // Must be called after writing has completed but before the string is used.
- void Complete() {
- str_->resize(cur_len_);
- buffer_len_ = cur_len_;
- }
-
- virtual void Resize(int sz) {
- str_->resize(sz);
- buffer_ = &(*str_)[0];
- buffer_len_ = sz;
- }
-
- protected:
- std::string* str_;
-};
-
-// An extension of the Replacements class that allows the setters to use
-// standard strings.
-//
-// The strings passed as arguments are not copied and must remain valid until
-// this class goes out of scope.
-template<typename STR>
-class StdStringReplacements :
- public url_canon::Replacements<typename STR::value_type> {
- public:
- void SetSchemeStr(const STR& s) {
- this->SetScheme(s.data(),
- url_parse::Component(0, static_cast<int>(s.length())));
- }
- void SetUsernameStr(const STR& s) {
- this->SetUsername(s.data(),
- url_parse::Component(0, static_cast<int>(s.length())));
- }
- void SetPasswordStr(const STR& s) {
- this->SetPassword(s.data(),
- url_parse::Component(0, static_cast<int>(s.length())));
- }
- void SetHostStr(const STR& s) {
- this->SetHost(s.data(),
- url_parse::Component(0, static_cast<int>(s.length())));
- }
- void SetPortStr(const STR& s) {
- this->SetPort(s.data(),
- url_parse::Component(0, static_cast<int>(s.length())));
- }
- void SetPathStr(const STR& s) {
- this->SetPath(s.data(),
- url_parse::Component(0, static_cast<int>(s.length())));
- }
- void SetQueryStr(const STR& s) {
- this->SetQuery(s.data(),
- url_parse::Component(0, static_cast<int>(s.length())));
- }
- void SetRefStr(const STR& s) {
- this->SetRef(s.data(),
- url_parse::Component(0, static_cast<int>(s.length())));
- }
-};
-
-} // namespace url_canon
-
-#endif // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
-
diff --git a/googleurl/src/url_canon_unittest.cc b/googleurl/src/url_canon_unittest.cc
deleted file mode 100644
index 731d82f..0000000
--- a/googleurl/src/url_canon_unittest.cc
+++ /dev/null
@@ -1,1955 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <errno.h>
-#include <unicode/ucnv.h>
-
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_canon_icu.h"
-#include "googleurl/src/url_canon_internal.h"
-#include "googleurl/src/url_canon_stdstring.h"
-#include "googleurl/src/url_parse.h"
-#include "googleurl/src/url_test_utils.h"
-#include "testing/base/public/gunit.h"
-
-// Some implementations of base/basictypes.h may define ARRAYSIZE.
-// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro
-// which is in our version of basictypes.h.
-#ifndef ARRAYSIZE
-#define ARRAYSIZE ARRAYSIZE_UNSAFE
-#endif
-
-using url_test_utils::WStringToUTF16;
-using url_test_utils::ConvertUTF8ToUTF16;
-using url_test_utils::ConvertUTF16ToUTF8;
-using url_canon::CanonHostInfo;
-
-namespace {
-
-struct ComponentCase {
- const char* input;
- const char* expected;
- url_parse::Component expected_component;
- bool expected_success;
-};
-
-// ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests
-// treat each input as optional, and will only try processing if non-NULL.
-// The output is always 8-bit.
-struct DualComponentCase {
- const char* input8;
- const wchar_t* input16;
- const char* expected;
- url_parse::Component expected_component;
- bool expected_success;
-};
-
-// Test cases for CanonicalizeIPAddress(). The inputs are identical to
-// DualComponentCase, but the output has extra CanonHostInfo fields.
-struct IPAddressCase {
- const char* input8;
- const wchar_t* input16;
- const char* expected;
- url_parse::Component expected_component;
-
- // CanonHostInfo fields, for verbose output.
- CanonHostInfo::Family expected_family;
- int expected_num_ipv4_components;
-};
-
-struct ReplaceCase {
- const char* base;
- const char* scheme;
- const char* username;
- const char* password;
- const char* host;
- const char* port;
- const char* path;
- const char* query;
- const char* ref;
- const char* expected;
-};
-
-// Wrapper around a UConverter object that managers creation and destruction.
-class UConvScoper {
- public:
- explicit UConvScoper(const char* charset_name) {
- UErrorCode err = U_ZERO_ERROR;
- converter_ = ucnv_open(charset_name, &err);
- }
-
- ~UConvScoper() {
- if (converter_)
- ucnv_close(converter_);
- }
-
- // Returns the converter object, may be NULL.
- UConverter* converter() const { return converter_; }
-
- private:
- UConverter* converter_;
-};
-
-// Magic string used in the replacements code that tells SetupReplComp to
-// call the clear function.
-const char kDeleteComp[] = "|";
-
-// Sets up a replacement for a single component. This is given pointers to
-// the set and clear function for the component being replaced, and will
-// either set the component (if it exists) or clear it (if the replacement
-// string matches kDeleteComp).
-//
-// This template is currently used only for the 8-bit case, and the strlen
-// causes it to fail in other cases. It is left a template in case we have
-// tests for wide replacements.
-template<typename CHAR>
-void SetupReplComp(
- void (url_canon::Replacements<CHAR>::*set)(const CHAR*,
- const url_parse::Component&),
- void (url_canon::Replacements<CHAR>::*clear)(),
- url_canon::Replacements<CHAR>* rep,
- const CHAR* str) {
- if (str && str[0] == kDeleteComp[0]) {
- (rep->*clear)();
- } else if (str) {
- (rep->*set)(str, url_parse::Component(0, static_cast<int>(strlen(str))));
- }
-}
-
-} // namespace
-
-TEST(URLCanonTest, UTF) {
- // Low-level test that we handle reading, canonicalization, and writing
- // UTF-8/UTF-16 strings properly.
- struct UTFCase {
- const char* input8;
- const wchar_t* input16;
- bool expected_success;
- const char* output;
- } utf_cases[] = {
- // Valid canonical input should get passed through & escaped.
- {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
- // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
- {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
- // Non-shortest-form UTF-8 are invalid. The bad char should be replaced
- // with the invalid character (EF BF DB in UTF-8).
- {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, "%EF%BF%BD%E5%A5%BD"},
- // Invalid UTF-8 sequences should be marked as invalid (the first
- // sequence is truncated).
- {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"},
- // Character going off the end.
- {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"},
- // ...same with low surrogates with no high surrogate.
- {"\xed\xb0\x80", L"\xdc00", false, "%EF%BF%BD"},
- // Test a UTF-8 encoded surrogate value is marked as invalid.
- // ED A0 80 = U+D800
- {"\xed\xa0\x80", NULL, false, "%EF%BF%BD"},
- };
-
- std::string out_str;
- for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) {
- if (utf_cases[i].input8) {
- out_str.clear();
- url_canon::StdStringCanonOutput output(&out_str);
-
- int input_len = static_cast<int>(strlen(utf_cases[i].input8));
- bool success = true;
- for (int ch = 0; ch < input_len; ch++) {
- success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len,
- &output);
- }
- output.Complete();
- EXPECT_EQ(utf_cases[i].expected_success, success);
- EXPECT_EQ(std::string(utf_cases[i].output), out_str);
- }
- if (utf_cases[i].input16) {
- out_str.clear();
- url_canon::StdStringCanonOutput output(&out_str);
-
- string16 input_str(WStringToUTF16(utf_cases[i].input16));
- int input_len = static_cast<int>(input_str.length());
- bool success = true;
- for (int ch = 0; ch < input_len; ch++) {
- success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
- &output);
- }
- output.Complete();
- EXPECT_EQ(utf_cases[i].expected_success, success);
- EXPECT_EQ(std::string(utf_cases[i].output), out_str);
- }
-
- if (utf_cases[i].input8 && utf_cases[i].input16 &&
- utf_cases[i].expected_success) {
- // Check that the UTF-8 and UTF-16 inputs are equivalent.
-
- // UTF-16 -> UTF-8
- std::string input8_str(utf_cases[i].input8);
- string16 input16_str(WStringToUTF16(utf_cases[i].input16));
- EXPECT_EQ(input8_str, ConvertUTF16ToUTF8(input16_str));
-
- // UTF-8 -> UTF-16
- EXPECT_EQ(input16_str, ConvertUTF8ToUTF16(input8_str));
- }
- }
-}
-
-TEST(URLCanonTest, ICUCharsetConverter) {
- struct ICUCase {
- const wchar_t* input;
- const char* encoding;
- const char* expected;
- } icu_cases[] = {
- // UTF-8.
- {L"Hello, world", "utf-8", "Hello, world"},
- {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
- // Non-BMP UTF-8.
- {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
- // Big5
- {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
- // Unrepresentable character in the destination set.
- {L"hello\x4f60\x06de\x597dworld", "big5", "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) {
- UConvScoper conv(icu_cases[i].encoding);
- ASSERT_TRUE(conv.converter() != NULL);
- url_canon::ICUCharsetConverter converter(conv.converter());
-
- std::string str;
- url_canon::StdStringCanonOutput output(&str);
-
- string16 input_str(WStringToUTF16(icu_cases[i].input));
- int input_len = static_cast<int>(input_str.length());
- converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
- output.Complete();
-
- EXPECT_STREQ(icu_cases[i].expected, str.c_str());
- }
-
- // Test string sizes around the resize boundary for the output to make sure
- // the converter resizes as needed.
- const int static_size = 16;
- UConvScoper conv("utf-8");
- ASSERT_TRUE(conv.converter());
- url_canon::ICUCharsetConverter converter(conv.converter());
- for (int i = static_size - 2; i <= static_size + 2; i++) {
- // Make a string with the appropriate length.
- string16 input;
- for (int ch = 0; ch < i; ch++)
- input.push_back('a');
-
- url_canon::RawCanonOutput<static_size> output;
- converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
- &output);
- EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
- }
-}
-
-TEST(URLCanonTest, Scheme) {
- // Here, we're mostly testing that unusual characters are handled properly.
- // The canonicalizer doesn't do any parsing or whitespace detection. It will
- // also do its best on error, and will escape funny sequences (these won't be
- // valid schemes and it will return error).
- //
- // Note that the canonicalizer will append a colon to the output to separate
- // out the rest of the URL, which is not present in the input. We check,
- // however, that the output range includes everything but the colon.
- ComponentCase scheme_cases[] = {
- {"http", "http:", url_parse::Component(0, 4), true},
- {"HTTP", "http:", url_parse::Component(0, 4), true},
- {" HTTP ", "%20http%20:", url_parse::Component(0, 10),false},
- {"htt: ", "htt%3A%20:", url_parse::Component(0, 9), false},
- {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", url_parse::Component(0, 22), false},
- // Don't re-escape something already escaped. Note that it will
- // "canonicalize" the 'A' to 'a', but that's OK.
- {"ht%3Atp", "ht%3atp:", url_parse::Component(0, 7), false},
- };
-
- std::string out_str;
-
- for (size_t i = 0; i < arraysize(scheme_cases); i++) {
- int url_len = static_cast<int>(strlen(scheme_cases[i].input));
- url_parse::Component in_comp(0, url_len);
- url_parse::Component out_comp;
-
- out_str.clear();
- url_canon::StdStringCanonOutput output1(&out_str);
- bool success = url_canon::CanonicalizeScheme(scheme_cases[i].input,
- in_comp, &output1, &out_comp);
- output1.Complete();
-
- EXPECT_EQ(scheme_cases[i].expected_success, success);
- EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
- EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
-
- // Now try the wide version
- out_str.clear();
- url_canon::StdStringCanonOutput output2(&out_str);
-
- string16 wide_input(ConvertUTF8ToUTF16(scheme_cases[i].input));
- in_comp.len = static_cast<int>(wide_input.length());
- success = url_canon::CanonicalizeScheme(wide_input.c_str(), in_comp,
- &output2, &out_comp);
- output2.Complete();
-
- EXPECT_EQ(scheme_cases[i].expected_success, success);
- EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
- EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
- }
-
- // Test the case where the scheme is declared nonexistant, it should be
- // converted into an empty scheme.
- url_parse::Component out_comp;
- out_str.clear();
- url_canon::StdStringCanonOutput output(&out_str);
-
- EXPECT_TRUE(url_canon::CanonicalizeScheme("", url_parse::Component(0, -1),
- &output, &out_comp));
- output.Complete();
-
- EXPECT_EQ(std::string(":"), out_str);
- EXPECT_EQ(0, out_comp.begin);
- EXPECT_EQ(0, out_comp.len);
-}
-
-TEST(URLCanonTest, Host) {
- IPAddressCase host_cases[] = {
- // Basic canonicalization, uppercase should be converted to lowercase.
- {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1},
- // Spaces and some other characters should be escaped.
- {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", url_parse::Component(0, 22), CanonHostInfo::NEUTRAL, -1},
- // Exciting different types of spaces!
- {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", url_parse::Component(0, 16), CanonHostInfo::NEUTRAL, -1},
- // Other types of space (no-break, zero-width, zero-width-no-break) are
- // name-prepped away to nothing.
- {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1},
- // Ideographic full stop (full-width period for Chinese, etc.) should be
- // treated as a dot.
- {NULL, L"www.foo\x3002" L"bar.com", "www.foo.bar.com", url_parse::Component(0, 15), CanonHostInfo::NEUTRAL, -1},
- // Invalid unicode characters should fail...
- // ...In wide input, ICU will barf and we'll end up with the input as
- // escaped UTF-8 (the invalid character should be replaced with the
- // replacement character).
- {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1},
- // ...This is the same as previous but with with escaped.
- {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1},
- // Test name prepping, fullwidth input should be converted to ASCII and NOT
- // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
- {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", url_parse::Component(0, 6), CanonHostInfo::NEUTRAL, -1},
- // Test that fullwidth escaped values are properly name-prepped,
- // then converted or rejected.
- // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
- {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1},
- {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1},
- // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
- {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1},
- {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1},
- // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
- {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1},
- // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
- // UTF-8 (wide case). The output should be equivalent to the true wide
- // character input above).
- {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1},
- // Invalid escaped characters should fail and the percents should be
- // escaped.
- {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", url_parse::Component(0, 10), CanonHostInfo::BROKEN, -1},
- // If we get an invalid character that has been escaped.
- {"%25", L"%25", "%25", url_parse::Component(0, 3), CanonHostInfo::BROKEN, -1},
- {"hello%00", L"hello%00", "hello%00", url_parse::Component(0, 8), CanonHostInfo::BROKEN, -1},
- // Escaped numbers should be treated like IP addresses if they are.
- {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
- {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
- // Invalid escaping should trigger the regular host error handling.
- {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", url_parse::Component(0, 17), CanonHostInfo::BROKEN, -1},
- // Something that isn't exactly an IP should get treated as a host and
- // spaces escaped.
- {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", url_parse::Component(0, 19), CanonHostInfo::NEUTRAL, -1},
- // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
- // These are "0Xc0.0250.01" in fullwidth.
- {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
- // Broken IP addresses get marked as such.
- {"192.168.0.257", L"192.168.0.257", "192.168.0.257", url_parse::Component(0, 13), CanonHostInfo::BROKEN, -1},
- {"[google.com]", L"[google.com]", "[google.com]", url_parse::Component(0, 12), CanonHostInfo::BROKEN, -1},
- // Cyrillic letter followed buy ( should return punicode for ( escaped before punicode string was created. I.e.
- // if ( is escaped after punicode is created we would get xn--%28-8tb (incorrect).
- {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", url_parse::Component(0, 11), CanonHostInfo::NEUTRAL, -1},
- };
-
- // CanonicalizeHost() non-verbose.
- std::string out_str;
- for (size_t i = 0; i < arraysize(host_cases); i++) {
- // Narrow version.
- if (host_cases[i].input8) {
- int host_len = static_cast<int>(strlen(host_cases[i].input8));
- url_parse::Component in_comp(0, host_len);
- url_parse::Component out_comp;
-
- out_str.clear();
- url_canon::StdStringCanonOutput output(&out_str);
-
- bool success = url_canon::CanonicalizeHost(host_cases[i].input8, in_comp,
- &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
- success);
- EXPECT_EQ(std::string(host_cases[i].expected), out_str);
- EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
- }
-
- // Wide version.
- if (host_cases[i].input16) {
- string16 input16(WStringToUTF16(host_cases[i].input16));
- int host_len = static_cast<int>(input16.length());
- url_parse::Component in_comp(0, host_len);
- url_parse::Component out_comp;
-
- out_str.clear();
- url_canon::StdStringCanonOutput output(&out_str);
-
- bool success = url_canon::CanonicalizeHost(input16.c_str(), in_comp,
- &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
- success);
- EXPECT_EQ(std::string(host_cases[i].expected), out_str);
- EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
- }
- }
-
- // CanonicalizeHostVerbose()
- for (size_t i = 0; i < arraysize(host_cases); i++) {
- // Narrow version.
- if (host_cases[i].input8) {
- int host_len = static_cast<int>(strlen(host_cases[i].input8));
- url_parse::Component in_comp(0, host_len);
-
- out_str.clear();
- url_canon::StdStringCanonOutput output(&out_str);
- CanonHostInfo host_info;
-
- url_canon::CanonicalizeHostVerbose(host_cases[i].input8, in_comp,
- &output, &host_info);
- output.Complete();
-
- EXPECT_EQ(host_cases[i].expected_family, host_info.family);
- EXPECT_EQ(std::string(host_cases[i].expected), out_str);
- EXPECT_EQ(host_cases[i].expected_component.begin,
- host_info.out_host.begin);
- EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
- if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
- EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
- host_info.num_ipv4_components);
- }
- }
-
- // Wide version.
- if (host_cases[i].input16) {
- string16 input16(WStringToUTF16(host_cases[i].input16));
- int host_len = static_cast<int>(input16.length());
- url_parse::Component in_comp(0, host_len);
-
- out_str.clear();
- url_canon::StdStringCanonOutput output(&out_str);
- CanonHostInfo host_info;
-
- url_canon::CanonicalizeHostVerbose(input16.c_str(), in_comp,
- &output, &host_info);
- output.Complete();
-
- EXPECT_EQ(host_cases[i].expected_family, host_info.family);
- EXPECT_EQ(std::string(host_cases[i].expected), out_str);
- EXPECT_EQ(host_cases[i].expected_component.begin,
- host_info.out_host.begin);
- EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
- if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
- EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
- host_info.num_ipv4_components);
- }
- }
- }
-}
-
-TEST(URLCanonTest, IPv4) {
- IPAddressCase cases[] = {
- // Empty is not an IP address.
- {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- {".", L".", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // Regular IP addresses in different bases.
- {"192.168.0.1", L"192.168.0.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4},
- {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4},
- {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4},
- // Non-IP addresses due to invalid characters.
- {"192.168.9.com", L"192.168.9.com", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // Invalid characters for the base should be rejected.
- {"19a.168.0.1", L"19a.168.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- {"0308.0250.00.01", L"0308.0250.00.01", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // If there are not enough components, the last one should fill them out.
- {"192", L"192", "0.0.0.192", url_parse::Component(0, 9), CanonHostInfo::IPV4, 1},
- {"0xC0a80001", L"0xC0a80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1},
- {"030052000001", L"030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1},
- {"000030052000001", L"000030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1},
- {"192.168", L"192.168", "192.0.0.168", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2},
- {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2},
- {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2},
- {"192.168.1", L"192.168.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
- // Too many components means not an IP address.
- {"192.168.0.0.1", L"192.168.0.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // We allow a single trailing dot.
- {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4},
- {"192.168.0.1. hello", L"192.168.0.1. hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- {"192.168.0.1..", L"192.168.0.1..", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // Two dots in a row means not an IP address.
- {"192.168..1", L"192.168..1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // Any numerical overflow should be marked as BROKEN.
- {"0x100.0", L"0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0x100.0.0", L"0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0x100.0.0.0", L"0x100.0.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0.0x100.0.0", L"0.0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0.0.0x100.0", L"0.0.0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0.0.0.0x100", L"0.0.0.0x100", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0.0.0x10000", L"0.0.0x10000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0.0x1000000", L"0.0x1000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0x100000000", L"0x100000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // Repeat the previous tests, minus 1, to verify boundaries.
- {"0xFF.0", L"0xFF.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 2},
- {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 3},
- {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4},
- {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4},
- {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4},
- {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4},
- {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
- {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", url_parse::Component(0, 13), CanonHostInfo::IPV4, 2},
- {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", url_parse::Component(0, 15), CanonHostInfo::IPV4, 1},
- // Old trunctations tests. They're all "BROKEN" now.
- {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"192.168.0.257", L"192.168.0.257", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"192.168.0xa20001", L"192.168.0xa20001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"192.015052000001", L"192.015052000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"0X12C0a80001", L"0X12C0a80001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"276.1.2", L"276.1.2", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // Spaces should be rejected.
- {"192.168.0.1 hello", L"192.168.0.1 hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // Very large numbers.
- {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
- {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", url_parse::Component(0, 11), CanonHostInfo::BROKEN, -1},
- // A number has no length limit, but long numbers can still overflow.
- {"00000000000000000001", L"00000000000000000001", "0.0.0.1", url_parse::Component(0, 7), CanonHostInfo::IPV4, 1},
- {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // If a long component is non-numeric, it's a hostname, *not* a broken IP.
- {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // Truncation of all zeros should still result in 0.
- {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", url_parse::Component(0, 7), CanonHostInfo::IPV4, 4},
- };
-
- for (size_t i = 0; i < arraysize(cases); i++) {
- // 8-bit version.
- url_parse::Component component(0,
- static_cast<int>(strlen(cases[i].input8)));
-
- std::string out_str1;
- url_canon::StdStringCanonOutput output1(&out_str1);
- url_canon::CanonHostInfo host_info;
- url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1,
- &host_info);
- output1.Complete();
-
- EXPECT_EQ(cases[i].expected_family, host_info.family);
- if (host_info.family == CanonHostInfo::IPV4) {
- EXPECT_STREQ(cases[i].expected, out_str1.c_str());
- EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
- EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
- EXPECT_EQ(cases[i].expected_num_ipv4_components,
- host_info.num_ipv4_components);
- }
-
- // 16-bit version.
- string16 input16(WStringToUTF16(cases[i].input16));
- component = url_parse::Component(0, static_cast<int>(input16.length()));
-
- std::string out_str2;
- url_canon::StdStringCanonOutput output2(&out_str2);
- url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2,
- &host_info);
- output2.Complete();
-
- EXPECT_EQ(cases[i].expected_family, host_info.family);
- if (host_info.family == CanonHostInfo::IPV4) {
- EXPECT_STREQ(cases[i].expected, out_str2.c_str());
- EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
- EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
- EXPECT_EQ(cases[i].expected_num_ipv4_components,
- host_info.num_ipv4_components);
- }
- }
-}
-
-TEST(URLCanonTest, IPv6) {
- IPAddressCase cases[] = {
- // Empty is not an IP address.
- {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
- // Non-IPs with [:] characters are marked BROKEN.
- {":", L":", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[", L"[", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[:", L"[:", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"]", L"]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {":]", L":]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[]", L"[]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[:]", L"[:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // Regular IP address is invalid without bounding '[' and ']'.
- {"2001:db8::1", L"2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[2001:db8::1", L"[2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"2001:db8::1]", L"2001:db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // Regular IP addresses.
- {"[::]", L"[::]", "[::]", url_parse::Component(0,4), CanonHostInfo::IPV6, -1},
- {"[::1]", L"[::1]", "[::1]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1},
- {"[1::]", L"[1::]", "[1::]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1},
- {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", url_parse::Component(0,10), CanonHostInfo::IPV6, -1},
- {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1},
-
- // Leading zeros should be stripped.
- {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1},
-
- // Upper case letters should be lowercased.
- {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", url_parse::Component(0,20), CanonHostInfo::IPV6, -1},
-
- // The same address can be written with different contractions, but should
- // get canonicalized to the same thing.
- {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1},
- {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1},
-
- // IPv4 addresses
- // Only mapped and compat addresses can have IPv4 syntax embedded.
- {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
-
- // IPv4 with last component missing.
- {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1},
-
- // IPv4 using hex.
- // TODO(eroman): Should this format be disallowed?
- {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1},
-
- // There may be zeros surrounding the "::" contraction.
- {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1},
-
- {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", url_parse::Component(0,13), CanonHostInfo::IPV6, -1},
-
- // Can only have one "::" contraction in an IPv6 string literal.
- {"[2001::db8::1]", L"[2001::db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // No more than 2 consecutive ':'s.
- {"[2001:db8:::1]", L"[2001:db8:::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[:::]", L"[:::]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // Non-IP addresses due to invalid characters.
- {"[2001::.com]", L"[2001::.com]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // If there are not enough components, the last one should fill them out.
- // ... omitted at this time ...
- // Too many components means not an IP address. Similarly with too few if using IPv4 compat or mapped addresses.
- {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // Too many bits (even though 8 comonents, the last one holds 32 bits).
- {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
-
- // Too many bits specified -- the contraction would have to be zero-length
- // to not exceed 128 bits.
- {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
-
- // The contraction is for 16 bits of zero.
- {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1},
-
- // Cannot have a trailing colon.
- {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
-
- // Cannot have negative numbers.
- {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
-
- // Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
- // The scope_id should be included in the canonicalized URL, and is an
- // unsigned decimal number.
-
- // Invalid because no ID was given after the percent.
-
- // Don't allow scope-id
- {"[1::%1]", L"[1::%1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[1::%eth0]", L"[1::%eth0]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[1::%]", L"[1::%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[%]", L"[%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[::%:]", L"[::%:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
-
- // Don't allow leading or trailing colons.
- {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
-
- // We allow a single trailing dot.
- // ... omitted at this time ...
- // Two dots in a row means not an IP address.
- {"[::192.168..1]", L"[::192.168..1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- // Any non-first components get truncated to one byte.
- // ... omitted at this time ...
- // Spaces should be rejected.
- {"[::1 hello]", L"[::1 hello]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
- };
-
- for (size_t i = 0; i < arraysize(cases); i++) {
- // 8-bit version.
- url_parse::Component component(0,
- static_cast<int>(strlen(cases[i].input8)));
-
- std::string out_str1;
- url_canon::StdStringCanonOutput output1(&out_str1);
- url_canon::CanonHostInfo host_info;
- url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1,
- &host_info);
- output1.Complete();
-
- EXPECT_EQ(cases[i].expected_family, host_info.family);
- if (host_info.family == CanonHostInfo::IPV6) {
- EXPECT_STREQ(cases[i].expected, out_str1.c_str());
- EXPECT_EQ(cases[i].expected_component.begin,
- host_info.out_host.begin);
- EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
- }
-
- // 16-bit version.
- string16 input16(WStringToUTF16(cases[i].input16));
- component = url_parse::Component(0, static_cast<int>(input16.length()));
-
- std::string out_str2;
- url_canon::StdStringCanonOutput output2(&out_str2);
- url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2,
- &host_info);
- output2.Complete();
-
- EXPECT_EQ(cases[i].expected_family, host_info.family);
- if (host_info.family == CanonHostInfo::IPV6) {
- EXPECT_STREQ(cases[i].expected, out_str2.c_str());
- EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
- EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
- }
- }
-}
-
-TEST(URLCanonTest, IPEmpty) {
- std::string out_str1;
- url_canon::StdStringCanonOutput output1(&out_str1);
- url_canon::CanonHostInfo host_info;
-
- // This tests tests.
- const char spec[] = "192.168.0.1";
- url_canon::CanonicalizeIPAddress(spec, url_parse::Component(),
- &output1, &host_info);
- EXPECT_FALSE(host_info.IsIPAddress());
-
- url_canon::CanonicalizeIPAddress(spec, url_parse::Component(0, 0),
- &output1, &host_info);
- EXPECT_FALSE(host_info.IsIPAddress());
-}
-
-TEST(URLCanonTest, UserInfo) {
- // Note that the canonicalizer should escape and treat empty components as
- // not being there.
-
- // We actually parse a full input URL so we can get the initial components.
- struct UserComponentCase {
- const char* input;
- const char* expected;
- url_parse::Component expected_username;
- url_parse::Component expected_password;
- bool expected_success;
- } user_info_cases[] = {
- {"http://user:pass@host.com/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true},
- {"http://@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true},
- {"http://:@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true},
- {"http://foo:@host.com/", "foo@", url_parse::Component(0, 3), url_parse::Component(0, -1), true},
- {"http://:foo@host.com/", ":foo@", url_parse::Component(0, 0), url_parse::Component(1, 3), true},
- {"http://^ :$\t@host.com/", "%5E%20:$%09@", url_parse::Component(0, 6), url_parse::Component(7, 4), true},
- {"http://user:pass@/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true},
- {"http://%2540:bar@domain.com/", "%2540:bar@", url_parse::Component(0, 5), url_parse::Component(6, 3), true },
-
- // IE7 compatability: old versions allowed backslashes in usernames, but
- // IE7 does not. We disallow it as well.
- {"ftp://me\\mydomain:pass@foo.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(user_info_cases); i++) {
- int url_len = static_cast<int>(strlen(user_info_cases[i].input));
- url_parse::Parsed parsed;
- url_parse::ParseStandardURL(user_info_cases[i].input, url_len, &parsed);
- url_parse::Component out_user, out_pass;
- std::string out_str;
- url_canon::StdStringCanonOutput output1(&out_str);
-
- bool success = url_canon::CanonicalizeUserInfo(user_info_cases[i].input,
- parsed.username,
- user_info_cases[i].input,
- parsed.password,
- &output1, &out_user,
- &out_pass);
- output1.Complete();
-
- EXPECT_EQ(user_info_cases[i].expected_success, success);
- EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
- EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
- EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
- EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
- EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
-
- // Now try the wide version
- out_str.clear();
- url_canon::StdStringCanonOutput output2(&out_str);
- string16 wide_input(ConvertUTF8ToUTF16(user_info_cases[i].input));
- success = url_canon::CanonicalizeUserInfo(wide_input.c_str(),
- parsed.username,
- wide_input.c_str(),
- parsed.password,
- &output2, &out_user, &out_pass);
- output2.Complete();
-
- EXPECT_EQ(user_info_cases[i].expected_success, success);
- EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
- EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
- EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
- EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
- EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
- }
-}
-
-TEST(URLCanonTest, Port) {
- // We only need to test that the number gets properly put into the output
- // buffer. The parser unit tests will test scanning the number correctly.
- //
- // Note that the CanonicalizePort will always prepend a colon to the output
- // to separate it from the colon that it assumes preceeds it.
- struct PortCase {
- const char* input;
- int default_port;
- const char* expected;
- url_parse::Component expected_component;
- bool expected_success;
- } port_cases[] = {
- // Invalid input should be copied w/ failure.
- {"as df", 80, ":as%20df", url_parse::Component(1, 7), false},
- {"-2", 80, ":-2", url_parse::Component(1, 2), false},
- // Default port should be omitted.
- {"80", 80, "", url_parse::Component(0, -1), true},
- {"8080", 80, ":8080", url_parse::Component(1, 4), true},
- // PORT_UNSPECIFIED should mean always keep the port.
- {"80", url_parse::PORT_UNSPECIFIED, ":80", url_parse::Component(1, 2), true},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(port_cases); i++) {
- int url_len = static_cast<int>(strlen(port_cases[i].input));
- url_parse::Component in_comp(0, url_len);
- url_parse::Component out_comp;
- std::string out_str;
- url_canon::StdStringCanonOutput output1(&out_str);
- bool success = url_canon::CanonicalizePort(port_cases[i].input, in_comp,
- port_cases[i].default_port,
- &output1, &out_comp);
- output1.Complete();
-
- EXPECT_EQ(port_cases[i].expected_success, success);
- EXPECT_EQ(std::string(port_cases[i].expected), out_str);
- EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len);
-
- // Now try the wide version
- out_str.clear();
- url_canon::StdStringCanonOutput output2(&out_str);
- string16 wide_input(ConvertUTF8ToUTF16(port_cases[i].input));
- success = url_canon::CanonicalizePort(wide_input.c_str(), in_comp,
- port_cases[i].default_port,
- &output2, &out_comp);
- output2.Complete();
-
- EXPECT_EQ(port_cases[i].expected_success, success);
- EXPECT_EQ(std::string(port_cases[i].expected), out_str);
- EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len);
- }
-}
-
-TEST(URLCanonTest, Path) {
- DualComponentCase path_cases[] = {
- // ----- path collapsing tests -----
- {"/././foo", L"/././foo", "/foo", url_parse::Component(0, 4), true},
- {"/./.foo", L"/./.foo", "/.foo", url_parse::Component(0, 5), true},
- {"/foo/.", L"/foo/.", "/foo/", url_parse::Component(0, 5), true},
- {"/foo/./", L"/foo/./", "/foo/", url_parse::Component(0, 5), true},
- // double dots followed by a slash or the end of the string count
- {"/foo/bar/..", L"/foo/bar/..", "/foo/", url_parse::Component(0, 5), true},
- {"/foo/bar/../", L"/foo/bar/../", "/foo/", url_parse::Component(0, 5), true},
- // don't count double dots when they aren't followed by a slash
- {"/foo/..bar", L"/foo/..bar", "/foo/..bar", url_parse::Component(0, 10), true},
- // some in the middle
- {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", url_parse::Component(0, 8), true},
- {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a", url_parse::Component(0, 2), true},
- // we should not be able to go above the root
- {"/foo/../../..", L"/foo/../../..", "/", url_parse::Component(0, 1), true},
- {"/foo/../../../ton", L"/foo/../../../ton", "/ton", url_parse::Component(0, 4), true},
- // escaped dots should be unescaped and treated the same as dots
- {"/foo/%2e", L"/foo/%2e", "/foo/", url_parse::Component(0, 5), true},
- {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", url_parse::Component(0, 8), true},
- {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar", "/..bar", url_parse::Component(0, 6), true},
- // Multiple slashes in a row should be preserved and treated like empty
- // directory names.
- {"////../..", L"////../..", "//", url_parse::Component(0, 2), true},
-
- // ----- escaping tests -----
- {"/foo", L"/foo", "/foo", url_parse::Component(0, 4), true},
- // Valid escape sequence
- {"/%20foo", L"/%20foo", "/%20foo", url_parse::Component(0, 7), true},
- // Invalid escape sequence we should pass through unchanged.
- {"/foo%", L"/foo%", "/foo%", url_parse::Component(0, 5), true},
- {"/foo%2", L"/foo%2", "/foo%2", url_parse::Component(0, 6), true},
- // Invalid escape sequence: bad characters should be treated the same as
- // the sourrounding text, not as escaped (in this case, UTF-8).
- {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", url_parse::Component(0, 10), true},
- {"/foo%2\xc2\xa9zbar", NULL, "/foo%2%C2%A9zbar", url_parse::Component(0, 16), true},
- {NULL, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", url_parse::Component(0, 22), true},
- // Regular characters that are escaped should be unescaped
- {"/foo%41%7a", L"/foo%41%7a", "/fooAz", url_parse::Component(0, 6), true},
- // Funny characters that are unescaped should be escaped
- {"/foo\x09\x91%91", NULL, "/foo%09%91%91", url_parse::Component(0, 13), true},
- {NULL, L"/foo\x09\x91%91", "/foo%09%C2%91%91", url_parse::Component(0, 16), true},
- // Invalid characters that are escaped should cause a failure.
- {"/foo%00%51", L"/foo%00%51", "/foo%00Q", url_parse::Component(0, 8), false},
- // Some characters should be passed through unchanged regardless of esc.
- {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", url_parse::Component(0, 13), true},
- // Characters that are properly escaped should not have the case changed
- // of hex letters.
- {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", url_parse::Component(0, 13), true},
- // Funny characters that are unescaped should be escaped
- {"/foo\tbar", L"/foo\tbar", "/foo%09bar", url_parse::Component(0, 10), true},
- // Backslashes should get converted to forward slashes
- {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", url_parse::Component(0, 8), true},
- // Hashes found in paths (possibly only when the caller explicitly sets
- // the path on an already-parsed URL) should be escaped.
- {"/foo#bar", L"/foo#bar", "/foo%23bar", url_parse::Component(0, 10), true},
- // %7f should be allowed and %3D should not be unescaped (these were wrong
- // in a previous version).
- {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", url_parse::Component(0, 24), true},
- // @ should be passed through unchanged (escaped or unescaped).
- {"/@asdf%40", L"/@asdf%40", "/@asdf%40", url_parse::Component(0, 9), true},
-
- // ----- encoding tests -----
- // Basic conversions
- {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", url_parse::Component(0, 37), true},
- // Invalid unicode characters should fail. We only do validation on
- // UTF-16 input, so this doesn't happen on 8-bit.
- {"/\xef\xb7\x90zyx", NULL, "/%EF%B7%90zyx", url_parse::Component(0, 13), true},
- {NULL, L"/\xfdd0zyx", "/%EF%BF%BDzyx", url_parse::Component(0, 13), false},
- };
-
- for (size_t i = 0; i < arraysize(path_cases); i++) {
- if (path_cases[i].input8) {
- int len = static_cast<int>(strlen(path_cases[i].input8));
- url_parse::Component in_comp(0, len);
- url_parse::Component out_comp;
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- bool success = url_canon::CanonicalizePath(path_cases[i].input8, in_comp,
- &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(path_cases[i].expected_success, success);
- EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
- EXPECT_EQ(path_cases[i].expected, out_str);
- }
-
- if (path_cases[i].input16) {
- string16 input16(WStringToUTF16(path_cases[i].input16));
- int len = static_cast<int>(input16.length());
- url_parse::Component in_comp(0, len);
- url_parse::Component out_comp;
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
-
- bool success = url_canon::CanonicalizePath(input16.c_str(), in_comp,
- &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(path_cases[i].expected_success, success);
- EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
- EXPECT_EQ(path_cases[i].expected, out_str);
- }
- }
-
- // Manual test: embedded NULLs should be escaped and the URL should be marked
- // as invalid.
- const char path_with_null[] = "/ab\0c";
- url_parse::Component in_comp(0, 5);
- url_parse::Component out_comp;
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- bool success = url_canon::CanonicalizePath(path_with_null, in_comp,
- &output, &out_comp);
- output.Complete();
- EXPECT_FALSE(success);
- EXPECT_EQ("/ab%00c", out_str);
-}
-
-TEST(URLCanonTest, Query) {
- struct QueryCase {
- const char* input8;
- const wchar_t* input16;
- const char* encoding;
- const char* expected;
- } query_cases[] = {
- // Regular ASCII case in some different encodings.
- {"foo=bar", L"foo=bar", NULL, "?foo=bar"},
- {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
- {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
- {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
- // Allow question marks in the query without escaping
- {"as?df", L"as?df", NULL, "?as?df"},
- // Always escape '#' since it would mark the ref.
- {"as#df", L"as#df", NULL, "?as%23df"},
- // Escape some questionable 8-bit characters, but never unescape.
- {"\x02hello\x7f bye", L"\x02hello\x7f bye", NULL, "?%02hello%7F%20bye"},
- {"%40%41123", L"%40%41123", NULL, "?%40%41123"},
- // Chinese input/output
- {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", NULL, "?q=%E4%BD%A0%E5%A5%BD"},
- {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", "?q=%C4%E3%BA%C3"},
- {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
- // Unencodable character in the destination character set should be
- // escaped. The escape sequence unescapes to be the entity name:
- // "?q=你"
- {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", "?q=Chinese%26%2365319%3B"},
- // Invalid UTF-8/16 input should be replaced with invalid characters.
- {"q=\xed\xed", L"q=\xd800\xd800", NULL, "?q=%EF%BF%BD%EF%BF%BD"},
- // Don't allow < or > because sometimes they are used for XSS if the
- // URL is echoed in content. Firefox does this, IE doesn't.
- {"q=<asdf>", L"q=<asdf>", NULL, "?q=%3Casdf%3E"},
- // Escape double quotemarks in the query.
- {"q=\"asdf\"", L"q=\"asdf\"", NULL, "?q=%22asdf%22"},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) {
- url_parse::Component out_comp;
-
- UConvScoper conv(query_cases[i].encoding);
- ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
- url_canon::ICUCharsetConverter converter(conv.converter());
-
- // Map NULL to a NULL converter pointer.
- url_canon::ICUCharsetConverter* conv_pointer = &converter;
- if (!query_cases[i].encoding)
- conv_pointer = NULL;
-
- if (query_cases[i].input8) {
- int len = static_cast<int>(strlen(query_cases[i].input8));
- url_parse::Component in_comp(0, len);
- std::string out_str;
-
- url_canon::StdStringCanonOutput output(&out_str);
- url_canon::CanonicalizeQuery(query_cases[i].input8, in_comp,
- conv_pointer, &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(query_cases[i].expected, out_str);
- }
-
- if (query_cases[i].input16) {
- string16 input16(WStringToUTF16(query_cases[i].input16));
- int len = static_cast<int>(input16.length());
- url_parse::Component in_comp(0, len);
- std::string out_str;
-
- url_canon::StdStringCanonOutput output(&out_str);
- url_canon::CanonicalizeQuery(input16.c_str(), in_comp,
- conv_pointer, &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(query_cases[i].expected, out_str);
- }
- }
-
- // Extra test for input with embedded NULL;
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- url_parse::Component out_comp;
- url_canon::CanonicalizeQuery("a \x00z\x01", url_parse::Component(0, 5), NULL,
- &output, &out_comp);
- output.Complete();
- EXPECT_EQ("?a%20%00z%01", out_str);
-}
-
-TEST(URLCanonTest, Ref) {
- // Refs are trivial, it just checks the encoding.
- DualComponentCase ref_cases[] = {
- // Regular one, we shouldn't escape spaces, et al.
- {"hello, world", L"hello, world", "#hello, world", url_parse::Component(1, 12), true},
- // UTF-8/wide input should be preserved
- {"\xc2\xa9", L"\xa9", "#\xc2\xa9", url_parse::Component(1, 2), true},
- // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
- {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#\xF0\x90\x8C\x80ss", url_parse::Component(1, 6), true},
- // Escaping should be preserved unchanged, even invalid ones
- {"%41%a", L"%41%a", "#%41%a", url_parse::Component(1, 5), true},
- // Invalid UTF-8/16 input should be flagged and the input made valid
- {"\xc2", NULL, "#\xef\xbf\xbd", url_parse::Component(1, 3), true},
- {NULL, L"\xd800\x597d", "#\xef\xbf\xbd\xe5\xa5\xbd", url_parse::Component(1, 6), true},
- // Test a Unicode invalid character.
- {"a\xef\xb7\x90", L"a\xfdd0", "#a\xef\xbf\xbd", url_parse::Component(1, 4), true},
- // Refs can have # signs and we should preserve them.
- {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", url_parse::Component(1, 9), true},
- {"#asdf", L"#asdf", "##asdf", url_parse::Component(1, 5), true},
- };
-
- for (size_t i = 0; i < arraysize(ref_cases); i++) {
- // 8-bit input
- if (ref_cases[i].input8) {
- int len = static_cast<int>(strlen(ref_cases[i].input8));
- url_parse::Component in_comp(0, len);
- url_parse::Component out_comp;
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- url_canon::CanonicalizeRef(ref_cases[i].input8, in_comp,
- &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
- EXPECT_EQ(ref_cases[i].expected, out_str);
- }
-
- // 16-bit input
- if (ref_cases[i].input16) {
- string16 input16(WStringToUTF16(ref_cases[i].input16));
- int len = static_cast<int>(input16.length());
- url_parse::Component in_comp(0, len);
- url_parse::Component out_comp;
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- url_canon::CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
- EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
- EXPECT_EQ(ref_cases[i].expected, out_str);
- }
- }
-
- // Try one with an embedded NULL. It should be stripped.
- const char null_input[5] = "ab\x00z";
- url_parse::Component null_input_component(0, 4);
- url_parse::Component out_comp;
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- url_canon::CanonicalizeRef(null_input, null_input_component,
- &output, &out_comp);
- output.Complete();
-
- EXPECT_EQ(1, out_comp.begin);
- EXPECT_EQ(3, out_comp.len);
- EXPECT_EQ("#abz", out_str);
-}
-
-TEST(URLCanonTest, CanonicalizeStandardURL) {
- // The individual component canonicalize tests should have caught the cases
- // for each of those components. Here, we just need to test that the various
- // parts are included or excluded properly, and have the correct separators.
- struct URLCase {
- const char* input;
- const char* expected;
- bool expected_success;
- } cases[] = {
- {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#", true},
- {"http://[www.google.com]/", "http://[www.google.com]/", false},
- {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#", false},
- {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo", true},
- {"www.google.com", ":www.google.com/", true},
- {"http://192.0x00A80001", "http://192.168.0.1/", true},
- {"http://www/foo%2Ehtml", "http://www/foo.html", true},
- {"http://user:pass@/", "http://user:pass@/", false},
- {"http://%25DOMAIN:foobar@foodomain.com/", "http://%25DOMAIN:foobar@foodomain.com/", true},
-
- // Backslashes should get converted to forward slashes.
- {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true},
-
- // Busted refs shouldn't make the whole thing fail.
- {"http://www.google.com/asdf#\xc2", "http://www.google.com/asdf#\xef\xbf\xbd", true},
-
- // Basic port tests.
- {"http://foo:80/", "http://foo/", true},
- {"http://foo:81/", "http://foo:81/", true},
- {"httpa://foo:80/", "httpa://foo:80/", true},
- {"http://foo:-80/", "http://foo:-80/", false},
-
- {"https://foo:443/", "https://foo/", true},
- {"https://foo:80/", "https://foo:80/", true},
- {"ftp://foo:21/", "ftp://foo/", true},
- {"ftp://foo:80/", "ftp://foo:80/", true},
- {"gopher://foo:70/", "gopher://foo/", true},
- {"gopher://foo:443/", "gopher://foo:443/", true},
- {"ws://foo:80/", "ws://foo/", true},
- {"ws://foo:81/", "ws://foo:81/", true},
- {"ws://foo:443/", "ws://foo:443/", true},
- {"ws://foo:815/", "ws://foo:815/", true},
- {"wss://foo:80/", "wss://foo:80/", true},
- {"wss://foo:81/", "wss://foo:81/", true},
- {"wss://foo:443/", "wss://foo/", true},
- {"wss://foo:815/", "wss://foo:815/", true},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
- int url_len = static_cast<int>(strlen(cases[i].input));
- url_parse::Parsed parsed;
- url_parse::ParseStandardURL(cases[i].input, url_len, &parsed);
-
- url_parse::Parsed out_parsed;
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- bool success = url_canon::CanonicalizeStandardURL(
- cases[i].input, url_len, parsed, NULL, &output, &out_parsed);
- output.Complete();
-
- EXPECT_EQ(cases[i].expected_success, success);
- EXPECT_EQ(cases[i].expected, out_str);
- }
-}
-
-// The codepath here is the same as for regular canonicalization, so we just
-// need to test that things are replaced or not correctly.
-TEST(URLCanonTest, ReplaceStandardURL) {
- ReplaceCase replace_cases[] = {
- // Common case of truncating the path.
- {"http://www.google.com/foo?bar=baz#ref", NULL, NULL, NULL, NULL, NULL, "/", kDeleteComp, kDeleteComp, "http://www.google.com/"},
- // Replace everything
- {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"},
- // Replace nothing
- {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"},
- };
-
- for (size_t i = 0; i < arraysize(replace_cases); i++) {
- const ReplaceCase& cur = replace_cases[i];
- int base_len = static_cast<int>(strlen(cur.base));
- url_parse::Parsed parsed;
- url_parse::ParseStandardURL(cur.base, base_len, &parsed);
-
- url_canon::Replacements<char> r;
- typedef url_canon::Replacements<char> R; // Clean up syntax.
-
- // Note that for the scheme we pass in a different clear function since
- // there is no function to clear the scheme.
- SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
- SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
- SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
- SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
- SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
- SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
- SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
- SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- url_parse::Parsed out_parsed;
- url_canon::ReplaceStandardURL(replace_cases[i].base, parsed,
- r, NULL, &output, &out_parsed);
- output.Complete();
-
- EXPECT_EQ(replace_cases[i].expected, out_str);
- }
-
- // The path pointer should be ignored if the address is invalid.
- {
- const char src[] = "http://www.google.com/here_is_the_path";
- int src_len = static_cast<int>(strlen(src));
-
- url_parse::Parsed parsed;
- url_parse::ParseStandardURL(src, src_len, &parsed);
-
- // Replace the path to 0 length string. By using 1 as the string address,
- // the test should get an access violation if it tries to dereference it.
- url_canon::Replacements<char> r;
- r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component(0, 0));
- std::string out_str1;
- url_canon::StdStringCanonOutput output1(&out_str1);
- url_parse::Parsed new_parsed;
- url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output1, &new_parsed);
- output1.Complete();
- EXPECT_STREQ("http://www.google.com/", out_str1.c_str());
-
- // Same with an "invalid" path.
- r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component());
- std::string out_str2;
- url_canon::StdStringCanonOutput output2(&out_str2);
- url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output2, &new_parsed);
- output2.Complete();
- EXPECT_STREQ("http://www.google.com/", out_str2.c_str());
- }
-}
-
-TEST(URLCanonTest, ReplaceFileURL) {
- ReplaceCase replace_cases[] = {
- // Replace everything
- {"file:///C:/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"},
- // Replace nothing
- {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"},
- // Clear non-path components (common)
- {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///C:/gaba"},
- // Replace path with something that doesn't begin with a slash and make
- // sure it get added properly.
- {"file:///C:/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"},
- {"file:///home/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"},
- {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///home/gaba?query#ref"},
- {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///home/gaba"},
- {"file:///home/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"},
- };
-
- for (size_t i = 0; i < arraysize(replace_cases); i++) {
- const ReplaceCase& cur = replace_cases[i];
- int base_len = static_cast<int>(strlen(cur.base));
- url_parse::Parsed parsed;
- url_parse::ParseFileURL(cur.base, base_len, &parsed);
-
- url_canon::Replacements<char> r;
- typedef url_canon::Replacements<char> R; // Clean up syntax.
- SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
- SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
- SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
- SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
- SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
- SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
- SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
- SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- url_parse::Parsed out_parsed;
- url_canon::ReplaceFileURL(cur.base, parsed,
- r, NULL, &output, &out_parsed);
- output.Complete();
-
- EXPECT_EQ(replace_cases[i].expected, out_str);
- }
-}
-
-TEST(URLCanonTest, ReplacePathURL) {
- ReplaceCase replace_cases[] = {
- // Replace everything
- {"data:foo", "javascript", NULL, NULL, NULL, NULL, "alert('foo?');", NULL, NULL, "javascript:alert('foo?');"},
- // Replace nothing
- {"data:foo", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "data:foo"},
- // Replace one or the other
- {"data:foo", "javascript", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "javascript:foo"},
- {"data:foo", NULL, NULL, NULL, NULL, NULL, "bar", NULL, NULL, "data:bar"},
- {"data:foo", NULL, NULL, NULL, NULL, NULL, kDeleteComp, NULL, NULL, "data:"},
- };
-
- for (size_t i = 0; i < arraysize(replace_cases); i++) {
- const ReplaceCase& cur = replace_cases[i];
- int base_len = static_cast<int>(strlen(cur.base));
- url_parse::Parsed parsed;
- url_parse::ParsePathURL(cur.base, base_len, &parsed);
-
- url_canon::Replacements<char> r;
- typedef url_canon::Replacements<char> R; // Clean up syntax.
- SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
- SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
- SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
- SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
- SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
- SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
- SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
- SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- url_parse::Parsed out_parsed;
- url_canon::ReplacePathURL(cur.base, parsed,
- r, &output, &out_parsed);
- output.Complete();
-
- EXPECT_EQ(replace_cases[i].expected, out_str);
- }
-}
-
-TEST(URLCanonTest, ReplaceMailtoURL) {
- ReplaceCase replace_cases[] = {
- // Replace everything
- {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"},
- // Replace nothing
- {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"},
- // Replace the path
- {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"},
- // Replace the query
- {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"},
- // Replace the path and query
- {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"},
- // Set the query to empty (should leave trailing question mark)
- {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"},
- // Clear the query
- {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"},
- // Clear the path
- {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"},
- // Clear the path + query
- {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"},
- // Setting the ref should have no effect
- {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"},
- };
-
- for (size_t i = 0; i < arraysize(replace_cases); i++) {
- const ReplaceCase& cur = replace_cases[i];
- int base_len = static_cast<int>(strlen(cur.base));
- url_parse::Parsed parsed;
- url_parse::ParseMailtoURL(cur.base, base_len, &parsed);
-
- url_canon::Replacements<char> r;
- typedef url_canon::Replacements<char> R;
- SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
- SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
- SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
- SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
- SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
- SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
- SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
- SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- url_parse::Parsed out_parsed;
- url_canon::ReplaceMailtoURL(cur.base, parsed,
- r, &output, &out_parsed);
- output.Complete();
-
- EXPECT_EQ(replace_cases[i].expected, out_str);
- }
-}
-
-TEST(URLCanonTest, CanonicalizeFileURL) {
- struct URLCase {
- const char* input;
- const char* expected;
- bool expected_success;
- url_parse::Component expected_host;
- url_parse::Component expected_path;
- } cases[] = {
-#ifdef _WIN32
- // Windows-style paths
- {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
- {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
- {"file:", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
- {"file:UNChost/path", "file://unchost/path", true, url_parse::Component(7, 7), url_parse::Component(14, 5)},
- // CanonicalizeFileURL supports absolute Windows style paths for IE
- // compatability. Note that the caller must decide that this is a file
- // URL itself so it can call the file canonicalizer. This is usually
- // done automatically as part of relative URL resolving.
- {"c:\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
- {"C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
- {"/C|\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
- {"//C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
- {"//server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
- {"\\\\server\\file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
- {"/\\server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
- // We should preserve the number of slashes after the colon for IE
- // compatability, except when there is none, in which case we should
- // add one.
- {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
- {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
- // Three slashes should be non-UNC, even if there is no drive spec (IE
- // does this, which makes the resulting request invalid).
- {"file:///foo/bar.txt", "file:///foo/bar.txt", true, url_parse::Component(), url_parse::Component(7, 12)},
- // TODO(brettw) we should probably fail for invalid host names, which
- // would change the expected result on this test. We also currently allow
- // colon even though it's probably invalid, because its currently the
- // "natural" result of the way the canonicalizer is written. There doesn't
- // seem to be a strong argument for why allowing it here would be bad, so
- // we just tolerate it and the load will fail later.
- {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false, url_parse::Component(7, 2), url_parse::Component(9, 16)},
- {"file:filer/home\\me", "file://filer/home/me", true, url_parse::Component(7, 5), url_parse::Component(12, 8)},
- // Make sure relative paths can't go above the "C:"
- {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, url_parse::Component(), url_parse::Component(7, 12)},
- // Busted refs shouldn't make the whole thing fail.
- {"file:///C:/asdf#\xc2", "file:///C:/asdf#\xef\xbf\xbd", true, url_parse::Component(), url_parse::Component(7, 8)},
-#else
- // Unix-style paths
- {"file:///home/me", "file:///home/me", true, url_parse::Component(), url_parse::Component(7, 8)},
- // Windowsy ones should get still treated as Unix-style.
- {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
- {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
- // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html)
- {"//", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
- {"///", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
- {"///test", "file:///test", true, url_parse::Component(), url_parse::Component(7, 5)},
- {"file://test", "file://test/", true, url_parse::Component(7, 4), url_parse::Component(11, 1)},
- {"file://localhost", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)},
- {"file://localhost/", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)},
- {"file://localhost/test", "file://localhost/test", true, url_parse::Component(7, 9), url_parse::Component(16, 5)},
-#endif // _WIN32
- };
-
- for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
- int url_len = static_cast<int>(strlen(cases[i].input));
- url_parse::Parsed parsed;
- url_parse::ParseFileURL(cases[i].input, url_len, &parsed);
-
- url_parse::Parsed out_parsed;
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- bool success = url_canon::CanonicalizeFileURL(cases[i].input, url_len,
- parsed, NULL, &output,
- &out_parsed);
- output.Complete();
-
- EXPECT_EQ(cases[i].expected_success, success);
- EXPECT_EQ(cases[i].expected, out_str);
-
- // Make sure the spec was properly identified, the file canonicalizer has
- // different code for writing the spec.
- EXPECT_EQ(0, out_parsed.scheme.begin);
- EXPECT_EQ(4, out_parsed.scheme.len);
-
- EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin);
- EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len);
-
- EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
- EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
- }
-}
-
-TEST(URLCanonTest, CanonicalizePathURL) {
- // Path URLs should get canonicalized schemes but nothing else.
- struct PathCase {
- const char* input;
- const char* expected;
- } path_cases[] = {
- {"javascript:", "javascript:"},
- {"JavaScript:Foo", "javascript:Foo"},
- {":\":This /is interesting;?#", ":\":This /is interesting;?#"},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) {
- int url_len = static_cast<int>(strlen(path_cases[i].input));
- url_parse::Parsed parsed;
- url_parse::ParsePathURL(path_cases[i].input, url_len, &parsed);
-
- url_parse::Parsed out_parsed;
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- bool success = url_canon::CanonicalizePathURL(path_cases[i].input, url_len,
- parsed, &output,
- &out_parsed);
- output.Complete();
-
- EXPECT_TRUE(success);
- EXPECT_EQ(path_cases[i].expected, out_str);
-
- EXPECT_EQ(0, out_parsed.host.begin);
- EXPECT_EQ(-1, out_parsed.host.len);
-
- // When we end with a colon at the end, there should be no path.
- if (path_cases[i].input[url_len - 1] == ':') {
- EXPECT_EQ(0, out_parsed.path.begin);
- EXPECT_EQ(-1, out_parsed.path.len);
- }
- }
-}
-
-TEST(URLCanonTest, CanonicalizeMailtoURL) {
- struct URLCase {
- const char* input;
- const char* expected;
- bool expected_success;
- url_parse::Component expected_path;
- url_parse::Component expected_query;
- } cases[] = {
- {"mailto:addr1", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()},
- {"mailto:addr1@foo.com", "mailto:addr1@foo.com", true, url_parse::Component(7, 13), url_parse::Component()},
- // Trailing whitespace is stripped.
- {"MaIlTo:addr1 \t ", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()},
- {"MaIlTo:addr1?to=jon", "mailto:addr1?to=jon", true, url_parse::Component(7, 5), url_parse::Component(13,6)},
- {"mailto:addr1,addr2", "mailto:addr1,addr2", true, url_parse::Component(7, 11), url_parse::Component()},
- {"mailto:addr1, addr2", "mailto:addr1, addr2", true, url_parse::Component(7, 12), url_parse::Component()},
- {"mailto:addr1%2caddr2", "mailto:addr1%2caddr2", true, url_parse::Component(7, 13), url_parse::Component()},
- {"mailto:\xF0\x90\x8C\x80", "mailto:%F0%90%8C%80", true, url_parse::Component(7, 12), url_parse::Component()},
- // Null character should be escaped to %00
- {"mailto:addr1\0addr2?foo", "mailto:addr1%00addr2?foo", true, url_parse::Component(7, 13), url_parse::Component(21, 3)},
- // Invalid -- UTF-8 encoded surrogate value.
- {"mailto:\xed\xa0\x80", "mailto:%EF%BF%BD", false, url_parse::Component(7, 9), url_parse::Component()},
- {"mailto:addr1?", "mailto:addr1?", true, url_parse::Component(7, 5), url_parse::Component(13, 0)},
- };
-
- // Define outside of loop to catch bugs where components aren't reset
- url_parse::Parsed parsed;
- url_parse::Parsed out_parsed;
-
- for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
- int url_len = static_cast<int>(strlen(cases[i].input));
- if (i == 8) {
- // The 9th test case purposely has a '\0' in it -- don't count it
- // as the string terminator.
- url_len = 22;
- }
- url_parse::ParseMailtoURL(cases[i].input, url_len, &parsed);
-
- std::string out_str;
- url_canon::StdStringCanonOutput output(&out_str);
- bool success = url_canon::CanonicalizeMailtoURL(cases[i].input, url_len,
- parsed, &output,
- &out_parsed);
- output.Complete();
-
- EXPECT_EQ(cases[i].expected_success, success);
- EXPECT_EQ(cases[i].expected, out_str);
-
- // Make sure the spec was properly identified
- EXPECT_EQ(0, out_parsed.scheme.begin);
- EXPECT_EQ(6, out_parsed.scheme.len);
-
- EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
- EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
-
- EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin);
- EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len);
- }
-}
-
-#ifndef WIN32
-
-TEST(URLCanonTest, _itoa_s) {
- // We fill the buffer with 0xff to ensure that it's getting properly
- // null-terminated. We also allocate one byte more than what we tell
- // _itoa_s about, and ensure that the extra byte is untouched.
- char buf[6];
- memset(buf, 0xff, sizeof(buf));
- EXPECT_EQ(0, url_canon::_itoa_s(12, buf, sizeof(buf) - 1, 10));
- EXPECT_STREQ("12", buf);
- EXPECT_EQ('\xFF', buf[3]);
-
- // Test the edge cases - exactly the buffer size and one over
- memset(buf, 0xff, sizeof(buf));
- EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 10));
- EXPECT_STREQ("1234", buf);
- EXPECT_EQ('\xFF', buf[5]);
-
- memset(buf, 0xff, sizeof(buf));
- EXPECT_EQ(EINVAL, url_canon::_itoa_s(12345, buf, sizeof(buf) - 1, 10));
- EXPECT_EQ('\xFF', buf[5]); // should never write to this location
-
- // Test the template overload (note that this will see the full buffer)
- memset(buf, 0xff, sizeof(buf));
- EXPECT_EQ(0, url_canon::_itoa_s(12, buf, 10));
- EXPECT_STREQ("12", buf);
- EXPECT_EQ('\xFF', buf[3]);
-
- memset(buf, 0xff, sizeof(buf));
- EXPECT_EQ(0, url_canon::_itoa_s(12345, buf, 10));
- EXPECT_STREQ("12345", buf);
-
- EXPECT_EQ(EINVAL, url_canon::_itoa_s(123456, buf, 10));
-
- // Test that radix 16 is supported.
- memset(buf, 0xff, sizeof(buf));
- EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 16));
- EXPECT_STREQ("4d2", buf);
- EXPECT_EQ('\xFF', buf[5]);
-}
-
-TEST(URLCanonTest, _itow_s) {
- // We fill the buffer with 0xff to ensure that it's getting properly
- // null-terminated. We also allocate one byte more than what we tell
- // _itoa_s about, and ensure that the extra byte is untouched.
- char16 buf[6];
- const char fill_mem = 0xff;
- const char16 fill_char = 0xffff;
- memset(buf, fill_mem, sizeof(buf));
- EXPECT_EQ(0, url_canon::_itow_s(12, buf, sizeof(buf) / 2 - 1, 10));
- EXPECT_EQ(WStringToUTF16(L"12"), string16(buf));
- EXPECT_EQ(fill_char, buf[3]);
-
- // Test the edge cases - exactly the buffer size and one over
- EXPECT_EQ(0, url_canon::_itow_s(1234, buf, sizeof(buf) / 2 - 1, 10));
- EXPECT_EQ(WStringToUTF16(L"1234"), string16(buf));
- EXPECT_EQ(fill_char, buf[5]);
-
- memset(buf, fill_mem, sizeof(buf));
- EXPECT_EQ(EINVAL, url_canon::_itow_s(12345, buf, sizeof(buf) / 2 - 1, 10));
- EXPECT_EQ(fill_char, buf[5]); // should never write to this location
-
- // Test the template overload (note that this will see the full buffer)
- memset(buf, fill_mem, sizeof(buf));
- EXPECT_EQ(0, url_canon::_itow_s(12, buf, 10));
- EXPECT_EQ(WStringToUTF16(L"12"), string16(buf));
- EXPECT_EQ(fill_char, buf[3]);
-
- memset(buf, fill_mem, sizeof(buf));
- EXPECT_EQ(0, url_canon::_itow_s(12345, buf, 10));
- EXPECT_EQ(WStringToUTF16(L"12345"), string16(buf));
-
- EXPECT_EQ(EINVAL, url_canon::_itow_s(123456, buf, 10));
-}
-
-#endif // !WIN32
-
-// Returns true if the given two structures are the same.
-static bool ParsedIsEqual(const url_parse::Parsed& a,
- const url_parse::Parsed& b) {
- return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len &&
- a.username.begin == b.username.begin && a.username.len == b.username.len &&
- a.password.begin == b.password.begin && a.password.len == b.password.len &&
- a.host.begin == b.host.begin && a.host.len == b.host.len &&
- a.port.begin == b.port.begin && a.port.len == b.port.len &&
- a.path.begin == b.path.begin && a.path.len == b.path.len &&
- a.query.begin == b.query.begin && a.query.len == b.query.len &&
- a.ref.begin == b.ref.begin && a.ref.len == b.ref.len;
-}
-
-TEST(URLCanonTest, ResolveRelativeURL) {
- struct RelativeCase {
- const char* base; // Input base URL: MUST BE CANONICAL
- bool is_base_hier; // Is the base URL hierarchical
- bool is_base_file; // Tells us if the base is a file URL.
- const char* test; // Input URL to test against.
- bool succeed_relative; // Whether we expect IsRelativeURL to succeed
- bool is_rel; // Whether we expect |test| to be relative or not.
- bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed.
- const char* resolved; // What we expect in the result when resolving.
- } rel_cases[] = {
- // Basic absolute input.
- {"http://host/a", true, false, "http://another/", true, false, false, NULL},
- {"http://host/a", true, false, "http:////another/", true, false, false, NULL},
- // Empty relative URLs should only remove the ref part of the URL,
- // leaving the rest unchanged.
- {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
- {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"},
- {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
- // Spaces at the ends of the relative path should be ignored.
- {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"},
- {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"},
- {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"},
- // Matching schemes without two slashes are treated as relative.
- {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"},
- {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"},
- {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"},
- {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"},
- // Nonmatching schemes are absolute.
- {"http://host/a", true, false, "https:host2", true, false, false, NULL},
- {"http://host/a", true, false, "htto:/host2", true, false, false, NULL},
- // Absolute path input
- {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"},
- {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"},
- {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"},
- {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"},
- {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"},
- {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"},
- // Relative path input
- {"http://host/a", true, false, "b", true, true, true, "http://host/b"},
- {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"},
- {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"},
- {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"},
- {"http://host/a/", true, false, "..", true, true, true, "http://host/"},
- {"http://host/a/", true, false, "./..", true, true, true, "http://host/"},
- {"http://host/a/", true, false, "../.", true, true, true, "http://host/"},
- {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"},
- {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"},
- // Query input
- {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"},
- {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"},
- {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"},
- // Ref input
- {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"},
- {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"},
- {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"},
- // Non-hierarchical base: no relative handling. Relative input should
- // error, and if a scheme is present, it should be treated as absolute.
- {"data:foobar", false, false, "baz.html", false, false, false, NULL},
- {"data:foobar", false, false, "data:baz", true, false, false, NULL},
- {"data:foobar", false, false, "data:/base", true, false, false, NULL},
- // Non-hierarchical base: absolute input should succeed.
- {"data:foobar", false, false, "http://host/", true, false, false, NULL},
- {"data:foobar", false, false, "http:host", true, false, false, NULL},
- // Invalid schemes should be treated as relative.
- {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"},
- {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"},
- {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"},
- {"data:asdf", false, false, ":foo", false, false, false, NULL},
- // We should treat semicolons like any other character in URL resolving
- {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"},
- {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"},
- {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"},
- // Relative URLs can also be written as "//foo/bar" which is relative to
- // the scheme. In this case, it would take the old scheme, so for http
- // the example would resolve to "http://foo/bar".
- {"http://host/a", true, false, "//another", true, true, true, "http://another/"},
- {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"},
- {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"},
- {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"},
- {"http://host/a", true, false, "//", true, true, false, "http:"},
- // IE will also allow one or the other to be a backslash to get the same
- // behavior.
- {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"},
- {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"},
-#ifdef WIN32
- // Resolving against Windows file base URLs.
- {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL},
- {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"},
- {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"},
- {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"},
- // But two backslashes on Windows should be UNC so should be treated
- // as absolute.
- {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL},
- // IE doesn't support drive specs starting with two slashes. It fails
- // immediately and doesn't even try to load. We fix it up to either
- // an absolute path or UNC depending on what it looks like.
- {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"},
- {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"},
- // Windows drive specs should be allowed and treated as absolute.
- {"file:///C:/foo", true, true, "c:", true, false, false, NULL},
- {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL},
- {"http://host/a", true, false, "c:\\foo", true, false, false, NULL},
- // Relative paths with drive letters should be allowed when the base is
- // also a file.
- {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"},
- // Treat absolute paths as being off of the drive.
- {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"},
- {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"},
- {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"},
- // On Windows, two slashes without a drive letter when the base is a file
- // means that the path is UNC.
- {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"},
- {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"},
-#else
- // On Unix we fall back to relative behavior since there's nothing else
- // reasonable to do.
- {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"},
-#endif
- // Even on Windows, we don't allow relative drive specs when the base
- // is not file.
- {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"},
- {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"},
- };
-
- for (size_t i = 0; i < ARRAYSIZE(rel_cases); i++) {
- const RelativeCase& cur_case = rel_cases[i];
-
- url_parse::Parsed parsed;
- int base_len = static_cast<int>(strlen(cur_case.base));
- if (cur_case.is_base_file)
- url_parse::ParseFileURL(cur_case.base, base_len, &parsed);
- else if (cur_case.is_base_hier)
- url_parse::ParseStandardURL(cur_case.base, base_len, &parsed);
- else
- url_parse::ParsePathURL(cur_case.base, base_len, &parsed);
-
- // First see if it is relative.
- int test_len = static_cast<int>(strlen(cur_case.test));
- bool is_relative;
- url_parse::Component relative_component;
- bool succeed_is_rel = url_canon::IsRelativeURL(
- cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier,
- &is_relative, &relative_component);
-
- EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) <<
- "succeed is rel failure on " << cur_case.test;
- EXPECT_EQ(cur_case.is_rel, is_relative) <<
- "is rel failure on " << cur_case.test;
- // Now resolve it.
- if (succeed_is_rel && is_relative && cur_case.is_rel) {
- std::string resolved;
- url_canon::StdStringCanonOutput output(&resolved);
- url_parse::Parsed resolved_parsed;
-
- bool succeed_resolve = url_canon::ResolveRelativeURL(
- cur_case.base, parsed, cur_case.is_base_file,
- cur_case.test, relative_component, NULL, &output, &resolved_parsed);
- output.Complete();
-
- EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve);
- EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test;
-
- // Verify that the output parsed structure is the same as parsing a
- // the URL freshly.
- url_parse::Parsed ref_parsed;
- int resolved_len = static_cast<int>(resolved.size());
- if (cur_case.is_base_file)
- url_parse::ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed);
- else if (cur_case.is_base_hier)
- url_parse::ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed);
- else
- url_parse::ParsePathURL(resolved.c_str(), resolved_len, &ref_parsed);
- EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
- }
- }
-}
-
-// It used to be when we did a replacement with a long buffer of UTF-16
-// characters, we would get invalid data in the URL. This is because the buffer
-// it used to hold the UTF-8 data was resized, while some pointers were still
-// kept to the old buffer that was removed.
-TEST(URLCanonTest, ReplacementOverflow) {
- const char src[] = "file:///C:/foo/bar";
- int src_len = static_cast<int>(strlen(src));
- url_parse::Parsed parsed;
- url_parse::ParseFileURL(src, src_len, &parsed);
-
- // Override two components, the path with something short, and the query with
- // sonething long enough to trigger the bug.
- url_canon::Replacements<char16> repl;
- string16 new_query;
- for (int i = 0; i < 4800; i++)
- new_query.push_back('a');
-
- string16 new_path(WStringToUTF16(L"/foo"));
- repl.SetPath(new_path.c_str(), url_parse::Component(0, 4));
- repl.SetQuery(new_query.c_str(),
- url_parse::Component(0, static_cast<int>(new_query.length())));
-
- // Call ReplaceComponents on the string. It doesn't matter if we call it for
- // standard URLs, file URLs, etc, since they will go to the same replacement
- // function that was buggy.
- url_parse::Parsed repl_parsed;
- std::string repl_str;
- url_canon::StdStringCanonOutput repl_output(&repl_str);
- url_canon::ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed);
- repl_output.Complete();
-
- // Generate the expected string and check.
- std::string expected("file:///foo?");
- for (size_t i = 0; i < new_query.length(); i++)
- expected.push_back('a');
- EXPECT_TRUE(expected == repl_str);
-}
diff --git a/googleurl/src/url_common.h b/googleurl/src/url_common.h
deleted file mode 100644
index 7e7e27a..0000000
--- a/googleurl/src/url_common.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2010, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef GOOGLEURL_SRC_URL_COMMON_H__
-#define GOOGLEURL_SRC_URL_COMMON_H__
-
-#if !defined(GURL_IMPLEMENTATION)
-#define GURL_IMPLEMENTATION 0
-#endif
-
-#if defined(WIN32) && defined(GURL_DLL)
-#if GURL_IMPLEMENTATION
-#define GURL_API __declspec(dllexport)
-#else
-#define GURL_API __declspec(dllimport)
-#endif
-#else
-#define GURL_API
-#endif
-
-#endif // GOOGLEURL_SRC_URL_COMMON_H__
-
diff --git a/googleurl/src/url_file.h b/googleurl/src/url_file.h
deleted file mode 100644
index c1b8ac9..0000000
--- a/googleurl/src/url_file.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Provides shared functions used by the internals of the parser and
-// canonicalizer for file URLs. Do not use outside of these modules.
-
-#ifndef GOOGLEURL_SRC_URL_FILE_H__
-#define GOOGLEURL_SRC_URL_FILE_H__
-
-#include "googleurl/src/url_parse_internal.h"
-
-namespace url_parse {
-
-#ifdef WIN32
-
-// We allow both "c:" and "c|" as drive identifiers.
-inline bool IsWindowsDriveSeparator(char16 ch) {
- return ch == ':' || ch == '|';
-}
-inline bool IsWindowsDriveLetter(char16 ch) {
- return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
-}
-
-#endif // WIN32
-
-// Returns the index of the next slash in the input after the given index, or
-// spec_len if the end of the input is reached.
-template<typename CHAR>
-inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) {
- int idx = begin_index;
- while (idx < spec_len && !IsURLSlash(spec[idx]))
- idx++;
- return idx;
-}
-
-#ifdef WIN32
-
-// Returns true if the start_offset in the given spec looks like it begins a
-// drive spec, for example "c:". This function explicitly handles start_offset
-// values that are equal to or larger than the spec_len to simplify callers.
-//
-// If this returns true, the spec is guaranteed to have a valid drive letter
-// plus a colon starting at |start_offset|.
-template<typename CHAR>
-inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset,
- int spec_len) {
- int remaining_len = spec_len - start_offset;
- if (remaining_len < 2)
- return false; // Not enough room.
- if (!IsWindowsDriveLetter(spec[start_offset]))
- return false; // Doesn't start with a valid drive letter.
- if (!IsWindowsDriveSeparator(spec[start_offset + 1]))
- return false; // Isn't followed with a drive separator.
- return true;
-}
-
-// Returns true if the start_offset in the given text looks like it begins a
-// UNC path, for example "\\". This function explicitly handles start_offset
-// values that are equal to or larger than the spec_len to simplify callers.
-//
-// When strict_slashes is set, this function will only accept backslashes as is
-// standard for Windows. Otherwise, it will accept forward slashes as well
-// which we use for a lot of URL handling.
-template<typename CHAR>
-inline bool DoesBeginUNCPath(const CHAR* text,
- int start_offset,
- int len,
- bool strict_slashes) {
- int remaining_len = len - start_offset;
- if (remaining_len < 2)
- return false;
-
- if (strict_slashes)
- return text[start_offset] == '\\' && text[start_offset + 1] == '\\';
- return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]);
-}
-
-#endif // WIN32
-
-} // namespace url_parse
-
-#endif // GOOGLEURL_SRC_URL_FILE_H__
diff --git a/googleurl/src/url_parse_internal.h b/googleurl/src/url_parse_internal.h
deleted file mode 100644
index 61bd068..0000000
--- a/googleurl/src/url_parse_internal.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Contains common inline helper functions used by the URL parsing routines.
-
-#ifndef GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
-#define GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
-
-#include "googleurl/src/url_parse.h"
-
-namespace url_parse {
-
-// We treat slashes and backslashes the same for IE compatability.
-inline bool IsURLSlash(char16 ch) {
- return ch == '/' || ch == '\\';
-}
-
-// Returns true if we should trim this character from the URL because it is a
-// space or a control character.
-inline bool ShouldTrimFromURL(char16 ch) {
- return ch <= ' ';
-}
-
-// Given an already-initialized begin index and length, this shrinks the range
-// to eliminate "should-be-trimmed" characters. Note that the length does *not*
-// indicate the length of untrimmed data from |*begin|, but rather the position
-// in the input string (so the string starts at character |*begin| in the spec,
-// and goes until |*len|).
-template<typename CHAR>
-inline void TrimURL(const CHAR* spec, int* begin, int* len) {
- // Strip leading whitespace and control characters.
- while (*begin < *len && ShouldTrimFromURL(spec[*begin]))
- (*begin)++;
-
- // Strip trailing whitespace and control characters. We need the >i test for
- // when the input string is all blanks; we don't want to back past the input.
- while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
- (*len)--;
-}
-
-// Counts the number of consecutive slashes starting at the given offset
-// in the given string of the given length.
-template<typename CHAR>
-inline int CountConsecutiveSlashes(const CHAR *str,
- int begin_offset, int str_len) {
- int count = 0;
- while (begin_offset + count < str_len &&
- IsURLSlash(str[begin_offset + count]))
- ++count;
- return count;
-}
-
-// Internal functions in url_parse.cc that parse the path, that is, everything
-// following the authority section. The input is the range of everything
-// following the authority section, and the output is the identified ranges.
-//
-// This is designed for the file URL parser or other consumers who may do
-// special stuff at the beginning, but want regular path parsing, it just
-// maps to the internal parsing function for paths.
-void ParsePathInternal(const char* spec,
- const Component& path,
- Component* filepath,
- Component* query,
- Component* ref);
-void ParsePathInternal(const char16* spec,
- const Component& path,
- Component* filepath,
- Component* query,
- Component* ref);
-
-
-// Given a spec and a pointer to the character after the colon following the
-// scheme, this parses it and fills in the structure, Every item in the parsed
-// structure is filled EXCEPT for the scheme, which is untouched.
-void ParseAfterScheme(const char* spec,
- int spec_len,
- int after_scheme,
- Parsed* parsed);
-void ParseAfterScheme(const char16* spec,
- int spec_len,
- int after_scheme,
- Parsed* parsed);
-
-} // namespace url_parse
-
-#endif // GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
diff --git a/googleurl/src/url_test_utils.h b/googleurl/src/url_test_utils.h
deleted file mode 100644
index fdadf7f..0000000
--- a/googleurl/src/url_test_utils.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2007 Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Convenience functions for string conversions.
-// These are mostly intended for use in unit tests.
-
-#ifndef GOOGLEURL_SRC_URL_TEST_UTILS_H__
-#define GOOGLEURL_SRC_URL_TEST_UTILS_H__
-
-#include <string>
-
-#include "base/string16.h"
-#include "googleurl/src/url_canon_internal.h"
-#include "testing/base/public/gunit.h"
-
-namespace url_test_utils {
-
-// Converts a UTF-16 string from native wchar_t format to char16, by
-// truncating the high 32 bits. This is not meant to handle true UTF-32
-// encoded strings.
-inline string16 WStringToUTF16(const wchar_t* src) {
- string16 str;
- int length = static_cast<int>(wcslen(src));
- for (int i = 0; i < length; ++i) {
- str.push_back(static_cast<char16>(src[i]));
- }
- return str;
-}
-
-// Converts a string from UTF-8 to UTF-16
-inline string16 ConvertUTF8ToUTF16(const std::string& src) {
- int length = static_cast<int>(src.length());
- EXPECT_LT(length, 1024);
- url_canon::RawCanonOutputW<1024> output;
- EXPECT_TRUE(url_canon::ConvertUTF8ToUTF16(src.data(), length, &output));
- return string16(output.data(), output.length());
-}
-
-// Converts a string from UTF-16 to UTF-8
-inline std::string ConvertUTF16ToUTF8(const string16& src) {
- std::string str;
- url_canon::StdStringCanonOutput output(&str);
- EXPECT_TRUE(url_canon::ConvertUTF16ToUTF8(src.data(),
- static_cast<int>(src.length()),
- &output));
- output.Complete();
- return str;
-}
-
-} // namespace url_test_utils
-
-// This operator allows EXPECT_EQ(astring16, anotherstring16); to work.
-inline std::ostream& operator<<(std::ostream& os,
- const string16& str) {
- // Convert to UTF-8 and print the string
- return os << url_test_utils::ConvertUTF16ToUTF8(str);
-}
-
-#endif // GOOGLEURL_SRC_URL_TEST_UTILS_H__
diff --git a/googleurl/src/url_util.cc b/googleurl/src/url_util.cc
deleted file mode 100644
index 7e100aa..0000000
--- a/googleurl/src/url_util.cc
+++ /dev/null
@@ -1,553 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <string.h>
-#include <vector>
-
-#include "googleurl/src/url_util.h"
-
-#include "base/logging.h"
-#include "googleurl/src/url_canon_internal.h"
-#include "googleurl/src/url_file.h"
-
-namespace url_util {
-
-namespace {
-
-// ASCII-specific tolower. The standard library's tolower is locale sensitive,
-// so we don't want to use it here.
-template <class Char> inline Char ToLowerASCII(Char c) {
- return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
-}
-
-// Backend for LowerCaseEqualsASCII.
-template<typename Iter>
-inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
- for (Iter it = a_begin; it != a_end; ++it, ++b) {
- if (!*b || ToLowerASCII(*it) != *b)
- return false;
- }
- return *b == 0;
-}
-
-const char kFileScheme[] = "file"; // Used in a number of places.
-const char kMailtoScheme[] = "mailto";
-
-const int kNumStandardURLSchemes = 7;
-const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
- "http",
- "https",
- kFileScheme, // Yes, file urls can have a hostname!
- "ftp",
- "gopher",
- "ws", // WebSocket.
- "wss", // WebSocket secure.
-};
-
-// List of the currently installed standard schemes. This list is lazily
-// initialized by InitStandardSchemes and is leaked on shutdown to prevent
-// any destructors from being called that will slow us down or cause problems.
-std::vector<const char*>* standard_schemes = NULL;
-
-// See the LockStandardSchemes declaration in the header.
-bool standard_schemes_locked = false;
-
-// Ensures that the standard_schemes list is initialized, does nothing if it
-// already has values.
-void InitStandardSchemes() {
- if (standard_schemes)
- return;
- standard_schemes = new std::vector<const char*>;
- for (int i = 0; i < kNumStandardURLSchemes; i++)
- standard_schemes->push_back(kStandardURLSchemes[i]);
-}
-
-// Given a string and a range inside the string, compares it to the given
-// lower-case |compare_to| buffer.
-template<typename CHAR>
-inline bool CompareSchemeComponent(const CHAR* spec,
- const url_parse::Component& component,
- const char* compare_to) {
- if (!component.is_nonempty())
- return compare_to[0] == 0; // When component is empty, match empty scheme.
- return LowerCaseEqualsASCII(&spec[component.begin],
- &spec[component.end()],
- compare_to);
-}
-
-// Returns true if the given scheme identified by |scheme| within |spec| is one
-// of the registered "standard" schemes.
-template<typename CHAR>
-bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {
- if (!scheme.is_nonempty())
- return false; // Empty or invalid schemes are non-standard.
-
- InitStandardSchemes();
- for (size_t i = 0; i < standard_schemes->size(); i++) {
- if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
- standard_schemes->at(i)))
- return true;
- }
- return false;
-}
-
-template<typename CHAR>
-bool DoFindAndCompareScheme(const CHAR* str,
- int str_len,
- const char* compare,
- url_parse::Component* found_scheme) {
- // Before extracting scheme, canonicalize the URL to remove any whitespace.
- // This matches the canonicalization done in DoCanonicalize function.
- url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
- int spec_len;
- const CHAR* spec = RemoveURLWhitespace(str, str_len,
- &whitespace_buffer, &spec_len);
-
- url_parse::Component our_scheme;
- if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) {
- // No scheme.
- if (found_scheme)
- *found_scheme = url_parse::Component();
- return false;
- }
- if (found_scheme)
- *found_scheme = our_scheme;
- return CompareSchemeComponent(spec, our_scheme, compare);
-}
-
-template<typename CHAR>
-bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed) {
- // Remove any whitespace from the middle of the relative URL, possibly
- // copying to the new buffer.
- url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
- int spec_len;
- const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
- &whitespace_buffer, &spec_len);
-
- url_parse::Parsed parsed_input;
-#ifdef WIN32
- // For Windows, we allow things that look like absolute Windows paths to be
- // fixed up magically to file URLs. This is done for IE compatability. For
- // example, this will change "c:/foo" into a file URL rather than treating
- // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
- // There is similar logic in url_canon_relative.cc for
- //
- // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
- // has no meaning as an absolute path name. This is because browsers on Mac
- // & Unix don't generally do this, so there is no compatibility reason for
- // doing so.
- if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) ||
- url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
- url_parse::ParseFileURL(spec, spec_len, &parsed_input);
- return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
- charset_converter,
- output, output_parsed);
- }
-#endif
-
- url_parse::Component scheme;
- if (!url_parse::ExtractScheme(spec, spec_len, &scheme))
- return false;
-
- // This is the parsed version of the input URL, we have to canonicalize it
- // before storing it in our object.
- bool success;
- if (CompareSchemeComponent(spec, scheme, kFileScheme)) {
- // File URLs are special.
- url_parse::ParseFileURL(spec, spec_len, &parsed_input);
- success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
- charset_converter,
- output, output_parsed);
-
- } else if (DoIsStandard(spec, scheme)) {
- // All "normal" URLs.
- url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
- success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
- charset_converter,
- output, output_parsed);
-
- } else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) {
- // Mailto are treated like a standard url with only a scheme, path, query
- url_parse::ParseMailtoURL(spec, spec_len, &parsed_input);
- success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input,
- output, output_parsed);
-
- } else {
- // "Weird" URLs like data: and javascript:
- url_parse::ParsePathURL(spec, spec_len, &parsed_input);
- success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input,
- output, output_parsed);
- }
- return success;
-}
-
-template<typename CHAR>
-bool DoResolveRelative(const char* base_spec,
- int base_spec_len,
- const url_parse::Parsed& base_parsed,
- const CHAR* in_relative,
- int in_relative_length,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed) {
- // Remove any whitespace from the middle of the relative URL, possibly
- // copying to the new buffer.
- url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
- int relative_length;
- const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
- &whitespace_buffer,
- &relative_length);
-
- // See if our base URL should be treated as "standard".
- bool standard_base_scheme =
- base_parsed.scheme.is_nonempty() &&
- DoIsStandard(base_spec, base_parsed.scheme);
-
- bool is_relative;
- url_parse::Component relative_component;
- if (!url_canon::IsRelativeURL(base_spec, base_parsed,
- relative, relative_length,
- standard_base_scheme,
- &is_relative,
- &relative_component)) {
- // Error resolving.
- return false;
- }
-
- if (is_relative) {
- // Relative, resolve and canonicalize.
- bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
- CompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
- return url_canon::ResolveRelativeURL(base_spec, base_parsed,
- file_base_scheme, relative,
- relative_component, charset_converter,
- output, output_parsed);
- }
-
- // Not relative, canonicalize the input.
- return DoCanonicalize(relative, relative_length, charset_converter,
- output, output_parsed);
-}
-
-template<typename CHAR>
-bool DoReplaceComponents(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- const url_canon::Replacements<CHAR>& replacements,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* out_parsed) {
- // If the scheme is overridden, just do a simple string substitution and
- // reparse the whole thing. There are lots of edge cases that we really don't
- // want to deal with. Like what happens if I replace "http://e:8080/foo"
- // with a file. Does it become "file:///E:/8080/foo" where the port number
- // becomes part of the path? Parsing that string as a file URL says "yes"
- // but almost no sane rule for dealing with the components individually would
- // come up with that.
- //
- // Why allow these crazy cases at all? Programatically, there is almost no
- // case for replacing the scheme. The most common case for hitting this is
- // in JS when building up a URL using the location object. In this case, the
- // JS code expects the string substitution behavior:
- // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
- if (replacements.IsSchemeOverridden()) {
- // Canonicalize the new scheme so it is 8-bit and can be concatenated with
- // the existing spec.
- url_canon::RawCanonOutput<128> scheme_replaced;
- url_parse::Component scheme_replaced_parsed;
- url_canon::CanonicalizeScheme(
- replacements.sources().scheme,
- replacements.components().scheme,
- &scheme_replaced, &scheme_replaced_parsed);
-
- // We can assume that the input is canonicalized, which means it always has
- // a colon after the scheme (or where the scheme would be).
- int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
- : 1;
- if (spec_len - spec_after_colon > 0) {
- scheme_replaced.Append(&spec[spec_after_colon],
- spec_len - spec_after_colon);
- }
-
- // We now need to completely re-parse the resulting string since its meaning
- // may have changed with the different scheme.
- url_canon::RawCanonOutput<128> recanonicalized;
- url_parse::Parsed recanonicalized_parsed;
- DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
- charset_converter,
- &recanonicalized, &recanonicalized_parsed);
-
- // Recurse using the version with the scheme already replaced. This will now
- // use the replacement rules for the new scheme.
- //
- // Warning: this code assumes that ReplaceComponents will re-check all
- // components for validity. This is because we can't fail if DoCanonicalize
- // failed above since theoretically the thing making it fail could be
- // getting replaced here. If ReplaceComponents didn't re-check everything,
- // we wouldn't know if something *not* getting replaced is a problem.
- // If the scheme-specific replacers are made more intelligent so they don't
- // re-check everything, we should instead recanonicalize the whole thing
- // after this call to check validity (this assumes replacing the scheme is
- // much much less common than other types of replacements, like clearing the
- // ref).
- url_canon::Replacements<CHAR> replacements_no_scheme = replacements;
- replacements_no_scheme.SetScheme(NULL, url_parse::Component());
- return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
- recanonicalized_parsed, replacements_no_scheme,
- charset_converter, output, out_parsed);
- }
-
- // If we get here, then we know the scheme doesn't need to be replaced, so can
- // just key off the scheme in the spec to know how to do the replacements.
- if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {
- return url_canon::ReplaceFileURL(spec, parsed, replacements,
- charset_converter, output, out_parsed);
- }
- if (DoIsStandard(spec, parsed.scheme)) {
- return url_canon::ReplaceStandardURL(spec, parsed, replacements,
- charset_converter, output, out_parsed);
- }
- if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {
- return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
- output, out_parsed);
- }
-
- // Default is a path URL.
- return url_canon::ReplacePathURL(spec, parsed, replacements,
- output, out_parsed);
-}
-
-} // namespace
-
-void Initialize() {
- InitStandardSchemes();
-}
-
-void Shutdown() {
- if (standard_schemes) {
- delete standard_schemes;
- standard_schemes = NULL;
- }
-}
-
-void AddStandardScheme(const char* new_scheme) {
- // If this assert triggers, it means you've called AddStandardScheme after
- // LockStandardSchemes have been called (see the header file for
- // LockStandardSchemes for more).
- //
- // This normally means you're trying to set up a new standard scheme too late
- // in your application's init process. Locate where your app does this
- // initialization and calls LockStandardScheme, and add your new standard
- // scheme there.
- DCHECK(!standard_schemes_locked) <<
- "Trying to add a standard scheme after the list has been locked.";
-
- size_t scheme_len = strlen(new_scheme);
- if (scheme_len == 0)
- return;
-
- // Dulicate the scheme into a new buffer and add it to the list of standard
- // schemes. This pointer will be leaked on shutdown.
- char* dup_scheme = new char[scheme_len + 1];
- memcpy(dup_scheme, new_scheme, scheme_len + 1);
-
- InitStandardSchemes();
- standard_schemes->push_back(dup_scheme);
-}
-
-void LockStandardSchemes() {
- standard_schemes_locked = true;
-}
-
-bool IsStandard(const char* spec, const url_parse::Component& scheme) {
- return DoIsStandard(spec, scheme);
-}
-
-bool IsStandard(const char16* spec, const url_parse::Component& scheme) {
- return DoIsStandard(spec, scheme);
-}
-
-bool FindAndCompareScheme(const char* str,
- int str_len,
- const char* compare,
- url_parse::Component* found_scheme) {
- return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
-}
-
-bool FindAndCompareScheme(const char16* str,
- int str_len,
- const char* compare,
- url_parse::Component* found_scheme) {
- return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
-}
-
-bool Canonicalize(const char* spec,
- int spec_len,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed) {
- return DoCanonicalize(spec, spec_len, charset_converter,
- output, output_parsed);
-}
-
-bool Canonicalize(const char16* spec,
- int spec_len,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed) {
- return DoCanonicalize(spec, spec_len, charset_converter,
- output, output_parsed);
-}
-
-bool ResolveRelative(const char* base_spec,
- int base_spec_len,
- const url_parse::Parsed& base_parsed,
- const char* relative,
- int relative_length,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed) {
- return DoResolveRelative(base_spec, base_spec_len, base_parsed,
- relative, relative_length,
- charset_converter, output, output_parsed);
-}
-
-bool ResolveRelative(const char* base_spec,
- int base_spec_len,
- const url_parse::Parsed& base_parsed,
- const char16* relative,
- int relative_length,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed) {
- return DoResolveRelative(base_spec, base_spec_len, base_parsed,
- relative, relative_length,
- charset_converter, output, output_parsed);
-}
-
-bool ReplaceComponents(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- const url_canon::Replacements<char>& replacements,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* out_parsed) {
- return DoReplaceComponents(spec, spec_len, parsed, replacements,
- charset_converter, output, out_parsed);
-}
-
-bool ReplaceComponents(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- const url_canon::Replacements<char16>& replacements,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* out_parsed) {
- return DoReplaceComponents(spec, spec_len, parsed, replacements,
- charset_converter, output, out_parsed);
-}
-
-// Front-ends for LowerCaseEqualsASCII.
-bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b) {
- return DoLowerCaseEqualsASCII(a_begin, a_end, b);
-}
-
-bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b_begin,
- const char* b_end) {
- while (a_begin != a_end && b_begin != b_end &&
- ToLowerASCII(*a_begin) == *b_begin) {
- a_begin++;
- b_begin++;
- }
- return a_begin == a_end && b_begin == b_end;
-}
-
-bool LowerCaseEqualsASCII(const char16* a_begin,
- const char16* a_end,
- const char* b) {
- return DoLowerCaseEqualsASCII(a_begin, a_end, b);
-}
-
-void DecodeURLEscapeSequences(const char* input, int length,
- url_canon::CanonOutputW* output) {
- url_canon::RawCanonOutputT<char> unescaped_chars;
- for (int i = 0; i < length; i++) {
- if (input[i] == '%') {
- unsigned char ch;
- if (url_canon::DecodeEscaped(input, &i, length, &ch)) {
- unescaped_chars.push_back(ch);
- } else {
- // Invalid escape sequence, copy the percent literal.
- unescaped_chars.push_back('%');
- }
- } else {
- // Regular non-escaped 8-bit character.
- unescaped_chars.push_back(input[i]);
- }
- }
-
- // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
- // JavaScript URLs, but Firefox and Safari do.
- for (int i = 0; i < unescaped_chars.length(); i++) {
- unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
- if (uch < 0x80) {
- // Non-UTF-8, just append directly
- output->push_back(uch);
- } else {
- // next_ch will point to the last character of the decoded
- // character.
- int next_character = i;
- unsigned code_point;
- if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character,
- unescaped_chars.length(), &code_point)) {
- // Valid UTF-8 character, convert to UTF-16.
- url_canon::AppendUTF16Value(code_point, output);
- i = next_character;
- } else {
- // If there are any sequences that are not valid UTF-8, we keep
- // invalid code points and promote to UTF-16. We copy all characters
- // from the current position to the end of the identified sequence.
- while (i < next_character) {
- output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
- i++;
- }
- output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
- }
- }
- }
-}
-
-} // namespace url_util
diff --git a/googleurl/src/url_util.h b/googleurl/src/url_util.h
deleted file mode 100644
index ec4cf9e..0000000
--- a/googleurl/src/url_util.h
+++ /dev/null
@@ -1,222 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef GOOGLEURL_SRC_URL_UTIL_H__
-#define GOOGLEURL_SRC_URL_UTIL_H__
-
-#include <string>
-
-#include "base/string16.h"
-#include "googleurl/src/url_common.h"
-#include "googleurl/src/url_parse.h"
-#include "googleurl/src/url_canon.h"
-
-namespace url_util {
-
-// Init ------------------------------------------------------------------------
-
-// Initialization is NOT required, it will be implicitly initialized when first
-// used. However, this implicit initialization is NOT threadsafe. If you are
-// using this library in a threaded environment and don't have a consistent
-// "first call" (an example might be calling "AddStandardScheme" with your
-// special application-specific schemes) then you will want to call initialize
-// before spawning any threads.
-//
-// It is OK to call this function more than once, subsequent calls will simply
-// "noop", unless Shutdown() was called in the mean time. This will also be a
-// "noop" if other calls to the library have forced an initialization
-// beforehand.
-GURL_API void Initialize();
-
-// Cleanup is not required, except some strings may leak. For most user
-// applications, this is fine. If you're using it in a library that may get
-// loaded and unloaded, you'll want to unload to properly clean up your
-// library.
-GURL_API void Shutdown();
-
-// Schemes --------------------------------------------------------------------
-
-// Adds an application-defined scheme to the internal list of "standard" URL
-// schemes. This function is not threadsafe and can not be called concurrently
-// with any other url_util function. It will assert if the list of standard
-// schemes has been locked (see LockStandardSchemes).
-GURL_API void AddStandardScheme(const char* new_scheme);
-
-// Sets a flag to prevent future calls to AddStandardScheme from succeeding.
-//
-// This is designed to help prevent errors for multithreaded applications.
-// Normal usage would be to call AddStandardScheme for your custom schemes at
-// the beginning of program initialization, and then LockStandardSchemes. This
-// prevents future callers from mistakenly calling AddStandardScheme when the
-// program is running with multiple threads, where such usage would be
-// dangerous.
-//
-// We could have had AddStandardScheme use a lock instead, but that would add
-// some platform-specific dependencies we don't otherwise have now, and is
-// overkill considering the normal usage is so simple.
-GURL_API void LockStandardSchemes();
-
-// Locates the scheme in the given string and places it into |found_scheme|,
-// which may be NULL to indicate the caller does not care about the range.
-//
-// Returns whether the given |compare| scheme matches the scheme found in the
-// input (if any). The |compare| scheme must be a valid canonical scheme or
-// the result of the comparison is undefined.
-GURL_API bool FindAndCompareScheme(const char* str,
- int str_len,
- const char* compare,
- url_parse::Component* found_scheme);
-GURL_API bool FindAndCompareScheme(const char16* str,
- int str_len,
- const char* compare,
- url_parse::Component* found_scheme);
-inline bool FindAndCompareScheme(const std::string& str,
- const char* compare,
- url_parse::Component* found_scheme) {
- return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
- compare, found_scheme);
-}
-inline bool FindAndCompareScheme(const string16& str,
- const char* compare,
- url_parse::Component* found_scheme) {
- return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
- compare, found_scheme);
-}
-
-// Returns true if the given string represents a standard URL. This means that
-// either the scheme is in the list of known standard schemes.
-GURL_API bool IsStandard(const char* spec,
- const url_parse::Component& scheme);
-GURL_API bool IsStandard(const char16* spec,
- const url_parse::Component& scheme);
-
-// TODO(brettw) remove this. This is a temporary compatibility hack to avoid
-// breaking the WebKit build when this version is synced via Chrome.
-inline bool IsStandard(const char* spec, int spec_len,
- const url_parse::Component& scheme) {
- return IsStandard(spec, scheme);
-}
-
-// URL library wrappers -------------------------------------------------------
-
-// Parses the given spec according to the extracted scheme type. Normal users
-// should use the URL object, although this may be useful if performance is
-// critical and you don't want to do the heap allocation for the std::string.
-//
-// As with the url_canon::Canonicalize* functions, the charset converter can
-// be NULL to use UTF-8 (it will be faster in this case).
-//
-// Returns true if a valid URL was produced, false if not. On failure, the
-// output and parsed structures will still be filled and will be consistent,
-// but they will not represent a loadable URL.
-GURL_API bool Canonicalize(const char* spec,
- int spec_len,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed);
-GURL_API bool Canonicalize(const char16* spec,
- int spec_len,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed);
-
-// Resolves a potentially relative URL relative to the given parsed base URL.
-// The base MUST be valid. The resulting canonical URL and parsed information
-// will be placed in to the given out variables.
-//
-// The relative need not be relative. If we discover that it's absolute, this
-// will produce a canonical version of that URL. See Canonicalize() for more
-// about the charset_converter.
-//
-// Returns true if the output is valid, false if the input could not produce
-// a valid URL.
-GURL_API bool ResolveRelative(const char* base_spec,
- int base_spec_len,
- const url_parse::Parsed& base_parsed,
- const char* relative,
- int relative_length,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed);
-GURL_API bool ResolveRelative(const char* base_spec,
- int base_spec_len,
- const url_parse::Parsed& base_parsed,
- const char16* relative,
- int relative_length,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed);
-
-// Replaces components in the given VALID input url. The new canonical URL info
-// is written to output and out_parsed.
-//
-// Returns true if the resulting URL is valid.
-GURL_API bool ReplaceComponents(
- const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- const url_canon::Replacements<char>& replacements,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* out_parsed);
-GURL_API bool ReplaceComponents(
- const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- const url_canon::Replacements<char16>& replacements,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* out_parsed);
-
-// String helper functions ----------------------------------------------------
-
-// Compare the lower-case form of the given string against the given ASCII
-// string. This is useful for doing checking if an input string matches some
-// token, and it is optimized to avoid intermediate string copies.
-//
-// The versions of this function that don't take a b_end assume that the b
-// string is NULL terminated.
-GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b);
-GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b_begin,
- const char* b_end);
-GURL_API bool LowerCaseEqualsASCII(const char16* a_begin,
- const char16* a_end,
- const char* b);
-
-// Unescapes the given string using URL escaping rules.
-GURL_API void DecodeURLEscapeSequences(const char* input, int length,
- url_canon::CanonOutputW* output);
-
-} // namespace url_util
-
-#endif // GOOGLEURL_SRC_URL_UTIL_H__
diff --git a/googleurl/src/url_util_unittest.cc b/googleurl/src/url_util_unittest.cc
deleted file mode 100644
index bb04905..0000000
--- a/googleurl/src/url_util_unittest.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_canon_stdstring.h"
-#include "googleurl/src/url_parse.h"
-#include "googleurl/src/url_test_utils.h"
-#include "googleurl/src/url_util.h"
-#include "testing/base/public/gunit.h"
-
-// From googleurl/base/basictypes.h
-#define ARRAYSIZE_UNSAFE(a) \
- ((sizeof(a) / sizeof(*(a))) / \
- static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-
-TEST(URLUtilTest, FindAndCompareScheme) {
- url_parse::Component found_scheme;
-
- // Simple case where the scheme is found and matches.
- const char kStr1[] = "http://www.com/";
- EXPECT_TRUE(url_util::FindAndCompareScheme(
- kStr1, static_cast<int>(strlen(kStr1)), "http", NULL));
- EXPECT_TRUE(url_util::FindAndCompareScheme(
- kStr1, static_cast<int>(strlen(kStr1)), "http", &found_scheme));
- EXPECT_TRUE(found_scheme == url_parse::Component(0, 4));
-
- // A case where the scheme is found and doesn't match.
- EXPECT_FALSE(url_util::FindAndCompareScheme(
- kStr1, static_cast<int>(strlen(kStr1)), "https", &found_scheme));
- EXPECT_TRUE(found_scheme == url_parse::Component(0, 4));
-
- // A case where there is no scheme.
- const char kStr2[] = "httpfoobar";
- EXPECT_FALSE(url_util::FindAndCompareScheme(
- kStr2, static_cast<int>(strlen(kStr2)), "http", &found_scheme));
- EXPECT_TRUE(found_scheme == url_parse::Component());
-
- // When there is an empty scheme, it should match the empty scheme.
- const char kStr3[] = ":foo.com/";
- EXPECT_TRUE(url_util::FindAndCompareScheme(
- kStr3, static_cast<int>(strlen(kStr3)), "", &found_scheme));
- EXPECT_TRUE(found_scheme == url_parse::Component(0, 0));
-
- // But when there is no scheme, it should fail.
- EXPECT_FALSE(url_util::FindAndCompareScheme("", 0, "", &found_scheme));
- EXPECT_TRUE(found_scheme == url_parse::Component());
-
- // When there is a whitespace char in scheme, it should canonicalize the url
- // before comparison.
- const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)";
- EXPECT_TRUE(url_util::FindAndCompareScheme(
- whtspc_str, static_cast<int>(strlen(whtspc_str)), "javascript",
- &found_scheme));
- EXPECT_TRUE(found_scheme == url_parse::Component(1, 10));
-
- // Control characters should be stripped out on the ends, and kept in the
- // middle.
- const char ctrl_str[] = "\02jav\02scr\03ipt:alert(1)";
- EXPECT_FALSE(url_util::FindAndCompareScheme(
- ctrl_str, static_cast<int>(strlen(ctrl_str)), "javascript",
- &found_scheme));
- EXPECT_TRUE(found_scheme == url_parse::Component(1, 11));
-}
-
-TEST(URLUtilTest, ReplaceComponents) {
- url_parse::Parsed parsed;
- url_canon::RawCanonOutputT<char> output;
- url_parse::Parsed new_parsed;
-
- // Check that the following calls do not cause crash
- url_canon::Replacements<char> replacements;
- replacements.SetRef("test", url_parse::Component(0, 4));
- url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
- &new_parsed);
- url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
- &new_parsed);
- replacements.ClearRef();
- replacements.SetHost("test", url_parse::Component(0, 4));
- url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
- &new_parsed);
- url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
- &new_parsed);
-
- replacements.ClearHost();
- url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
- &new_parsed);
- url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
- &new_parsed);
- url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
- &new_parsed);
- url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
- &new_parsed);
-}
-
-static std::string CheckReplaceScheme(const char* base_url,
- const char* scheme) {
- // Make sure the input is canonicalized.
- url_canon::RawCanonOutput<32> original;
- url_parse::Parsed original_parsed;
- url_util::Canonicalize(base_url, strlen(base_url), NULL,
- &original, &original_parsed);
-
- url_canon::Replacements<char> replacements;
- replacements.SetScheme(scheme, url_parse::Component(0, strlen(scheme)));
-
- std::string output_string;
- url_canon::StdStringCanonOutput output(&output_string);
- url_parse::Parsed output_parsed;
- url_util::ReplaceComponents(original.data(), original.length(),
- original_parsed, replacements, NULL,
- &output, &output_parsed);
-
- output.Complete();
- return output_string;
-}
-
-TEST(URLUtilTest, ReplaceScheme) {
- EXPECT_EQ("https://google.com/",
- CheckReplaceScheme("http://google.com/", "https"));
- EXPECT_EQ("file://google.com/",
- CheckReplaceScheme("http://google.com/", "file"));
- EXPECT_EQ("http://home/Build",
- CheckReplaceScheme("file:///Home/Build", "http"));
- EXPECT_EQ("javascript:foo",
- CheckReplaceScheme("about:foo", "javascript"));
- EXPECT_EQ("://google.com/",
- CheckReplaceScheme("http://google.com/", ""));
- EXPECT_EQ("http://google.com/",
- CheckReplaceScheme("about:google.com", "http"));
- EXPECT_EQ("http:", CheckReplaceScheme("", "http"));
-
-#ifdef WIN32
- // Magic Windows drive letter behavior when converting to a file URL.
- EXPECT_EQ("file:///E:/foo/",
- CheckReplaceScheme("http://localhost/e:foo/", "file"));
-#endif
-
- // This will probably change to "about://google.com/" when we fix
- // http://crbug.com/160 which should also be an acceptable result.
- EXPECT_EQ("about://google.com/",
- CheckReplaceScheme("http://google.com/", "about"));
-}
-
-TEST(URLUtilTest, DecodeURLEscapeSequences) {
- struct DecodeCase {
- const char* input;
- const char* output;
- } decode_cases[] = {
- {"hello, world", "hello, world"},
- {"%01%02%03%04%05%06%07%08%09%0a%0B%0C%0D%0e%0f/",
- "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0B\x0C\x0D\x0e\x0f/"},
- {"%10%11%12%13%14%15%16%17%18%19%1a%1B%1C%1D%1e%1f/",
- "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1B\x1C\x1D\x1e\x1f/"},
- {"%20%21%22%23%24%25%26%27%28%29%2a%2B%2C%2D%2e%2f/",
- " !\"#$%&'()*+,-.//"},
- {"%30%31%32%33%34%35%36%37%38%39%3a%3B%3C%3D%3e%3f/",
- "0123456789:;<=>?/"},
- {"%40%41%42%43%44%45%46%47%48%49%4a%4B%4C%4D%4e%4f/",
- "@ABCDEFGHIJKLMNO/"},
- {"%50%51%52%53%54%55%56%57%58%59%5a%5B%5C%5D%5e%5f/",
- "PQRSTUVWXYZ[\\]^_/"},
- {"%60%61%62%63%64%65%66%67%68%69%6a%6B%6C%6D%6e%6f/",
- "`abcdefghijklmno/"},
- {"%70%71%72%73%74%75%76%77%78%79%7a%7B%7C%7D%7e%7f/",
- "pqrstuvwxyz{|}~\x7f/"},
- // Test un-UTF-8-ization.
- {"%e4%bd%a0%e5%a5%bd", "\xe4\xbd\xa0\xe5\xa5\xbd"},
- };
-
- for (size_t i = 0; i < ARRAYSIZE_UNSAFE(decode_cases); i++) {
- const char* input = decode_cases[i].input;
- url_canon::RawCanonOutputT<char16> output;
- url_util::DecodeURLEscapeSequences(input, strlen(input), &output);
- EXPECT_EQ(decode_cases[i].output,
- url_test_utils::ConvertUTF16ToUTF8(
- string16(output.data(), output.length())));
- }
-
- // Our decode should decode %00
- const char zero_input[] = "%00";
- url_canon::RawCanonOutputT<char16> zero_output;
- url_util::DecodeURLEscapeSequences(zero_input, strlen(zero_input),
- &zero_output);
- EXPECT_NE("%00",
- url_test_utils::ConvertUTF16ToUTF8(
- string16(zero_output.data(), zero_output.length())));
-
- // Test the error behavior for invalid UTF-8.
- const char invalid_input[] = "%e4%a0%e5%a5%bd";
- const char16 invalid_expected[4] = {0x00e4, 0x00a0, 0x597d, 0};
- url_canon::RawCanonOutputT<char16> invalid_output;
- url_util::DecodeURLEscapeSequences(invalid_input, strlen(invalid_input),
- &invalid_output);
- EXPECT_EQ(string16(invalid_expected),
- string16(invalid_output.data(), invalid_output.length()));
-}
diff --git a/googleurl/third_party/icu/build/using_icu.vsprops b/googleurl/third_party/icu/build/using_icu.vsprops
deleted file mode 100644
index a3989ef..0000000
--- a/googleurl/third_party/icu/build/using_icu.vsprops
+++ /dev/null
@@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioPropertySheet
- ProjectType="Visual C++"
- Version="8.00"
- Name="using_icu"
- >
- <Tool
- Name="VCCLCompilerTool"
- AdditionalIncludeDirectories=""$(SolutionDir)..\..\third_party\icu\public\common";"$(SolutionDir)..\..\third_party\icu\public\i18n""
- />
-</VisualStudioPropertySheet>
diff --git a/src/base/strings/string16.cc b/src/base/strings/string16.cc
new file mode 100644
index 0000000..2e749a3
--- /dev/null
+++ b/src/base/strings/string16.cc
@@ -0,0 +1,74 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string16.h"
+
+#if defined(WCHAR_T_IS_UTF16)
+
+#error This file should not be used on 2-byte wchar_t systems
+// If this winds up being needed on 2-byte wchar_t systems, either the
+// definitions below can be used, or the host system's wide character
+// functions like wmemcmp can be wrapped.
+
+#elif defined(WCHAR_T_IS_UTF32)
+
+#include <ostream>
+
+namespace url {
+namespace base {
+
+int c16memcmp(const char16* s1, const char16* s2, size_t n) {
+ // We cannot call memcmp because that changes the semantics.
+ while (n-- > 0) {
+ if (*s1 != *s2) {
+ // We cannot use (*s1 - *s2) because char16 is unsigned.
+ return ((*s1 < *s2) ? -1 : 1);
+ }
+ ++s1;
+ ++s2;
+ }
+ return 0;
+}
+
+size_t c16len(const char16* s) {
+ const char16 *s_orig = s;
+ while (*s) {
+ ++s;
+ }
+ return s - s_orig;
+}
+
+const char16* c16memchr(const char16* s, char16 c, size_t n) {
+ while (n-- > 0) {
+ if (*s == c) {
+ return s;
+ }
+ ++s;
+ }
+ return 0;
+}
+
+char16* c16memmove(char16* s1, const char16* s2, size_t n) {
+ return static_cast<char16*>(memmove(s1, s2, n * sizeof(char16)));
+}
+
+char16* c16memcpy(char16* s1, const char16* s2, size_t n) {
+ return static_cast<char16*>(memcpy(s1, s2, n * sizeof(char16)));
+}
+
+char16* c16memset(char16* s, char16 c, size_t n) {
+ char16 *s_orig = s;
+ while (n-- > 0) {
+ *s = c;
+ ++s;
+ }
+ return s_orig;
+}
+
+} // namespace base
+} // namespace url
+
+template class std::basic_string<url::base::char16, url::base::string16_char_traits>;
+
+#endif // WCHAR_T_IS_UTF32
diff --git a/googleurl/base/string16.h b/src/base/strings/string16.h
similarity index 74%
rename from googleurl/base/string16.h
rename to src/base/strings/string16.h
index deedaf6..be488c3 100644
--- a/googleurl/base/string16.h
+++ b/src/base/strings/string16.h
@@ -1,33 +1,9 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
-#ifndef BASE_STRING16_H_
-#define BASE_STRING16_H_
+#ifndef BASE_STRINGS_STRING16_H_
+#define BASE_STRINGS_STRING16_H_
// WHAT:
// A version of std::basic_string that provides 2-byte characters even when
@@ -54,18 +30,27 @@
#include <string>
#include "base/basictypes.h"
+#include "build/build_config.h"
-#ifdef WIN32
+#if defined(WCHAR_T_IS_UTF16)
+
+namespace url {
+namespace base {
typedef wchar_t char16;
typedef std::wstring string16;
+typedef std::char_traits<wchar_t> string16_char_traits;
-#else // !WIN32
+} // namespace base
+} // namespace url
+
+#elif defined(WCHAR_T_IS_UTF32)
+
+namespace url {
+namespace base {
typedef uint16 char16;
-namespace base {
-
// char16 versions of the functions required by string16_char_traits; these
// are based on the wide character functions of similar names ("w" or "wcs"
// instead of "c16").
@@ -80,6 +65,10 @@
typedef char16 char_type;
typedef int int_type;
+ // int_type needs to be able to hold each possible value of char_type, and in
+ // addition, the distinct value of eof().
+ COMPILE_ASSERT(sizeof(int_type) > sizeof(char_type), unexpected_type_width);
+
typedef std::streamoff off_type;
typedef mbstate_t state_type;
typedef std::fpos<state_type> pos_type;
@@ -141,7 +130,10 @@
}
};
+typedef std::basic_string<char16, url::base::string16_char_traits> string16;
+
} // namespace base
+} // namespace url
// The string class will be explicitly instantiated only once, in string16.cc.
//
@@ -182,12 +174,9 @@
//
// TODO(mark): File this bug with Apple and update this note with a bug number.
-extern template class std::basic_string<char16, base::string16_char_traits>;
+extern template
+class std::basic_string<url::base::char16, url::base::string16_char_traits>;
-typedef std::basic_string<char16, base::string16_char_traits> string16;
+#endif // WCHAR_T_IS_UTF32
-extern std::ostream& operator<<(std::ostream& out, const string16& str);
-
-#endif // !WIN32
-
-#endif // BASE_STRING16_H_
+#endif // BASE_STRINGS_STRING16_H_
diff --git a/src/base/strings/string_util.cc b/src/base/strings/string_util.cc
new file mode 100644
index 0000000..8b2e068
--- /dev/null
+++ b/src/base/strings/string_util.cc
@@ -0,0 +1,147 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_util.h"
+
+#include "base/basictypes.h"
+#include "base/third_party/icu/icu_utf.h"
+
+static bool IsWildcard(base_icu::UChar32 character) {
+ return character == '*' || character == '?';
+}
+
+// Move the strings pointers to the point where they start to differ.
+template <typename CHAR, typename NEXT>
+static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
+ const CHAR** string, const CHAR* string_end,
+ NEXT next) {
+ const CHAR* escape = NULL;
+ while (*pattern != pattern_end && *string != string_end) {
+ if (!escape && IsWildcard(**pattern)) {
+ // We don't want to match wildcard here, except if it's escaped.
+ return;
+ }
+
+ // Check if the escapement char is found. If so, skip it and move to the
+ // next character.
+ if (!escape && **pattern == '\\') {
+ escape = *pattern;
+ next(pattern, pattern_end);
+ continue;
+ }
+
+ // Check if the chars match, if so, increment the ptrs.
+ const CHAR* pattern_next = *pattern;
+ const CHAR* string_next = *string;
+ base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
+ if (pattern_char == next(&string_next, string_end) &&
+ pattern_char != CBU_SENTINEL) {
+ *pattern = pattern_next;
+ *string = string_next;
+ } else {
+ // Uh oh, it did not match, we are done. If the last char was an
+ // escapement, that means that it was an error to advance the ptr here,
+ // let's put it back where it was. This also mean that the MatchPattern
+ // function will return false because if we can't match an escape char
+ // here, then no one will.
+ if (escape) {
+ *pattern = escape;
+ }
+ return;
+ }
+
+ escape = NULL;
+ }
+}
+
+template <typename CHAR, typename NEXT>
+static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
+ while (*pattern != end) {
+ if (!IsWildcard(**pattern))
+ return;
+ next(pattern, end);
+ }
+}
+
+template <typename CHAR, typename NEXT>
+static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
+ const CHAR* pattern, const CHAR* pattern_end,
+ int depth,
+ NEXT next) {
+ const int kMaxDepth = 16;
+ if (depth > kMaxDepth)
+ return false;
+
+ // Eat all the matching chars.
+ EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
+
+ // If the string is empty, then the pattern must be empty too, or contains
+ // only wildcards.
+ if (eval == eval_end) {
+ EatWildcard(&pattern, pattern_end, next);
+ return pattern == pattern_end;
+ }
+
+ // Pattern is empty but not string, this is not a match.
+ if (pattern == pattern_end)
+ return false;
+
+ // If this is a question mark, then we need to compare the rest with
+ // the current string or the string with one character eaten.
+ const CHAR* next_pattern = pattern;
+ next(&next_pattern, pattern_end);
+ if (pattern[0] == '?') {
+ if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
+ depth + 1, next))
+ return true;
+ const CHAR* next_eval = eval;
+ next(&next_eval, eval_end);
+ if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
+ depth + 1, next))
+ return true;
+ }
+
+ // This is a *, try to match all the possible substrings with the remainder
+ // of the pattern.
+ if (pattern[0] == '*') {
+ // Collapse duplicate wild cards (********** into *) so that the
+ // method does not recurse unnecessarily. http://crbug.com/52839
+ EatWildcard(&next_pattern, pattern_end, next);
+
+ while (eval != eval_end) {
+ if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
+ depth + 1, next))
+ return true;
+ eval++;
+ }
+
+ // We reached the end of the string, let see if the pattern contains only
+ // wildcards.
+ if (eval == eval_end) {
+ EatWildcard(&pattern, pattern_end, next);
+ if (pattern != pattern_end)
+ return false;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+struct NextCharUTF8 {
+ base_icu::UChar32 operator()(const char** p, const char* end) {
+ base_icu::UChar32 c;
+ int offset = 0;
+ CBU8_NEXT(*p, offset, end - *p, c);
+ *p += offset;
+ return c;
+ }
+};
+
+bool MatchPattern(const std::string& eval,
+ const std::string& pattern) {
+ return MatchPatternT(eval.data(), eval.data() + eval.size(),
+ pattern.data(), pattern.data() + pattern.size(),
+ 0, NextCharUTF8());
+}
diff --git a/src/base/strings/string_util.h b/src/base/strings/string_util.h
new file mode 100644
index 0000000..ffc1579
--- /dev/null
+++ b/src/base/strings/string_util.h
@@ -0,0 +1,20 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// This file defines utility functions for working with strings.
+
+#ifndef BASE_STRINGS_STRING_UTIL_H_
+#define BASE_STRINGS_STRING_UTIL_H_
+
+#include "base/basictypes.h"
+
+// Returns true if the string passed in matches the pattern. The pattern
+// string can contain wildcards like * and ?
+// The backslash character (\) is an escape character for * and ?
+// We limit the patterns to having a max of 16 * or ? characters.
+// ? matches 0 or 1 character, while * matches 0 or more characters.
+bool MatchPattern(const std::string& string,
+ const std::string& pattern);
+
+#endif // BASE_STRINGS_STRING_UTIL_H_
diff --git a/src/base/strings/utf_string_conversion_utils.cc b/src/base/strings/utf_string_conversion_utils.cc
new file mode 100644
index 0000000..e71605b
--- /dev/null
+++ b/src/base/strings/utf_string_conversion_utils.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/utf_string_conversion_utils.h"
+
+#include "base/third_party/icu/icu_utf.h"
+
+namespace url {
+namespace base {
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+bool ReadUnicodeCharacter(const char* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point_out) {
+ // U8_NEXT expects to be able to use -1 to signal an error, so we must
+ // use a signed type for code_point. But this function returns false
+ // on error anyway, so code_point_out is unsigned.
+ int32 code_point;
+ CBU8_NEXT(src, *char_index, src_len, code_point);
+ *code_point_out = static_cast<uint32>(code_point);
+
+ // The ICU macro above moves to the next char, we want to point to the last
+ // char consumed.
+ (*char_index)--;
+
+ // Validate the decoded value.
+ return IsValidCodepoint(code_point);
+}
+
+bool ReadUnicodeCharacter(const char16* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point) {
+ if (CBU16_IS_SURROGATE(src[*char_index])) {
+ if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
+ *char_index + 1 >= src_len ||
+ !CBU16_IS_TRAIL(src[*char_index + 1])) {
+ // Invalid surrogate pair.
+ return false;
+ }
+
+ // Valid surrogate pair.
+ *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index],
+ src[*char_index + 1]);
+ (*char_index)++;
+ } else {
+ // Not a surrogate, just one 16-bit word.
+ *code_point = src[*char_index];
+ }
+
+ return IsValidCodepoint(*code_point);
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+bool ReadUnicodeCharacter(const wchar_t* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point) {
+ // Conversion is easy since the source is 32-bit.
+ *code_point = src[*char_index];
+
+ // Validate the value.
+ return IsValidCodepoint(*code_point);
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) {
+ if (code_point <= 0x7f) {
+ // Fast path the common case of one byte.
+ output->push_back(static_cast<char>(code_point));
+ return 1;
+ }
+
+
+ // CBU8_APPEND_UNSAFE can append up to 4 bytes.
+ size_t char_offset = output->length();
+ size_t original_char_offset = char_offset;
+ output->resize(char_offset + CBU8_MAX_LENGTH);
+
+ CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+
+ // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
+ // it will represent the new length of the string.
+ output->resize(char_offset);
+ return char_offset - original_char_offset;
+}
+
+size_t WriteUnicodeCharacter(uint32 code_point, string16* output) {
+ if (CBU16_LENGTH(code_point) == 1) {
+ // Thie code point is in the Basic Multilingual Plane (BMP).
+ output->push_back(static_cast<char16>(code_point));
+ return 1;
+ }
+ // Non-BMP characters use a double-character encoding.
+ size_t char_offset = output->length();
+ output->resize(char_offset + CBU16_MAX_LENGTH);
+ CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+ return CBU16_MAX_LENGTH;
+}
+
+// Generalized Unicode converter -----------------------------------------------
+
+template<typename CHAR>
+void PrepareForUTF8Output(const CHAR* src,
+ size_t src_len,
+ std::string* output) {
+ output->clear();
+ if (src_len == 0)
+ return;
+ if (src[0] < 0x80) {
+ // Assume that the entire input will be ASCII.
+ output->reserve(src_len);
+ } else {
+ // Assume that the entire input is non-ASCII and will have 3 bytes per char.
+ output->reserve(src_len * 3);
+ }
+}
+
+// Instantiate versions we know callers will need.
+template void PrepareForUTF8Output(const wchar_t*, size_t, std::string*);
+template void PrepareForUTF8Output(const char16*, size_t, std::string*);
+
+template<typename STRING>
+void PrepareForUTF16Or32Output(const char* src,
+ size_t src_len,
+ STRING* output) {
+ output->clear();
+ if (src_len == 0)
+ return;
+ if (static_cast<unsigned char>(src[0]) < 0x80) {
+ // Assume the input is all ASCII, which means 1:1 correspondence.
+ output->reserve(src_len);
+ } else {
+ // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
+ // character.
+ output->reserve(src_len / 2);
+ }
+}
+
+// Instantiate versions we know callers will need.
+template void PrepareForUTF16Or32Output(const char*, size_t, std::wstring*);
+template void PrepareForUTF16Or32Output(const char*, size_t, string16*);
+
+} // namespace base
+} // namespace url
diff --git a/src/base/strings/utf_string_conversion_utils.h b/src/base/strings/utf_string_conversion_utils.h
new file mode 100644
index 0000000..b24f03b
--- /dev/null
+++ b/src/base/strings/utf_string_conversion_utils.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
+#define BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
+
+// This should only be used by the various UTF string conversion files.
+
+#include "base/strings/string16.h"
+
+namespace url {
+namespace base {
+
+inline bool IsValidCodepoint(uint32 code_point) {
+ // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
+ // codepoints larger than 0x10FFFF (the highest codepoint allowed).
+ // Non-characters and unassigned codepoints are allowed.
+ return code_point < 0xD800u ||
+ (code_point >= 0xE000u && code_point <= 0x10FFFFu);
+}
+
+inline bool IsValidCharacter(uint32 code_point) {
+ // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
+ // 0xFFFE or 0xFFFF) from the set of valid code points.
+ return code_point < 0xD800u || (code_point >= 0xE000u &&
+ code_point < 0xFDD0u) || (code_point > 0xFDEFu &&
+ code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
+}
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+// Reads a UTF-8 stream, placing the next code point into the given output
+// |*code_point|. |src| represents the entire string to read, and |*char_index|
+// is the character offset within the string to start reading at. |*char_index|
+// will be updated to index the last character read, such that incrementing it
+// (as in a for loop) will take the reader to the next character.
+//
+// Returns true on success. On false, |*code_point| will be invalid.
+bool ReadUnicodeCharacter(const char* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point_out);
+
+// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
+bool ReadUnicodeCharacter(const char16* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point);
+
+#if defined(WCHAR_T_IS_UTF32)
+// Reads UTF-32 character. The usage is the same as the 8-bit version above.
+bool ReadUnicodeCharacter(const wchar_t* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point);
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+// Appends a UTF-8 character to the given 8-bit string. Returns the number of
+// bytes written.
+// TODO(brettw) Bug 79631: This function should not be exposed.
+size_t WriteUnicodeCharacter(uint32 code_point,
+ std::string* output);
+
+// Appends the given code point as a UTF-16 character to the given 16-bit
+// string. Returns the number of 16-bit values written.
+size_t WriteUnicodeCharacter(uint32 code_point, string16* output);
+
+#if defined(WCHAR_T_IS_UTF32)
+// Appends the given UTF-32 character to the given 32-bit string. Returns the
+// number of 32-bit values written.
+inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
+ // This is the easy case, just append the character.
+ output->push_back(code_point);
+ return 1;
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// Generalized Unicode converter -----------------------------------------------
+
+// Guesses the length of the output in UTF-8 in bytes, clears that output
+// string, and reserves that amount of space. We assume that the input
+// character types are unsigned, which will be true for UTF-16 and -32 on our
+// systems.
+template<typename CHAR>
+void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output);
+
+// Prepares an output buffer (containing either UTF-16 or -32 data) given some
+// UTF-8 input that will be converted to it. See PrepareForUTF8Output().
+template<typename STRING>
+void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output);
+
+} // namespace base
+} // namespace url
+
+#endif // BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
diff --git a/src/base/third_party/icu/LICENSE b/src/base/third_party/icu/LICENSE
new file mode 100644
index 0000000..40282f4
--- /dev/null
+++ b/src/base/third_party/icu/LICENSE
@@ -0,0 +1,32 @@
+ICU License - ICU 1.8.1 and later
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2009 International Business Machines Corporation and others
+
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
diff --git a/src/base/third_party/icu/icu_utf.cc b/src/base/third_party/icu/icu_utf.cc
new file mode 100644
index 0000000..55edce1
--- /dev/null
+++ b/src/base/third_party/icu/icu_utf.cc
@@ -0,0 +1,230 @@
+/*
+******************************************************************************
+*
+* Copyright (C) 1999-2006, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: utf_impl.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999sep13
+* created by: Markus W. Scherer
+*
+* This file provides implementation functions for macros in the utfXX.h
+* that would otherwise be too long as macros.
+*/
+
+#include "base/third_party/icu/icu_utf.h"
+
+namespace base_icu {
+
+/**
+ * UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8,
+ * which need 1 or 2 bytes in UTF-8:
+ * \code
+ * U+0015 = NAK = Negative Acknowledge, C0 control character
+ * U+009f = highest C1 control character
+ * \endcode
+ *
+ * These are used by UTF8_..._SAFE macros so that they can return an error value
+ * that needs the same number of code units (bytes) as were seen by
+ * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().
+ *
+ * @deprecated ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define CBUTF8_ERROR_VALUE_1 0x15
+
+/**
+ * See documentation on UTF8_ERROR_VALUE_1 for details.
+ *
+ * @deprecated ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define CBUTF8_ERROR_VALUE_2 0x9f
+
+
+/**
+ * Error value for all UTFs. This code point value will be set by macros with e>
+ * checking if an error is detected.
+ *
+ * @deprecated ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define CBUTF_ERROR_VALUE 0xffff
+
+/*
+ * This table could be replaced on many machines by
+ * a few lines of assembler code using an
+ * "index of first 0-bit from msb" instruction and
+ * one or two more integer instructions.
+ *
+ * For example, on an i386, do something like
+ * - MOV AL, leadByte
+ * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0)
+ * - MOV AH, 0
+ * - BSR BX, AX (16-bit)
+ * - MOV AX, 6 (result)
+ * - JZ finish (ZF==1 if leadByte==0xff)
+ * - SUB AX, BX (result)
+ * -finish:
+ * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
+ *
+ * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
+ * lead bytes above 0xf4 are illegal.
+ * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
+ */
+const uint8
+utf8_countTrailBytes[256]={
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3,
+ 3, 3, 3, /* illegal in Unicode */
+ 4, 4, 4, 4, /* illegal in Unicode */
+ 5, 5, /* illegal in Unicode */
+ 0, 0 /* illegal bytes 0xfe and 0xff */
+};
+
+static const UChar32
+utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
+
+static const UChar32
+utf8_errorValue[6]={
+ CBUTF8_ERROR_VALUE_1, CBUTF8_ERROR_VALUE_2, CBUTF_ERROR_VALUE, 0x10ffff,
+ 0x3ffffff, 0x7fffffff
+};
+
+/*
+ * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling
+ * UTF8_NEXT_CHAR_SAFE().
+ *
+ * The "strict" parameter controls the error behavior:
+ * <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative
+ * code point result.
+ * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
+ * All illegal byte sequences yield a positive code point such that this
+ * result code point would be encoded with the same number of bytes as
+ * the illegal sequence.
+ * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
+ * Same as the obsolete "safe" behavior, but non-characters are also treated
+ * like illegal sequences.
+ *
+ * The special negative (<0) value -2 is used for lenient treatment of surrogate
+ * code points as legal. Some implementations use this for roundtripping of
+ * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
+ * contain unpaired surrogates.
+ *
+ * Note that a UBool is the same as an int8_t.
+ */
+UChar32
+utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict) {
+ int32 i=*pi;
+ uint8 count=CBU8_COUNT_TRAIL_BYTES(c);
+ if((i)+count<=(length)) {
+ uint8 trail, illegal=0;
+
+ CBU8_MASK_LEAD_BYTE((c), count);
+ /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
+ switch(count) {
+ /* each branch falls through to the next one */
+ case 5:
+ case 4:
+ /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
+ illegal=1;
+ break;
+ case 3:
+ trail=s[(i)++];
+ (c)=((c)<<6)|(trail&0x3f);
+ if(c<0x110) {
+ illegal|=(trail&0xc0)^0x80;
+ } else {
+ /* code point>0x10ffff, outside Unicode */
+ illegal=1;
+ break;
+ }
+ FALLTHROUGH_INTENDED;
+ case 2:
+ trail=s[(i)++];
+ (c)=((c)<<6)|(trail&0x3f);
+ illegal|=(trail&0xc0)^0x80;
+ FALLTHROUGH_INTENDED;
+ case 1:
+ trail=s[(i)++];
+ (c)=((c)<<6)|(trail&0x3f);
+ illegal|=(trail&0xc0)^0x80;
+ break;
+ case 0:
+ if(strict>=0) {
+ return CBUTF8_ERROR_VALUE_1;
+ } else {
+ return CBU_SENTINEL;
+ }
+ /* no default branch to optimize switch() - all values are covered */
+ }
+
+ /*
+ * All the error handling should return a value
+ * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
+ *
+ * Starting with Unicode 3.0.1, non-shortest forms are illegal.
+ * Starting with Unicode 3.2, surrogate code points must not be
+ * encoded in UTF-8, and there are no irregular sequences any more.
+ *
+ * U8_ macros (new in ICU 2.4) return negative values for error conditions.
+ */
+
+ /* correct sequence - all trail bytes have (b7..b6)==(10)? */
+ /* illegal is also set if count>=4 */
+ if(illegal || (c)<utf8_minLegal[count] || (CBU_IS_SURROGATE(c) && strict!=-2)) {
+ /* error handling */
+ uint8 errorCount=count;
+ /* don't go beyond this sequence */
+ i=*pi;
+ while(count>0 && CBU8_IS_TRAIL(s[i])) {
+ ++(i);
+ --count;
+ }
+ if(strict>=0) {
+ c=utf8_errorValue[errorCount-count];
+ } else {
+ c=CBU_SENTINEL;
+ }
+ } else if((strict)>0 && CBU_IS_UNICODE_NONCHAR(c)) {
+ /* strict: forbid non-characters like U+fffe */
+ c=utf8_errorValue[count];
+ }
+ } else /* too few bytes left */ {
+ /* error handling */
+ int32 i0=i;
+ /* don't just set (i)=(length) in case there is an illegal sequence */
+ while((i)<(length) && CBU8_IS_TRAIL(s[i])) {
+ ++(i);
+ }
+ if(strict>=0) {
+ c=utf8_errorValue[i-i0];
+ } else {
+ c=CBU_SENTINEL;
+ }
+ }
+ *pi=i;
+ return c;
+}
+
+} // namespace base_icu
diff --git a/src/base/third_party/icu/icu_utf.h b/src/base/third_party/icu/icu_utf.h
new file mode 100644
index 0000000..2b993b0
--- /dev/null
+++ b/src/base/third_party/icu/icu_utf.h
@@ -0,0 +1,391 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2004, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: utf.h
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999sep09
+* created by: Markus W. Scherer
+*/
+
+#ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_
+#define BASE_THIRD_PARTY_ICU_ICU_UTF_H_
+
+#include "base/basictypes.h"
+
+namespace base_icu {
+
+typedef int32 UChar32;
+typedef uint16 UChar;
+typedef int8 UBool;
+
+// General ---------------------------------------------------------------------
+// from utf.h
+
+/**
+ * This value is intended for sentinel values for APIs that
+ * (take or) return single code points (UChar32).
+ * It is outside of the Unicode code point range 0..0x10ffff.
+ *
+ * For example, a "done" or "error" value in a new API
+ * could be indicated with CBU_SENTINEL.
+ *
+ * ICU APIs designed before ICU 2.4 usually define service-specific "done"
+ * values, mostly 0xffff.
+ * Those may need to be distinguished from
+ * actual U+ffff text contents by calling functions like
+ * CharacterIterator::hasNext() or UnicodeString::length().
+ *
+ * @return -1
+ * @see UChar32
+ * @stable ICU 2.4
+ */
+#define CBU_SENTINEL (-1)
+
+/**
+ * Is this code point a Unicode noncharacter?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_UNICODE_NONCHAR(c) \
+ ((c)>=0xfdd0 && \
+ ((uint32)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
+ (uint32)(c)<=0x10ffff)
+
+/**
+ * Is c a Unicode code point value (0..U+10ffff)
+ * that can be assigned a character?
+ *
+ * Code points that are not characters include:
+ * - single surrogate code points (U+d800..U+dfff, 2048 code points)
+ * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
+ * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
+ * - the highest Unicode code point value is U+10ffff
+ *
+ * This means that all code points below U+d800 are character code points,
+ * and that boundary is tested first for performance.
+ *
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_UNICODE_CHAR(c) \
+ ((uint32)(c)<0xd800 || \
+ ((uint32)(c)>0xdfff && \
+ (uint32)(c)<=0x10ffff && \
+ !CBU_IS_UNICODE_NONCHAR(c)))
+
+/**
+ * Is this code point a surrogate (U+d800..U+dfff)?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
+
+/**
+ * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
+ * is it a lead surrogate?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+
+
+// UTF-8 macros ----------------------------------------------------------------
+// from utf8.h
+
+extern const uint8 utf8_countTrailBytes[256];
+
+/**
+ * Count the trail bytes for a UTF-8 lead byte.
+ * @internal
+ */
+#define CBU8_COUNT_TRAIL_BYTES(leadByte) (base_icu::utf8_countTrailBytes[(uint8)leadByte])
+
+/**
+ * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
+ * @internal
+ */
+#define CBU8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
+
+/**
+ * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_SINGLE(c) (((c)&0x80)==0)
+
+/**
+ * Is this code unit (byte) a UTF-8 lead byte?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_LEAD(c) ((uint8)((c)-0xc0)<0x3e)
+
+/**
+ * Is this code unit (byte) a UTF-8 trail byte?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_TRAIL(c) (((c)&0xc0)==0x80)
+
+/**
+ * How many code units (bytes) are used for the UTF-8 encoding
+ * of this Unicode code point?
+ * @param c 32-bit code point
+ * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
+ * @stable ICU 2.4
+ */
+#define CBU8_LENGTH(c) \
+ ((uint32)(c)<=0x7f ? 1 : \
+ ((uint32)(c)<=0x7ff ? 2 : \
+ ((uint32)(c)<=0xd7ff ? 3 : \
+ ((uint32)(c)<=0xdfff || (uint32)(c)>0x10ffff ? 0 : \
+ ((uint32)(c)<=0xffff ? 3 : 4)\
+ ) \
+ ) \
+ ) \
+ )
+
+/**
+ * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
+ * @return 4
+ * @stable ICU 2.4
+ */
+#define CBU8_MAX_LENGTH 4
+
+/**
+ * Function for handling "next code point" with error-checking.
+ * @internal
+ */
+UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict);
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * If the offset points to a trail byte or an illegal UTF-8 sequence, then
+ * c is set to a negative value.
+ *
+ * @param s const uint8 * string
+ * @param i string offset, i<length
+ * @param length string length
+ * @param c output UChar32 variable, set to <0 in case of an error
+ * @see CBU8_NEXT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define CBU8_NEXT(s, i, length, c) { \
+ (c)=(s)[(i)++]; \
+ if(((uint8)(c))>=0x80) { \
+ if(CBU8_IS_LEAD(c)) { \
+ (c)=base_icu::utf8_nextCharSafeBody((const uint8 *)s, &(i), (int32)(length), c, -1); \
+ } else { \
+ (c)=CBU_SENTINEL; \
+ } \
+ } \
+}
+
+/**
+ * Append a code point to a string, overwriting 1 to 4 bytes.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const uint8 * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see CBU8_APPEND
+ * @stable ICU 2.4
+ */
+#define CBU8_APPEND_UNSAFE(s, i, c) { \
+ if((uint32)(c)<=0x7f) { \
+ (s)[(i)++]=(uint8)(c); \
+ } else { \
+ if((uint32)(c)<=0x7ff) { \
+ (s)[(i)++]=(uint8)(((c)>>6)|0xc0); \
+ } else { \
+ if((uint32)(c)<=0xffff) { \
+ (s)[(i)++]=(uint8)(((c)>>12)|0xe0); \
+ } else { \
+ (s)[(i)++]=(uint8)(((c)>>18)|0xf0); \
+ (s)[(i)++]=(uint8)((((c)>>12)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8)((((c)>>6)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8)(((c)&0x3f)|0x80); \
+ } \
+}
+
+// UTF-16 macros ---------------------------------------------------------------
+// from utf16.h
+
+/**
+ * Does this code unit alone encode a code point (BMP, not a surrogate)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c)
+
+/**
+ * Is this code unit a lead surrogate (U+d800..U+dbff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
+
+/**
+ * Is this code unit a trail surrogate (U+dc00..U+dfff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
+
+/**
+ * Is this code unit a surrogate (U+d800..U+dfff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c)
+
+/**
+ * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
+ * is it a lead surrogate?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+
+/**
+ * Helper constant for CBU16_GET_SUPPLEMENTARY.
+ * @internal
+ */
+#define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
+
+/**
+ * Get a supplementary code point value (U+10000..U+10ffff)
+ * from its lead and trail surrogates.
+ * The result is undefined if the input values are not
+ * lead and trail surrogates.
+ *
+ * @param lead lead surrogate (U+d800..U+dbff)
+ * @param trail trail surrogate (U+dc00..U+dfff)
+ * @return supplementary code point (U+10000..U+10ffff)
+ * @stable ICU 2.4
+ */
+#define CBU16_GET_SUPPLEMENTARY(lead, trail) \
+ (((base_icu::UChar32)(lead)<<10UL)+(base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET)
+
+
+/**
+ * Get the lead surrogate (0xd800..0xdbff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param supplementary 32-bit code point (U+10000..U+10ffff)
+ * @return lead surrogate (U+d800..U+dbff) for supplementary
+ * @stable ICU 2.4
+ */
+#define CBU16_LEAD(supplementary) \
+ (base_icu::UChar)(((supplementary)>>10)+0xd7c0)
+
+/**
+ * Get the trail surrogate (0xdc00..0xdfff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param supplementary 32-bit code point (U+10000..U+10ffff)
+ * @return trail surrogate (U+dc00..U+dfff) for supplementary
+ * @stable ICU 2.4
+ */
+#define CBU16_TRAIL(supplementary) \
+ (base_icu::UChar)(((supplementary)&0x3ff)|0xdc00)
+
+/**
+ * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
+ * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
+ * @param c 32-bit code point
+ * @return 1 or 2
+ * @stable ICU 2.4
+ */
+#define CBU16_LENGTH(c) ((uint32)(c)<=0xffff ? 1 : 2)
+
+/**
+ * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
+ * @return 2
+ * @stable ICU 2.4
+ */
+#define CBU16_MAX_LENGTH 2
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate or
+ * to a single, unpaired lead surrogate, then that itself
+ * will be returned as the code point.
+ *
+ * @param s const UChar * string
+ * @param i string offset, i<length
+ * @param length string length
+ * @param c output UChar32 variable
+ * @stable ICU 2.4
+ */
+#define CBU16_NEXT(s, i, length, c) { \
+ (c)=(s)[(i)++]; \
+ if(CBU16_IS_LEAD(c)) { \
+ uint16 __c2; \
+ if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \
+ ++(i); \
+ (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
+ } \
+ } \
+}
+
+/**
+ * Append a code point to a string, overwriting 1 or 2 code units.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const UChar * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see CBU16_APPEND
+ * @stable ICU 2.4
+ */
+#define CBU16_APPEND_UNSAFE(s, i, c) { \
+ if((uint32)(c)<=0xffff) { \
+ (s)[(i)++]=(uint16)(c); \
+ } else { \
+ (s)[(i)++]=(uint16)(((c)>>10)+0xd7c0); \
+ (s)[(i)++]=(uint16)(((c)&0x3ff)|0xdc00); \
+ } \
+}
+
+} // namesapce base_icu
+
+#endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_
diff --git a/src/build/build_config.h b/src/build/build_config.h
new file mode 100644
index 0000000..b07660d
--- /dev/null
+++ b/src/build/build_config.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This file adds defines about the platform we're currently building on.
+// Operating System:
+// OS_WIN / OS_MACOSX / OS_LINUX / OS_POSIX (MACOSX or LINUX) /
+// OS_NACL (NACL_SFI or NACL_NONSFI) / OS_NACL_SFI / OS_NACL_NONSFI
+// Compiler:
+// COMPILER_MSVC / COMPILER_GCC
+// Processor:
+// ARCH_CPU_X86 / ARCH_CPU_X86_64 / ARCH_CPU_X86_FAMILY (X86 or X86_64)
+// ARCH_CPU_32_BITS / ARCH_CPU_64_BITS
+
+#ifndef BUILD_BUILD_CONFIG_H_
+#define BUILD_BUILD_CONFIG_H_
+
+// A set of macros to use for platform detection.
+#if defined(__native_client__)
+// __native_client__ must be first, so that other OS_ defines are not set.
+#define OS_NACL 1
+// OS_NACL comes in two sandboxing technology flavors, SFI or Non-SFI.
+// PNaCl toolchain defines __native_client_nonsfi__ macro in Non-SFI build
+// mode, while it does not in SFI build mode.
+#if defined(__native_client_nonsfi__)
+#define OS_NACL_NONSFI
+#else
+#define OS_NACL_SFI
+#endif
+#elif defined(ANDROID)
+#define OS_ANDROID 1
+#elif defined(__APPLE__)
+// only include TargetConditions after testing ANDROID as some android builds
+// on mac don't have this header available and it's not needed unless the target
+// is really mac/ios.
+#include <TargetConditionals.h>
+#define OS_MACOSX 1
+#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
+#define OS_IOS 1
+#endif // defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
+#elif defined(__linux__)
+#define OS_LINUX 1
+// include a system header to pull in features.h for glibc/uclibc macros.
+#include <unistd.h>
+#if defined(__GLIBC__) && !defined(__UCLIBC__)
+// we really are using glibc, not uClibc pretending to be glibc
+#define LIBC_GLIBC 1
+#endif
+#elif defined(_WIN32)
+#define OS_WIN 1
+#define TOOLKIT_VIEWS 1
+#elif defined(__FreeBSD__)
+#define OS_FREEBSD 1
+#elif defined(__OpenBSD__)
+#define OS_OPENBSD 1
+#elif defined(__sun)
+#define OS_SOLARIS 1
+#elif defined(__QNXNTO__)
+#define OS_QNX 1
+#else
+#error Please add support for your platform in build/build_config.h
+#endif
+
+#if defined(USE_OPENSSL) && defined(USE_NSS)
+#error Cannot use both OpenSSL and NSS
+#endif
+
+// For access to standard BSD features, use OS_BSD instead of a
+// more specific macro.
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD)
+#define OS_BSD 1
+#endif
+
+// For access to standard POSIXish features, use OS_POSIX instead of a
+// more specific macro.
+#if defined(OS_MACOSX) || defined(OS_LINUX) || defined(OS_FREEBSD) || \
+ defined(OS_OPENBSD) || defined(OS_SOLARIS) || defined(OS_ANDROID) || \
+ defined(OS_NACL) || defined(OS_QNX)
+#define OS_POSIX 1
+#endif
+
+// Use tcmalloc
+#if (defined(OS_WIN) || defined(OS_LINUX) || defined(OS_ANDROID)) && \
+ !defined(NO_TCMALLOC)
+#define USE_TCMALLOC 1
+#endif
+
+// Compiler detection.
+#if defined(__GNUC__)
+#define COMPILER_GCC 1
+#elif defined(_MSC_VER)
+#define COMPILER_MSVC 1
+#else
+#error Please add support for your compiler in build/build_config.h
+#endif
+
+// Processor architecture detection. For more info on what's defined, see:
+// http://msdn.microsoft.com/en-us/library/b0084kay.aspx
+// http://www.agner.org/optimize/calling_conventions.pdf
+// or with gcc, run: "echo | gcc -E -dM -"
+#if defined(_M_X64) || defined(__x86_64__)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86_64 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(_M_IX86) || defined(__i386__)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86 1
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__ARMEL__)
+#define ARCH_CPU_ARM_FAMILY 1
+#define ARCH_CPU_ARMEL 1
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__aarch64__)
+#define ARCH_CPU_ARM_FAMILY 1
+#define ARCH_CPU_ARM64 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__pnacl__)
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__MIPSEL__)
+#if defined(__LP64__)
+#define ARCH_CPU_MIPS64_FAMILY 1
+#define ARCH_CPU_MIPS64EL 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#else
+#define ARCH_CPU_MIPS_FAMILY 1
+#define ARCH_CPU_MIPSEL 1
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#endif
+#else
+#error Please add support for your architecture in build/build_config.h
+#endif
+
+// Type detection for wchar_t.
+#if defined(OS_WIN)
+#define WCHAR_T_IS_UTF16
+#elif defined(OS_POSIX) && defined(COMPILER_GCC) && \
+ defined(__WCHAR_MAX__) && \
+ (__WCHAR_MAX__ == 0x7fffffff || __WCHAR_MAX__ == 0xffffffff)
+#define WCHAR_T_IS_UTF32
+#elif defined(OS_POSIX) && defined(COMPILER_GCC) && \
+ defined(__WCHAR_MAX__) && \
+ (__WCHAR_MAX__ == 0x7fff || __WCHAR_MAX__ == 0xffff)
+// On Posix, we'll detect short wchar_t, but projects aren't guaranteed to
+// compile in this mode (in particular, Chrome doesn't). This is intended for
+// other projects using base who manage their own dependencies and make sure
+// short wchar works for them.
+#define WCHAR_T_IS_UTF16
+#else
+#error Please add support for your compiler in build/build_config.h
+#endif
+
+#if defined(OS_ANDROID)
+// The compiler thinks std::string::const_iterator and "const char*" are
+// equivalent types.
+#define STD_STRING_ITERATOR_IS_CHAR_POINTER
+// The compiler thinks base::string16::const_iterator and "char16*" are
+// equivalent types.
+#define BASE_STRING16_ITERATOR_IS_CHAR16_POINTER
+#endif
+
+#endif // BUILD_BUILD_CONFIG_H_
diff --git a/src/url/gurl.cc b/src/url/gurl.cc
new file mode 100644
index 0000000..6801dda
--- /dev/null
+++ b/src/url/gurl.cc
@@ -0,0 +1,549 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+#include <algorithm>
+#include <ostream>
+
+#include "url/gurl.h"
+
+#include "base/logging.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_util.h"
+
+namespace {
+
+static std::string* empty_string = NULL;
+static GURL* empty_gurl = NULL;
+
+#ifdef WIN32
+
+// Returns a static reference to an empty string for returning a reference
+// when there is no underlying string.
+const std::string& EmptyStringForGURL() {
+ // Avoid static object construction/destruction on startup/shutdown.
+ if (!empty_string) {
+ // Create the string. Be careful that we don't break in the case that this
+ // is being called from multiple threads. Statics are not threadsafe.
+ std::string* new_empty_string = new std::string;
+ if (InterlockedCompareExchangePointer(
+ reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
+ // The old value was non-NULL, so no replacement was done. Another
+ // thread did the initialization out from under us.
+ delete new_empty_string;
+ }
+ }
+ return *empty_string;
+}
+
+#else
+
+static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
+static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
+
+void EmptyStringForGURLOnce(void) {
+ empty_string = new std::string;
+}
+
+const std::string& EmptyStringForGURL() {
+ // Avoid static object construction/destruction on startup/shutdown.
+ pthread_once(&empty_string_once, EmptyStringForGURLOnce);
+ return *empty_string;
+}
+
+#endif // WIN32
+
+} // namespace
+
+GURL::GURL() : is_valid_(false) {
+}
+
+GURL::GURL(const GURL& other)
+ : spec_(other.spec_),
+ is_valid_(other.is_valid_),
+ parsed_(other.parsed_) {
+ if (other.inner_url_)
+ inner_url_.reset(new GURL(*other.inner_url_));
+ // Valid filesystem urls should always have an inner_url_.
+ DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
+}
+
+GURL::GURL(const std::string& url_string) {
+ InitCanonical(url_string, true);
+}
+
+GURL::GURL(const url::base::string16& url_string) {
+ InitCanonical(url_string, true);
+}
+
+GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
+ InitCanonical(url_string, false);
+}
+
+GURL::GURL(const char* canonical_spec,
+ size_t canonical_spec_len,
+ const url::Parsed& parsed,
+ bool is_valid)
+ : spec_(canonical_spec, canonical_spec_len),
+ is_valid_(is_valid),
+ parsed_(parsed) {
+ InitializeFromCanonicalSpec();
+}
+
+GURL::GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid)
+ : is_valid_(is_valid),
+ parsed_(parsed) {
+ spec_.swap(canonical_spec);
+ InitializeFromCanonicalSpec();
+}
+
+template<typename STR>
+void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ spec_.reserve(input_spec.size() + 32);
+ url::StdStringCanonOutput output(&spec_);
+ is_valid_ = url::Canonicalize(
+ input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
+ NULL, &output, &parsed_);
+
+ output.Complete(); // Must be done before using string.
+ if (is_valid_ && SchemeIsFileSystem()) {
+ inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
+ *parsed_.inner_parsed(), true));
+ }
+}
+
+void GURL::InitializeFromCanonicalSpec() {
+ if (is_valid_ && SchemeIsFileSystem()) {
+ inner_url_.reset(
+ new GURL(spec_.data(), parsed_.Length(),
+ *parsed_.inner_parsed(), true));
+ }
+
+#ifndef NDEBUG
+ // For testing purposes, check that the parsed canonical URL is identical to
+ // what we would have produced. Skip checking for invalid URLs have no meaning
+ // and we can't always canonicalize then reproducabely.
+ if (is_valid_) {
+ url::Component scheme;
+ // We can't do this check on the inner_url of a filesystem URL, as
+ // canonical_spec actually points to the start of the outer URL, so we'd
+ // end up with infinite recursion in this constructor.
+ if (!url::FindAndCompareScheme(spec_.data(), spec_.length(),
+ url::kFileSystemScheme, &scheme) ||
+ scheme.begin == parsed_.scheme.begin) {
+ // We need to retain trailing whitespace on path URLs, as the |parsed_|
+ // spec we originally received may legitimately contain trailing white-
+ // space on the path or components e.g. if the #ref has been
+ // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
+ GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
+
+ DCHECK(test_url.is_valid_ == is_valid_);
+ DCHECK(test_url.spec_ == spec_);
+
+ DCHECK(test_url.parsed_.scheme == parsed_.scheme);
+ DCHECK(test_url.parsed_.username == parsed_.username);
+ DCHECK(test_url.parsed_.password == parsed_.password);
+ DCHECK(test_url.parsed_.host == parsed_.host);
+ DCHECK(test_url.parsed_.port == parsed_.port);
+ DCHECK(test_url.parsed_.path == parsed_.path);
+ DCHECK(test_url.parsed_.query == parsed_.query);
+ DCHECK(test_url.parsed_.ref == parsed_.ref);
+ }
+ }
+#endif
+}
+
+GURL::~GURL() {
+}
+
+GURL& GURL::operator=(GURL other) {
+ Swap(&other);
+ return *this;
+}
+
+const std::string& GURL::spec() const {
+ if (is_valid_ || spec_.empty())
+ return spec_;
+
+ DCHECK(false) << "Trying to get the spec of an invalid URL!";
+ return EmptyStringForGURL();
+}
+
+bool GURL::operator==(const GURL& other) const {
+ return spec_ == other.spec_;
+}
+
+bool GURL::operator!=(const GURL& other) const {
+ return spec_ != other.spec_;
+}
+
+bool GURL::operator<(const GURL& other) const {
+ return spec_ < other.spec_;
+}
+
+bool GURL::operator>(const GURL& other) const {
+ return spec_ > other.spec_;
+}
+
+GURL GURL::Resolve(const std::string& relative) const {
+ return ResolveWithCharsetConverter(relative, NULL);
+}
+GURL GURL::Resolve(const url::base::string16& relative) const {
+ return ResolveWithCharsetConverter(relative, NULL);
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::ResolveWithCharsetConverter(
+ const std::string& relative,
+ url::CharsetConverter* charset_converter) const {
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ GURL result;
+
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ result.spec_.reserve(spec_.size() + 32);
+ url::StdStringCanonOutput output(&result.spec_);
+
+ if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
+ parsed_, relative.data(),
+ static_cast<int>(relative.length()),
+ charset_converter, &output, &result.parsed_)) {
+ // Error resolving, return an empty URL.
+ return GURL();
+ }
+
+ output.Complete();
+ result.is_valid_ = true;
+ if (result.SchemeIsFileSystem()) {
+ result.inner_url_.reset(
+ new GURL(result.spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true));
+ }
+ return result;
+}
+
+// Note: code duplicated above (it's inconvenient to use a template here).
+GURL GURL::ResolveWithCharsetConverter(
+ const url::base::string16& relative,
+ url::CharsetConverter* charset_converter) const {
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ GURL result;
+
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ result.spec_.reserve(spec_.size() + 32);
+ url::StdStringCanonOutput output(&result.spec_);
+
+ if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
+ parsed_, relative.data(),
+ static_cast<int>(relative.length()),
+ charset_converter, &output, &result.parsed_)) {
+ // Error resolving, return an empty URL.
+ return GURL();
+ }
+
+ output.Complete();
+ result.is_valid_ = true;
+ if (result.SchemeIsFileSystem()) {
+ result.inner_url_.reset(
+ new GURL(result.spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true));
+ }
+ return result;
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::ReplaceComponents(
+ const url::Replacements<char>& replacements) const {
+ GURL result;
+
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ result.spec_.reserve(spec_.size() + 32);
+ url::StdStringCanonOutput output(&result.spec_);
+
+ result.is_valid_ = url::ReplaceComponents(
+ spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+ NULL, &output, &result.parsed_);
+
+ output.Complete();
+ if (result.is_valid_ && result.SchemeIsFileSystem()) {
+ result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true));
+ }
+ return result;
+}
+
+// Note: code duplicated above (it's inconvenient to use a template here).
+GURL GURL::ReplaceComponents(
+ const url::Replacements<url::base::char16>& replacements) const {
+ GURL result;
+
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ result.spec_.reserve(spec_.size() + 32);
+ url::StdStringCanonOutput output(&result.spec_);
+
+ result.is_valid_ = url::ReplaceComponents(
+ spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+ NULL, &output, &result.parsed_);
+
+ output.Complete();
+ if (result.is_valid_ && result.SchemeIsFileSystem()) {
+ result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true));
+ }
+ return result;
+}
+
+GURL GURL::GetOrigin() const {
+ // This doesn't make sense for invalid or nonstandard URLs, so return
+ // the empty URL
+ if (!is_valid_ || !IsStandard())
+ return GURL();
+
+ if (SchemeIsFileSystem())
+ return inner_url_->GetOrigin();
+
+ url::Replacements<char> replacements;
+ replacements.ClearUsername();
+ replacements.ClearPassword();
+ replacements.ClearPath();
+ replacements.ClearQuery();
+ replacements.ClearRef();
+
+ return ReplaceComponents(replacements);
+}
+
+GURL GURL::GetAsReferrer() const {
+ if (!is_valid_ || !SchemeIsHTTPOrHTTPS())
+ return GURL();
+
+ if (!has_ref() && !has_username() && !has_password())
+ return GURL(*this);
+
+ url::Replacements<char> replacements;
+ replacements.ClearRef();
+ replacements.ClearUsername();
+ replacements.ClearPassword();
+ return ReplaceComponents(replacements);
+}
+
+GURL GURL::GetWithEmptyPath() const {
+ // This doesn't make sense for invalid or nonstandard URLs, so return
+ // the empty URL.
+ if (!is_valid_ || !IsStandard())
+ return GURL();
+
+ // We could optimize this since we know that the URL is canonical, and we are
+ // appending a canonical path, so avoiding re-parsing.
+ GURL other(*this);
+ if (parsed_.path.len == 0)
+ return other;
+
+ // Clear everything after the path.
+ other.parsed_.query.reset();
+ other.parsed_.ref.reset();
+
+ // Set the path, since the path is longer than one, we can just set the
+ // first character and resize.
+ other.spec_[other.parsed_.path.begin] = '/';
+ other.parsed_.path.len = 1;
+ other.spec_.resize(other.parsed_.path.begin + 1);
+ return other;
+}
+
+bool GURL::IsStandard() const {
+ return url::IsStandard(spec_.data(), parsed_.scheme);
+}
+
+bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
+ if (parsed_.scheme.len <= 0)
+ return lower_ascii_scheme == NULL;
+ return url::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
+ spec_.data() + parsed_.scheme.end(),
+ lower_ascii_scheme);
+}
+
+bool GURL::SchemeIsHTTPOrHTTPS() const {
+ return SchemeIs(url::kHttpScheme) || SchemeIs(url::kHttpsScheme);
+}
+
+bool GURL::SchemeIsWSOrWSS() const {
+ return SchemeIs(url::kWsScheme) || SchemeIs(url::kWssScheme);
+}
+
+int GURL::IntPort() const {
+ if (parsed_.port.is_nonempty())
+ return url::ParsePort(spec_.data(), parsed_.port);
+ return url::PORT_UNSPECIFIED;
+}
+
+int GURL::EffectiveIntPort() const {
+ int int_port = IntPort();
+ if (int_port == url::PORT_UNSPECIFIED && IsStandard())
+ return url::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
+ parsed_.scheme.len);
+ return int_port;
+}
+
+std::string GURL::ExtractFileName() const {
+ url::Component file_component;
+ url::ExtractFileName(spec_.data(), parsed_.path, &file_component);
+ return ComponentString(file_component);
+}
+
+std::string GURL::PathForRequest() const {
+ DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
+ if (parsed_.ref.len >= 0) {
+ // Clip off the reference when it exists. The reference starts after the #
+ // sign, so we have to subtract one to also remove it.
+ return std::string(spec_, parsed_.path.begin,
+ parsed_.ref.begin - parsed_.path.begin - 1);
+ }
+ // Compute the actual path length, rather than depending on the spec's
+ // terminator. If we're an inner_url, our spec continues on into our outer
+ // url's path/query/ref.
+ int path_len = parsed_.path.len;
+ if (parsed_.query.is_valid())
+ path_len = parsed_.query.end() - parsed_.path.begin;
+
+ return std::string(spec_, parsed_.path.begin, path_len);
+}
+
+std::string GURL::HostNoBrackets() const {
+ // If host looks like an IPv6 literal, strip the square brackets.
+ url::Component h(parsed_.host);
+ if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
+ h.begin++;
+ h.len -= 2;
+ }
+ return ComponentString(h);
+}
+
+std::string GURL::GetContent() const {
+ return is_valid_ ? ComponentString(parsed_.GetContent()) : std::string();
+}
+
+bool GURL::HostIsIPAddress() const {
+ if (!is_valid_ || spec_.empty())
+ return false;
+
+ url::RawCanonOutputT<char, 128> ignored_output;
+ url::CanonHostInfo host_info;
+ url::CanonicalizeIPAddress(spec_.c_str(), parsed_.host, &ignored_output,
+ &host_info);
+ return host_info.IsIPAddress();
+}
+
+#ifdef WIN32
+
+const GURL& GURL::EmptyGURL() {
+ // Avoid static object construction/destruction on startup/shutdown.
+ if (!empty_gurl) {
+ // Create the string. Be careful that we don't break in the case that this
+ // is being called from multiple threads.
+ GURL* new_empty_gurl = new GURL;
+ if (InterlockedCompareExchangePointer(
+ reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
+ // The old value was non-NULL, so no replacement was done. Another
+ // thread did the initialization out from under us.
+ delete new_empty_gurl;
+ }
+ }
+ return *empty_gurl;
+}
+
+#else
+
+void EmptyGURLOnce(void) {
+ empty_gurl = new GURL;
+}
+
+const GURL& GURL::EmptyGURL() {
+ // Avoid static object construction/destruction on startup/shutdown.
+ pthread_once(&empty_gurl_once, EmptyGURLOnce);
+ return *empty_gurl;
+}
+
+#endif // WIN32
+
+bool GURL::DomainIs(const char* lower_ascii_domain,
+ int domain_len) const {
+ // Return false if this URL is not valid or domain is empty.
+ if (!is_valid_ || !domain_len)
+ return false;
+
+ // FileSystem URLs have empty parsed_.host, so check this first.
+ if (SchemeIsFileSystem() && inner_url_)
+ return inner_url_->DomainIs(lower_ascii_domain, domain_len);
+
+ if (!parsed_.host.is_nonempty())
+ return false;
+
+ // Check whether the host name is end with a dot. If yes, treat it
+ // the same as no-dot unless the input comparison domain is end
+ // with dot.
+ const char* last_pos = spec_.data() + parsed_.host.end() - 1;
+ int host_len = parsed_.host.len;
+ if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
+ last_pos--;
+ host_len--;
+ }
+
+ // Return false if host's length is less than domain's length.
+ if (host_len < domain_len)
+ return false;
+
+ // Compare this url whether belong specific domain.
+ const char* start_pos = spec_.data() + parsed_.host.begin +
+ host_len - domain_len;
+
+ if (!url::LowerCaseEqualsASCII(start_pos,
+ last_pos + 1,
+ lower_ascii_domain,
+ lower_ascii_domain + domain_len))
+ return false;
+
+ // Check whether host has right domain start with dot, make sure we got
+ // right domain range. For example www.google.com has domain
+ // "google.com" but www.iamnotgoogle.com does not.
+ if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
+ '.' != *(start_pos - 1))
+ return false;
+
+ return true;
+}
+
+void GURL::Swap(GURL* other) {
+ spec_.swap(other->spec_);
+ std::swap(is_valid_, other->is_valid_);
+ std::swap(parsed_, other->parsed_);
+ inner_url_.swap(other->inner_url_);
+}
+
+std::ostream& operator<<(std::ostream& out, const GURL& url) {
+ return out << url.possibly_invalid_spec();
+}
diff --git a/googleurl/src/gurl.h b/src/url/gurl.h
similarity index 69%
rename from googleurl/src/gurl.h
rename to src/url/gurl.h
index 29fea81..16d9a2a 100644
--- a/googleurl/src/gurl.h
+++ b/src/url/gurl.h
@@ -1,55 +1,32 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
-#ifndef GOOGLEURL_SRC_GURL_H__
-#define GOOGLEURL_SRC_GURL_H__
+#ifndef URL_GURL_H_
+#define URL_GURL_H_
-#include <iostream>
+#include <iosfwd>
+#include <memory>
#include <string>
-#include "base/string16.h"
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_canon_stdstring.h"
-#include "googleurl/src/url_common.h"
-#include "googleurl/src/url_parse.h"
+#include "base/strings/string16.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_constants.h"
+#include "url/url_export.h"
+#include "url/url_parse.h"
-class GURL {
+class URL_EXPORT GURL {
public:
- typedef url_canon::StdStringReplacements<std::string> Replacements;
- typedef url_canon::StdStringReplacements<string16> ReplacementsW;
+ typedef url::StdStringReplacements<std::string> Replacements;
+ typedef url::StdStringReplacements<url::base::string16> ReplacementsW;
// Creates an empty, invalid URL.
- GURL_API GURL();
+ GURL();
// Copy construction is relatively inexpensive, with most of the time going
// to reallocating the string. It does not re-parse.
- GURL_API GURL(const GURL& other);
+ GURL(const GURL& other);
// The narrow version requires the input be UTF-8. Invalid UTF-8 input will
// result in an invalid URL.
@@ -58,16 +35,27 @@
// encode the query parameters. It is probably sufficient for the narrow
// version to assume the query parameter encoding should be the same as the
// input encoding.
- GURL_API explicit GURL(const std::string& url_string
- /*, output_param_encoding*/);
- GURL_API explicit GURL(const string16& url_string
- /*, output_param_encoding*/);
+ explicit GURL(const std::string& url_string /*, output_param_encoding*/);
+ explicit GURL(const url::base::string16& url_string /*, output_param_encoding*/);
// Constructor for URLs that have already been parsed and canonicalized. This
// is used for conversions from KURL, for example. The caller must supply all
// information associated with the URL, which must be correct and consistent.
- GURL_API GURL(const char* canonical_spec, size_t canonical_spec_len,
- const url_parse::Parsed& parsed, bool is_valid);
+ GURL(const char* canonical_spec,
+ size_t canonical_spec_len,
+ const url::Parsed& parsed,
+ bool is_valid);
+ // Notice that we take the canonical_spec by value so that we can convert
+ // from WebURL without copying the string. When we call this constructor
+ // we pass in a temporary std::string, which lets the compiler skip the
+ // copy and just move the std::string into the function argument. In the
+ // implementation, we use swap to move the data into the GURL itself,
+ // which means we end up with zero copies.
+ GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid);
+
+ ~GURL();
+
+ GURL& operator=(GURL other);
// Returns true when this object represents a valid parsed URL. When not
// valid, other functions will still succeed, but you will not get canonical
@@ -99,7 +87,7 @@
// Used invalid_spec() below to get the unusable spec of an invalid URL. This
// separation is designed to prevent errors that may cause security problems
// that could result from the mistaken use of an invalid URL.
- GURL_API const std::string& spec() const;
+ const std::string& spec() const;
// Returns the potentially invalid spec for a the URL. This spec MUST NOT be
// modified or sent over the network. It is designed to be displayed in error
@@ -119,22 +107,17 @@
// or may not be valid. If you are using this to index into the spec, BE
// SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you
// don't do anything "important" with invalid specs.
- const url_parse::Parsed& parsed_for_possibly_invalid_spec() const {
+ const url::Parsed& parsed_for_possibly_invalid_spec() const {
return parsed_;
}
// Defiant equality operator!
- bool operator==(const GURL& other) const {
- return spec_ == other.spec_;
- }
- bool operator!=(const GURL& other) const {
- return spec_ != other.spec_;
- }
+ bool operator==(const GURL& other) const;
+ bool operator!=(const GURL& other) const;
// Allows GURL to used as a key in STL (for example, a std::set or std::map).
- bool operator<(const GURL& other) const {
- return spec_ < other.spec_;
- }
+ bool operator<(const GURL& other) const;
+ bool operator>(const GURL& other) const;
// Resolves a URL that's possibly relative to this object's URL, and returns
// it. Absolute URLs are also handled according to the rules of URLs on web
@@ -151,8 +134,8 @@
//
// It is an error to resolve a URL relative to an invalid URL. The result
// will be the empty URL.
- GURL_API GURL Resolve(const std::string& relative) const;
- GURL_API GURL Resolve(const string16& relative) const;
+ GURL Resolve(const std::string& relative) const;
+ GURL Resolve(const url::base::string16& relative) const;
// Like Resolve() above but takes a character set encoder which will be used
// for any query text specified in the input. The charset converter parameter
@@ -161,12 +144,12 @@
// TODO(brettw): These should be replaced with versions that take something
// more friendly than a raw CharsetConverter (maybe like an ICU character set
// name).
- GURL_API GURL ResolveWithCharsetConverter(
+ GURL ResolveWithCharsetConverter(
const std::string& relative,
- url_canon::CharsetConverter* charset_converter) const;
- GURL_API GURL ResolveWithCharsetConverter(
- const string16& relative,
- url_canon::CharsetConverter* charset_converter) const;
+ url::CharsetConverter* charset_converter) const;
+ GURL ResolveWithCharsetConverter(
+ const url::base::string16& relative,
+ url::CharsetConverter* charset_converter) const;
// Creates a new GURL by replacing the current URL's components with the
// supplied versions. See the Replacements class in url_canon.h for more.
@@ -177,12 +160,11 @@
// It is an error to replace components of an invalid URL. The result will
// be the empty URL.
//
- // Note that we use the more general url_canon::Replacements type to give
+ // Note that we use the more general url::Replacements type to give
// callers extra flexibility rather than our override.
- GURL_API GURL ReplaceComponents(
- const url_canon::Replacements<char>& replacements) const;
- GURL_API GURL ReplaceComponents(
- const url_canon::Replacements<char16>& replacements) const;
+ GURL ReplaceComponents(const url::Replacements<char>& replacements) const;
+ GURL ReplaceComponents(
+ const url::Replacements<url::base::char16>& replacements) const;
// A helper function that is equivalent to replacing the path with a slash
// and clearing out everything after that. We sometimes need to know just the
@@ -193,7 +175,7 @@
//
// It is an error to get an empty path on an invalid URL. The result
// will be the empty URL.
- GURL_API GURL GetWithEmptyPath() const;
+ GURL GetWithEmptyPath() const;
// A helper function to return a GURL containing just the scheme, host,
// and port from a URL. Equivalent to clearing any username and password,
@@ -204,35 +186,63 @@
//
// It is an error to get the origin of an invalid URL. The result
// will be the empty URL.
- GURL_API GURL GetOrigin() const;
+ GURL GetOrigin() const;
+
+ // A helper function to return a GURL stripped from the elements that are not
+ // supposed to be sent as HTTP referrer: username, password and ref fragment.
+ // For invalid URLs or URLs that no valid referrers, an empty URL will be
+ // returned.
+ GURL GetAsReferrer() const;
// Returns true if the scheme for the current URL is a known "standard"
// scheme. Standard schemes have an authority and a path section. This
- // includes file:, which some callers may want to filter out explicitly by
- // calling SchemeIsFile.
- GURL_API bool IsStandard() const;
+ // includes file: and filesystem:, which some callers may want to filter out
+ // explicitly by calling SchemeIsFile[System].
+ bool IsStandard() const;
// Returns true if the given parameter (should be lower-case ASCII to match
// the canonicalized scheme) is the scheme for this URL. This call is more
// efficient than getting the scheme and comparing it because no copies or
// object constructions are done.
- GURL_API bool SchemeIs(const char* lower_ascii_scheme) const;
+ bool SchemeIs(const char* lower_ascii_scheme) const;
+
+ // Returns true if the scheme is "http" or "https".
+ bool SchemeIsHTTPOrHTTPS() const;
+
+ // Returns true is the scheme is "ws" or "wss".
+ bool SchemeIsWSOrWSS() const;
// We often need to know if this is a file URL. File URLs are "standard", but
// are often treated separately by some programs.
bool SchemeIsFile() const {
- return SchemeIs("file");
+ return SchemeIs(url::kFileScheme);
+ }
+
+ // FileSystem URLs need to be treated differently in some cases.
+ bool SchemeIsFileSystem() const {
+ return SchemeIs(url::kFileSystemScheme);
}
// If the scheme indicates a secure connection
bool SchemeIsSecure() const {
- return SchemeIs("https");
+ return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme) ||
+ (SchemeIsFileSystem() && inner_url() && inner_url()->SchemeIsSecure());
}
+ // Returns true if the scheme is "blob".
+ bool SchemeIsBlob() const {
+ return SchemeIs(url::kBlobScheme);
+ }
+
+ // The "content" of the URL is everything after the scheme (skipping the
+ // scheme delimiting colon). It is an error to get the origin of an invalid
+ // URL. The result will be an empty string.
+ std::string GetContent() const;
+
// Returns true if the hostname is an IP address. Note: this function isn't
// as cheap as a simple getter because it re-parses the hostname to verify.
// This currently identifies only IPv4 addresses (bug 822685).
- GURL_API bool HostIsIPAddress() const;
+ bool HostIsIPAddress() const;
// Getters for various components of the URL. The returned string will be
// empty if the component is empty or is not present.
@@ -298,24 +308,24 @@
// Returns a parsed version of the port. Can also be any of the special
// values defined in Parsed for ExtractPort.
- GURL_API int IntPort() const;
+ int IntPort() const;
// Returns the port number of the url, or the default port number.
// If the scheme has no concept of port (or unknown default) returns
// PORT_UNSPECIFIED.
- GURL_API int EffectiveIntPort() const;
+ int EffectiveIntPort() const;
// Extracts the filename portion of the path and returns it. The filename
// is everything after the last slash in the path. This may be empty.
- GURL_API std::string ExtractFileName() const;
+ std::string ExtractFileName() const;
// Returns the path that should be sent to the server. This is the path,
// parameter, and query portions of the URL. It is guaranteed to be ASCII.
- GURL_API std::string PathForRequest() const;
+ std::string PathForRequest() const;
// Returns the host, excluding the square brackets surrounding IPv6 address
// literals. This can be useful for passing to getaddrinfo().
- GURL_API std::string HostNoBrackets() const;
+ std::string HostNoBrackets() const;
// Returns true if this URL's host matches or is in the same domain as
// the given input string. For example if this URL was "www.google.com",
@@ -327,7 +337,7 @@
//
// If function DomainIs has parameter domain_len, which means the parameter
// lower_ascii_domain does not gurantee to terminate with NULL character.
- GURL_API bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
+ bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
// If function DomainIs only has parameter lower_ascii_domain, which means
// domain string should be terminate with NULL character.
@@ -338,16 +348,35 @@
// Swaps the contents of this GURL object with the argument without doing
// any memory allocations.
- GURL_API void Swap(GURL* other);
+ void Swap(GURL* other);
// Returns a reference to a singleton empty GURL. This object is for callers
// who return references but don't have anything to return in some cases.
// This function may be called from any thread.
- GURL_API static const GURL& EmptyGURL();
+ static const GURL& EmptyGURL();
+
+ // Returns the inner URL of a nested URL [currently only non-null for
+ // filesystem: URLs].
+ const GURL* inner_url() const {
+ return inner_url_.get();
+ }
private:
+ // Variant of the string parsing constructor that allows the caller to elect
+ // retain trailing whitespace, if any, on the passed URL spec but only if the
+ // scheme is one that allows trailing whitespace. The primary use-case is
+ // for data: URLs. In most cases, you want to use the single parameter
+ // constructor above.
+ enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
+ GURL(const std::string& url_string, RetainWhiteSpaceSelector);
+
+ template<typename STR>
+ void InitCanonical(const STR& input_spec, bool trim_path_end);
+
+ void InitializeFromCanonicalSpec();
+
// Returns the substring of the input identified by the given component.
- std::string ComponentString(const url_parse::Component& comp) const {
+ std::string ComponentString(const url::Component& comp) const {
if (comp.len <= 0)
return std::string();
return std::string(spec_, comp.begin, comp.len);
@@ -362,14 +391,15 @@
bool is_valid_;
// Identified components of the canonical spec.
- url_parse::Parsed parsed_;
+ url::Parsed parsed_;
+
+ // Used for nested schemes [currently only filesystem:].
+ std::unique_ptr<GURL> inner_url_;
// TODO bug 684583: Add encoding for query params.
};
// Stream operator so GURL can be used in assertion statements.
-inline std::ostream& operator<<(std::ostream& out, const GURL& url) {
- return out << url.possibly_invalid_spec();
-}
+URL_EXPORT std::ostream& operator<<(std::ostream& out, const GURL& url);
-#endif // GOOGLEURL_SRC_GURL_H__
+#endif // URL_GURL_H_
diff --git a/src/url/gurl_unittest.cc b/src/url/gurl_unittest.cc
new file mode 100644
index 0000000..112ee5f
--- /dev/null
+++ b/src/url/gurl_unittest.cc
@@ -0,0 +1,642 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/macros.h"
+#include "testing/base/public/gunit.h"
+#include "url/gurl.h"
+#include "url/url_canon.h"
+#include "url/url_test_utils.h"
+
+namespace url {
+
+using test_utils::WStringToUTF16;
+using test_utils::ConvertUTF8ToUTF16;
+
+namespace {
+
+template<typename CHAR>
+void SetupReplacement(
+ void (Replacements<CHAR>::*func)(const CHAR*, const Component&),
+ Replacements<CHAR>* replacements,
+ const CHAR* str) {
+ if (str) {
+ Component comp;
+ if (str[0])
+ comp.len = static_cast<int>(strlen(str));
+ (replacements->*func)(str, comp);
+ }
+}
+
+// Returns the canonicalized string for the given URL string for the
+// GURLTest.Types test.
+std::string TypesTestCase(const char* src) {
+ GURL gurl(src);
+ return gurl.possibly_invalid_spec();
+}
+
+} // namespace
+
+// Different types of URLs should be handled differently, and handed off to
+// different canonicalizers.
+TEST(GURLTest, Types) {
+ // URLs with unknown schemes should be treated as path URLs, even when they
+ // have things like "://".
+ EXPECT_EQ("something:///HOSTNAME.com/",
+ TypesTestCase("something:///HOSTNAME.com/"));
+
+ // In the reverse, known schemes should always trigger standard URL handling.
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com"));
+
+#ifdef WIN32
+ // URLs that look like absolute Windows drive specs.
+ EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt"));
+ EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt"));
+ EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt"));
+ EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt"));
+#endif
+}
+
+// Test the basic creation and querying of components in a GURL. We assume
+// the parser is already tested and works, so we are mostly interested if the
+// object does the right thing with the results.
+TEST(GURLTest, Components) {
+ GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref"));
+ EXPECT_TRUE(url.is_valid());
+ EXPECT_TRUE(url.SchemeIs("http"));
+ EXPECT_FALSE(url.SchemeIsFile());
+
+ // This is the narrow version of the URL, which should match the wide input.
+ EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url.spec());
+
+ EXPECT_EQ("http", url.scheme());
+ EXPECT_EQ("user", url.username());
+ EXPECT_EQ("pass", url.password());
+ EXPECT_EQ("google.com", url.host());
+ EXPECT_EQ("99", url.port());
+ EXPECT_EQ(99, url.IntPort());
+ EXPECT_EQ("/foo;bar", url.path());
+ EXPECT_EQ("q=a", url.query());
+ EXPECT_EQ("ref", url.ref());
+
+ // Test parsing userinfo with special characters.
+ GURL url_special_pass("http://user:%40!$&'()*+,;=:@google.com:12345");
+ EXPECT_TRUE(url_special_pass.is_valid());
+ // GURL canonicalizes some delimiters.
+ EXPECT_EQ("%40!$&%27()*+,%3B%3D%3A", url_special_pass.password());
+ EXPECT_EQ("google.com", url_special_pass.host());
+ EXPECT_EQ("12345", url_special_pass.port());
+}
+
+TEST(GURLTest, Empty) {
+ GURL url;
+ EXPECT_FALSE(url.is_valid());
+ EXPECT_EQ("", url.spec());
+
+ EXPECT_EQ("", url.scheme());
+ EXPECT_EQ("", url.username());
+ EXPECT_EQ("", url.password());
+ EXPECT_EQ("", url.host());
+ EXPECT_EQ("", url.port());
+ EXPECT_EQ(PORT_UNSPECIFIED, url.IntPort());
+ EXPECT_EQ("", url.path());
+ EXPECT_EQ("", url.query());
+ EXPECT_EQ("", url.ref());
+}
+
+TEST(GURLTest, Copy) {
+ GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref"));
+
+ GURL url2(url);
+ EXPECT_TRUE(url2.is_valid());
+
+ EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec());
+ EXPECT_EQ("http", url2.scheme());
+ EXPECT_EQ("user", url2.username());
+ EXPECT_EQ("pass", url2.password());
+ EXPECT_EQ("google.com", url2.host());
+ EXPECT_EQ("99", url2.port());
+ EXPECT_EQ(99, url2.IntPort());
+ EXPECT_EQ("/foo;bar", url2.path());
+ EXPECT_EQ("q=a", url2.query());
+ EXPECT_EQ("ref", url2.ref());
+
+ // Copying of invalid URL should be invalid
+ GURL invalid;
+ GURL invalid2(invalid);
+ EXPECT_FALSE(invalid2.is_valid());
+ EXPECT_EQ("", invalid2.spec());
+ EXPECT_EQ("", invalid2.scheme());
+ EXPECT_EQ("", invalid2.username());
+ EXPECT_EQ("", invalid2.password());
+ EXPECT_EQ("", invalid2.host());
+ EXPECT_EQ("", invalid2.port());
+ EXPECT_EQ(PORT_UNSPECIFIED, invalid2.IntPort());
+ EXPECT_EQ("", invalid2.path());
+ EXPECT_EQ("", invalid2.query());
+ EXPECT_EQ("", invalid2.ref());
+}
+
+TEST(GURLTest, Assign) {
+ GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref"));
+
+ GURL url2;
+ url2 = url;
+ EXPECT_TRUE(url2.is_valid());
+
+ EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec());
+ EXPECT_EQ("http", url2.scheme());
+ EXPECT_EQ("user", url2.username());
+ EXPECT_EQ("pass", url2.password());
+ EXPECT_EQ("google.com", url2.host());
+ EXPECT_EQ("99", url2.port());
+ EXPECT_EQ(99, url2.IntPort());
+ EXPECT_EQ("/foo;bar", url2.path());
+ EXPECT_EQ("q=a", url2.query());
+ EXPECT_EQ("ref", url2.ref());
+
+ // Assignment of invalid URL should be invalid
+ GURL invalid;
+ GURL invalid2;
+ invalid2 = invalid;
+ EXPECT_FALSE(invalid2.is_valid());
+ EXPECT_EQ("", invalid2.spec());
+ EXPECT_EQ("", invalid2.scheme());
+ EXPECT_EQ("", invalid2.username());
+ EXPECT_EQ("", invalid2.password());
+ EXPECT_EQ("", invalid2.host());
+ EXPECT_EQ("", invalid2.port());
+ EXPECT_EQ(PORT_UNSPECIFIED, invalid2.IntPort());
+ EXPECT_EQ("", invalid2.path());
+ EXPECT_EQ("", invalid2.query());
+ EXPECT_EQ("", invalid2.ref());
+}
+
+// This is a regression test for http://crbug.com/309975 .
+TEST(GURLTest, SelfAssign) {
+ GURL a("filesystem:http://example.com/temporary/");
+ // This should not crash.
+ a = a;
+}
+
+TEST(GURLTest, CopyFileSystem) {
+ GURL url(WStringToUTF16(L"filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref"));
+
+ GURL url2(url);
+ EXPECT_TRUE(url2.is_valid());
+
+ EXPECT_EQ("filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref", url2.spec());
+ EXPECT_EQ("filesystem", url2.scheme());
+ EXPECT_EQ("", url2.username());
+ EXPECT_EQ("", url2.password());
+ EXPECT_EQ("", url2.host());
+ EXPECT_EQ("", url2.port());
+ EXPECT_EQ(PORT_UNSPECIFIED, url2.IntPort());
+ EXPECT_EQ("/foo;bar", url2.path());
+ EXPECT_EQ("q=a", url2.query());
+ EXPECT_EQ("ref", url2.ref());
+
+ const GURL* inner = url2.inner_url();
+ ASSERT_TRUE(inner);
+ EXPECT_EQ("https", inner->scheme());
+ EXPECT_EQ("user", inner->username());
+ EXPECT_EQ("pass", inner->password());
+ EXPECT_EQ("google.com", inner->host());
+ EXPECT_EQ("99", inner->port());
+ EXPECT_EQ(99, inner->IntPort());
+ EXPECT_EQ("/t", inner->path());
+ EXPECT_EQ("", inner->query());
+ EXPECT_EQ("", inner->ref());
+}
+
+TEST(GURLTest, IsValid) {
+ const char* valid_cases[] = {
+ "http://google.com",
+ "unknown://google.com",
+ "http://user:pass@google.com",
+ "http://google.com:12345",
+ "http://google.com/path",
+ "http://google.com//path",
+ "http://google.com?k=v#fragment",
+ "http://user:pass@google.com:12345/path?k=v#fragment",
+ "http:/path",
+ "http:path",
+ "://google.com",
+ };
+ for (size_t i = 0; i < arraysize(valid_cases); i++) {
+ EXPECT_TRUE(GURL(valid_cases[i]).is_valid())
+ << "Case: " << valid_cases[i];
+ }
+
+ const char* invalid_cases[] = {
+ "http://?k=v",
+ "http:://google.com",
+ "http//google.com",
+ "http://google.com:12three45",
+ "path",
+ };
+ for (size_t i = 0; i < arraysize(invalid_cases); i++) {
+ EXPECT_FALSE(GURL(invalid_cases[i]).is_valid())
+ << "Case: " << invalid_cases[i];
+ }
+}
+
+TEST(GURLTest, ExtraSlashesBeforeAuthority) {
+ // According to RFC3986, the hier-part for URI with an authority must use only
+ // two slashes, GURL intentionally just ignores slashes more than 2 and parses
+ // the following part as an authority.
+ GURL url("http:///host");
+ EXPECT_EQ("host", url.host());
+ EXPECT_EQ("/", url.path());
+}
+
+// Given an invalid URL, we should still get most of the components.
+TEST(GURLTest, ComponentGettersWorkEvenForInvalidURL) {
+ GURL url("http:google.com:foo");
+ EXPECT_FALSE(url.is_valid());
+ EXPECT_EQ("http://google.com:foo/", url.possibly_invalid_spec());
+
+ EXPECT_EQ("http", url.scheme());
+ EXPECT_EQ("", url.username());
+ EXPECT_EQ("", url.password());
+ EXPECT_EQ("google.com", url.host());
+ EXPECT_EQ("foo", url.port());
+ EXPECT_EQ(PORT_INVALID, url.IntPort());
+ EXPECT_EQ("/", url.path());
+ EXPECT_EQ("", url.query());
+ EXPECT_EQ("", url.ref());
+}
+
+TEST(GURLTest, Resolve) {
+ // The tricky cases for relative URL resolving are tested in the
+ // canonicalizer unit test. Here, we just test that the GURL integration
+ // works properly.
+ struct ResolveCase {
+ const char* base;
+ const char* relative;
+ bool expected_valid;
+ const char* expected;
+ } resolve_cases[] = {
+ {"http://www.google.com/", "foo.html", true, "http://www.google.com/foo.html"},
+ {"http://www.google.com/", "http://images.google.com/foo.html", true, "http://images.google.com/foo.html"},
+ {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"},
+ {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"},
+ {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"},
+ // A non-standard base can be replaced with a standard absolute URL.
+ {"data:blahblah", "http://google.com/", true, "http://google.com/"},
+ {"data:blahblah", "http:google.com", true, "http://google.com/"},
+ // Filesystem URLs have different paths to test.
+ {"filesystem:http://www.google.com/type/", "foo.html", true, "filesystem:http://www.google.com/type/foo.html"},
+ {"filesystem:http://www.google.com/type/", "../foo.html", true, "filesystem:http://www.google.com/type/foo.html"},
+ };
+
+ for (size_t i = 0; i < arraysize(resolve_cases); i++) {
+ // 8-bit code path.
+ GURL input(resolve_cases[i].base);
+ GURL output = input.Resolve(resolve_cases[i].relative);
+ EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i;
+ EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i;
+ EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL);
+
+ // Wide code path.
+ GURL inputw(ConvertUTF8ToUTF16(resolve_cases[i].base));
+ GURL outputw =
+ input.Resolve(ConvertUTF8ToUTF16(resolve_cases[i].relative));
+ EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i;
+ EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i;
+ EXPECT_EQ(outputw.SchemeIsFileSystem(), outputw.inner_url() != NULL);
+ }
+}
+
+TEST(GURLTest, GetOrigin) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ {"http://www.google.com", "http://www.google.com/"},
+ {"javascript:window.alert(\"hello,world\");", ""},
+ {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/"},
+ {"http://user@www.google.com", "http://www.google.com/"},
+ {"http://:pass@www.google.com", "http://www.google.com/"},
+ {"http://:@www.google.com", "http://www.google.com/"},
+ {"filesystem:http://www.google.com/temp/foo?q#b", "http://www.google.com/"},
+ {"filesystem:http://user:pass@google.com:21/blah#baz", "http://google.com:21/"},
+ };
+ for (size_t i = 0; i < arraysize(cases); i++) {
+ GURL url(cases[i].input);
+ GURL origin = url.GetOrigin();
+ EXPECT_EQ(cases[i].expected, origin.spec());
+ }
+}
+
+TEST(GURLTest, GetAsReferrer) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ {"http://www.google.com", "http://www.google.com/"},
+ {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/blah"},
+ {"http://user@www.google.com", "http://www.google.com/"},
+ {"http://:pass@www.google.com", "http://www.google.com/"},
+ {"http://:@www.google.com", "http://www.google.com/"},
+ {"http://www.google.com/temp/foo?q#b", "http://www.google.com/temp/foo?q"},
+ {"not a url", ""},
+ {"unknown-scheme://foo.html", ""},
+ {"file:///tmp/test.html", ""},
+ {"https://www.google.com", "https://www.google.com/"},
+ };
+ for (size_t i = 0; i < arraysize(cases); i++) {
+ GURL url(cases[i].input);
+ GURL origin = url.GetAsReferrer();
+ EXPECT_EQ(cases[i].expected, origin.spec());
+ }
+}
+
+TEST(GURLTest, GetWithEmptyPath) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ {"http://www.google.com", "http://www.google.com/"},
+ {"javascript:window.alert(\"hello, world\");", ""},
+ {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"},
+ {"filesystem:http://www.google.com/temporary/bar.html?baz=22", "filesystem:http://www.google.com/temporary/"},
+ {"filesystem:file:///temporary/bar.html?baz=22", "filesystem:file:///temporary/"},
+ };
+
+ for (size_t i = 0; i < arraysize(cases); i++) {
+ GURL url(cases[i].input);
+ GURL empty_path = url.GetWithEmptyPath();
+ EXPECT_EQ(cases[i].expected, empty_path.spec());
+ }
+}
+
+TEST(GURLTest, Replacements) {
+ // The url canonicalizer replacement test will handle most of these case.
+ // The most important thing to do here is to check that the proper
+ // canonicalizer gets called based on the scheme of the input.
+ struct ReplaceCase {
+ const char* base;
+ const char* scheme;
+ const char* username;
+ const char* password;
+ const char* host;
+ const char* port;
+ const char* path;
+ const char* query;
+ const char* ref;
+ const char* expected;
+ } replace_cases[] = {
+ {"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "http://www.google.com/"},
+ {"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", "", "window.open('foo');", "", "", "javascript:window.open('foo');"},
+ {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo","search", "ref", "http://www.google.com:99/foo?search#ref"},
+#ifdef WIN32
+ {"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", "c:\\", "", "", "file:///C:/"},
+#endif
+ {"filesystem:http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "filesystem:http://www.google.com/foo/"},
+ };
+
+ for (size_t i = 0; i < arraysize(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ GURL url(cur.base);
+ GURL::Replacements repl;
+ SetupReplacement(&GURL::Replacements::SetScheme, &repl, cur.scheme);
+ SetupReplacement(&GURL::Replacements::SetUsername, &repl, cur.username);
+ SetupReplacement(&GURL::Replacements::SetPassword, &repl, cur.password);
+ SetupReplacement(&GURL::Replacements::SetHost, &repl, cur.host);
+ SetupReplacement(&GURL::Replacements::SetPort, &repl, cur.port);
+ SetupReplacement(&GURL::Replacements::SetPath, &repl, cur.path);
+ SetupReplacement(&GURL::Replacements::SetQuery, &repl, cur.query);
+ SetupReplacement(&GURL::Replacements::SetRef, &repl, cur.ref);
+ GURL output = url.ReplaceComponents(repl);
+
+ EXPECT_EQ(replace_cases[i].expected, output.spec());
+ EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL);
+ }
+}
+
+TEST(GURLTest, ClearFragmentOnDataUrl) {
+ // http://crbug.com/291747 - a data URL may legitimately have trailing
+ // whitespace in the spec after the ref is cleared. Test this does not trigger
+ // the Parsed importing validation DCHECK in GURL.
+ GURL url(" data: one ? two # three ");
+
+ // By default the trailing whitespace will have been stripped.
+ EXPECT_EQ("data: one ? two # three", url.spec());
+ GURL::Replacements repl;
+ repl.ClearRef();
+ GURL url_no_ref = url.ReplaceComponents(repl);
+
+ EXPECT_EQ("data: one ? two ", url_no_ref.spec());
+
+ // Importing a parsed url via this constructor overload will retain trailing
+ // whitespace.
+ GURL import_url(url_no_ref.spec(),
+ url_no_ref.parsed_for_possibly_invalid_spec(),
+ url_no_ref.is_valid());
+ EXPECT_EQ(url_no_ref, import_url);
+ EXPECT_EQ(import_url.query(), " two ");
+}
+
+TEST(GURLTest, PathForRequest) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ const char* inner_expected;
+ } cases[] = {
+ {"http://www.google.com", "/", NULL},
+ {"http://www.google.com/", "/", NULL},
+ {"http://www.google.com/foo/bar.html?baz=22", "/foo/bar.html?baz=22", NULL},
+ {"http://www.google.com/foo/bar.html#ref", "/foo/bar.html", NULL},
+ {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query", NULL},
+ {"filesystem:http://www.google.com/temporary/foo/bar.html?query#ref", "/foo/bar.html?query", "/temporary"},
+ {"filesystem:http://www.google.com/temporary/foo/bar.html?query", "/foo/bar.html?query", "/temporary"},
+ };
+
+ for (size_t i = 0; i < arraysize(cases); i++) {
+ GURL url(cases[i].input);
+ std::string path_request = url.PathForRequest();
+ EXPECT_EQ(cases[i].expected, path_request);
+ EXPECT_EQ(cases[i].inner_expected == NULL, url.inner_url() == NULL);
+ if (url.inner_url() && cases[i].inner_expected)
+ EXPECT_EQ(cases[i].inner_expected, url.inner_url()->PathForRequest());
+ }
+}
+
+TEST(GURLTest, EffectiveIntPort) {
+ struct PortTest {
+ const char* spec;
+ int expected_int_port;
+ } port_tests[] = {
+ // http
+ {"http://www.google.com/", 80},
+ {"http://www.google.com:80/", 80},
+ {"http://www.google.com:443/", 443},
+
+ // https
+ {"https://www.google.com/", 443},
+ {"https://www.google.com:443/", 443},
+ {"https://www.google.com:80/", 80},
+
+ // ftp
+ {"ftp://www.google.com/", 21},
+ {"ftp://www.google.com:21/", 21},
+ {"ftp://www.google.com:80/", 80},
+
+ // gopher
+ {"gopher://www.google.com/", 70},
+ {"gopher://www.google.com:70/", 70},
+ {"gopher://www.google.com:80/", 80},
+
+ // file - no port
+ {"file://www.google.com/", PORT_UNSPECIFIED},
+ {"file://www.google.com:443/", PORT_UNSPECIFIED},
+
+ // data - no port
+ {"data:www.google.com:90", PORT_UNSPECIFIED},
+ {"data:www.google.com", PORT_UNSPECIFIED},
+
+ // filesystem - no port
+ {"filesystem:http://www.google.com:90/t/foo", PORT_UNSPECIFIED},
+ {"filesystem:file:///t/foo", PORT_UNSPECIFIED},
+ };
+
+ for (size_t i = 0; i < arraysize(port_tests); i++) {
+ GURL url(port_tests[i].spec);
+ EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort());
+ }
+}
+
+TEST(GURLTest, IPAddress) {
+ struct IPTest {
+ const char* spec;
+ bool expected_ip;
+ } ip_tests[] = {
+ {"http://www.google.com/", false},
+ {"http://192.168.9.1/", true},
+ {"http://192.168.9.1.2/", false},
+ {"http://192.168.m.1/", false},
+ {"http://2001:db8::1/", false},
+ {"http://[2001:db8::1]/", true},
+ {"", false},
+ {"some random input!", false},
+ };
+
+ for (size_t i = 0; i < arraysize(ip_tests); i++) {
+ GURL url(ip_tests[i].spec);
+ EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress());
+ }
+}
+
+TEST(GURLTest, HostNoBrackets) {
+ struct TestCase {
+ const char* input;
+ const char* expected_host;
+ const char* expected_plainhost;
+ } cases[] = {
+ {"http://www.google.com", "www.google.com", "www.google.com"},
+ {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"},
+ {"http://[::]/", "[::]", "::"},
+
+ // Don't require a valid URL, but don't crash either.
+ {"http://[]/", "[]", ""},
+ {"http://[x]/", "[x]", "x"},
+ {"http://[x/", "[x", "[x"},
+ {"http://x]/", "x]", "x]"},
+ {"http://[/", "[", "["},
+ {"http://]/", "]", "]"},
+ {"", "", ""},
+ };
+ for (size_t i = 0; i < arraysize(cases); i++) {
+ GURL url(cases[i].input);
+ EXPECT_EQ(cases[i].expected_host, url.host());
+ EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets());
+ }
+}
+
+TEST(GURLTest, DomainIs) {
+ const char google_domain[] = "google.com";
+
+ GURL url_1("http://www.google.com:99/foo");
+ EXPECT_TRUE(url_1.DomainIs(google_domain));
+
+ GURL url_2("http://google.com:99/foo");
+ EXPECT_TRUE(url_2.DomainIs(google_domain));
+
+ GURL url_3("http://google.com./foo");
+ EXPECT_TRUE(url_3.DomainIs(google_domain));
+
+ GURL url_4("http://google.com/foo");
+ EXPECT_FALSE(url_4.DomainIs("google.com."));
+
+ GURL url_5("http://google.com./foo");
+ EXPECT_TRUE(url_5.DomainIs("google.com."));
+
+ GURL url_6("http://www.google.com./foo");
+ EXPECT_TRUE(url_6.DomainIs(".com."));
+
+ GURL url_7("http://www.balabala.com/foo");
+ EXPECT_FALSE(url_7.DomainIs(google_domain));
+
+ GURL url_8("http://www.google.com.cn/foo");
+ EXPECT_FALSE(url_8.DomainIs(google_domain));
+
+ GURL url_9("http://www.iamnotgoogle.com/foo");
+ EXPECT_FALSE(url_9.DomainIs(google_domain));
+
+ GURL url_10("http://www.iamnotgoogle.com../foo");
+ EXPECT_FALSE(url_10.DomainIs(".com"));
+
+ GURL url_11("filesystem:http://www.google.com:99/foo/");
+ EXPECT_TRUE(url_11.DomainIs(google_domain));
+
+ GURL url_12("filesystem:http://www.iamnotgoogle.com/foo/");
+ EXPECT_FALSE(url_12.DomainIs(google_domain));
+}
+
+// Newlines should be stripped from inputs.
+TEST(GURLTest, Newlines) {
+ // Constructor.
+ GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n ");
+ EXPECT_EQ("http://www.google.com/asdf", url_1.spec());
+
+ // Relative path resolver.
+ GURL url_2 = url_1.Resolve(" \n /fo\to\r ");
+ EXPECT_EQ("http://www.google.com/foo", url_2.spec());
+
+ // Note that newlines are NOT stripped from ReplaceComponents.
+}
+
+TEST(GURLTest, IsStandard) {
+ GURL a("http:foo/bar");
+ EXPECT_TRUE(a.IsStandard());
+
+ GURL b("foo:bar/baz");
+ EXPECT_FALSE(b.IsStandard());
+
+ GURL c("foo://bar/baz");
+ EXPECT_FALSE(c.IsStandard());
+}
+
+TEST(GURLTest, SchemeIsHTTPOrHTTPS) {
+ EXPECT_TRUE(GURL("http://bar/").SchemeIsHTTPOrHTTPS());
+ EXPECT_TRUE(GURL("HTTPS://BAR").SchemeIsHTTPOrHTTPS());
+ EXPECT_FALSE(GURL("ftp://bar/").SchemeIsHTTPOrHTTPS());
+}
+
+TEST(GURLTest, SchemeIsWSOrWSS) {
+ EXPECT_TRUE(GURL("WS://BAR/").SchemeIsWSOrWSS());
+ EXPECT_TRUE(GURL("wss://bar/").SchemeIsWSOrWSS());
+ EXPECT_FALSE(GURL("http://bar/").SchemeIsWSOrWSS());
+}
+
+TEST(GURLTest, SchemeIsBlob) {
+ EXPECT_TRUE(GURL("BLOB://BAR/").SchemeIsBlob());
+ EXPECT_TRUE(GURL("blob://bar/").SchemeIsBlob());
+ EXPECT_FALSE(GURL("http://bar/").SchemeIsBlob());
+}
+
+} // namespace url
diff --git a/src/url/origin.cc b/src/url/origin.cc
new file mode 100644
index 0000000..fdb8913
--- /dev/null
+++ b/src/url/origin.cc
@@ -0,0 +1,20 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/origin.h"
+
+#include "base/logging.h"
+#include "base/strings/string_util.h"
+
+namespace url {
+
+Origin::Origin() : string_("null") {}
+
+Origin::Origin(const std::string& origin) : string_(origin) {
+ DCHECK(origin == "null" || MatchPattern(origin, "?*://?*"));
+ DCHECK_GT(origin.size(), 0u);
+ DCHECK(origin == "file://" || origin[origin.size() - 1] != '/');
+}
+
+} // namespace url
diff --git a/src/url/origin.h b/src/url/origin.h
new file mode 100644
index 0000000..777e4e1
--- /dev/null
+++ b/src/url/origin.h
@@ -0,0 +1,33 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_ORIGIN_H_
+#define URL_ORIGIN_H_
+
+#include <string>
+
+#include "url/url_export.h"
+
+namespace url {
+
+// Origin represents a Web Origin serialized to a string.
+// See RFC6454 for details.
+class URL_EXPORT Origin {
+ public:
+ Origin();
+ explicit Origin(const std::string& origin);
+
+ const std::string& string() const { return string_; }
+
+ bool IsSameAs(const Origin& that) const {
+ return string_ == that.string_;
+ }
+
+ private:
+ std::string string_;
+};
+
+} // namespace url
+
+#endif // URL_ORIGIN_H_
diff --git a/src/url/origin_unittest.cc b/src/url/origin_unittest.cc
new file mode 100644
index 0000000..910a1cf
--- /dev/null
+++ b/src/url/origin_unittest.cc
@@ -0,0 +1,41 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "testing/base/public/gunit.h"
+#include "url/origin.h"
+
+namespace url {
+
+namespace {
+
+// Each test examines the Origin is constructed correctly without
+// violating DCHECKs.
+TEST(OriginTest, constructEmpty) {
+ Origin origin;
+ EXPECT_EQ("null", origin.string());
+}
+
+TEST(OriginTest, constructNull) {
+ Origin origin("null");
+ EXPECT_EQ("null", origin.string());
+}
+
+TEST(OriginTest, constructValidOrigin) {
+ Origin origin("http://example.com:8080");
+ EXPECT_EQ("http://example.com:8080", origin.string());
+}
+
+TEST(OriginTest, constructValidFileOrigin) {
+ Origin origin("file://");
+ EXPECT_EQ("file://", origin.string());
+}
+
+TEST(OriginTest, constructValidOriginWithoutPort) {
+ Origin origin("wss://example2.com");
+ EXPECT_EQ("wss://example2.com", origin.string());
+}
+
+} // namespace
+
+} // namespace url
diff --git a/googleurl/src/url_parse.cc b/src/url/third_party/mozilla/url_parse.cc
similarity index 73%
rename from googleurl/src/url_parse.cc
rename to src/url/third_party/mozilla/url_parse.cc
index a08c4da..211043c 100644
--- a/googleurl/src/url_parse.cc
+++ b/src/url/third_party/mozilla/url_parse.cc
@@ -34,19 +34,21 @@
*
* ***** END LICENSE BLOCK ***** */
-#include "googleurl/src/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
#include <stdlib.h>
#include "base/logging.h"
-#include "googleurl/src/url_parse_internal.h"
+#include "url/url_parse_internal.h"
+#include "url/url_util.h"
+#include "url/url_util_internal.h"
-namespace url_parse {
+namespace url {
namespace {
// Returns true if the given character is a valid digit to use in a port.
-inline bool IsPortDigit(char16 ch) {
+inline bool IsPortDigit(base::char16 ch) {
return ch >= '0' && ch <= '9';
}
@@ -324,7 +326,7 @@
if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
after_scheme = parsed->scheme.end() + 1; // Skip past the colon.
} else {
- // Say there's no scheme when there is a colon. We could also say that
+ // Say there's no scheme when there is no colon. We could also say that
// everything is the scheme. Both would produce an invalid URL, but this way
// seems less wrong in more cases.
parsed->scheme.reset();
@@ -333,18 +335,20 @@
DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
}
-// Initializes a path URL which is merely a scheme followed by a path. Examples
-// include "about:foo" and "javascript:alert('bar');"
+#ifndef NO_FILESYSTEMURL_SUPPORT
template<typename CHAR>
-void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
- // Get the non-path and non-scheme parts of the URL out of the way, we never
- // use them.
+void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ DCHECK(spec_len >= 0);
+
+ // Get the unused parts of the URL out of the way.
parsed->username.reset();
parsed->password.reset();
parsed->host.reset();
parsed->port.reset();
- parsed->query.reset();
- parsed->ref.reset();
+ parsed->path.reset(); // May use this; reset for convenience.
+ parsed->ref.reset(); // May use this; reset for convenience.
+ parsed->query.reset(); // May use this; reset for convenience.
+ parsed->clear_inner_parsed(); // May use this; reset for convenience.
// Strip leading & trailing spaces and control characters.
int begin = 0;
@@ -353,28 +357,151 @@
// Handle empty specs or ones that contain only whitespace or control chars.
if (begin == spec_len) {
parsed->scheme.reset();
+ return;
+ }
+
+ int inner_start = -1;
+
+ // Extract the scheme. We also handle the case where there is no scheme.
+ if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += begin;
+
+ if (parsed->scheme.end() == spec_len - 1)
+ return;
+
+ inner_start = parsed->scheme.end() + 1;
+ } else {
+ // No scheme found; that's not valid for filesystem URLs.
+ parsed->scheme.reset();
+ return;
+ }
+
+ Component inner_scheme;
+ const CHAR* inner_spec = &spec[inner_start];
+ int inner_spec_len = spec_len - inner_start;
+
+ if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ inner_scheme.begin += inner_start;
+
+ if (inner_scheme.end() == spec_len - 1)
+ return;
+ } else {
+ // No scheme found; that's not valid for filesystem URLs.
+ // The best we can do is return "filesystem://".
+ return;
+ }
+
+ Parsed inner_parsed;
+
+ if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) {
+ // File URLs are special.
+ ParseFileURL(inner_spec, inner_spec_len, &inner_parsed);
+ } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) {
+ // Filesystem URLs don't nest.
+ return;
+ } else if (IsStandard(spec, inner_scheme)) {
+ // All "normal" URLs.
+ DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed);
+ } else {
+ return;
+ }
+
+ // All members of inner_parsed need to be offset by inner_start.
+ // If we had any scheme that supported nesting more than one level deep,
+ // we'd have to recurse into the inner_parsed's inner_parsed when
+ // adjusting by inner_start.
+ inner_parsed.scheme.begin += inner_start;
+ inner_parsed.username.begin += inner_start;
+ inner_parsed.password.begin += inner_start;
+ inner_parsed.host.begin += inner_start;
+ inner_parsed.port.begin += inner_start;
+ inner_parsed.query.begin += inner_start;
+ inner_parsed.ref.begin += inner_start;
+ inner_parsed.path.begin += inner_start;
+
+ // Query and ref move from inner_parsed to parsed.
+ parsed->query = inner_parsed.query;
+ inner_parsed.query.reset();
+ parsed->ref = inner_parsed.ref;
+ inner_parsed.ref.reset();
+
+ parsed->set_inner_parsed(inner_parsed);
+ if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() ||
+ inner_parsed.inner_parsed()) {
+ return;
+ }
+
+ // The path in inner_parsed should start with a slash, then have a filesystem
+ // type followed by a slash. From the first slash up to but excluding the
+ // second should be what it keeps; the rest goes to parsed. If the path ends
+ // before the second slash, it's still pretty clear what the user meant, so
+ // we'll let that through.
+ if (!IsURLSlash(spec[inner_parsed.path.begin])) {
+ return;
+ }
+ int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash
+ while (inner_path_end < spec_len &&
+ !IsURLSlash(spec[inner_path_end]))
+ ++inner_path_end;
+ parsed->path.begin = inner_path_end;
+ int new_inner_path_length = inner_path_end - inner_parsed.path.begin;
+ parsed->path.len = inner_parsed.path.len - new_inner_path_length;
+ parsed->inner_parsed()->path.len = new_inner_path_length;
+}
+#endif
+
+// Initializes a path URL which is merely a scheme followed by a path. Examples
+// include "about:foo" and "javascript:alert('bar');"
+template<typename CHAR>
+void DoParsePathURL(const CHAR* spec, int spec_len,
+ bool trim_path_end,
+ Parsed* parsed) {
+ // Get the non-path and non-scheme parts of the URL out of the way, we never
+ // use them.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->host.reset();
+ parsed->port.reset();
+ parsed->path.reset();
+ parsed->query.reset();
+ parsed->ref.reset();
+
+ // Strip leading & trailing spaces and control characters.
+ int scheme_begin = 0;
+ TrimURL(spec, &scheme_begin, &spec_len, trim_path_end);
+
+ // Handle empty specs or ones that contain only whitespace or control chars.
+ if (scheme_begin == spec_len) {
+ parsed->scheme.reset();
parsed->path.reset();
return;
}
+ int path_begin;
// Extract the scheme, with the path being everything following. We also
// handle the case where there is no scheme.
- if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin,
+ &parsed->scheme)) {
// Offset the results since we gave ExtractScheme a substring.
- parsed->scheme.begin += begin;
-
- // For compatability with the standard URL parser, we treat no path as
- // -1, rather than having a length of 0 (we normally wouldn't care so
- // much for these non-standard URLs).
- if (parsed->scheme.end() == spec_len - 1)
- parsed->path.reset();
- else
- parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
+ parsed->scheme.begin += scheme_begin;
+ path_begin = parsed->scheme.end() + 1;
} else {
- // No scheme found, just path.
+ // No scheme case.
parsed->scheme.reset();
- parsed->path = MakeRange(begin, spec_len);
+ path_begin = scheme_begin;
}
+
+ if (path_begin == spec_len)
+ return;
+ DCHECK_LT(path_begin, spec_len);
+
+ ParsePath(spec,
+ MakeRange(path_begin, spec_len),
+ &parsed->path,
+ &parsed->query,
+ &parsed->ref);
}
template<typename CHAR>
@@ -496,23 +623,13 @@
return;
}
- // Search backwards for a parameter, which is a normally unused field in a
- // URL delimited by a semicolon. We parse the parameter as part of the
- // path, but here, we don't want to count it. The last semicolon is the
- // parameter. The path should start with a slash, so we don't need to check
- // the first one.
+ // Extract the filename range from the path which is between
+ // the last slash and the following semicolon.
int file_end = path.end();
- for (int i = path.end() - 1; i > path.begin; i--) {
+ for (int i = path.end() - 1; i >= path.begin; i--) {
if (spec[i] == ';') {
file_end = i;
- break;
- }
- }
-
- // Now search backwards from the filename end to the previous slash
- // to find the beginning of the filename.
- for (int i = file_end - 1; i >= path.begin; i--) {
- if (IsURLSlash(spec[i])) {
+ } else if (IsURLSlash(spec[i])) {
// File name is everything following this character to the end
*file_name = MakeRange(i + 1, file_end);
return;
@@ -559,12 +676,51 @@
cur++;
// Save the new query
- *query = url_parse::MakeRange(cur, end);
+ *query = MakeRange(cur, end);
return true;
}
} // namespace
+Parsed::Parsed() : inner_parsed_(NULL) {
+}
+
+Parsed::Parsed(const Parsed& other) :
+ scheme(other.scheme),
+ username(other.username),
+ password(other.password),
+ host(other.host),
+ port(other.port),
+ path(other.path),
+ query(other.query),
+ ref(other.ref),
+ inner_parsed_(NULL) {
+ if (other.inner_parsed_)
+ set_inner_parsed(*other.inner_parsed_);
+}
+
+Parsed& Parsed::operator=(const Parsed& other) {
+ if (this != &other) {
+ scheme = other.scheme;
+ username = other.username;
+ password = other.password;
+ host = other.host;
+ port = other.port;
+ path = other.path;
+ query = other.query;
+ ref = other.ref;
+ if (other.inner_parsed_)
+ set_inner_parsed(*other.inner_parsed_);
+ else
+ clear_inner_parsed();
+ }
+ return *this;
+}
+
+Parsed::~Parsed() {
+ delete inner_parsed_;
+}
+
int Parsed::Length() const {
if (ref.is_valid())
return ref.end();
@@ -634,18 +790,27 @@
return cur;
}
+Component Parsed::GetContent() const {
+ const int begin = CountCharactersBefore(USERNAME, false);
+ const int len = Length() - begin;
+ // For compatability with the standard URL parser, we treat no content as
+ // -1, rather than having a length of 0 (we normally wouldn't care so
+ // much for these non-standard URLs).
+ return len ? Component(begin, len) : Component();
+}
+
bool ExtractScheme(const char* url, int url_len, Component* scheme) {
return DoExtractScheme(url, url_len, scheme);
}
-bool ExtractScheme(const char16* url, int url_len, Component* scheme) {
+bool ExtractScheme(const base::char16* url, int url_len, Component* scheme) {
return DoExtractScheme(url, url_len, scheme);
}
// This handles everything that may be an authority terminator, including
// backslash. For special backslash handling see DoParseAfterScheme.
-bool IsAuthorityTerminator(char16 ch) {
- return IsURLSlash(ch) || ch == '?' || ch == '#' || ch == ';';
+bool IsAuthorityTerminator(base::char16 ch) {
+ return IsURLSlash(ch) || ch == '?' || ch == '#';
}
void ExtractFileName(const char* url,
@@ -654,7 +819,7 @@
DoExtractFileName(url, path, file_name);
}
-void ExtractFileName(const char16* url,
+void ExtractFileName(const base::char16* url,
const Component& path,
Component* file_name) {
DoExtractFileName(url, path, file_name);
@@ -667,7 +832,7 @@
return DoExtractQueryKeyValue(url, query, key, value);
}
-bool ExtractQueryKeyValue(const char16* url,
+bool ExtractQueryKeyValue(const base::char16* url,
Component* query,
Component* key,
Component* value) {
@@ -683,7 +848,7 @@
DoParseAuthority(spec, auth, username, password, hostname, port_num);
}
-void ParseAuthority(const char16* spec,
+void ParseAuthority(const base::char16* spec,
const Component& auth,
Component* username,
Component* password,
@@ -696,7 +861,7 @@
return DoParsePort(url, port);
}
-int ParsePort(const char16* url, const Component& port) {
+int ParsePort(const base::char16* url, const Component& port) {
return DoParsePort(url, port);
}
@@ -704,23 +869,49 @@
DoParseStandardURL(url, url_len, parsed);
}
-void ParseStandardURL(const char16* url, int url_len, Parsed* parsed) {
+void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) {
DoParseStandardURL(url, url_len, parsed);
}
-void ParsePathURL(const char* url, int url_len, Parsed* parsed) {
- DoParsePathURL(url, url_len, parsed);
+void ParsePathURL(const char* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed) {
+ DoParsePathURL(url, url_len, trim_path_end, parsed);
}
-void ParsePathURL(const char16* url, int url_len, Parsed* parsed) {
- DoParsePathURL(url, url_len, parsed);
+void ParsePathURL(const base::char16* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed) {
+ DoParsePathURL(url, url_len, trim_path_end, parsed);
+}
+
+void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
+#ifndef NO_FILESYSTEMURL_SUPPORT
+ DoParseFileSystemURL(url, url_len, parsed);
+#else
+ // Should not reach here if the client doesn't want to support file system
+ // URL.
+ DCHECK(false);
+#endif
+}
+
+void ParseFileSystemURL(const base::char16* url, int url_len, Parsed* parsed) {
+#ifndef NO_FILESYSTEMURL_SUPPORT
+ DoParseFileSystemURL(url, url_len, parsed);
+#else
+ // Should not reach here if the client doesn't want to support file system
+ // URL.
+ DCHECK(false);
+#endif
}
void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) {
DoParseMailtoURL(url, url_len, parsed);
}
-void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed) {
+void ParseMailtoURL(const base::char16* url, int url_len, Parsed* parsed) {
DoParseMailtoURL(url, url_len, parsed);
}
@@ -732,7 +923,7 @@
ParsePath(spec, path, filepath, query, ref);
}
-void ParsePathInternal(const char16* spec,
+void ParsePathInternal(const base::char16* spec,
const Component& path,
Component* filepath,
Component* query,
@@ -747,11 +938,11 @@
DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
}
-void ParseAfterScheme(const char16* spec,
+void ParseAfterScheme(const base::char16* spec,
int spec_len,
int after_scheme,
Parsed* parsed) {
DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
}
-} // namespace url_parse
+} // namespace url
diff --git a/googleurl/src/url_parse.h b/src/url/third_party/mozilla/url_parse.h
similarity index 61%
rename from googleurl/src/url_parse.h
rename to src/url/third_party/mozilla/url_parse.h
index 134b445..71dbb78 100644
--- a/googleurl/src/url_parse.h
+++ b/src/url/third_party/mozilla/url_parse.h
@@ -1,46 +1,21 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
-#ifndef GOOGLEURL_SRC_URL_PARSE_H__
-#define GOOGLEURL_SRC_URL_PARSE_H__
+#ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
+#define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
#include <string>
#include "base/basictypes.h"
-#include "base/string16.h"
-#include "googleurl/src/url_common.h"
+#include "base/strings/string16.h"
+#include "url/url_export.h"
-namespace url_parse {
+namespace url {
// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and
// KURLGoogle.cpp still rely on this type.
-typedef char16 UTF16Char;
+typedef base::char16 UTF16Char;
// Component ------------------------------------------------------------------
@@ -94,19 +69,19 @@
//
// Typical usage would be:
//
-// url_parse::Parsed parsed;
-// url_parse::Component scheme;
-// if (!url_parse::ExtractScheme(url, url_len, &scheme))
+// Parsed parsed;
+// Component scheme;
+// if (!ExtractScheme(url, url_len, &scheme))
// return I_CAN_NOT_FIND_THE_SCHEME_DUDE;
//
// if (IsStandardScheme(url, scheme)) // Not provided by this component
-// url_parseParseStandardURL(url, url_len, &parsed);
+// ParseStandardURL(url, url_len, &parsed);
// else if (IsFileURL(url, scheme)) // Not provided by this component
-// url_parse::ParseFileURL(url, url_len, &parsed);
+// ParseFileURL(url, url_len, &parsed);
// else
-// url_parse::ParsePathURL(url, url_len, &parsed);
+// ParsePathURL(url, url_len, &parsed);
//
-struct Parsed {
+struct URL_EXPORT Parsed {
// Identifies different components.
enum ComponentType {
SCHEME,
@@ -119,8 +94,12 @@
REF,
};
- // The default constructor is sufficient for the components.
- Parsed() {}
+ // The default constructor is sufficient for the components, but inner_parsed_
+ // requires special handling.
+ Parsed();
+ Parsed(const Parsed&);
+ Parsed& operator=(const Parsed&);
+ ~Parsed();
// Returns the length of the URL (the end of the last component).
//
@@ -128,7 +107,7 @@
// of the string. For example "http://": the parsed structure will only
// contain an entry for the four-character scheme, and it doesn't know about
// the "://". For all other last-components, it will return the real length.
- GURL_API int Length() const;
+ int Length() const;
// Returns the number of characters before the given component if it exists,
// or where the component would be if it did exist. This will return the
@@ -156,8 +135,7 @@
// *QUERY: 14 15 <-
// *REF: 20 20
//
- GURL_API int CountCharactersBefore(ComponentType type,
- bool include_delimiter) const;
+ int CountCharactersBefore(ComponentType type, bool include_delimiter) const;
// Scheme without the colon: "http://foo"/ would have a scheme of "http".
// The length will be -1 if no scheme is specified ("foo.com"), or 0 if there
@@ -181,10 +159,11 @@
// Port number.
Component port;
- // Path, this is everything following the host name. Length will be -1 if
- // unspecified. This includes the preceeding slash, so the path on
- // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to
- // have a 0 length path, it will be -1 in cases like "http://host?foo".
+ // Path, this is everything following the host name, stopping at the query of
+ // ref delimiter (if any). Length will be -1 if unspecified. This includes
+ // the preceeding slash, so the path on http://www.google.com/asdf" is
+ // "/asdf". As a result, it is impossible to have a 0 length path, it will
+ // be -1 in cases like "http://host?foo".
// Note that we treat backslashes the same as slashes.
Component path;
@@ -198,6 +177,37 @@
// Length will be -1 if there is no hash sign, or 0 if there is one but
// nothing follows it.
Component ref;
+
+ // The URL spec from the character after the scheme: until the end of the
+ // URL, regardless of the scheme. This is mostly useful for 'opaque' non-
+ // hierarchical schemes like data: and javascript: as a convient way to get
+ // the string with the scheme stripped off.
+ Component GetContent() const;
+
+ // This is used for nested URL types, currently only filesystem. If you
+ // parse a filesystem URL, the resulting Parsed will have a nested
+ // inner_parsed_ to hold the parsed inner URL's component information.
+ // For all other url types [including the inner URL], it will be NULL.
+ Parsed* inner_parsed() const {
+ return inner_parsed_;
+ }
+
+ void set_inner_parsed(const Parsed& inner_parsed) {
+ if (!inner_parsed_)
+ inner_parsed_ = new Parsed(inner_parsed);
+ else
+ *inner_parsed_ = inner_parsed;
+ }
+
+ void clear_inner_parsed() {
+ if (inner_parsed_) {
+ delete inner_parsed_;
+ inner_parsed_ = NULL;
+ }
+ }
+
+ private:
+ Parsed* inner_parsed_; // This object is owned and managed by this struct.
};
// Initialization functions ---------------------------------------------------
@@ -217,24 +227,46 @@
// StandardURL is for when the scheme is known to be one that has an
// authority (host) like "http". This function will not handle weird ones
// like "about:" and "javascript:", or do the right thing for "file:" URLs.
-GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
-GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParseStandardURL(const char* url,
+ int url_len,
+ Parsed* parsed);
+URL_EXPORT void ParseStandardURL(const base::char16* url,
+ int url_len,
+ Parsed* parsed);
// PathURL is for when the scheme is known not to have an authority (host)
// section but that aren't file URLs either. The scheme is parsed, and
// everything after the scheme is considered as the path. This is used for
// things like "about:" and "javascript:"
-GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed);
-GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParsePathURL(const char* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed);
+URL_EXPORT void ParsePathURL(const base::char16* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed);
// FileURL is for file URLs. There are some special rules for interpreting
// these.
-GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed);
-GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParseFileURL(const char* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParseFileURL(const base::char16* url,
+ int url_len,
+ Parsed* parsed);
+
+// Filesystem URLs are structured differently than other URLs.
+URL_EXPORT void ParseFileSystemURL(const char* url,
+ int url_len,
+ Parsed* parsed);
+URL_EXPORT void ParseFileSystemURL(const base::char16* url,
+ int url_len,
+ Parsed* parsed);
// MailtoURL is for mailto: urls. They are made up scheme,path,query
-GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
-GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParseMailtoURL(const base::char16* url,
+ int url_len,
+ Parsed* parsed);
// Helper functions -----------------------------------------------------------
@@ -258,27 +290,31 @@
// end of the string).
//
// The 8-bit version requires UTF-8 encoding.
-GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme);
-GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme);
+URL_EXPORT bool ExtractScheme(const char* url,
+ int url_len,
+ Component* scheme);
+URL_EXPORT bool ExtractScheme(const base::char16* url,
+ int url_len,
+ Component* scheme);
// Returns true if ch is a character that terminates the authority segment
// of a URL.
-GURL_API bool IsAuthorityTerminator(char16 ch);
+URL_EXPORT bool IsAuthorityTerminator(base::char16 ch);
// Does a best effort parse of input |spec|, in range |auth|. If a particular
// component is not found, it will be set to invalid.
-GURL_API void ParseAuthority(const char* spec,
- const Component& auth,
- Component* username,
- Component* password,
- Component* hostname,
- Component* port_num);
-GURL_API void ParseAuthority(const char16* spec,
- const Component& auth,
- Component* username,
- Component* password,
- Component* hostname,
- Component* port_num);
+URL_EXPORT void ParseAuthority(const char* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num);
+URL_EXPORT void ParseAuthority(const base::char16* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num);
// Computes the integer port value from the given port component. The port
// component should have been identified by one of the init functions on
@@ -287,8 +323,8 @@
// The return value will be a positive integer between 0 and 64K, or one of
// the two special values below.
enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
-GURL_API int ParsePort(const char* url, const Component& port);
-GURL_API int ParsePort(const char16* url, const Component& port);
+URL_EXPORT int ParsePort(const char* url, const Component& port);
+URL_EXPORT int ParsePort(const base::char16* url, const Component& port);
// Extracts the range of the file name in the given url. The path must
// already have been computed by the parse function, and the matching URL
@@ -300,12 +336,12 @@
// following the last slash.
//
// The 8-bit version requires UTF-8 encoding.
-GURL_API void ExtractFileName(const char* url,
- const Component& path,
- Component* file_name);
-GURL_API void ExtractFileName(const char16* url,
- const Component& path,
- Component* file_name);
+URL_EXPORT void ExtractFileName(const char* url,
+ const Component& path,
+ Component* file_name);
+URL_EXPORT void ExtractFileName(const base::char16* url,
+ const Component& path,
+ Component* file_name);
// Extract the first key/value from the range defined by |*query|. Updates
// |*query| to start at the end of the extracted key/value pair. This is
@@ -322,15 +358,15 @@
//
// If no key/value are found |*key| and |*value| will be unchanged and it will
// return false.
-GURL_API bool ExtractQueryKeyValue(const char* url,
- Component* query,
- Component* key,
- Component* value);
-GURL_API bool ExtractQueryKeyValue(const char16* url,
- Component* query,
- Component* key,
- Component* value);
+URL_EXPORT bool ExtractQueryKeyValue(const char* url,
+ Component* query,
+ Component* key,
+ Component* value);
+URL_EXPORT bool ExtractQueryKeyValue(const base::char16* url,
+ Component* query,
+ Component* key,
+ Component* value);
-} // namespace url_parse
+} // namespace url
-#endif // GOOGLEURL_SRC_URL_PARSE_H__
+#endif // URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
diff --git a/googleurl/src/url_canon.h b/src/url/url_canon.h
similarity index 62%
rename from googleurl/src/url_canon.h
rename to src/url/url_canon.h
index e2cfb55..89e3509 100644
--- a/googleurl/src/url_canon.h
+++ b/src/url/url_canon.h
@@ -1,42 +1,18 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#ifndef GOOGLEURL_SRC_URL_CANON_H__
-#define GOOGLEURL_SRC_URL_CANON_H__
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
-#include <memory.h>
+#ifndef URL_URL_CANON_H_
+#define URL_URL_CANON_H_
+
#include <stdlib.h>
+#include <string.h>
-#include "base/string16.h"
-#include "googleurl/src/url_common.h"
-#include "googleurl/src/url_parse.h"
+#include "base/strings/string16.h"
+#include "url/url_export.h"
+#include "url/url_parse.h"
-namespace url_canon {
+namespace url {
// Canonicalizer output -------------------------------------------------------
@@ -65,13 +41,13 @@
// Accessor for returning a character at a given position. The input offset
// must be in the valid range.
- inline char at(int offset) const {
+ inline T at(int offset) const {
return buffer_[offset];
}
// Sets the character at the given position. The given position MUST be less
// than the length().
- inline void set(int offset, int ch) {
+ inline void set(int offset, T ch) {
buffer_[offset] = ch;
}
@@ -178,7 +154,7 @@
delete[] this->buffer_;
}
- virtual void Resize(int sz) {
+ void Resize(int sz) override {
T* new_buf = new T[sz];
memcpy(new_buf, this->buffer_,
sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));
@@ -196,12 +172,12 @@
// the templates so it can also be used internally if a wide buffer is
// required.
typedef CanonOutputT<char> CanonOutput;
-typedef CanonOutputT<char16> CanonOutputW;
+typedef CanonOutputT<base::char16> CanonOutputW;
template<int fixed_capacity>
class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
template<int fixed_capacity>
-class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {};
+class RawCanonOutputW : public RawCanonOutputT<base::char16, fixed_capacity> {};
// Character set converter ----------------------------------------------------
//
@@ -211,7 +187,7 @@
//
// Embedders will want to see the unit test for the ICU version.
-class CharsetConverter {
+class URL_EXPORT CharsetConverter {
public:
CharsetConverter() {}
virtual ~CharsetConverter() {}
@@ -227,7 +203,7 @@
// decimal, (such as "你") with escaping of the ampersand, number
// sign, and semicolon (in the previous example it would be
// "%26%2320320%3B"). This rule is based on what IE does in this situation.
- virtual void ConvertFromUTF16(const char16* input,
+ virtual void ConvertFromUTF16(const base::char16* input,
int input_len,
CanonOutput* output) = 0;
};
@@ -247,14 +223,16 @@
// required, the given |buffer| will be used and the returned pointer will
// point to the beginning of the buffer.
//
-// Therefore, callers should not use the buffer, since it may actuall be empty,
+// Therefore, callers should not use the buffer, since it may actually be empty,
// use the computed pointer and |*output_len| instead.
-GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,
- CanonOutputT<char>* buffer,
- int* output_len);
-GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,
- CanonOutputT<char16>* buffer,
+URL_EXPORT const char* RemoveURLWhitespace(const char* input, int input_len,
+ CanonOutputT<char>* buffer,
int* output_len);
+URL_EXPORT const base::char16* RemoveURLWhitespace(
+ const base::char16* input,
+ int input_len,
+ CanonOutputT<base::char16>* buffer,
+ int* output_len);
// IDN ------------------------------------------------------------------------
@@ -267,7 +245,9 @@
// the length of the output will be set to the length of the new host name.
//
// On error, returns false. The output in this case is undefined.
-GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
+URL_EXPORT bool IDNToASCII(const base::char16* src,
+ int src_len,
+ CanonOutputW* output);
// Piece-by-piece canonicalizers ----------------------------------------------
//
@@ -293,14 +273,14 @@
// URLs.
//
// The 8-bit version requires UTF-8 encoding.
-GURL_API bool CanonicalizeScheme(const char* spec,
- const url_parse::Component& scheme,
- CanonOutput* output,
- url_parse::Component* out_scheme);
-GURL_API bool CanonicalizeScheme(const char16* spec,
- const url_parse::Component& scheme,
- CanonOutput* output,
- url_parse::Component* out_scheme);
+URL_EXPORT bool CanonicalizeScheme(const char* spec,
+ const Component& scheme,
+ CanonOutput* output,
+ Component* out_scheme);
+URL_EXPORT bool CanonicalizeScheme(const base::char16* spec,
+ const Component& scheme,
+ CanonOutput* output,
+ Component* out_scheme);
// User info: username/password. If present, this will add the delimiters so
// the output will be "<username>:<password>@" or "<username>@". Empty
@@ -312,21 +292,20 @@
// is legal as long as the two components don't overlap.
//
// The 8-bit version requires UTF-8 encoding.
-GURL_API bool CanonicalizeUserInfo(const char* username_source,
- const url_parse::Component& username,
- const char* password_source,
- const url_parse::Component& password,
- CanonOutput* output,
- url_parse::Component* out_username,
- url_parse::Component* out_password);
-GURL_API bool CanonicalizeUserInfo(const char16* username_source,
- const url_parse::Component& username,
- const char16* password_source,
- const url_parse::Component& password,
- CanonOutput* output,
- url_parse::Component* out_username,
- url_parse::Component* out_password);
-
+URL_EXPORT bool CanonicalizeUserInfo(const char* username_source,
+ const Component& username,
+ const char* password_source,
+ const Component& password,
+ CanonOutput* output,
+ Component* out_username,
+ Component* out_password);
+URL_EXPORT bool CanonicalizeUserInfo(const base::char16* username_source,
+ const Component& username,
+ const base::char16* password_source,
+ const Component& password,
+ CanonOutput* output,
+ Component* out_username,
+ Component* out_password);
// This structure holds detailed state exported from the IP/Host canonicalizers.
// Additional fields may be added as callers require them.
@@ -359,7 +338,18 @@
// Location of host within the canonicalized output.
// CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6.
// CanonicalizeHostVerbose() always sets it.
- url_parse::Component out_host;
+ Component out_host;
+
+ // |address| contains the parsed IP Address (if any) in its first
+ // AddressLength() bytes, in network order. If IsIPAddress() is false
+ // AddressLength() will return zero and the content of |address| is undefined.
+ unsigned char address[16];
+
+ // Convenience function to calculate the length of an IP address corresponding
+ // to the current IP version in |family|, if any. For use with |address|.
+ int AddressLength() const {
+ return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0);
+ }
};
@@ -367,28 +357,27 @@
//
// The 8-bit version requires UTF-8 encoding. Use this version when you only
// need to know whether canonicalization succeeded.
-GURL_API bool CanonicalizeHost(const char* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- url_parse::Component* out_host);
-GURL_API bool CanonicalizeHost(const char16* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- url_parse::Component* out_host);
+URL_EXPORT bool CanonicalizeHost(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ Component* out_host);
+URL_EXPORT bool CanonicalizeHost(const base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ Component* out_host);
// Extended version of CanonicalizeHost, which returns additional information.
// Use this when you need to know whether the hostname was an IP address.
// A successful return is indicated by host_info->family != BROKEN. See the
// definition of CanonHostInfo above for details.
-GURL_API void CanonicalizeHostVerbose(const char* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- CanonHostInfo* host_info);
-GURL_API void CanonicalizeHostVerbose(const char16* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- CanonHostInfo* host_info);
-
+URL_EXPORT void CanonicalizeHostVerbose(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+URL_EXPORT void CanonicalizeHostVerbose(const base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
// IP addresses.
//
@@ -400,34 +389,34 @@
// This is called AUTOMATICALLY from the host canonicalizer, which ensures that
// the input is unescaped and name-prepped, etc. It should not normally be
// necessary or wise to call this directly.
-GURL_API void CanonicalizeIPAddress(const char* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- CanonHostInfo* host_info);
-GURL_API void CanonicalizeIPAddress(const char16* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- CanonHostInfo* host_info);
+URL_EXPORT void CanonicalizeIPAddress(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+URL_EXPORT void CanonicalizeIPAddress(const base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
// Port: this function will add the colon for the port if a port is present.
-// The caller can pass url_parse::PORT_UNSPECIFIED as the
+// The caller can pass PORT_UNSPECIFIED as the
// default_port_for_scheme argument if there is no default port.
//
// The 8-bit version requires UTF-8 encoding.
-GURL_API bool CanonicalizePort(const char* spec,
- const url_parse::Component& port,
- int default_port_for_scheme,
- CanonOutput* output,
- url_parse::Component* out_port);
-GURL_API bool CanonicalizePort(const char16* spec,
- const url_parse::Component& port,
- int default_port_for_scheme,
- CanonOutput* output,
- url_parse::Component* out_port);
+URL_EXPORT bool CanonicalizePort(const char* spec,
+ const Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ Component* out_port);
+URL_EXPORT bool CanonicalizePort(const base::char16* spec,
+ const Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ Component* out_port);
// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
// if the scheme is unknown.
-GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);
+URL_EXPORT int DefaultPortForScheme(const char* scheme, int scheme_len);
// Path. If the input does not begin in a slash (including if the input is
// empty), we'll prepend a slash to the path to make it canonical.
@@ -438,14 +427,14 @@
// an issue. Somebody giving us an 8-bit path is responsible for generating
// the path that the server expects (we'll escape high-bit characters), so
// if something is invalid, it's their problem.
-GURL_API bool CanonicalizePath(const char* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
-GURL_API bool CanonicalizePath(const char16* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
+URL_EXPORT bool CanonicalizePath(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+URL_EXPORT bool CanonicalizePath(const base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
// Canonicalizes the input as a file path. This is like CanonicalizePath except
// that it also handles Windows drive specs. For example, the path can begin
@@ -453,14 +442,14 @@
// The string will be appended to |*output| and |*out_path| will be updated.
//
// The 8-bit version requires UTF-8 encoding.
-GURL_API bool FileCanonicalizePath(const char* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
-GURL_API bool FileCanonicalizePath(const char16* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
+URL_EXPORT bool FileCanonicalizePath(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+URL_EXPORT bool FileCanonicalizePath(const base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
// Query: Prepends the ? if needed.
//
@@ -474,16 +463,16 @@
// if necessary, for ASCII input, no conversions are necessary.
//
// The converter can be NULL. In this case, the output encoding will be UTF-8.
-GURL_API void CanonicalizeQuery(const char* spec,
- const url_parse::Component& query,
- CharsetConverter* converter,
- CanonOutput* output,
- url_parse::Component* out_query);
-GURL_API void CanonicalizeQuery(const char16* spec,
- const url_parse::Component& query,
- CharsetConverter* converter,
- CanonOutput* output,
- url_parse::Component* out_query);
+URL_EXPORT void CanonicalizeQuery(const char* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ Component* out_query);
+URL_EXPORT void CanonicalizeQuery(const base::char16* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ Component* out_query);
// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
// canonicalizer that does not produce ASCII output). The output is
@@ -491,14 +480,14 @@
//
// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
// the "Unicode replacement character" for the confusing bits and copy the rest.
-GURL_API void CanonicalizeRef(const char* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
-GURL_API void CanonicalizeRef(const char16* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
+URL_EXPORT void CanonicalizeRef(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+URL_EXPORT void CanonicalizeRef(const base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
// Full canonicalizer ---------------------------------------------------------
//
@@ -511,71 +500,85 @@
// The 8-bit versions require UTF-8 encoding.
// Use for standard URLs with authorities and paths.
-GURL_API bool CanonicalizeStandardURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-GURL_API bool CanonicalizeStandardURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+URL_EXPORT bool CanonicalizeStandardURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool CanonicalizeStandardURL(const base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
// Use for file URLs.
-GURL_API bool CanonicalizeFileURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-GURL_API bool CanonicalizeFileURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+URL_EXPORT bool CanonicalizeFileURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool CanonicalizeFileURL(const base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Use for filesystem URLs.
+URL_EXPORT bool CanonicalizeFileSystemURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool CanonicalizeFileSystemURL(const base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
// Use for path URLs such as javascript. This does not modify the path in any
// way, for example, by escaping it.
-GURL_API bool CanonicalizePathURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-GURL_API bool CanonicalizePathURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+URL_EXPORT bool CanonicalizePathURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool CanonicalizePathURL(const base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed);
// Use for mailto URLs. This "canonicalizes" the url into a path and query
// component. It does not attempt to merge "to" fields. It uses UTF-8 for
// the query encoding if there is a query. This is because a mailto URL is
// really intended for an external mail program, and the encoding of a page,
// etc. which would influence a query encoding normally are irrelevant.
-GURL_API bool CanonicalizeMailtoURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-GURL_API bool CanonicalizeMailtoURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+URL_EXPORT bool CanonicalizeMailtoURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool CanonicalizeMailtoURL(const base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed);
// Part replacer --------------------------------------------------------------
// Internal structure used for storing separate strings for each component.
// The basic canonicalization functions use this structure internally so that
-// component remplacement (different strings for different components) can be
+// component replacement (different strings for different components) can be
// treated on the same code path as regular canonicalization (the same string
// for each component).
//
-// A url_parse::Parsed structure usually goes along with this. Those
+// A Parsed structure usually goes along with this. Those
// components identify offsets within these strings, so that they can all be
// in the same string, or spread arbitrarily across different ones.
//
@@ -638,7 +641,7 @@
}
// Scheme
- void SetScheme(const CHAR* s, const url_parse::Component& comp) {
+ void SetScheme(const CHAR* s, const Component& comp) {
sources_.scheme = s;
components_.scheme = comp;
}
@@ -646,86 +649,86 @@
bool IsSchemeOverridden() const { return sources_.scheme != NULL; }
// Username
- void SetUsername(const CHAR* s, const url_parse::Component& comp) {
+ void SetUsername(const CHAR* s, const Component& comp) {
sources_.username = s;
components_.username = comp;
}
void ClearUsername() {
sources_.username = Placeholder();
- components_.username = url_parse::Component();
+ components_.username = Component();
}
bool IsUsernameOverridden() const { return sources_.username != NULL; }
// Password
- void SetPassword(const CHAR* s, const url_parse::Component& comp) {
+ void SetPassword(const CHAR* s, const Component& comp) {
sources_.password = s;
components_.password = comp;
}
void ClearPassword() {
sources_.password = Placeholder();
- components_.password = url_parse::Component();
+ components_.password = Component();
}
bool IsPasswordOverridden() const { return sources_.password != NULL; }
// Host
- void SetHost(const CHAR* s, const url_parse::Component& comp) {
+ void SetHost(const CHAR* s, const Component& comp) {
sources_.host = s;
components_.host = comp;
}
void ClearHost() {
sources_.host = Placeholder();
- components_.host = url_parse::Component();
+ components_.host = Component();
}
bool IsHostOverridden() const { return sources_.host != NULL; }
// Port
- void SetPort(const CHAR* s, const url_parse::Component& comp) {
+ void SetPort(const CHAR* s, const Component& comp) {
sources_.port = s;
components_.port = comp;
}
void ClearPort() {
sources_.port = Placeholder();
- components_.port = url_parse::Component();
+ components_.port = Component();
}
bool IsPortOverridden() const { return sources_.port != NULL; }
// Path
- void SetPath(const CHAR* s, const url_parse::Component& comp) {
+ void SetPath(const CHAR* s, const Component& comp) {
sources_.path = s;
components_.path = comp;
}
void ClearPath() {
sources_.path = Placeholder();
- components_.path = url_parse::Component();
+ components_.path = Component();
}
bool IsPathOverridden() const { return sources_.path != NULL; }
// Query
- void SetQuery(const CHAR* s, const url_parse::Component& comp) {
+ void SetQuery(const CHAR* s, const Component& comp) {
sources_.query = s;
components_.query = comp;
}
void ClearQuery() {
sources_.query = Placeholder();
- components_.query = url_parse::Component();
+ components_.query = Component();
}
bool IsQueryOverridden() const { return sources_.query != NULL; }
// Ref
- void SetRef(const CHAR* s, const url_parse::Component& comp) {
+ void SetRef(const CHAR* s, const Component& comp) {
sources_.ref = s;
components_.ref = comp;
}
void ClearRef() {
sources_.ref = Placeholder();
- components_.ref = url_parse::Component();
+ components_.ref = Component();
}
bool IsRefOverridden() const { return sources_.ref != NULL; }
// Getters for the itnernal data. See the variables below for how the
// information is encoded.
const URLComponentSource<CHAR>& sources() const { return sources_; }
- const url_parse::Parsed& components() const { return components_; }
+ const Parsed& components() const { return components_; }
private:
// Returns a pointer to a static empty string that is used as a placeholder
@@ -746,63 +749,80 @@
// We use a pointer to the empty string for the source when the component
// should be deleted.
URLComponentSource<CHAR> sources_;
- url_parse::Parsed components_;
+ Parsed components_;
};
// The base must be an 8-bit canonical URL.
-GURL_API bool ReplaceStandardURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-GURL_API bool ReplaceStandardURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+URL_EXPORT bool ReplaceStandardURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool ReplaceStandardURL(
+ const char* base,
+ const Parsed& base_parsed,
+ const Replacements<base::char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Filesystem URLs can only have the path, query, or ref replaced.
+// All other components will be ignored.
+URL_EXPORT bool ReplaceFileSystemURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool ReplaceFileSystemURL(
+ const char* base,
+ const Parsed& base_parsed,
+ const Replacements<base::char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
// Replacing some parts of a file URL is not permitted. Everything except
// the host, path, query, and ref will be ignored.
-GURL_API bool ReplaceFileURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-GURL_API bool ReplaceFileURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+URL_EXPORT bool ReplaceFileURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool ReplaceFileURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<base::char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
// Path URLs can only have the scheme and path replaced. All other components
// will be ignored.
-GURL_API bool ReplacePathURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-GURL_API bool ReplacePathURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+URL_EXPORT bool ReplacePathURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool ReplacePathURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<base::char16>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed);
// Mailto URLs can only have the scheme, path, and query replaced.
// All other components will be ignored.
-GURL_API bool ReplaceMailtoURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-GURL_API bool ReplaceMailtoURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+URL_EXPORT bool ReplaceMailtoURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed);
+URL_EXPORT bool ReplaceMailtoURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<base::char16>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed);
// Relative URL ---------------------------------------------------------------
@@ -811,26 +831,26 @@
// relative, the relevant portion of the URL will be placed into
// |*relative_component| (there may have been trimmed whitespace, for example).
// This value is passed to ResolveRelativeURL. If the input is not relative,
-// this value is UNDEFINED (it may be changed by the functin).
+// this value is UNDEFINED (it may be changed by the function).
//
// Returns true on success (we successfully determined the URL is relative or
// not). Failure means that the combination of URLs doesn't make any sense.
//
// The base URL should always be canonical, therefore is ASCII.
-GURL_API bool IsRelativeURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const char* fragment,
- int fragment_len,
- bool is_base_hierarchical,
- bool* is_relative,
- url_parse::Component* relative_component);
-GURL_API bool IsRelativeURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const char16* fragment,
- int fragment_len,
- bool is_base_hierarchical,
- bool* is_relative,
- url_parse::Component* relative_component);
+URL_EXPORT bool IsRelativeURL(const char* base,
+ const Parsed& base_parsed,
+ const char* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ Component* relative_component);
+URL_EXPORT bool IsRelativeURL(const char* base,
+ const Parsed& base_parsed,
+ const base::char16* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ Component* relative_component);
// Given a canonical parsed source URL, a URL fragment known to be relative,
// and the identified relevant portion of the relative URL (computed by
@@ -850,23 +870,23 @@
// Returns true on success. On failure, the output will be "something
// reasonable" that will be consistent and valid, just probably not what
// was intended by the web page author or caller.
-GURL_API bool ResolveRelativeURL(const char* base_url,
- const url_parse::Parsed& base_parsed,
- bool base_is_file,
- const char* relative_url,
- const url_parse::Component& relative_component,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* out_parsed);
-GURL_API bool ResolveRelativeURL(const char* base_url,
- const url_parse::Parsed& base_parsed,
- bool base_is_file,
- const char16* relative_url,
- const url_parse::Component& relative_component,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* out_parsed);
+URL_EXPORT bool ResolveRelativeURL(const char* base_url,
+ const Parsed& base_parsed,
+ bool base_is_file,
+ const char* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed);
+URL_EXPORT bool ResolveRelativeURL(const char* base_url,
+ const Parsed& base_parsed,
+ bool base_is_file,
+ const base::char16* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed);
-} // namespace url_canon
+} // namespace url
-#endif // GOOGLEURL_SRC_URL_CANON_H__
+#endif // URL_URL_CANON_H_
diff --git a/googleurl/src/url_canon_etc.cc b/src/url/url_canon_etc.cc
similarity index 68%
rename from googleurl/src/url_canon_etc.cc
rename to src/url/url_canon_etc.cc
index aea181a..7409efd 100644
--- a/googleurl/src/url_canon_etc.cc
+++ b/src/url/url_canon_etc.cc
@@ -1,40 +1,15 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
// Canonicalizers for random bits that aren't big enough for their own files.
#include <string.h>
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_canon_internal.h"
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
-namespace url_canon {
+namespace url {
namespace {
@@ -107,12 +82,12 @@
template<typename CHAR, typename UCHAR>
bool DoScheme(const CHAR* spec,
- const url_parse::Component& scheme,
+ const Component& scheme,
CanonOutput* output,
- url_parse::Component* out_scheme) {
+ Component* out_scheme) {
if (scheme.len <= 0) {
// Scheme is unspecified or empty, convert to empty by appending a colon.
- *out_scheme = url_parse::Component(output->length(), 0);
+ *out_scheme = Component(output->length(), 0);
output->push_back(':');
return true;
}
@@ -123,7 +98,7 @@
// Danger: it's important that this code does not strip any characters: it
// only emits the canonical version (be it valid or escaped) of each of
// the input characters. Stripping would put it out of sync with
- // url_util::FindAndCompareScheme, which could cause some security checks on
+ // FindAndCompareScheme, which could cause some security checks on
// schemes to be incorrect.
bool success = true;
int end = scheme.end();
@@ -171,16 +146,16 @@
// replacing components.
template<typename CHAR, typename UCHAR>
bool DoUserInfo(const CHAR* username_spec,
- const url_parse::Component& username,
+ const Component& username,
const CHAR* password_spec,
- const url_parse::Component& password,
+ const Component& password,
CanonOutput* output,
- url_parse::Component* out_username,
- url_parse::Component* out_password) {
+ Component* out_username,
+ Component* out_password) {
if (username.len <= 0 && password.len <= 0) {
// Common case: no user info. We strip empty username/passwords.
- *out_username = url_parse::Component();
- *out_password = url_parse::Component();
+ *out_username = Component();
+ *out_password = Component();
return true;
}
@@ -202,7 +177,7 @@
CHAR_USERINFO, output);
out_password->len = output->length() - out_password->begin;
} else {
- *out_password = url_parse::Component();
+ *out_password = Component();
}
output->push_back('@');
@@ -213,25 +188,21 @@
inline void WritePortInt(char* output, int output_len, int port) {
_itoa_s(port, output, output_len, 10);
}
-inline void WritePortInt(char16* output, int output_len, int port) {
- _itow_s(port, output, output_len, 10);
-}
// This function will prepend the colon if there will be a port.
template<typename CHAR, typename UCHAR>
bool DoPort(const CHAR* spec,
- const url_parse::Component& port,
+ const Component& port,
int default_port_for_scheme,
CanonOutput* output,
- url_parse::Component* out_port) {
- int port_num = url_parse::ParsePort(spec, port);
- if (port_num == url_parse::PORT_UNSPECIFIED ||
- port_num == default_port_for_scheme) {
- *out_port = url_parse::Component();
+ Component* out_port) {
+ int port_num = ParsePort(spec, port);
+ if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
+ *out_port = Component();
return true; // Leave port empty.
}
- if (port_num == url_parse::PORT_INVALID) {
+ if (port_num == PORT_INVALID) {
// Invalid port: We'll copy the text from the input so the user can see
// what the error was, and mark the URL as invalid by returning false.
output->push_back(':');
@@ -259,12 +230,12 @@
template<typename CHAR, typename UCHAR>
void DoCanonicalizeRef(const CHAR* spec,
- const url_parse::Component& ref,
+ const Component& ref,
CanonOutput* output,
- url_parse::Component* out_ref) {
+ Component* out_ref) {
if (ref.len < 0) {
// Common case of no ref.
- *out_ref = url_parse::Component();
+ *out_ref = Component();
return;
}
@@ -290,12 +261,11 @@
} else {
// Non-ASCII characters are appended unescaped, but only when they are
// valid. Invalid Unicode characters are replaced with the "invalid
- // character" as IE seems to.
+ // character" as IE seems to (ReadUTFChar puts the unicode replacement
+ // character in the output on failure for us).
unsigned code_point;
- if (!ReadUTFChar(spec, &i, end, &code_point))
- AppendUTF8Value(kUnicodeReplacementCharacter, output);
- else
- AppendUTF8Value(code_point, output);
+ ReadUTFChar(spec, &i, end, &code_point);
+ AppendUTF8Value(code_point, output);
}
}
@@ -310,87 +280,88 @@
return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
}
-const char16* RemoveURLWhitespace(const char16* input, int input_len,
- CanonOutputT<char16>* buffer,
- int* output_len) {
+const base::char16* RemoveURLWhitespace(const base::char16* input,
+ int input_len,
+ CanonOutputT<base::char16>* buffer,
+ int* output_len) {
return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
}
-char CanonicalSchemeChar(char16 ch) {
+char CanonicalSchemeChar(base::char16 ch) {
if (ch >= 0x80)
return 0; // Non-ASCII is not supported by schemes.
return kSchemeCanonical[ch];
}
bool CanonicalizeScheme(const char* spec,
- const url_parse::Component& scheme,
+ const Component& scheme,
CanonOutput* output,
- url_parse::Component* out_scheme) {
+ Component* out_scheme) {
return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
}
-bool CanonicalizeScheme(const char16* spec,
- const url_parse::Component& scheme,
+bool CanonicalizeScheme(const base::char16* spec,
+ const Component& scheme,
CanonOutput* output,
- url_parse::Component* out_scheme) {
- return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
+ Component* out_scheme) {
+ return DoScheme<base::char16, base::char16>(spec, scheme, output, out_scheme);
}
bool CanonicalizeUserInfo(const char* username_source,
- const url_parse::Component& username,
+ const Component& username,
const char* password_source,
- const url_parse::Component& password,
+ const Component& password,
CanonOutput* output,
- url_parse::Component* out_username,
- url_parse::Component* out_password) {
+ Component* out_username,
+ Component* out_password) {
return DoUserInfo<char, unsigned char>(
username_source, username, password_source, password,
output, out_username, out_password);
}
-bool CanonicalizeUserInfo(const char16* username_source,
- const url_parse::Component& username,
- const char16* password_source,
- const url_parse::Component& password,
+bool CanonicalizeUserInfo(const base::char16* username_source,
+ const Component& username,
+ const base::char16* password_source,
+ const Component& password,
CanonOutput* output,
- url_parse::Component* out_username,
- url_parse::Component* out_password) {
- return DoUserInfo<char16, char16>(
+ Component* out_username,
+ Component* out_password) {
+ return DoUserInfo<base::char16, base::char16>(
username_source, username, password_source, password,
output, out_username, out_password);
}
bool CanonicalizePort(const char* spec,
- const url_parse::Component& port,
+ const Component& port,
int default_port_for_scheme,
CanonOutput* output,
- url_parse::Component* out_port) {
+ Component* out_port) {
return DoPort<char, unsigned char>(spec, port,
default_port_for_scheme,
output, out_port);
}
-bool CanonicalizePort(const char16* spec,
- const url_parse::Component& port,
+bool CanonicalizePort(const base::char16* spec,
+ const Component& port,
int default_port_for_scheme,
CanonOutput* output,
- url_parse::Component* out_port) {
- return DoPort<char16, char16>(spec, port, default_port_for_scheme,
- output, out_port);
+ Component* out_port) {
+ return DoPort<base::char16, base::char16>(spec, port, default_port_for_scheme,
+ output, out_port);
}
void CanonicalizeRef(const char* spec,
- const url_parse::Component& ref,
+ const Component& ref,
CanonOutput* output,
- url_parse::Component* out_ref) {
+ Component* out_ref) {
DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
}
-void CanonicalizeRef(const char16* spec,
- const url_parse::Component& ref,
+void CanonicalizeRef(const base::char16* spec,
+ const Component& ref,
CanonOutput* output,
- url_parse::Component* out_ref) {
- DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
+ Component* out_ref) {
+ DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref);
}
-} // namespace url_canon
+} // namespace url
diff --git a/src/url/url_canon_filesystemurl.cc b/src/url/url_canon_filesystemurl.cc
new file mode 100644
index 0000000..18e9055
--- /dev/null
+++ b/src/url/url_canon_filesystemurl.cc
@@ -0,0 +1,129 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions for canonicalizing "filesystem:file:" URLs.
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
+#include "url/url_util.h"
+#include "url/url_util_internal.h"
+
+namespace url {
+
+namespace {
+
+// We use the URLComponentSource for the outer URL, as it can have replacements,
+// whereas the inner_url can't, so it uses spec.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeFileSystemURL(const CHAR* spec,
+ const URLComponentSource<CHAR>& source,
+ const Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ // filesystem only uses {scheme, path, query, ref} -- clear the rest.
+ new_parsed->username.reset();
+ new_parsed->password.reset();
+ new_parsed->host.reset();
+ new_parsed->port.reset();
+
+ const Parsed* inner_parsed = parsed.inner_parsed();
+ Parsed new_inner_parsed;
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->Append("filesystem:", 11);
+ new_parsed->scheme.len = 10;
+
+ if (!parsed.inner_parsed() || !parsed.inner_parsed()->scheme.is_valid())
+ return false;
+
+ bool success = true;
+ if (CompareSchemeComponent(spec, inner_parsed->scheme, url::kFileScheme)) {
+ new_inner_parsed.scheme.begin = output->length();
+ output->Append("file://", 7);
+ new_inner_parsed.scheme.len = 4;
+ success &= CanonicalizePath(spec, inner_parsed->path, output,
+ &new_inner_parsed.path);
+ } else if (IsStandard(spec, inner_parsed->scheme)) {
+ success = CanonicalizeStandardURL(spec, parsed.inner_parsed()->Length(),
+ *parsed.inner_parsed(), charset_converter,
+ output, &new_inner_parsed);
+ } else {
+ // TODO(ericu): The URL is wrong, but should we try to output more of what
+ // we were given? Echoing back filesystem:mailto etc. doesn't seem all that
+ // useful.
+ return false;
+ }
+ // The filesystem type must be more than just a leading slash for validity.
+ success &= parsed.inner_parsed()->path.len > 1;
+
+ success &= CanonicalizePath(source.path, parsed.path, output,
+ &new_parsed->path);
+
+ // Ignore failures for query/ref since the URL can probably still be loaded.
+ CanonicalizeQuery(source.query, parsed.query, charset_converter,
+ output, &new_parsed->query);
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+ if (success)
+ new_parsed->set_inner_parsed(new_inner_parsed);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeFileSystemURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ spec, URLComponentSource<char>(spec), parsed, charset_converter, output,
+ new_parsed);
+}
+
+bool CanonicalizeFileSystemURL(const base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeFileSystemURL<base::char16, base::char16>(
+ spec, URLComponentSource<base::char16>(spec), parsed, charset_converter,
+ output, new_parsed);
+}
+
+bool ReplaceFileSystemURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ base, source, parsed, charset_converter, output, new_parsed);
+}
+
+bool ReplaceFileSystemURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<base::char16>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ base, source, parsed, charset_converter, output, new_parsed);
+}
+
+} // namespace url
diff --git a/src/url/url_canon_fileurl.cc b/src/url/url_canon_fileurl.cc
new file mode 100644
index 0000000..6191f8f
--- /dev/null
+++ b/src/url/url_canon_fileurl.cc
@@ -0,0 +1,189 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions for canonicalizing "file:" URLs.
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
+
+namespace url {
+
+namespace {
+
+#ifdef WIN32
+
+// Given a pointer into the spec, this copies and canonicalizes the drive
+// letter and colon to the output, if one is found. If there is not a drive
+// spec, it won't do anything. The index of the next character in the input
+// spec is returned (after the colon when a drive spec is found, the begin
+// offset if one is not).
+template<typename CHAR>
+int FileDoDriveSpec(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
+ // (with backslashes instead of slashes as well).
+ int num_slashes = CountConsecutiveSlashes(spec, begin, end);
+ int after_slashes = begin + num_slashes;
+
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end))
+ return begin; // Haven't consumed any characters
+
+ // A drive spec is the start of a path, so we need to add a slash for the
+ // authority terminator (typically the third slash).
+ output->push_back('/');
+
+ // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
+ // and that it is followed by a colon/pipe.
+
+ // Normalize Windows drive letters to uppercase
+ if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
+ output->push_back(static_cast<char>(spec[after_slashes] - 'a' + 'A'));
+ else
+ output->push_back(static_cast<char>(spec[after_slashes]));
+
+ // Normalize the character following it to a colon rather than pipe.
+ output->push_back(':');
+ return after_slashes + 2;
+}
+
+#endif // WIN32
+
+template<typename CHAR, typename UCHAR>
+bool DoFileCanonicalizePath(const CHAR* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ // Copies and normalizes the "c:" at the beginning, if present.
+ out_path->begin = output->length();
+ int after_drive;
+#ifdef WIN32
+ after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output);
+#else
+ after_drive = path.begin;
+#endif
+
+ // Copies the rest of the path, starting from the slash following the
+ // drive colon (if any, Windows only), or the first slash of the path.
+ bool success = true;
+ if (after_drive < path.end()) {
+ // Use the regular path canonicalizer to canonicalize the rest of the
+ // path. Give it a fake output component to write into. DoCanonicalizeFile
+ // will compute the full path component.
+ Component sub_path = MakeRange(after_drive, path.end());
+ Component fake_output_path;
+ success = CanonicalizePath(spec, sub_path, output, &fake_output_path);
+ } else {
+ // No input path, canonicalize to a slash.
+ output->push_back('/');
+ }
+
+ out_path->len = output->length() - out_path->begin;
+ return success;
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ // Things we don't set in file: URLs.
+ new_parsed->username = Component();
+ new_parsed->password = Component();
+ new_parsed->port = Component();
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->Append("file://", 7);
+ new_parsed->scheme.len = 4;
+
+ // Append the host. For many file URLs, this will be empty. For UNC, this
+ // will be present.
+ // TODO(brettw) This doesn't do any checking for host name validity. We
+ // should probably handle validity checking of UNC hosts differently than
+ // for regular IP hosts.
+ bool success = CanonicalizeHost(source.host, parsed.host,
+ output, &new_parsed->host);
+ success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path,
+ output, &new_parsed->path);
+ CanonicalizeQuery(source.query, parsed.query, query_converter,
+ output, &new_parsed->query);
+
+ // Ignore failure for refs since the URL can probably still be loaded.
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeFileURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, query_converter,
+ output, new_parsed);
+}
+
+bool CanonicalizeFileURL(const base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeFileURL<base::char16, base::char16>(
+ URLComponentSource<base::char16>(spec), parsed, query_converter,
+ output, new_parsed);
+}
+
+bool FileCanonicalizePath(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ return DoFileCanonicalizePath<char, unsigned char>(spec, path,
+ output, out_path);
+}
+
+bool FileCanonicalizePath(const base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ return DoFileCanonicalizePath<base::char16, base::char16>(spec, path,
+ output, out_path);
+}
+
+bool ReplaceFileURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ source, parsed, query_converter, output, new_parsed);
+}
+
+bool ReplaceFileURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<base::char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ source, parsed, query_converter, output, new_parsed);
+}
+
+} // namespace url
diff --git a/googleurl/src/url_canon_host.cc b/src/url/url_canon_host.cc
similarity index 83%
rename from googleurl/src/url_canon_host.cc
rename to src/url/url_canon_host.cc
index 6642004..513248a 100644
--- a/googleurl/src/url_canon_host.cc
+++ b/src/url/url_canon_host.cc
@@ -1,37 +1,12 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
#include "base/logging.h"
-#include "googleurl/src/url_canon.h"
-#include "googleurl/src/url_canon_internal.h"
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
-namespace url_canon {
+namespace url {
namespace {
@@ -94,14 +69,16 @@
const int kTempHostBufferLen = 1024;
typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
-typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
+typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
// Scans a host name and fills in the output flags according to what we find.
// |has_non_ascii| will be true if there are any non-7-bit characters, and
// |has_escaped| will be true if there is a percent sign.
template<typename CHAR, typename UCHAR>
-void ScanHostname(const CHAR* spec, const url_parse::Component& host,
- bool* has_non_ascii, bool* has_escaped) {
+void ScanHostname(const CHAR* spec,
+ const Component& host,
+ bool* has_non_ascii,
+ bool* has_escaped) {
int end = host.end();
*has_non_ascii = false;
*has_escaped = false;
@@ -187,7 +164,7 @@
}
// Canonicalizes a host that requires IDN conversion. Returns true on success
-bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
+bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
// We need to escape URL before doing IDN conversion, since punicode strings
// cannot be escaped after they are created.
RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
@@ -280,7 +257,7 @@
// UTF-16 convert host to its ASCII version. The set up is already ready for
// the backend, so we just pass through. The has_escaped flag should be set if
// the input string requires unescaping.
-bool DoComplexHost(const char16* host, int host_len,
+bool DoComplexHost(const base::char16* host, int host_len,
bool has_non_ascii, bool has_escaped, CanonOutput* output) {
if (has_escaped) {
// Yikes, we have escaped characters with wide input. The escaped
@@ -312,13 +289,13 @@
template<typename CHAR, typename UCHAR>
void DoHost(const CHAR* spec,
- const url_parse::Component& host,
+ const Component& host,
CanonOutput* output,
CanonHostInfo* host_info) {
if (host.len <= 0) {
// Empty hosts don't need anything.
host_info->family = CanonHostInfo::NEUTRAL;
- host_info->out_host = url_parse::Component();
+ host_info->out_host = Component();
return;
}
@@ -347,7 +324,7 @@
// should not cause an allocation.
RawCanonOutput<64> canon_ip;
CanonicalizeIPAddress(output->data(),
- url_parse::MakeRange(output_begin, output->length()),
+ MakeRange(output_begin, output->length()),
&canon_ip, host_info);
// If we got an IPv4/IPv6 address, copy the canonical form back to the
@@ -359,43 +336,43 @@
}
}
- host_info->out_host = url_parse::MakeRange(output_begin, output->length());
+ host_info->out_host = MakeRange(output_begin, output->length());
}
} // namespace
bool CanonicalizeHost(const char* spec,
- const url_parse::Component& host,
+ const Component& host,
CanonOutput* output,
- url_parse::Component* out_host) {
+ Component* out_host) {
CanonHostInfo host_info;
DoHost<char, unsigned char>(spec, host, output, &host_info);
*out_host = host_info.out_host;
return (host_info.family != CanonHostInfo::BROKEN);
}
-bool CanonicalizeHost(const char16* spec,
- const url_parse::Component& host,
+bool CanonicalizeHost(const base::char16* spec,
+ const Component& host,
CanonOutput* output,
- url_parse::Component* out_host) {
+ Component* out_host) {
CanonHostInfo host_info;
- DoHost<char16, char16>(spec, host, output, &host_info);
+ DoHost<base::char16, base::char16>(spec, host, output, &host_info);
*out_host = host_info.out_host;
return (host_info.family != CanonHostInfo::BROKEN);
}
void CanonicalizeHostVerbose(const char* spec,
- const url_parse::Component& host,
+ const Component& host,
CanonOutput* output,
- CanonHostInfo *host_info) {
+ CanonHostInfo* host_info) {
DoHost<char, unsigned char>(spec, host, output, host_info);
}
-void CanonicalizeHostVerbose(const char16* spec,
- const url_parse::Component& host,
+void CanonicalizeHostVerbose(const base::char16* spec,
+ const Component& host,
CanonOutput* output,
- CanonHostInfo *host_info) {
- DoHost<char16, char16>(spec, host, output, host_info);
+ CanonHostInfo* host_info) {
+ DoHost<base::char16, base::char16>(spec, host, output, host_info);
}
-} // namespace url_canon
+} // namespace url
diff --git a/src/url/url_canon_icu.cc b/src/url/url_canon_icu.cc
new file mode 100644
index 0000000..743ff00
--- /dev/null
+++ b/src/url/url_canon_icu.cc
@@ -0,0 +1,186 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// ICU integration functions.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "base/logging.h"
+#include "third_party/icu/include/unicode/ucnv.h"
+#include "third_party/icu/include/unicode/ucnv_cb.h"
+#include "third_party/icu/include/unicode/uidna.h"
+#include "url/url_canon_icu.h"
+#include "url/url_canon_internal.h" // for _itoa_s
+#include "util/gtl/lazy_static_ptr.h"
+
+namespace url {
+
+namespace {
+
+// Called when converting a character that can not be represented, this will
+// append an escaped version of the numerical character reference for that code
+// point. It is of the form "Ӓ" and we will escape the non-digits to
+// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
+void appendURLEscapedChar(const void* context,
+ UConverterFromUnicodeArgs* from_args,
+ const UChar* code_units,
+ int32_t length,
+ UChar32 code_point,
+ UConverterCallbackReason reason,
+ UErrorCode* err) {
+ if (reason == UCNV_UNASSIGNED) {
+ *err = U_ZERO_ERROR;
+
+ const static int prefix_len = 6;
+ const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped
+ ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
+
+ DCHECK(code_point < 0x110000);
+ char number[8]; // Max Unicode code point is 7 digits.
+ _itoa_s(code_point, number, 10);
+ int number_len = static_cast<int>(strlen(number));
+ ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
+
+ const static int postfix_len = 3;
+ const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped
+ ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
+ }
+}
+
+// A class for scoping the installation of the invalid character callback.
+class AppendHandlerInstaller {
+ public:
+ // The owner of this object must ensure that the converter is alive for the
+ // duration of this object's lifetime.
+ AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
+ UErrorCode err = U_ZERO_ERROR;
+ ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
+ &old_callback_, &old_context_, &err);
+ }
+
+ ~AppendHandlerInstaller() {
+ UErrorCode err = U_ZERO_ERROR;
+ ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
+ }
+
+ private:
+ UConverter* converter_;
+
+ UConverterFromUCallback old_callback_;
+ const void* old_context_;
+};
+
+// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
+// a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().
+//
+// We use UTS46 with BiDiCheck to migrate from IDNA 2003 (with unassigned
+// code points allowed) to IDNA 2008 with
+// the backward compatibility in mind. What it does:
+//
+// 1. Use the up-to-date Unicode data.
+// 2. Define a case folding/mapping with the up-to-date Unicode data as
+// in IDNA 2003.
+// 3. Use transitional mechanism for 4 deviation characters (sharp-s,
+// final sigma, ZWJ and ZWNJ) for now.
+// 4. Continue to allow symbols and punctuations.
+// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules.
+// 6. Do not apply STD3 rules
+// 7. Do not allow unassigned code points.
+//
+// It also closely matches what IE 10 does except for the BiDi check (
+// http://goo.gl/3XBhqw ).
+// See http://http://unicode.org/reports/tr46/ and references therein
+// for more details.
+struct UIDNAWrapper {
+ UIDNAWrapper() {
+ UErrorCode err = U_ZERO_ERROR;
+ // TODO(jungshik): Change options as different parties (browsers,
+ // registrars, search engines) converge toward a consensus.
+ value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);
+ if (U_FAILURE(err))
+ value = NULL;
+ }
+
+ UIDNA* value;
+};
+
+} // namespace
+
+ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
+ : converter_(converter) {
+}
+
+ICUCharsetConverter::~ICUCharsetConverter() {
+}
+
+void ICUCharsetConverter::ConvertFromUTF16(const base::char16* input,
+ int input_len,
+ CanonOutput* output) {
+ // Install our error handler. It will be called for character that can not
+ // be represented in the destination character set.
+ AppendHandlerInstaller handler(converter_);
+
+ int begin_offset = output->length();
+ int dest_capacity = output->capacity() - begin_offset;
+ output->set_length(output->length());
+
+ do {
+ UErrorCode err = U_ZERO_ERROR;
+ char* dest = &output->data()[begin_offset];
+ int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
+ input, input_len, &err);
+ if (err != U_BUFFER_OVERFLOW_ERROR) {
+ output->set_length(begin_offset + required_capacity);
+ return;
+ }
+
+ // Output didn't fit, expand
+ dest_capacity = required_capacity;
+ output->Resize(begin_offset + dest_capacity);
+ } while (true);
+}
+
+static util::gtl::LazyStaticPtr<UIDNAWrapper> g_uidna;
+
+// Converts the Unicode input representing a hostname to ASCII using IDN rules.
+// The output must be ASCII, but is represented as wide characters.
+//
+// On success, the output will be filled with the ASCII host name and it will
+// return true. Unlike most other canonicalization functions, this assumes that
+// the output is empty. The beginning of the host will be at offset 0, and
+// the length of the output will be set to the length of the new host name.
+//
+// On error, this will return false. The output in this case is undefined.
+// TODO(jungshik): use UTF-8/ASCII version of nameToASCII.
+// Change the function signature and callers accordingly to avoid unnecessary
+// conversions in our code. In addition, consider using icu::IDNA's UTF-8/ASCII
+// version with StringByteSink. That way, we can avoid C wrappers and additional
+// string conversion.
+bool IDNToASCII(const base::char16* src, int src_len, CanonOutputW* output) {
+ DCHECK(output->length() == 0); // Output buffer is assumed empty.
+
+ UIDNA* uidna = g_uidna->value;
+ DCHECK(uidna != NULL);
+ while (true) {
+ UErrorCode err = U_ZERO_ERROR;
+ UIDNAInfo info = UIDNA_INFO_INITIALIZER;
+ int output_length = uidna_nameToASCII(uidna, src, src_len, output->data(),
+ output->capacity(), &info, &err);
+ if (U_SUCCESS(err) && info.errors == 0) {
+ output->set_length(output_length);
+ return true;
+ }
+
+ // TODO(jungshik): Look at info.errors to handle them case-by-case basis
+ // if necessary.
+ if (err != U_BUFFER_OVERFLOW_ERROR || info.errors != 0)
+ return false; // Unknown error, give up.
+
+ // Not enough room in our buffer, expand.
+ output->Resize(output_length);
+ }
+}
+
+} // namespace url
diff --git a/src/url/url_canon_icu.h b/src/url/url_canon_icu.h
new file mode 100644
index 0000000..c3c1f01
--- /dev/null
+++ b/src/url/url_canon_icu.h
@@ -0,0 +1,39 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_ICU_H_
+#define URL_URL_CANON_ICU_H_
+
+// ICU integration functions.
+
+#include "url/url_canon.h"
+#include "url/url_export.h"
+
+typedef struct UConverter UConverter;
+
+namespace url {
+
+// An implementation of CharsetConverter that implementations can use to
+// interface the canonicalizer with ICU's conversion routines.
+class URL_EXPORT ICUCharsetConverter : public CharsetConverter {
+ public:
+ // Constructs a converter using an already-existing ICU character set
+ // converter. This converter is NOT owned by this object; the lifetime must
+ // be managed by the creator such that it is alive as long as this is.
+ ICUCharsetConverter(UConverter* converter);
+
+ ~ICUCharsetConverter() override;
+
+ void ConvertFromUTF16(const base::char16* input,
+ int input_len,
+ CanonOutput* output) override;
+
+ private:
+ // The ICU converter, not owned by this class.
+ UConverter* converter_;
+};
+
+} // namespace url
+
+#endif // URL_URL_CANON_ICU_H_
diff --git a/src/url/url_canon_icu_unittest.cc b/src/url/url_canon_icu_unittest.cc
new file mode 100644
index 0000000..cfa4b49
--- /dev/null
+++ b/src/url/url_canon_icu_unittest.cc
@@ -0,0 +1,160 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/macros.h"
+#include "testing/base/public/gunit.h"
+#include "third_party/icu/include/unicode/ucnv.h"
+#include "url/url_canon.h"
+#include "url/url_canon_icu.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_test_utils.h"
+
+namespace url {
+
+using test_utils::WStringToUTF16;
+
+namespace {
+
+// Wrapper around a UConverter object that managers creation and destruction.
+class UConvScoper {
+ public:
+ explicit UConvScoper(const char* charset_name) {
+ UErrorCode err = U_ZERO_ERROR;
+ converter_ = ucnv_open(charset_name, &err);
+ }
+
+ ~UConvScoper() {
+ if (converter_)
+ ucnv_close(converter_);
+ }
+
+ // Returns the converter object, may be NULL.
+ UConverter* converter() const { return converter_; }
+
+ private:
+ UConverter* converter_;
+};
+
+TEST(URLCanonIcuTest, ICUCharsetConverter) {
+ struct ICUCase {
+ const wchar_t* input;
+ const char* encoding;
+ const char* expected;
+ } icu_cases[] = {
+ // UTF-8.
+ {L"Hello, world", "utf-8", "Hello, world"},
+ {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
+ // Non-BMP UTF-8.
+ {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
+ // Big5
+ {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
+ // Unrepresentable character in the destination set.
+ {L"hello\x4f60\x06de\x597dworld", "big5",
+ "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
+ };
+
+ for (size_t i = 0; i < arraysize(icu_cases); i++) {
+ UConvScoper conv(icu_cases[i].encoding);
+ ASSERT_TRUE(conv.converter() != NULL);
+ ICUCharsetConverter converter(conv.converter());
+
+ std::string str;
+ StdStringCanonOutput output(&str);
+
+ base::string16 input_str(WStringToUTF16(icu_cases[i].input));
+ int input_len = static_cast<int>(input_str.length());
+ converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
+ output.Complete();
+
+ EXPECT_STREQ(icu_cases[i].expected, str.c_str());
+ }
+
+ // Test string sizes around the resize boundary for the output to make sure
+ // the converter resizes as needed.
+ const int static_size = 16;
+ UConvScoper conv("utf-8");
+ ASSERT_TRUE(conv.converter());
+ ICUCharsetConverter converter(conv.converter());
+ for (int i = static_size - 2; i <= static_size + 2; i++) {
+ // Make a string with the appropriate length.
+ base::string16 input;
+ for (int ch = 0; ch < i; ch++)
+ input.push_back('a');
+
+ RawCanonOutput<static_size> output;
+ converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
+ &output);
+ EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
+ }
+}
+
+TEST(URLCanonIcuTest, QueryWithConverter) {
+ struct QueryCase {
+ const char* input8;
+ const wchar_t* input16;
+ const char* encoding;
+ const char* expected;
+ } query_cases[] = {
+ // Regular ASCII case in some different encodings.
+ {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
+ {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
+ {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
+ // Chinese input/output
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
+ "?q=%C4%E3%BA%C3"},
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
+ // Unencodable character in the destination character set should be
+ // escaped. The escape sequence unescapes to be the entity name:
+ // "?q=你"
+ {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
+ "?q=Chinese%26%2365319%3B"},
+ };
+
+ for (size_t i = 0; i < arraysize(query_cases); i++) {
+ Component out_comp;
+
+ UConvScoper conv(query_cases[i].encoding);
+ ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
+ ICUCharsetConverter converter(conv.converter());
+
+ if (query_cases[i].input8) {
+ int len = static_cast<int>(strlen(query_cases[i].input8));
+ Component in_comp(0, len);
+ std::string out_str;
+
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
+ &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(query_cases[i].expected, out_str);
+ }
+
+ if (query_cases[i].input16) {
+ base::string16 input16(WStringToUTF16(query_cases[i].input16));
+ int len = static_cast<int>(input16.length());
+ Component in_comp(0, len);
+ std::string out_str;
+
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
+ &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(query_cases[i].expected, out_str);
+ }
+ }
+
+ // Extra test for input with embedded NULL;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ Component out_comp;
+ CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
+ output.Complete();
+ EXPECT_EQ("?a%20%00z%01", out_str);
+}
+
+} // namespace
+
+} // namespace url
diff --git a/googleurl/src/url_canon_internal.cc b/src/url/url_canon_internal.cc
similarity index 61%
rename from googleurl/src/url_canon_internal.cc
rename to src/url/url_canon_internal.cc
index 6b776bc..1554814 100644
--- a/googleurl/src/url_canon_internal.cc
+++ b/src/url/url_canon_internal.cc
@@ -1,40 +1,18 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
-#include <cstdio>
+#include "url/url_canon_internal.h"
+
#include <errno.h>
#include <stdlib.h>
+
+#include <cstdio>
#include <string>
-#include "googleurl/src/url_canon_internal.h"
+#include "base/strings/utf_string_conversion_utils.h"
-namespace url_canon {
+namespace url {
namespace {
@@ -82,12 +60,12 @@
}
}
-// Overrides one component, see the url_canon::Replacements structure for
+// Overrides one component, see the Replacements structure for
// what the various combionations of source pointer and component mean.
void DoOverrideComponent(const char* override_source,
- const url_parse::Component& override_component,
+ const Component& override_component,
const char** dest,
- url_parse::Component* dest_component) {
+ Component* dest_component) {
if (override_source) {
*dest = override_source;
*dest_component = override_component;
@@ -98,7 +76,7 @@
// not actually set the output character pointer.
//
// The input is converted to UTF-8 at the end of the given buffer as a temporary
-// holding place. The component indentifying the portion of the buffer used in
+// holding place. The component identifying the portion of the buffer used in
// the |utf8_buffer| will be specified in |*dest_component|.
//
// This will not actually set any |dest| pointer like DoOverrideComponent
@@ -106,16 +84,15 @@
// may get resized while we're overriding a subsequent component. Instead, the
// caller should use the beginning of the |utf8_buffer| as the string pointer
// for all components once all overrides have been prepared.
-bool PrepareUTF16OverrideComponent(
- const char16* override_source,
- const url_parse::Component& override_component,
- CanonOutput* utf8_buffer,
- url_parse::Component* dest_component) {
+bool PrepareUTF16OverrideComponent(const base::char16* override_source,
+ const Component& override_component,
+ CanonOutput* utf8_buffer,
+ Component* dest_component) {
bool success = true;
if (override_source) {
if (!override_component.is_valid()) {
// Non-"valid" component (means delete), so we need to preserve that.
- *dest_component = url_parse::Component();
+ *dest_component = Component();
} else {
// Convert to UTF-8.
dest_component->begin = utf8_buffer->length();
@@ -134,31 +111,31 @@
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f
0, // 0x20 ' ' (escape spaces in queries)
- CHAR_QUERY | CHAR_USERINFO, // 0x21 !
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x21 !
0, // 0x22 "
0, // 0x23 # (invalid in query since it marks the ref)
CHAR_QUERY | CHAR_USERINFO, // 0x24 $
CHAR_QUERY | CHAR_USERINFO, // 0x25 %
CHAR_QUERY | CHAR_USERINFO, // 0x26 &
- CHAR_QUERY | CHAR_USERINFO, // 0x27 '
- CHAR_QUERY | CHAR_USERINFO, // 0x28 (
- CHAR_QUERY | CHAR_USERINFO, // 0x29 )
- CHAR_QUERY | CHAR_USERINFO, // 0x2a *
+ 0, // 0x27 ' (Try to prevent XSS.)
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x28 (
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x29 )
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2a *
CHAR_QUERY | CHAR_USERINFO, // 0x2b +
CHAR_QUERY | CHAR_USERINFO, // 0x2c ,
- CHAR_QUERY | CHAR_USERINFO, // 0x2d -
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x2e .
- CHAR_QUERY, // 0x2f /
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x30 0
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x31 1
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x32 2
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x33 3
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x34 4
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x35 5
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x36 6
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x37 7
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC, // 0x38 8
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC, // 0x39 9
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2d -
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x2e .
+ CHAR_QUERY, // 0x2f /
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x30 0
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x31 1
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x32 2
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x33 3
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x34 4
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x35 5
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x36 6
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x37 7
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x38 8
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x39 9
CHAR_QUERY, // 0x3a :
CHAR_QUERY, // 0x3b ;
0, // 0x3c < (Try to prevent certain types of XSS.)
@@ -166,68 +143,68 @@
0, // 0x3e > (Try to prevent certain types of XSS.)
CHAR_QUERY, // 0x3f ?
CHAR_QUERY, // 0x40 @
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x41 A
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x42 B
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x43 C
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x44 D
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x45 E
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x46 F
- CHAR_QUERY | CHAR_USERINFO, // 0x47 G
- CHAR_QUERY | CHAR_USERINFO, // 0x48 H
- CHAR_QUERY | CHAR_USERINFO, // 0x49 I
- CHAR_QUERY | CHAR_USERINFO, // 0x4a J
- CHAR_QUERY | CHAR_USERINFO, // 0x4b K
- CHAR_QUERY | CHAR_USERINFO, // 0x4c L
- CHAR_QUERY | CHAR_USERINFO, // 0x4d M
- CHAR_QUERY | CHAR_USERINFO, // 0x4e N
- CHAR_QUERY | CHAR_USERINFO, // 0x4f O
- CHAR_QUERY | CHAR_USERINFO, // 0x50 P
- CHAR_QUERY | CHAR_USERINFO, // 0x51 Q
- CHAR_QUERY | CHAR_USERINFO, // 0x52 R
- CHAR_QUERY | CHAR_USERINFO, // 0x53 S
- CHAR_QUERY | CHAR_USERINFO, // 0x54 T
- CHAR_QUERY | CHAR_USERINFO, // 0x55 U
- CHAR_QUERY | CHAR_USERINFO, // 0x56 V
- CHAR_QUERY | CHAR_USERINFO, // 0x57 W
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x58 X
- CHAR_QUERY | CHAR_USERINFO, // 0x59 Y
- CHAR_QUERY | CHAR_USERINFO, // 0x5a Z
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x41 A
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x42 B
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x43 C
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x44 D
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x45 E
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x46 F
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x47 G
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x48 H
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x49 I
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4a J
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4b K
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4c L
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4d M
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4e N
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4f O
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x50 P
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x51 Q
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x52 R
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x53 S
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x54 T
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x55 U
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x56 V
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x57 W
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x58 X
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x59 Y
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5a Z
CHAR_QUERY, // 0x5b [
CHAR_QUERY, // 0x5c '\'
CHAR_QUERY, // 0x5d ]
CHAR_QUERY, // 0x5e ^
- CHAR_QUERY | CHAR_USERINFO, // 0x5f _
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5f _
CHAR_QUERY, // 0x60 `
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x61 a
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x62 b
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x63 c
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x64 d
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x65 e
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x66 f
- CHAR_QUERY | CHAR_USERINFO, // 0x67 g
- CHAR_QUERY | CHAR_USERINFO, // 0x68 h
- CHAR_QUERY | CHAR_USERINFO, // 0x69 i
- CHAR_QUERY | CHAR_USERINFO, // 0x6a j
- CHAR_QUERY | CHAR_USERINFO, // 0x6b k
- CHAR_QUERY | CHAR_USERINFO, // 0x6c l
- CHAR_QUERY | CHAR_USERINFO, // 0x6d m
- CHAR_QUERY | CHAR_USERINFO, // 0x6e n
- CHAR_QUERY | CHAR_USERINFO, // 0x6f o
- CHAR_QUERY | CHAR_USERINFO, // 0x70 p
- CHAR_QUERY | CHAR_USERINFO, // 0x71 q
- CHAR_QUERY | CHAR_USERINFO, // 0x72 r
- CHAR_QUERY | CHAR_USERINFO, // 0x73 s
- CHAR_QUERY | CHAR_USERINFO, // 0x74 t
- CHAR_QUERY | CHAR_USERINFO, // 0x75 u
- CHAR_QUERY | CHAR_USERINFO, // 0x76 v
- CHAR_QUERY | CHAR_USERINFO, // 0x77 w
- CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x78 x
- CHAR_QUERY | CHAR_USERINFO, // 0x79 y
- CHAR_QUERY | CHAR_USERINFO, // 0x7a z
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x61 a
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x62 b
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x63 c
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x64 d
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x65 e
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x66 f
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x67 g
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x68 h
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x69 i
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6a j
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6b k
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6c l
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6d m
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6e n
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6f o
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x70 p
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x71 q
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x72 r
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x73 s
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x74 t
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x75 u
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x76 v
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x77 w
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x78 x
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x79 y
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7a z
CHAR_QUERY, // 0x7b {
CHAR_QUERY, // 0x7c |
CHAR_QUERY, // 0x7d }
- CHAR_QUERY | CHAR_USERINFO, // 0x7e ~
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7e ~
0, // 0x7f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f
@@ -255,7 +232,7 @@
0, // 0xE0 - 0xFF
};
-const char16 kUnicodeReplacementCharacter = 0xfffd;
+const base::char16 kUnicodeReplacementCharacter = 0xfffd;
void AppendStringOfType(const char* source, int length,
SharedCharTypes type,
@@ -263,10 +240,37 @@
DoAppendStringOfType<char, unsigned char>(source, length, type, output);
}
-void AppendStringOfType(const char16* source, int length,
+void AppendStringOfType(const base::char16* source, int length,
SharedCharTypes type,
CanonOutput* output) {
- DoAppendStringOfType<char16, char16>(source, length, type, output);
+ DoAppendStringOfType<base::char16, base::char16>(
+ source, length, type, output);
+}
+
+bool ReadUTFChar(const char* str, int* begin, int length,
+ unsigned* code_point_out) {
+ // This depends on ints and int32s being the same thing. If they're not, it
+ // will fail to compile.
+ // TODO(mmenke): This should probably be fixed.
+ if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
+ !base::IsValidCharacter(*code_point_out)) {
+ *code_point_out = kUnicodeReplacementCharacter;
+ return false;
+ }
+ return true;
+}
+
+bool ReadUTFChar(const base::char16* str, int* begin, int length,
+ unsigned* code_point_out) {
+ // This depends on ints and int32s being the same thing. If they're not, it
+ // will fail to compile.
+ // TODO(mmenke): This should probably be fixed.
+ if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
+ !base::IsValidCharacter(*code_point_out)) {
+ *code_point_out = kUnicodeReplacementCharacter;
+ return false;
+ }
+ return true;
}
void AppendInvalidNarrowString(const char* spec, int begin, int end,
@@ -274,12 +278,13 @@
DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);
}
-void AppendInvalidNarrowString(const char16* spec, int begin, int end,
+void AppendInvalidNarrowString(const base::char16* spec, int begin, int end,
CanonOutput* output) {
- DoAppendInvalidNarrowString<char16, char16>(spec, begin, end, output);
+ DoAppendInvalidNarrowString<base::char16, base::char16>(
+ spec, begin, end, output);
}
-bool ConvertUTF16ToUTF8(const char16* input, int input_len,
+bool ConvertUTF16ToUTF8(const base::char16* input, int input_len,
CanonOutput* output) {
bool success = true;
for (int i = 0; i < input_len; i++) {
@@ -291,7 +296,7 @@
}
bool ConvertUTF8ToUTF16(const char* input, int input_len,
- CanonOutputT<char16>* output) {
+ CanonOutputT<base::char16>* output) {
bool success = true;
for (int i = 0; i < input_len; i++) {
unsigned code_point;
@@ -304,10 +309,10 @@
void SetupOverrideComponents(const char* base,
const Replacements<char>& repl,
URLComponentSource<char>* source,
- url_parse::Parsed* parsed) {
+ Parsed* parsed) {
// Get the source and parsed structures of the things we are replacing.
const URLComponentSource<char>& repl_source = repl.sources();
- const url_parse::Parsed& repl_parsed = repl.components();
+ const Parsed& repl_parsed = repl.components();
DoOverrideComponent(repl_source.scheme, repl_parsed.scheme,
&source->scheme, &parsed->scheme);
@@ -333,15 +338,15 @@
}
bool SetupUTF16OverrideComponents(const char* base,
- const Replacements<char16>& repl,
+ const Replacements<base::char16>& repl,
CanonOutput* utf8_buffer,
URLComponentSource<char>* source,
- url_parse::Parsed* parsed) {
+ Parsed* parsed) {
bool success = true;
// Get the source and parsed structures of the things we are replacing.
- const URLComponentSource<char16>& repl_source = repl.sources();
- const url_parse::Parsed& repl_parsed = repl.components();
+ const URLComponentSource<base::char16>& repl_source = repl.sources();
+ const Parsed& repl_parsed = repl.components();
success &= PrepareUTF16OverrideComponent(
repl_source.scheme, repl_parsed.scheme,
@@ -402,7 +407,7 @@
return 0;
}
-int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix) {
+int _itow_s(int value, base::char16* buffer, size_t size_in_chars, int radix) {
if (radix != 10)
return EINVAL;
@@ -416,7 +421,7 @@
}
for (int i = 0; i < written; ++i) {
- buffer[i] = static_cast<char16>(temp[i]);
+ buffer[i] = static_cast<base::char16>(temp[i]);
}
buffer[written] = '\0';
return 0;
@@ -424,4 +429,4 @@
#endif // !WIN32
-} // namespace url_canon
+} // namespace url
diff --git a/googleurl/src/url_canon_internal.h b/src/url/url_canon_internal.h
similarity index 77%
rename from googleurl/src/url_canon_internal.h
rename to src/url/url_canon_internal.h
index 4b1e45a..a66cd8d 100644
--- a/googleurl/src/url_canon_internal.h
+++ b/src/url/url_canon_internal.h
@@ -1,45 +1,21 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_INTERNAL_H_
+#define URL_URL_CANON_INTERNAL_H_
// This file is intended to be included in another C++ file where the character
// types are defined. This allows us to write mostly generic code, but not have
// templace bloat because everything is inlined when anybody calls any of our
// functions.
-#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
-#define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
-
#include <stdlib.h>
-#include "googleurl/src/url_canon.h"
+#include "base/logging.h"
+#include "url/url_canon.h"
-namespace url_canon {
+namespace url {
// Character type handling -----------------------------------------------------
@@ -47,7 +23,7 @@
// bits that are set for each 8-bit character in the kSharedCharTypeTable.
enum SharedCharTypes {
// Characters that do not require escaping in queries. Characters that do
- // not have this flag will be escaped, see url_canon_query.cc
+ // not have this flag will be escaped; see url_canon_query.cc
CHAR_QUERY = 1,
// Valid in the username/password field.
@@ -64,6 +40,10 @@
// Valid in an ASCII-representation of an octal digit.
CHAR_OCT = 32,
+
+ // Characters that do not require escaping in encodeURIComponent. Characters
+ // that do not have this flag will be escaped; see url_util.cc.
+ CHAR_COMPONENT = 64,
};
// This table contains the flags in SharedCharTypes for each 8-bit character.
@@ -88,19 +68,22 @@
inline bool IsHexChar(unsigned char c) {
return IsCharOfType(c, CHAR_HEX);
}
+inline bool IsComponentChar(unsigned char c) {
+ return IsCharOfType(c, CHAR_COMPONENT);
+}
// Appends the given string to the output, escaping characters that do not
// match the given |type| in SharedCharTypes.
void AppendStringOfType(const char* source, int length,
SharedCharTypes type,
CanonOutput* output);
-void AppendStringOfType(const char16* source, int length,
+void AppendStringOfType(const base::char16* source, int length,
SharedCharTypes type,
CanonOutput* output);
// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit
// that will be used to represent it.
-extern const char kHexCharLookup[0x10];
+URL_EXPORT extern const char kHexCharLookup[0x10];
// This lookup table allows fast conversion between ASCII hex letters and their
// corresponding numerical value. The 8-bit range is divided up into 8
@@ -138,7 +121,7 @@
// required for relative URL resolving to test for scheme equality.
//
// Returns 0 if the input character is not a valid scheme character.
-char CanonicalSchemeChar(char16 ch);
+char CanonicalSchemeChar(base::char16 ch);
// Write a single character, escaped, to the output. This always escapes: it
// does no checking that thee character requires escaping.
@@ -148,12 +131,12 @@
inline void AppendEscapedChar(UINCHAR ch,
CanonOutputT<OUTCHAR>* output) {
output->push_back('%');
- output->push_back(kHexCharLookup[ch >> 4]);
+ output->push_back(kHexCharLookup[(ch >> 4) & 0xf]);
output->push_back(kHexCharLookup[ch & 0xf]);
}
// The character we'll substitute for undecodable or invalid characters.
-extern const char16 kUnicodeReplacementCharacter;
+extern const base::char16 kUnicodeReplacementCharacter;
// UTF-8 functions ------------------------------------------------------------
@@ -165,14 +148,15 @@
// |*begin| will be updated to point to the last character consumed so it
// can be incremented in a loop and will be ready for the next character.
// (for a single-byte ASCII character, it will not be changed).
-//
-// Implementation is in url_canon_icu.cc.
-bool ReadUTFChar(const char* str, int* begin, int length,
- unsigned* code_point_out);
+URL_EXPORT bool ReadUTFChar(const char* str, int* begin, int length,
+ unsigned* code_point_out);
// Generic To-UTF-8 converter. This will call the given append method for each
// character that should be appended, with the given output method. Wrappers
// are provided below for escaped and non-escaped versions of this.
+//
+// The char_value must have already been checked that it's a valid Unicode
+// character.
template<class Output, void Appender(unsigned char, Output*)>
inline void DoAppendUTF8(unsigned char_value, Output* output) {
if (char_value <= 0x7f) {
@@ -191,7 +175,7 @@
output);
Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
output);
- } else if (char_value <= 0x1fffff) {
+ } else if (char_value <= 0x10FFFF) { // Max unicode code point.
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
output);
@@ -201,20 +185,9 @@
output);
Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
output);
- } else if (char_value <= 0x10FFFF) { // Max unicode code point.
- // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- Appender(static_cast<unsigned char>(0xf8 | (char_value >> 24)),
- output);
- Appender(static_cast<unsigned char>(0x80 | ((char_value >> 18) & 0x3f)),
- output);
- Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)),
- output);
- Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
- output);
- Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
- output);
} else {
- // Invalid UTF-8 character (>20 bits)
+ // Invalid UTF-8 character (>20 bits).
+ DCHECK(false); // NOTREACHED();
}
}
@@ -250,19 +223,17 @@
// |*begin| will be updated to point to the last character consumed so it
// can be incremented in a loop and will be ready for the next character.
// (for a single-16-bit-word character, it will not be changed).
-//
-// Implementation is in url_canon_icu.cc.
-bool ReadUTFChar(const char16* str, int* begin, int length,
- unsigned* code_point);
+URL_EXPORT bool ReadUTFChar(const base::char16* str, int* begin, int length,
+ unsigned* code_point_out);
// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method.
inline void AppendUTF16Value(unsigned code_point,
- CanonOutputT<char16>* output) {
+ CanonOutputT<base::char16>* output) {
if (code_point > 0xffff) {
- output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0));
- output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00));
+ output->push_back(static_cast<base::char16>((code_point >> 10) + 0xd7c0));
+ output->push_back(static_cast<base::char16>((code_point & 0x3ff) | 0xdc00));
} else {
- output->push_back(static_cast<char16>(code_point));
+ output->push_back(static_cast<base::char16>(code_point));
}
}
@@ -287,8 +258,8 @@
//
// Assumes that ch[begin] is within range in the array, but does not assume
// that any following characters are.
-inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length,
- CanonOutput* output) {
+inline bool AppendUTF8EscapedChar(const base::char16* str, int* begin,
+ int length, CanonOutput* output) {
// UTF-16 input. Readchar16 will handle invalid characters for us and give
// us the kUnicodeReplacementCharacter, so we don't have to do special
// checking after failure, just pass through the failure to the caller.
@@ -322,7 +293,7 @@
inline bool Is8BitChar(char c) {
return true; // this case is specialized to avoid a warning
}
-inline bool Is8BitChar(char16 c) {
+inline bool Is8BitChar(base::char16 c) {
return c <= 255;
}
@@ -358,7 +329,7 @@
// the escaping rules are not guaranteed!
void AppendInvalidNarrowString(const char* spec, int begin, int end,
CanonOutput* output);
-void AppendInvalidNarrowString(const char16* spec, int begin, int end,
+void AppendInvalidNarrowString(const base::char16* spec, int begin, int end,
CanonOutput* output);
// Misc canonicalization helpers ----------------------------------------------
@@ -371,15 +342,15 @@
// replacing the invalid characters with the "invalid character". It will
// return false in the failure case, and the caller should not continue as
// normal.
-bool ConvertUTF16ToUTF8(const char16* input, int input_len,
- CanonOutput* output);
-bool ConvertUTF8ToUTF16(const char* input, int input_len,
- CanonOutputT<char16>* output);
+URL_EXPORT bool ConvertUTF16ToUTF8(const base::char16* input, int input_len,
+ CanonOutput* output);
+URL_EXPORT bool ConvertUTF8ToUTF16(const char* input, int input_len,
+ CanonOutputT<base::char16>* output);
// Converts from UTF-16 to 8-bit using the character set converter. If the
// converter is NULL, this will use UTF-8.
-void ConvertUTF16ToQueryEncoding(const char16* input,
- const url_parse::Component& query,
+void ConvertUTF16ToQueryEncoding(const base::char16* input,
+ const Component& query,
CharsetConverter* converter,
CanonOutput* output);
@@ -395,7 +366,7 @@
void SetupOverrideComponents(const char* base,
const Replacements<char>& repl,
URLComponentSource<char>* source,
- url_parse::Parsed* parsed);
+ Parsed* parsed);
// Like the above 8-bit version, except that it additionally converts the
// UTF-16 input to UTF-8 before doing the overrides.
@@ -410,31 +381,33 @@
// |source| will point into this buffer, which could be invalidated if
// additional data is added and the CanonOutput resizes its buffer.
//
-// Returns true on success. Fales means that the input was not valid UTF-16,
+// Returns true on success. False means that the input was not valid UTF-16,
// although we will have still done the override with "invalid characters" in
// place of errors.
bool SetupUTF16OverrideComponents(const char* base,
- const Replacements<char16>& repl,
+ const Replacements<base::char16>& repl,
CanonOutput* utf8_buffer,
URLComponentSource<char>* source,
- url_parse::Parsed* parsed);
+ Parsed* parsed);
// Implemented in url_canon_path.cc, these are required by the relative URL
// resolver as well, so we declare them here.
bool CanonicalizePartialPath(const char* spec,
- const url_parse::Component& path,
+ const Component& path,
int path_begin_in_output,
CanonOutput* output);
-bool CanonicalizePartialPath(const char16* spec,
- const url_parse::Component& path,
+bool CanonicalizePartialPath(const base::char16* spec,
+ const Component& path,
int path_begin_in_output,
CanonOutput* output);
#ifndef WIN32
// Implementations of Windows' int-to-string conversions
-int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix);
-int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix);
+URL_EXPORT int _itoa_s(int value, char* buffer, size_t size_in_chars,
+ int radix);
+URL_EXPORT int _itow_s(int value, base::char16* buffer, size_t size_in_chars,
+ int radix);
// Secure template overloads for these functions
template<size_t N>
@@ -443,7 +416,7 @@
}
template<size_t N>
-inline int _itow_s(int value, char16 (&buffer)[N], int radix) {
+inline int _itow_s(int value, base::char16 (&buffer)[N], int radix) {
return _itow_s(value, buffer, N, radix);
}
@@ -455,6 +428,6 @@
#endif // WIN32
-} // namespace url_canon
+} // namespace url
-#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
+#endif // URL_URL_CANON_INTERNAL_H_
diff --git a/googleurl/src/url_canon_internal_file.h b/src/url/url_canon_internal_file.h
similarity index 70%
rename from googleurl/src/url_canon_internal_file.h
rename to src/url/url_canon_internal_file.h
index 63a9c5b..6903098 100644
--- a/googleurl/src/url_canon_internal_file.h
+++ b/src/url/url_canon_internal_file.h
@@ -1,31 +1,9 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_INTERNAL_FILE_H_
+#define URL_URL_CANON_INTERNAL_FILE_H_
// As with url_canon_internal.h, this file is intended to be included in
// another C++ file where the template types are defined. This allows the
@@ -36,13 +14,11 @@
// *** This file must be included after url_canon_internal as we depend on some
// functions in it. ***
-#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
-#define GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
-#include "googleurl/src/url_file.h"
-#include "googleurl/src/url_parse_internal.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
-using namespace url_canon;
+namespace url {
// Given a pointer into the spec, this copies and canonicalizes the drive
// letter and colon to the output, if one is found. If there is not a drive
@@ -90,11 +66,11 @@
// path. We supply it with the path following the slashes. It won't prepend
// a slash because it assumes any nonempty path already starts with one.
// We explicitly filter out calls with no path here to prevent that case.
- ParsedURL::Component sub_path(after_slashes, end - after_slashes);
+ ParsedComponent sub_path(after_slashes, end - after_slashes);
if (sub_path.len > 0) {
// Give it a fake output component to write into. DoCanonicalizeFile will
// compute the full path component.
- ParsedURL::Component fake_output_path;
+ ParsedComponent fake_output_path;
URLCanonInternal<CHAR, UCHAR>::DoPath(
spec, sub_path, output, &fake_output_path);
}
@@ -106,9 +82,9 @@
CanonOutput* output,
ParsedURL* new_parsed) {
// Things we don't set in file: URLs.
- new_parsed->username = ParsedURL::Component(0, -1);
- new_parsed->password = ParsedURL::Component(0, -1);
- new_parsed->port = ParsedURL::Component(0, -1);
+ new_parsed->username = ParsedComponent(0, -1);
+ new_parsed->password = ParsedComponent(0, -1);
+ new_parsed->port = ParsedComponent(0, -1);
// Scheme (known, so we don't bother running it through the more
// complicated scheme canonicalizer).
@@ -154,4 +130,6 @@
return success;
}
-#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
+} // namespace url
+
+#endif // URL_URL_CANON_INTERNAL_FILE_H_
diff --git a/googleurl/src/url_canon_ip.cc b/src/url/url_canon_ip.cc
similarity index 78%
rename from googleurl/src/url_canon_ip.cc
rename to src/url/url_canon_ip.cc
index 86f7c9c..45f95de 100644
--- a/googleurl/src/url_canon_ip.cc
+++ b/src/url/url_canon_ip.cc
@@ -1,41 +1,16 @@
-// Copyright 2009, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
-#include "googleurl/src/url_canon_ip.h"
+#include "url/url_canon_ip.h"
#include <stdlib.h>
#include "base/basictypes.h"
#include "base/logging.h"
-#include "googleurl/src/url_canon_internal.h"
+#include "url/url_canon_internal.h"
-namespace url_canon {
+namespace url {
namespace {
@@ -56,8 +31,8 @@
template<typename CHAR, typename UCHAR>
bool DoFindIPv4Components(const CHAR* spec,
- const url_parse::Component& host,
- url_parse::Component components[4]) {
+ const Component& host,
+ Component components[4]) {
if (!host.is_nonempty())
return false;
@@ -68,8 +43,7 @@
if (i >= end || spec[i] == '.') {
// Found the end of the current component.
int component_len = i - cur_component_begin;
- components[cur_component] =
- url_parse::Component(cur_component_begin, component_len);
+ components[cur_component] = Component(cur_component_begin, component_len);
// The next component starts after the dot.
cur_component_begin = i + 1;
@@ -101,7 +75,7 @@
// Fill in any unused components.
while (cur_component < 4)
- components[cur_component++] = url_parse::Component();
+ components[cur_component++] = Component();
return true;
}
@@ -116,10 +90,9 @@
// out any input that is greater than 7 bits. The components are assumed
// to be non-empty.
template<typename CHAR>
-CanonHostInfo::Family IPv4ComponentToNumber(
- const CHAR* spec,
- const url_parse::Component& component,
- uint32* number) {
+CanonHostInfo::Family IPv4ComponentToNumber(const CHAR* spec,
+ const Component& component,
+ uint32* number) {
// Figure out the base
SharedCharTypes base;
int base_prefix_len = 0; // Size of the prefix for this base.
@@ -181,34 +154,14 @@
return CanonHostInfo::IPV4;
}
-// Writes the given address (with each character representing one dotted
-// part of an IPv4 address) to the output, and updating |*out_host| to
-// identify the added portion.
-void AppendIPv4Address(const unsigned char address[4],
- CanonOutput* output,
- url_parse::Component* out_host) {
- out_host->begin = output->length();
- for (int i = 0; i < 4; i++) {
- char str[16];
- _itoa_s(address[i], str, 10);
-
- for (int ch = 0; str[ch] != 0; ch++)
- output->push_back(str[ch]);
-
- if (i != 3)
- output->push_back('.');
- }
- out_host->len = output->length() - out_host->begin;
-}
-
// See declaration of IPv4AddressToNumber for documentation.
template<typename CHAR>
CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec,
- const url_parse::Component& host,
+ const Component& host,
unsigned char address[4],
int* num_ipv4_components) {
// The identified components. Not all may exist.
- url_parse::Component components[4];
+ Component components[4];
if (!FindIPv4Components(spec, host, components))
return CanonHostInfo::NEUTRAL;
@@ -216,19 +169,30 @@
// |existing_components| will be valid.
uint32 component_values[4];
int existing_components = 0;
+
+ // Set to true if one or more components are BROKEN. BROKEN is only
+ // returned if all components are IPV4 or BROKEN, so, for example,
+ // 12345678912345.de returns NEUTRAL rather than broken.
+ bool broken = false;
for (int i = 0; i < 4; i++) {
if (components[i].len <= 0)
continue;
CanonHostInfo::Family family = IPv4ComponentToNumber(
spec, components[i], &component_values[existing_components]);
- // Stop if we hit an invalid non-empty component.
- if (family != CanonHostInfo::IPV4)
+ if (family == CanonHostInfo::BROKEN) {
+ broken = true;
+ } else if (family != CanonHostInfo::IPV4) {
+ // Stop if we hit a non-BROKEN invalid non-empty component.
return family;
+ }
existing_components++;
}
+ if (broken)
+ return CanonHostInfo::BROKEN;
+
// Use that sequence of numbers to fill out the 4-component IP address.
// First, process all components but the last, while making sure each fits
@@ -240,7 +204,15 @@
}
// Next, consume the last component to fill in the remaining bytes.
+ // Work around a gcc 4.9 bug. crbug.com/392872
+#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
uint32 last_value = component_values[existing_components - 1];
+#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4)
+#pragma GCC diagnostic pop
+#endif
for (int i = 3; i >= existing_components - 1; i--) {
address[i] = static_cast<unsigned char>(last_value);
last_value >>= 8;
@@ -261,17 +233,18 @@
// is NEUTRAL, and we could use a second opinion.
template<typename CHAR, typename UCHAR>
bool DoCanonicalizeIPv4Address(const CHAR* spec,
- const url_parse::Component& host,
+ const Component& host,
CanonOutput* output,
CanonHostInfo* host_info) {
- unsigned char address[4];
host_info->family = IPv4AddressToNumber(
- spec, host, address, &host_info->num_ipv4_components);
+ spec, host, host_info->address, &host_info->num_ipv4_components);
switch (host_info->family) {
case CanonHostInfo::IPV4:
// Definitely an IPv4 address.
- AppendIPv4Address(address, output, &host_info->out_host);
+ host_info->out_host.begin = output->length();
+ AppendIPv4Address(host_info->address, output);
+ host_info->out_host.len = output->length() - host_info->out_host.begin;
return true;
case CanonHostInfo::BROKEN:
// Definitely broken.
@@ -328,7 +301,7 @@
}
// There can be up to 8 hex components (colon separated) in the literal.
- url_parse::Component hex_components[8];
+ Component hex_components[8];
// The count of hex components present. Ranges from [0,8].
int num_hex_components;
@@ -338,16 +311,14 @@
int index_of_contraction;
// The range of characters which are an IPv4 literal.
- url_parse::Component ipv4_component;
+ Component ipv4_component;
};
// Parse the IPv6 input string. If parsing succeeded returns true and fills
// |parsed| with the information. If parsing failed (because the input is
// invalid) returns false.
template<typename CHAR, typename UCHAR>
-bool DoParseIPv6(const CHAR* spec,
- const url_parse::Component& host,
- IPv6Parsed* parsed) {
+bool DoParseIPv6(const CHAR* spec, const Component& host, IPv6Parsed* parsed) {
// Zero-out the info.
parsed->reset();
@@ -392,7 +363,7 @@
return false;
parsed->hex_components[parsed->num_hex_components++] =
- url_parse::Component(cur_component_begin, component_len);
+ Component(cur_component_begin, component_len);
}
}
@@ -423,8 +394,8 @@
// Since IPv4 address can only appear at the end, assume the rest
// of the string is an IPv4 address. (We will parse this separately
// later).
- parsed->ipv4_component = url_parse::Component(
- cur_component_begin, end - cur_component_begin);
+ parsed->ipv4_component =
+ Component(cur_component_begin, end - cur_component_begin);
break;
} else {
// The character was neither a hex digit, nor an IPv4 character.
@@ -473,8 +444,7 @@
// already verified that each character in the string was a hex digit, and
// that there were no more than 4 characters.
template<typename CHAR>
-uint16 IPv6HexComponentToNumber(const CHAR* spec,
- const url_parse::Component& component) {
+uint16 IPv6HexComponentToNumber(const CHAR* spec, const Component& component) {
DCHECK(component.len <= 4);
// Copy the hex string into a C-string.
@@ -492,7 +462,7 @@
// true on success. False means that the input was not a valid IPv6 address.
template<typename CHAR, typename UCHAR>
bool DoIPv6AddressToNumber(const CHAR* spec,
- const url_parse::Component& host,
+ const Component& host,
unsigned char address[16]) {
// Make sure the component is bounded by '[' and ']'.
int end = host.end();
@@ -500,7 +470,7 @@
return false;
// Exclude the square brackets.
- url_parse::Component ipv6_comp(host.begin + 1, host.len - 2);
+ Component ipv6_comp(host.begin + 1, host.len - 2);
// Parse the IPv6 address -- identify where all the colon separated hex
// components are, the "::" contraction, and the embedded IPv4 address.
@@ -538,18 +508,6 @@
// If there was an IPv4 section, convert it into a 32-bit number and append
// it to |address|.
if (ipv6_parsed.ipv4_component.is_valid()) {
- // We only allow the embedded IPv4 syntax to be used for "compat" and
- // "mapped" formats:
- // "mapped" ==> 0:0:0:0:0:ffff:<IPv4-literal>
- // "compat" ==> 0:0:0:0:0:0000:<IPv4-literal>
- for (int j = 0; j < 10; ++j) {
- if (address[j] != 0)
- return false;
- }
- if (!((address[10] == 0 && address[11] == 0) ||
- (address[10] == 0xFF && address[11] == 0xFF)))
- return false;
-
// Append the 32-bit number to |address|.
int ignored_num_ipv4_components;
if (CanonHostInfo::IPV4 !=
@@ -567,12 +525,12 @@
// range into |contraction_range|. The run of zeros must be at least 16 bits,
// and if there is a tie the first is chosen.
void ChooseIPv6ContractionRange(const unsigned char address[16],
- url_parse::Component* contraction_range) {
+ Component* contraction_range) {
// The longest run of zeros in |address| seen so far.
- url_parse::Component max_range;
+ Component max_range;
// The current run of zeros in |address| being iterated over.
- url_parse::Component cur_range;
+ Component cur_range;
for (int i = 0; i < 16; i += 2) {
// Test for 16 bits worth of zero.
@@ -581,7 +539,7 @@
if (is_zero) {
// Add the zero to the current range (or start a new one).
if (!cur_range.is_valid())
- cur_range = url_parse::Component(i, 0);
+ cur_range = Component(i, 0);
cur_range.len += 2;
}
@@ -601,12 +559,11 @@
// is NEUTRAL, and we could use a second opinion.
template<typename CHAR, typename UCHAR>
bool DoCanonicalizeIPv6Address(const CHAR* spec,
- const url_parse::Component& host,
+ const Component& host,
CanonOutput* output,
CanonHostInfo* host_info) {
// Turn the IP address into a 128 bit number.
- unsigned char address[16];
- if (!IPv6AddressToNumber(spec, host, address)) {
+ if (!IPv6AddressToNumber(spec, host, host_info->address)) {
// If it's not an IPv6 address, scan for characters that should *only*
// exist in an IPv6 address.
for (int i = host.begin; i < host.end(); i++) {
@@ -626,12 +583,35 @@
host_info->out_host.begin = output->length();
output->push_back('[');
+ AppendIPv6Address(host_info->address, output);
+ output->push_back(']');
+ host_info->out_host.len = output->length() - host_info->out_host.begin;