Internal change PiperOrigin-RevId: 16620190 Change-Id: I0ab3e5a57f8717f3e6511be07e4d2d109591209d

commit: 6ee28c9143d800380b6ab2efe45a682651e8e815 [log] [tgz]
author: Devany Sandoval <sandovad@google.com> Thu Jul 29 08:56:43 2010 -0700
committer: sandovad <sandovad@google.com> Tue Sep 03 12:51:02 2019 -0700
tree: 67720a304b2340a49d098d619a7483d2fc15d803
parent: 2199655e9cb5804f00eaeeccc7be22de10012f3d [diff]
diff --git a/README.google b/README.google
index a283c92..d0678e1 100644
--- a/README.google
+++ b/README.google

@@ -1,5 +1,5 @@
 URL: http://google-url.googlecode.com/svn/trunk/
-Version: Snapshot of Subversion trunk, revision [7]
+Version: Snapshot of Subversion trunk, revision [139]
 License: BSD and MPL (one source file under MPL)
 License File: googleurl/LICENSE.txt
 

diff --git a/googleurl/README.txt b/googleurl/README.txt
index 9265d75..b28fd04 100644
--- a/googleurl/README.txt
+++ b/googleurl/README.txt

@@ -149,6 +149,18 @@
 example implemnetation is provided in src/gurl.*. You may wish to use this
 object, extend or modify it, or write your own.
 
+Whitespace
+----------
+Sometimes, you may want to remove linefeeds and tabs from the content of a URL.
+Some web pages, for example, expect that a URL spanning two lines should be
+treated as one with the newline removed. Depending on the source of the URLs
+you are canonicalizing, these newlines may or may not be trimmed off.
+
+If you want this behavior, call RemoveURLWhitespace before parsing. This will
+remove CR, LF and TAB from the input. Note that it preserves spaces. On typical
+URLs, this function produces a 10-15% speed reduction, so it is optional and
+not done automatically. The example GURL object and the url_util wrapper does
+this for you.
 
 Tests
 =====

diff --git a/googleurl/base/basictypes.h b/googleurl/base/basictypes.h
deleted file mode 100644
index ef1f2bd..0000000
--- a/googleurl/base/basictypes.h
+++ /dev/null

@@ -1,78 +0,0 @@
-// Copyright 2001 - 2003 Google Inc. All Rights Reserved
-
-#ifndef BASE_BASICTYPES_H__
-#define BASE_BASICTYPES_H__
-
-typedef unsigned short uint16;
-
-// The arraysize(arr) macro returns the # of elements in an array arr.
-// The expression is a compile-time constant, and therefore can be
-// used in defining new arrays, for example.  If you use arraysize on
-// a pointer by mistake, you will get a compile-time error.
-//
-// One caveat is that arraysize() doesn't accept any array of an
-// anonymous type or a type defined inside a function.  In these rare
-// cases, you have to use the unsafe ARRAYSIZE() macro below.  This is
-// due to a limitation in C++'s template system.  The limitation might
-// eventually be removed, but it hasn't happened yet.
-
-// This template function declaration is used in defining arraysize.
-// Note that the function doesn't need an implementation, as we only
-// use its type.
-template <typename T, size_t N>
-char (&ArraySizeHelper(T (&array)[N]))[N];
-
-// That gcc wants both of these prototypes seems mysterious. VC, for
-// its part, can't decide which to use (another mystery). Matching of
-// template overloads: the final frontier.
-#ifndef _MSC_VER
-template <typename T, size_t N>
-char (&ArraySizeHelper(const T (&array)[N]))[N];
-#endif
-
-#define arraysize(array) (sizeof(ArraySizeHelper(array)))
-
-// ARRAYSIZE performs essentially the same calculation as arraysize,
-// but can be used on anonymous types or types defined inside
-// functions.  It's less safe than arraysize as it accepts some
-// (although not all) pointers.  Therefore, you should use arraysize
-// whenever possible.
-//
-// The expression ARRAYSIZE(a) is a compile-time constant of type
-// size_t.
-//
-// ARRAYSIZE catches a few type errors.  If you see a compiler error
-//
-//   "warning: division by zero in ..."
-//
-// when using ARRAYSIZE, you are (wrongfully) giving it a pointer.
-// You should only use ARRAYSIZE on statically allocated arrays.
-//
-// The following comments are on the implementation details, and can
-// be ignored by the users.
-//
-// ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in
-// the array) and sizeof(*(arr)) (the # of bytes in one array
-// element).  If the former is divisible by the latter, perhaps arr is
-// indeed an array, in which case the division result is the # of
-// elements in the array.  Otherwise, arr cannot possibly be an array,
-// and we generate a compiler error to prevent the code from
-// compiling.
-//
-// Since the size of bool is implementation-defined, we need to cast
-// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final
-// result has type size_t.
-//
-// This macro is not perfect as it wrongfully accepts certain
-// pointers, namely where the pointer size is divisible by the pointee
-// size.  Since all our code has to go through a 32-bit compiler,
-// where a pointer is 4 bytes, this means all pointers to a type whose
-// size is 3 or greater than 4 will be (righteously) rejected.
-//
-// Starting with Visual C++ 2005, WinNT.h includes ARRAYSIZE.
-#define ARRAYSIZE_UNSAFE(a) \
-  ((sizeof(a) / sizeof(*(a))) / \
-   static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
-#endif  // BASE_BASICTYPES_H__

diff --git a/googleurl/base/logging.cc b/googleurl/base/logging.cc
deleted file mode 100644
index b7710a1..0000000
--- a/googleurl/base/logging.cc
+++ /dev/null

@@ -1,380 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <ctime>
-#include <iomanip>
-#include <cstring>
-#include <windows.h>
-#include <tchar.h>
-#include <algorithm>
-#include "base/logging.h"
-
-namespace logging {
-
-const char* const log_severity_names[LOG_NUM_SEVERITIES] = {
-  "INFO", "WARNING", "ERROR", "FATAL" };
-
-int min_log_level = 0;
-LogLockingState lock_log_file = LOCK_LOG_FILE;
-LoggingDestination logging_destination = LOG_ONLY_TO_FILE;
-
-const int kMaxFilteredLogLevel = LOG_WARNING;
-char* log_filter_prefix = NULL;
-
-// which log file to use? This is initialized by InitLogging or
-// will be lazily initialized to the default value when it is
-// first needed.
-TCHAR log_file_name[MAX_PATH] = { 0 };
-
-// this file is lazily opened and the handle may be NULL
-HANDLE log_file = NULL;
-
-// what should be prepended to each message?
-bool log_process_id = false;
-bool log_thread_id = false;
-bool log_timestamp = true;
-bool log_tickcount = false;
-
-// An assert handler override specified by the client to be called instead of 
-// the debug message dialog.
-LogAssertHandlerFunction log_assert_handler = NULL;
-
-// The critical section is used if log file locking is false. It helps us
-// avoid problems with multiple threads writing to the log file at the same
-// time.
-bool initialized_critical_section = false;
-CRITICAL_SECTION log_critical_section;
-
-// When we don't use a critical section, we are using a global mutex. We
-// need to do this because LockFileEx is not thread safe
-HANDLE log_mutex = NULL;
-
-// Called by logging functions to ensure that debug_file is initialized
-// and can be used for writing. Returns false if the file could not be
-// initialized. debug_file will be NULL in this case.
-bool InitializeLogFileHandle() {
-  if (log_file)
-    return true;
-
-  if (!log_file_name[0]) {
-    // nobody has called InitLogging to specify a debug log file, so here we
-    // initialize the log file name to the default
-    GetModuleFileName(NULL, log_file_name, MAX_PATH);
-    TCHAR* last_backslash = _tcsrchr(log_file_name, '\\');
-    if (last_backslash)
-      last_backslash[1] = 0; // name now ends with the backslash
-    _tcscat_s(log_file_name, _T("debug.log"));
-  }
-
-  log_file = CreateFile(log_file_name, GENERIC_WRITE,
-                        FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
-                        OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
-  if (log_file == INVALID_HANDLE_VALUE || log_file == NULL) {
-    // try the current directory 
-    log_file = CreateFile(_T(".\\debug.log"), GENERIC_WRITE,
-                          FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
-                          OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
-    if (log_file == INVALID_HANDLE_VALUE || log_file == NULL) {
-      log_file = NULL;
-      return false;
-    }
-  }
-  SetFilePointer(log_file, 0, 0, FILE_END);
-  return true;
-}
-
-void InitLogMutex() {
-  if (!log_mutex) {
-    // \ is not a legal character in mutex names so we replace \ with /
-    std::wstring safe_name(log_file_name);
-    std::replace(safe_name.begin(), safe_name.end(), '\\', '/');
-    std::wstring t(L"Global\\");
-    t.append(safe_name);
-    log_mutex = ::CreateMutex(NULL, FALSE, t.c_str());
-  }
-}
-
-void InitLogging(const TCHAR* new_log_file, LoggingDestination logging_dest,
-                 LogLockingState lock_log, OldFileDeletionState delete_old) {
-  if (log_file) {
-    // calling InitLogging twice or after some log call has already opened the
-    // default log file will re-initialize to the new options
-    CloseHandle(log_file);
-    log_file = NULL;
-  }
-
-  lock_log_file = lock_log;
-  logging_destination = logging_dest;
-
-  // ignore file options if logging is only to system
-  if (logging_destination == LOG_ONLY_TO_SYSTEM_DEBUG_LOG)
-    return; 
-
-  _tcscpy_s(log_file_name, MAX_PATH, new_log_file);
-  if (delete_old == DELETE_OLD_LOG_FILE)
-    DeleteFile(log_file_name);
-
-  if (lock_log_file == LOCK_LOG_FILE) {
-    InitLogMutex();
-  } else if (!initialized_critical_section) {
-    // initialize the critical section
-    InitializeCriticalSection(&log_critical_section);
-    initialized_critical_section = true;
-  }
-
-  InitializeLogFileHandle();
-}
-
-void SetMinLogLevel(int level) {
-  min_log_level = level;
-}
-
-void SetLogFilterPrefix(char* filter)  {
-  if (log_filter_prefix) {
-    delete[] log_filter_prefix;
-    log_filter_prefix = NULL;
-  }
-
-  if (filter) {
-    size_t size = strlen(filter)+1;
-    log_filter_prefix = new char[size];
-    strcpy_s(log_filter_prefix, size, filter);
-  }
-}
-
-void SetLogItems(bool enable_process_id, bool enable_thread_id,
-                 bool enable_timestamp, bool enable_tickcount) {
-  log_process_id = enable_process_id;
-  log_thread_id = enable_thread_id;
-  log_timestamp = enable_timestamp;
-  log_tickcount = enable_tickcount;
-}
-
-void SetLogAssertHandler(LogAssertHandlerFunction handler) {
-  log_assert_handler = handler;
-}
-
-// Displays a message box to the user with the error message in it. For
-// Windows programs, it's possible that the message loop is messed up on
-// a fatal error, and creating a MessageBox will cause that message loop
-// to be run. Instead, we try to spawn another process that displays its
-// command line. We look for "Debug Message.exe" in the same directory as
-// the application. If it exists, we use it, otherwise, we use a regular
-// message box.
-void DisplayDebugMessage(const std::string& str) {
-  if (str.empty())
-    return;
-
-  // look for the debug dialog program next to our application
-  wchar_t prog_name[MAX_PATH];
-  GetModuleFileNameW(NULL, prog_name, MAX_PATH);
-  wchar_t* backslash = wcsrchr(prog_name, '\\');
-  if (backslash)
-    backslash[1] = 0;
-  wcscat_s(prog_name, MAX_PATH, L"debug_message.exe");
-
-  // stupid CreateProcess requires a non-const command line and may modify it.
-  // We also want to use the wide string
-  int charcount = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, NULL, 0);
-  if (!charcount)
-    return;
-  scoped_array<wchar_t> cmdline(new wchar_t[charcount]);
-  if (!MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, cmdline.get(), charcount))
-    return;
-
-  STARTUPINFO startup_info;
-  memset(&startup_info, 0, sizeof(startup_info));
-  startup_info.cb = sizeof(startup_info);
-
-  PROCESS_INFORMATION process_info;
-  if (CreateProcessW(prog_name, cmdline.get(), NULL, NULL, false, 0, NULL,
-                     NULL, &startup_info, &process_info)) {
-    WaitForSingleObject(process_info.hProcess, INFINITE);
-    CloseHandle(process_info.hThread);
-    CloseHandle(process_info.hProcess);
-  } else {
-    // debug process broken, let's just do a message box
-    MessageBoxW(NULL, cmdline.get(), L"Fatal error", MB_OK | MB_ICONHAND);
-  }
-}
-
-LogMessage::LogMessage(const char* file, int line, LogSeverity severity,
-		       int ctr)
-    : severity_(severity) {
-  Init(file, line);
-}
-
-LogMessage::LogMessage(const char* file, int line, const CheckOpString& result)
-    : severity_(LOG_FATAL) {
-  Init(file, line);
-  stream_ << "Check failed: " << (*result.str_);
-}
-
-LogMessage::LogMessage(const char* file, int line)
-     : severity_(LOG_INFO) {
-  Init(file, line);
-}
-
-LogMessage::LogMessage(const char* file, int line, LogSeverity severity)
-    : severity_(severity) {
-  Init(file, line);
-}
-
-// writes the common header info to the stream
-void LogMessage::Init(const char* file, int line) {
-  // log only the filename
-  const char* last_slash = strrchr(file, '\\');
-  if (last_slash)
-    file = last_slash + 1;
-
-  stream_ <<  '[';
-  if (log_process_id)
-    stream_ << GetCurrentProcessId() << ':';
-  if (log_thread_id)
-    stream_ << GetCurrentThreadId() << ':';
-  if (log_timestamp) {
-    time_t t = time(NULL);
-    struct tm tm_time;
-    localtime_s(&tm_time, &t);
-    stream_ << std::setfill('0')
-            << std::setw(2) << 1 + tm_time.tm_mon
-            << std::setw(2) << tm_time.tm_mday
-            << '/'
-            << std::setw(2) << tm_time.tm_hour
-            << std::setw(2) << tm_time.tm_min
-            << std::setw(2) << tm_time.tm_sec
-            << ':';
-  }
-  if (log_tickcount)
-    stream_ << GetTickCount() << ':';
-  stream_ << log_severity_names[severity_] << ":" << file << "(" << line << ")] ";
-
-  message_start_ = stream_.pcount();
-}
-
-LogMessage::~LogMessage() {
-  if (severity_ < min_log_level)
-    return;
-
-  std::string str_newline(stream_.str(), stream_.pcount());
-  str_newline.append("\r\n");
-
-  if (log_filter_prefix && severity_ <= kMaxFilteredLogLevel &&
-      str_newline.compare(message_start_, strlen(log_filter_prefix),
-                          log_filter_prefix) != 0) {
-    goto cleanup;
-  }
-
-  if (logging_destination != LOG_ONLY_TO_FILE)
-    OutputDebugStringA(str_newline.c_str());
-
-  // write to log file
-  if (logging_destination != LOG_ONLY_TO_SYSTEM_DEBUG_LOG &&
-      InitializeLogFileHandle()) {
-    // we can have multiple threads and/or processes, so try to prevent them from
-    // clobbering each other's writes
-    if (lock_log_file == LOCK_LOG_FILE) {
-      // Ensure that the mutex is initialized in case the client app did not
-      // call InitLogging. This is not thread safe. See below
-      InitLogMutex();
-
-      DWORD r = ::WaitForSingleObject(log_mutex, INFINITE);
-      DCHECK(r != WAIT_ABANDONED);
-    } else {
-      // use the critical section
-      if (!initialized_critical_section) {
-        // The client app did not call InitLogging, and so the critical section
-        // has not been created. We do this on demand, but if two threads try to
-        // do this at the same time, there will be a race condition to create
-        // the critical section. This is why InitLogging should be called from
-        // the main thread at the beginning of execution.
-        InitializeCriticalSection(&log_critical_section);
-        initialized_critical_section = true;
-      }
-      EnterCriticalSection(&log_critical_section);
-    }
-
-    SetFilePointer(log_file, 0, 0, SEEK_END);
-    DWORD num_written;
-    WriteFile(log_file, (void*)str_newline.c_str(), (DWORD)str_newline.length(), &num_written, NULL);
-
-    if (lock_log_file == LOCK_LOG_FILE) {
-      ReleaseMutex(log_mutex);
-    } else {
-      LeaveCriticalSection(&log_critical_section);
-    }
-  }
-
-  if (severity_ == LOG_FATAL) {
-    // display a message or break into the debugger on a fatal error
-    if (::IsDebuggerPresent()) {
-      DebugBreak();
-    } else {
-      if (log_assert_handler) {
-        log_assert_handler(std::string(stream_.str(), stream_.pcount()));
-      } else {
-        // don't use the string with the newline, get a fresh version to send to
-        // the debug message process
-        DisplayDebugMessage(std::string(stream_.str(), stream_.pcount()));
-        TerminateProcess(GetCurrentProcess(), 1);
-      }
-    }
-  }
-
-cleanup:
-  // Calling stream_.str() freezes the stream buffer.  A frozen buffer will
-  // not be freed during strstreambuf destruction.
-  stream_.freeze(false);
-}
-
-void CloseLogFile() {
-  if (!log_file)
-    return;
-
-  CloseHandle(log_file);
-  log_file = NULL;
-}
-
-} // namespace logging
-
-std::ostream& operator<<(std::ostream& out, const wchar_t* wstr) {
-  if (!wstr || !wstr[0])
-    return out;
-
-  // compute the length of the buffer we'll need
-  int charcount = WideCharToMultiByte(CP_UTF8, 0, wstr, -1,
-                                      NULL, 0, NULL, NULL);
-  if (charcount == 0)
-    return out;
-
-  // convert
-  scoped_array<char> buf(new char[charcount]);
-  WideCharToMultiByte(CP_UTF8, 0, wstr, -1, buf.get(), charcount, NULL, NULL);
-  return out << buf.get();
-}

diff --git a/googleurl/base/logging.h b/googleurl/base/logging.h
deleted file mode 100644
index d7c0f46..0000000
--- a/googleurl/base/logging.h
+++ /dev/null

@@ -1,482 +0,0 @@
-// Copyright 2006 Google Inc. All Rights Reserved.
-// Author: brettw (Brett Wilson)
-
-#ifndef BASE_LOGGING_H__
-#define BASE_LOGGING_H__
-
-#include <string>
-#include <cstring>
-#include <strstream>
-#include <tchar.h>
-
-#include "base/basictypes.h"
-#include "base/scoped_ptr.h"
-
-// Optional message capabilities
-// -----------------------------
-// Assertion failed messages and fatal errors are displayed in a dialog box
-// before the application exits. However, running this UI creates a message
-// loop, which causes application messages to be processed and potentially
-// dispatched to existing application windows. Since the application is in a
-// bad state when this assertion dialog is displayed, these messages may not
-// get processed and hang the dialog, or the application might go crazy.
-//
-// Therefore, it can be beneficial to display the error dialog in a separate
-// process from the main application. When the logging system needs to display
-// a fatal error dialog box, it will look for a program called
-// "DebugMessage.exe" in the same directory as the application executable. It
-// will run this application with the message as the command line, and will
-// not include the name of the application as is traditional for easier
-// parsing.
-//
-// The code for DebugMessage.exe is only one line. In WinMain, do:
-//   MessageBox(NULL, GetCommandLineW(), L"Fatal Error", 0);
-//
-// If DebugMessage.exe is not found, the logging code will use a normal
-// MessageBox, potentially causing the problems discussed above.
-
-
-// Instructions
-// ------------
-//
-// Make a bunch of macros for logging.  The way to log things is to stream
-// things to LOG(<a particular severity level>).  E.g.,
-//
-//   LOG(INFO) << "Found " << num_cookies << " cookies";
-//
-// You can also do conditional logging:
-//
-//   LOG_IF(INFO, num_cookies > 10) << "Got lots of cookies";
-//
-// The above will cause log messages to be output on the 1st, 11th, 21st, ...
-// times it is executed.  Note that the special COUNTER value is used to
-// identify which repetition is happening.
-//
-// There are also "debug mode" logging macros like the ones above:
-//
-//   DLOG(INFO) << "Found cookies";
-//
-//   DLOG_IF(INFO, num_cookies > 10) << "Got lots of cookies";
-//
-// All "debug mode" logging is compiled away to nothing for non-debug mode
-// compiles.  LOG_IF and development flags also work well together
-// because the code can be compiled away sometimes.
-//
-// We also have
-//
-//   LOG_ASSERT(assertion);
-//   DLOG_ASSERT(assertion);
-//
-// which is syntactic sugar for {,D}LOG_IF(FATAL, assert fails) << assertion;
-//
-// We also override the standard 'assert' to use 'DLOG_ASSERT'.
-//
-// The supported severity levels for macros that allow you to specify one
-// are (in increasing order of severity) INFO, WARNING, ERROR, and FATAL.
-//
-// There is also the special severity of DFATAL, which logs FATAL in
-// debug mode, ERROR in normal mode.
-//
-// Very important: logging a message at the FATAL severity level causes
-// the program to terminate (after the message is logged).
-
-namespace logging {
-
-// Where to record logging output? A flat file and/or system debug log via
-// OutputDebugString. Defaults to LOG_ONLY_TO_FILE.
-enum LoggingDestination { LOG_ONLY_TO_FILE, 
-                          LOG_ONLY_TO_SYSTEM_DEBUG_LOG,
-                          LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG };
-
-// Indicates that the log file should be locked when being written to.
-// Often, there is no locking, which is fine for a single threaded program.
-// If logging is being done from multiple threads or there can be more than
-// one process doing the logging, the file should be locked during writes to
-// make each log outut atomic. Other writers will block.
-//
-// All processes writing to the log file must have their locking set for it to
-// work properly. Defaults to DONT_LOCK_LOG_FILE.
-enum LogLockingState { LOCK_LOG_FILE, DONT_LOCK_LOG_FILE };
-
-// On startup, should we delete or append to an existing log file (if any)?
-// Defaults to APPEND_TO_OLD_LOG_FILE.
-enum OldFileDeletionState { DELETE_OLD_LOG_FILE, APPEND_TO_OLD_LOG_FILE };
-
-// Sets the log file name and other global logging state. Calling this function
-// is recommended, and is normally done at the beginning of application init.
-// If you don't call it, all the flags will be initialized to their default
-// values, and there is a race condition that may leak a critical section
-// object if two threads try to do the first log at the same time.
-// See the definition of the enums above for descriptions and default values.
-//
-// The default log file is initialized to "debug.log" in the application
-// directory. You probably don't want this, especially since the program
-// directory may not be writable on an enduser's system.
-void InitLogging(const TCHAR* log_file, LoggingDestination logging_dest,
-                 LogLockingState lock_log, OldFileDeletionState delete_old);
-
-// Sets the log level. Anything at or above this level will be written to the
-// log file/displayed to the user (if applicable). Anything below this level
-// will be silently ignored. The log level defaults to 0 (everything is logged)
-// if this function is not called.
-void SetMinLogLevel(int level);
-
-// Sets the log filter prefix.  Any log message below LOG_ERROR severity that
-// doesn't start with this prefix with be silently ignored.  The filter defaults
-// to NULL (everything is logged) if this function is not called.  Messages
-// with severity of LOG_ERROR or higher will not be filtered.
-void SetLogFilterPrefix(char* filter);
-
-// Sets the common items you want to be prepended to each log message.
-// process and thread IDs default to off, the timestamp defaults to on.
-// If this function is not called, logging defaults to writing the timestamp
-// only.
-void SetLogItems(bool enable_process_id, bool enable_thread_id,
-                 bool enable_timestamp, bool enable_tickcount);
-
-// Sets the Log Assert Handler that will be used to notify of check failures.
-// The default handler shows a dialog box, however clients can use this 
-// function to override with their own handling (e.g. a silent one for Unit
-// Tests)
-typedef void (*LogAssertHandlerFunction)(const std::string& str);
-void SetLogAssertHandler(LogAssertHandlerFunction handler);
-
-typedef int LogSeverity;
-const LogSeverity LOG_INFO = 0;
-const LogSeverity LOG_WARNING = 1;
-const LogSeverity LOG_ERROR = 2;
-const LogSeverity LOG_FATAL = 3;
-const LogSeverity LOG_NUM_SEVERITIES = 4;
-
-// LOG_DFATAL_LEVEL is LOG_FATAL in debug mode, ERROR in normal mode
-#ifdef NDEBUG
-const LogSeverity LOG_DFATAL_LEVEL = LOG_ERROR;
-#else
-const LogSeverity LOG_DFATAL_LEVEL = LOG_FATAL;
-#endif
-
-// A few definitions of macros that don't generate much code. These are used
-// by LOG() and LOG_IF, etc. Since these are used all over our code, it's
-// better to have compact code for these operations.
-#define COMPACT_GOOGLE_LOG_INFO \
-  logging::LogMessage(__FILE__, __LINE__)
-#define COMPACT_GOOGLE_LOG_WARNING \
-  logging::LogMessage(__FILE__, __LINE__, logging::LOG_WARNING)
-#define COMPACT_GOOGLE_LOG_ERROR \
-  logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR)
-#define COMPACT_GOOGLE_LOG_FATAL \
-  logging::LogMessage(__FILE__, __LINE__, logging::LOG_FATAL)
-#define COMPACT_GOOGLE_LOG_DFATAL \
-  logging::LogMessage(__FILE__, __LINE__, logging::LOG_DFATAL_LEVEL)
-
-// wingdi.h defines ERROR to be 0. When we call LOG(ERROR), it gets
-// substituted with 0, and it expands to COMPACT_GOOGLE_LOG_0. To allow us
-// to keep using this syntax, we define this macro to do the same thing
-// as COMPACT_GOOGLE_LOG_ERROR, and also define ERROR the same way that
-// the Windows SDK does for consistency.
-#define ERROR 0
-#define COMPACT_GOOGLE_LOG_0 \
-  logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR)
-
-// We use the preprocessor's merging operator, "##", so that, e.g.,
-// LOG(INFO) becomes the token COMPACT_GOOGLE_LOG_INFO.  There's some funny
-// subtle difference between ostream member streaming functions (e.g.,
-// ostream::operator<<(int) and ostream non-member streaming functions
-// (e.g., ::operator<<(ostream&, string&): it turns out that it's
-// impossible to stream something like a string directly to an unnamed
-// ostream. We employ a neat hack by calling the stream() member
-// function of LogMessage which seems to avoid the problem.
-
-#define LOG(severity) COMPACT_GOOGLE_LOG_ ## severity.stream()
-#define SYSLOG(severity) LOG(severity)
-
-#define LOG_IF(severity, condition) \
-  !(condition) ? (void) 0 : logging::LogMessageVoidify() & LOG(severity)
-#define SYSLOG_IF(severity, condition) LOG_IF(severity, condition)
-
-#define LOG_ASSERT(condition)  \
-  LOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". "
-#define SYSLOG_ASSERT(condition) \
-  SYSLOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". "
-
-// A container for a string pointer which can be evaluated to a bool -
-// true iff the pointer is NULL.
-struct CheckOpString {
-  CheckOpString(std::string* str) : str_(str) { }
-  // No destructor: if str_ is non-NULL, we're about to LOG(FATAL),
-  // so there's no point in cleaning up str_.
-  operator bool() const { return str_ != NULL; }
-  std::string* str_;
-};
-
-// Build the error message string.  This is separate from the "Impl"
-// function template because it is not performance critical and so can
-// be out of line, while the "Impl" code should be inline.
-template<class t1, class t2>
-std::string* MakeCheckOpString(const t1& v1, const t2& v2, const char* names) {
-  std::ostrstream ss;
-  ss << names << " (" << v1 << " vs. " << v2 << ")";
-  return new std::string(ss.str(), ss.pcount());
-}
-
-extern std::string* MakeCheckOpStringIntInt(int v1, int v2, const char* names);
-
-template<int, int>
-std::string* MakeCheckOpString(const int& v1, const int& v2, const char* names) {
-  return MakeCheckOpStringIntInt(v1, v2, names);
-}
-
-// Plus some debug-logging macros that get compiled to nothing for production
-//
-// DEBUG_MODE is for uses like
-//   if (DEBUG_MODE) foo.CheckThatFoo();
-// instead of
-//   #ifndef NDEBUG
-//     foo.CheckThatFoo();
-//   #endif
-
-#ifndef NDEBUG
-
-#define DLOG(severity) LOG(severity)
-#define DLOG_IF(severity, condition) LOG_IF(severity, condition)
-#define DLOG_ASSERT(condition) LOG_ASSERT(condition)
-
-// debug-only checking.  not executed in NDEBUG mode.
-enum { DEBUG_MODE = 1 };
-#define DCHECK(condition) \
-  LOG_IF(FATAL, !(condition)) << "Check failed: " #condition ". "
-
-// Helper functions for DCHECK_OP macro.
-// The (int, int) specialization works around the issue that the compiler
-// will not instantiate the template version of the function on values of
-// unnamed enum type - see comment below.
-#define DEFINE_DCHECK_OP_IMPL(name, op) \
-  template <class t1, class t2> \
-  inline std::string* Check##name##Impl(const t1& v1, const t2& v2, \
-                                        const char* names) { \
-    if (v1 op v2) return NULL; \
-    else return MakeCheckOpString(v1, v2, names); \
-  } \
-  inline std::string* Check##name##Impl(int v1, int v2, const char* names) { \
-    if (v1 op v2) return NULL; \
-    else return MakeCheckOpString(v1, v2, names); \
-  }
-DEFINE_DCHECK_OP_IMPL(EQ, ==)
-DEFINE_DCHECK_OP_IMPL(NE, !=)
-DEFINE_DCHECK_OP_IMPL(LE, <=)
-DEFINE_DCHECK_OP_IMPL(LT, < )
-DEFINE_DCHECK_OP_IMPL(GE, >=)
-DEFINE_DCHECK_OP_IMPL(GT, > )
-#undef DEFINE_DCHECK_OP_IMPL
-
-// Helper macro for binary operators.
-// Don't use this macro directly in your code, use CHECK_EQ et al below.
-#define DCHECK_OP(name, op, val1, val2)  \
-  while (logging::CheckOpString _result = \
-         logging::Check##name##Impl((val1), (val2), #val1 " " #op " " #val2)) \
-    logging::LogMessage(__FILE__, __LINE__, _result).stream()
-
-// Equality/Inequality checks - compare two values, and log a LOG_FATAL message
-// including the two values when the result is not as expected.  The values
-// must have operator<<(ostream, ...) defined.
-//
-// You may append to the error message like so:
-//   CHECK_NE(1, 2) << ": The world must be ending!";
-//
-// We are very careful to ensure that each argument is evaluated exactly
-// once, and that anything which is legal to pass as a function argument is
-// legal here.  In particular, the arguments may be temporary expressions
-// which will end up being destroyed at the end of the apparent statement,
-// for example:
-//   CHECK_EQ(string("abc")[1], 'b');
-//
-// WARNING: These don't compile correctly if one of the arguments is a pointer
-// and the other is NULL. To work around this, simply static_cast NULL to the
-// type of the desired pointer.
-
-#define DCHECK_EQ(val1, val2) DCHECK_OP(EQ, ==, val1, val2)
-#define DCHECK_NE(val1, val2) DCHECK_OP(NE, !=, val1, val2)
-#define DCHECK_LE(val1, val2) DCHECK_OP(LE, <=, val1, val2)
-#define DCHECK_LT(val1, val2) DCHECK_OP(LT, < , val1, val2)
-#define DCHECK_GE(val1, val2) DCHECK_OP(GE, >=, val1, val2)
-#define DCHECK_GT(val1, val2) DCHECK_OP(GT, > , val1, val2)
-
-// Helper functions for string comparisons.
-// To avoid bloat, the definitions are in logging.cc.
-#define DECLARE_DCHECK_STROP_IMPL(func, expected) \
-  std::string* Check##func##expected##Impl(const char* s1, \
-                                           const char* s2, \
-                                           const char* names);
-DECLARE_DCHECK_STROP_IMPL(strcmp, true)
-DECLARE_DCHECK_STROP_IMPL(strcmp, false)
-DECLARE_DCHECK_STROP_IMPL(_stricmp, true)
-DECLARE_DCHECK_STROP_IMPL(_stricmp, false)
-#undef DECLARE_DCHECK_STROP_IMPL
-
-// Helper macro for string comparisons.
-// Don't use this macro directly in your code, use CHECK_STREQ et al below.
-#define DCHECK_STROP(func, op, expected, s1, s2) \
-  while (CheckOpString _result = \
-      logging::Check##func##expected##Impl((s1), (s2), \
-                                           #s1 " " #op " " #s2)) \
-    LOG(FATAL) << *_result.str_
-
-// String (char*) equality/inequality checks.
-// CASE versions are case-insensitive.
-//
-// Note that "s1" and "s2" may be temporary strings which are destroyed
-// by the compiler at the end of the current "full expression"
-// (e.g. DCHECK_STREQ(Foo().c_str(), Bar().c_str())).
-
-#define DCHECK_STREQ(s1, s2) DCHECK_STROP(strcmp, ==, true, s1, s2)
-#define DCHECK_STRNE(s1, s2) DCHECK_STROP(strcmp, !=, false, s1, s2)
-#define DCHECK_STRCASEEQ(s1, s2) DCHECK_STROP(_stricmp, ==, true, s1, s2)
-#define DCHECK_STRCASENE(s1, s2) DCHECK_STROP(_stricmp, !=, false, s1, s2)
-
-#define DCHECK_INDEX(I,A) DCHECK(I < (sizeof(A)/sizeof(A[0])))
-#define DCHECK_BOUND(B,A) DCHECK(B <= (sizeof(A)/sizeof(A[0])))
-
-#else  // NDEBUG
-
-#define DLOG(severity) \
-  true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity)
-
-#define DLOG_IF(severity, condition) \
-  true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity)
-
-#define DLOG_ASSERT(condition) \
-  true ? (void) 0 : LOG_ASSERT(condition)
-
-enum { DEBUG_MODE = 0 };
-
-// This macro can be followed by a sequence of stream parameters in
-// non-debug mode. The DCHECK and friends macros use this so that
-// the expanded expression DCHECK(foo) << "asdf" is still syntactically
-// valid, even though the expression will get optimized away.
-#define NDEBUG_EAT_STREAM_PARAMETERS \
-  logging::LogMessage(__FILE__, __LINE__).stream()
-
-#define DCHECK(condition) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_EQ(val1, val2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_NE(val1, val2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_LE(val1, val2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_LT(val1, val2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_GE(val1, val2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_GT(val1, val2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_STREQ(str1, str2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_STRCASEEQ(str1, str2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_STRNE(str1, str2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#define DCHECK_STRCASENE(str1, str2) \
-  while (false) NDEBUG_EAT_STREAM_PARAMETERS
-
-#endif  // NDEBUG
-
-#define NOTREACHED() DCHECK(false)
-
-// Redefine the standard assert to use our nice log files
-#undef assert
-#define assert(x) DLOG_ASSERT(x)
-
-// This class more or less represents a particular log message.  You
-// create an instance of LogMessage and then stream stuff to it.
-// When you finish streaming to it, ~LogMessage is called and the
-// full message gets streamed to the appropriate destination.
-//
-// You shouldn't actually use LogMessage's constructor to log things,
-// though.  You should use the LOG() macro (and variants thereof)
-// above.
-class LogMessage {
- public:
-  LogMessage(const char* file, int line, LogSeverity severity, int ctr);
-
-  // Two special constructors that generate reduced amounts of code at
-  // LOG call sites for common cases.
-  //
-  // Used for LOG(INFO): Implied are:
-  // severity = LOG_INFO, ctr = 0
-  //
-  // Using this constructor instead of the more complex constructor above
-  // saves a couple of bytes per call site.
-  LogMessage(const char* file, int line);
-
-  // Used for LOG(severity) where severity != INFO.  Implied
-  // are: ctr = 0
-  //
-  // Using this constructor instead of the more complex constructor above
-  // saves a couple of bytes per call site.
-  LogMessage(const char* file, int line, LogSeverity severity);
-
-  // A special constructor used for check failures.
-  // Implied severity = LOG_FATAL
-  LogMessage(const char* file, int line, const CheckOpString& result);
-
-  ~LogMessage();
-
-  std::ostream& stream() { return stream_; }
-
- private:
-  void Init(const char* file, int line);
-
-  LogSeverity severity_;
-  std::ostrstream stream_;
-  int message_start_;  // offset of the start of the message (past prefix info).
-
-  DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
-};
-
-// A non-macro interface to the log facility; (useful
-// when the logging level is not a compile-time constant).
-inline void LogAtLevel(int const log_level, std::string const &msg) {
-  LogMessage(__FILE__, __LINE__, log_level).stream() << msg;
-}
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros.  This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-class LogMessageVoidify {
- public:
-  LogMessageVoidify() { }
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(std::ostream&) { }
-};
-
-// Closes the log file explicitly if open.
-// NOTE: Since the log file is opened as necessary by the action of logging
-//       statements, there's no guarantee that it will stay closed
-//       after this call.
-void CloseLogFile();
-
-} // namespace Logging
-
-// These functions are provided as a convenience for logging, which is where we
-// use streams (it is against Google style to use streams in other places). It
-// is designed to allow you to emit non-ASCII Unicode strings to the log file,
-// which is normally ASCII. It is relatively slow, so try not to use it for
-// common cases. Non-ASCII characters will be converted to UTF-8 by these operators.
-std::ostream& operator<<(std::ostream& out, const wchar_t* wstr);
-inline std::ostream& operator<<(std::ostream& out, const std::wstring& wstr) {
-  return out << wstr.c_str();
-}
-
-#endif  // BASE_LOGGING_H__

diff --git a/googleurl/base/scoped_ptr.h b/googleurl/base/scoped_ptr.h
deleted file mode 100644
index aa21691..0000000
--- a/googleurl/base/scoped_ptr.h
+++ /dev/null

@@ -1,322 +0,0 @@
-#ifndef BASE_SCOPED_PTR_H
-#define BASE_SCOPED_PTR_H
-
-//  (C) Copyright Greg Colvin and Beman Dawes 1998, 1999.
-//  Copyright (c) 2001, 2002 Peter Dimov
-//
-//  Permission to copy, use, modify, sell and distribute this software
-//  is granted provided this copyright notice appears in all copies.
-//  This software is provided "as is" without express or implied
-//  warranty, and with no claim as to its suitability for any purpose.
-//
-//  See http://www.boost.org/libs/smart_ptr/scoped_ptr.htm for documentation.
-//
-
-//  scoped_ptr mimics a built-in pointer except that it guarantees deletion
-//  of the object pointed to, either on destruction of the scoped_ptr or via
-//  an explicit reset(). scoped_ptr is a simple solution for simple needs;
-//  use shared_ptr or std::auto_ptr if your needs are more complex.
-
-//  *** NOTE ***
-//  If your scoped_ptr is a class member of class FOO pointing to a 
-//  forward declared type BAR (as shown below), then you MUST use a non-inlined 
-//  version of the destructor.  The destructor of a scoped_ptr (called from
-//  FOO's destructor) must have a complete definition of BAR in order to 
-//  destroy it.  Example:
-//
-//  -- foo.h --
-//  class BAR;
-//
-//  class FOO {
-//   public:
-//    FOO();
-//    ~FOO();  // Required for sources that instantiate class FOO to compile!
-//    
-//   private:
-//    scoped_ptr<BAR> bar_;
-//  };
-//
-//  -- foo.cc --
-//  #include "foo.h"
-//  FOO::~FOO() {} // Empty, but must be non-inlined to FOO's class definition.
-
-#include <cstddef>            // for std::ptrdiff_t
-#include <assert.h>           // for assert
-#include <stdlib.h>           // for free() decl
-
-template <typename T>
-class scoped_ptr {
- private:
-
-  T* ptr;
-
-  scoped_ptr(scoped_ptr const &);
-  scoped_ptr & operator=(scoped_ptr const &);
-
- public:
-
-  typedef T element_type;
-
-  explicit scoped_ptr(T* p = 0): ptr(p) {}
-
-  ~scoped_ptr() {
-    typedef char type_must_be_complete[sizeof(T)];
-    delete ptr;
-  }
-
-  void reset(T* p = 0) {
-    typedef char type_must_be_complete[sizeof(T)];
-
-    if (ptr != p) {
-      delete ptr;
-      ptr = p;
-    }
-  }
-
-  T& operator*() const {
-    assert(ptr != 0);
-    return *ptr;
-  }
-
-  T* operator->() const  {
-    assert(ptr != 0);
-    return ptr;
-  }
-
-  bool operator==(T* p) const {
-    return ptr == p;
-  }
-
-  bool operator!=(T* p) const {
-    return ptr != p;
-  }
-
-  T* get() const  {
-    return ptr;
-  }
-
-  void swap(scoped_ptr & b) {
-    T* tmp = b.ptr;
-    b.ptr = ptr;
-    ptr = tmp;
-  }
-
-  T* release() {
-    T* tmp = ptr;
-    ptr = 0;
-    return tmp;
-  }
-
- private:
-
-  // no reason to use these: each scoped_ptr should have its own object
-  template <typename U> bool operator==(scoped_ptr<U> const& p) const;
-  template <typename U> bool operator!=(scoped_ptr<U> const& p) const;
-};
-
-template<typename T> inline
-void swap(scoped_ptr<T>& a, scoped_ptr<T>& b) {
-  a.swap(b);
-}
-
-template<typename T> inline
-bool operator==(T* p, const scoped_ptr<T>& b) {
-  return p == b.get();
-}
-
-template<typename T> inline
-bool operator!=(T* p, const scoped_ptr<T>& b) {
-  return p != b.get();
-}
-
-//  scoped_array extends scoped_ptr to arrays. Deletion of the array pointed to
-//  is guaranteed, either on destruction of the scoped_array or via an explicit
-//  reset(). Use shared_array or std::vector if your needs are more complex.
-
-template<typename T>
-class scoped_array {
- private:
-
-  T* ptr;
-
-  scoped_array(scoped_array const &);
-  scoped_array & operator=(scoped_array const &);
-
- public:
-
-  typedef T element_type;
-
-  explicit scoped_array(T* p = 0) : ptr(p) {}
-
-  ~scoped_array() {
-    typedef char type_must_be_complete[sizeof(T)];
-    delete[] ptr;
-  }
-
-  void reset(T* p = 0) {
-    typedef char type_must_be_complete[sizeof(T)];
-
-    if (ptr != p) {
-      delete [] ptr;
-      ptr = p;
-    }
-  }
-
-  T& operator[](std::ptrdiff_t i) const {
-    assert(ptr != 0);
-    assert(i >= 0);
-    return ptr[i];
-  }
-
-  bool operator==(T* p) const {
-    return ptr == p;
-  }
-
-  bool operator!=(T* p) const {
-    return ptr != p;
-  }
-
-  T* get() const {
-    return ptr;
-  }
-
-  void swap(scoped_array & b) {
-    T* tmp = b.ptr;
-    b.ptr = ptr;
-    ptr = tmp;
-  }
-
-  T* release() {
-    T* tmp = ptr;
-    ptr = 0;
-    return tmp;
-  }
-
- private:
-
-  // no reason to use these: each scoped_array should have its own object
-  template <typename U> bool operator==(scoped_array<U> const& p) const;
-  template <typename U> bool operator!=(scoped_array<U> const& p) const;
-};
-
-template<class T> inline
-void swap(::scoped_array<T>& a, ::scoped_array<T>& b) {
-  a.swap(b);
-}
-
-template<typename T> inline
-bool operator==(T* p, const ::scoped_array<T>& b) {
-  return p == b.get();
-}
-
-template<typename T> inline
-bool operator!=(T* p, const ::scoped_array<T>& b) {
-  return p != b.get();
-}
-
-
-// This class wraps the c library function free() in a class that can be
-// passed as a template argument to scoped_ptr_malloc below.
-class ScopedPtrMallocFree {
- public:
-  inline void operator()(void* x) const {
-    free(x);
-  }
-};
-
-// scoped_ptr_malloc<> is similar to scoped_ptr<>, but it accepts a
-// second template argument, the functor used to free the object.
-
-template<typename T, typename FreeProc = ScopedPtrMallocFree>
-class scoped_ptr_malloc {
- private:
-
-  T* ptr;
-
-  scoped_ptr_malloc(scoped_ptr_malloc const &);
-  scoped_ptr_malloc & operator=(scoped_ptr_malloc const &);
-
- public:
-
-  typedef T element_type;
-
-  explicit scoped_ptr_malloc(T* p = 0): ptr(p) {}
-
-  ~scoped_ptr_malloc() {
-    typedef char type_must_be_complete[sizeof(T)];
-    free_((void*) ptr);
-  }
-
-  void reset(T* p = 0) {
-    typedef char type_must_be_complete[sizeof(T)];
-
-    if (ptr != p) {
-      free_((void*) ptr);
-      ptr = p;
-    }
-  }
-
-  T& operator*() const {
-    assert(ptr != 0);
-    return *ptr;
-  }
-
-  T* operator->() const {
-    assert(ptr != 0);
-    return ptr;
-  }
-
-  bool operator==(T* p) const {
-    return ptr == p;
-  }
-
-  bool operator!=(T* p) const {
-    return ptr != p;
-  }
-
-  T* get() const {
-    return ptr;
-  }
-
-  void swap(scoped_ptr_malloc & b) {
-    T* tmp = b.ptr;
-    b.ptr = ptr;
-    ptr = tmp;
-  }
-
-  T* release() {
-    T* tmp = ptr;
-    ptr = 0;
-    return tmp;
-  }
-
- private:
-
-  // no reason to use these: each scoped_ptr_malloc should have its own object
-  template <typename U, typename GP>
-  bool operator==(scoped_ptr_malloc<U, GP> const& p) const;
-  template <typename U, typename GP>
-  bool operator!=(scoped_ptr_malloc<U, GP> const& p) const;
-
-  static FreeProc const free_;
-};
-
-template<typename T, typename FP>
-FP const scoped_ptr_malloc<T,FP>::free_ = FP();
-
-template<typename T, typename FP> inline
-void swap(scoped_ptr_malloc<T,FP>& a, scoped_ptr_malloc<T,FP>& b) {
-  a.swap(b);
-}
-
-template<typename T, typename FP> inline
-bool operator==(T* p, const scoped_ptr_malloc<T,FP>& b) {
-  return p == b.get();
-}
-
-template<typename T, typename FP> inline
-bool operator!=(T* p, const scoped_ptr_malloc<T,FP>& b) {
-  return p != b.get();
-}
-
-#endif  // #ifndef BASE_SCOPED_PTR_H

diff --git a/googleurl/base/string16.cc b/googleurl/base/string16.cc
new file mode 100644
index 0000000..fc25809
--- /dev/null
+++ b/googleurl/base/string16.cc

@@ -0,0 +1,94 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "base/string16.h"
+
+#ifdef WIN32
+
+#error This file should not be used on 2-byte wchar_t systems
+// If this winds up being needed on 2-byte wchar_t systems, either the
+// definitions below can be used, or the host system's wide character
+// functions like wmemcmp can be wrapped.
+
+#else  // !WIN32
+
+namespace base {
+
+int c16memcmp(const char16* s1, const char16* s2, size_t n) {
+  // We cannot call memcmp because that changes the semantics.
+  while (n-- > 0) {
+    if (*s1 != *s2) {
+      // We cannot use (*s1 - *s2) because char16 is unsigned.
+      return ((*s1 < *s2) ? -1 : 1);
+    }
+    ++s1;
+    ++s2;
+  }
+  return 0;
+}
+
+size_t c16len(const char16* s) {
+  const char16 *s_orig = s;
+  while (*s) {
+    ++s;
+  }
+  return s - s_orig;
+}
+
+const char16* c16memchr(const char16* s, char16 c, size_t n) {
+  while (n-- > 0) {
+    if (*s == c) {
+      return s;
+    }
+    ++s;
+  }
+  return 0;
+}
+
+char16* c16memmove(char16* s1, const char16* s2, size_t n) {
+  return reinterpret_cast<char16*>(memmove(s1, s2, n * sizeof(char16)));
+}
+
+char16* c16memcpy(char16* s1, const char16* s2, size_t n) {
+  return reinterpret_cast<char16*>(memcpy(s1, s2, n * sizeof(char16)));
+}
+
+char16* c16memset(char16* s, char16 c, size_t n) {
+  char16 *s_orig = s;
+  while (n-- > 0) {
+    *s = c;
+    ++s;
+  }
+  return s_orig;
+}
+
+}  // namespace base
+
+template class std::basic_string<char16, base::string16_char_traits>;
+
+#endif  // WIN32

diff --git a/googleurl/base/string16.h b/googleurl/base/string16.h
new file mode 100644
index 0000000..deedaf6
--- /dev/null
+++ b/googleurl/base/string16.h

@@ -0,0 +1,193 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef BASE_STRING16_H_
+#define BASE_STRING16_H_
+
+// WHAT:
+// A version of std::basic_string that provides 2-byte characters even when
+// wchar_t is not implemented as a 2-byte type. You can access this class as
+// string16. We also define char16, which string16 is based upon.
+//
+// WHY:
+// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2
+// data. Plenty of existing code operates on strings encoded as UTF-16.
+//
+// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make
+// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails
+// at run time, because it calls some functions (like wcslen) that come from
+// the system's native C library -- which was built with a 4-byte wchar_t!
+// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's
+// entirely improper on those systems where the encoding of wchar_t is defined
+// as UTF-32.
+//
+// Here, we define string16, which is similar to std::wstring but replaces all
+// libc functions with custom, 2-byte-char compatible routines. It is capable
+// of carrying UTF-16-encoded data.
+
+#include <stdio.h>
+#include <string>
+
+#include "base/basictypes.h"
+
+#ifdef WIN32
+
+typedef wchar_t char16;
+typedef std::wstring string16;
+
+#else  // !WIN32
+
+typedef uint16 char16;
+
+namespace base {
+
+// char16 versions of the functions required by string16_char_traits; these
+// are based on the wide character functions of similar names ("w" or "wcs"
+// instead of "c16").
+int c16memcmp(const char16* s1, const char16* s2, size_t n);
+size_t c16len(const char16* s);
+const char16* c16memchr(const char16* s, char16 c, size_t n);
+char16* c16memmove(char16* s1, const char16* s2, size_t n);
+char16* c16memcpy(char16* s1, const char16* s2, size_t n);
+char16* c16memset(char16* s, char16 c, size_t n);
+
+struct string16_char_traits {
+  typedef char16 char_type;
+  typedef int int_type;
+
+  typedef std::streamoff off_type;
+  typedef mbstate_t state_type;
+  typedef std::fpos<state_type> pos_type;
+
+  static void assign(char_type& c1, const char_type& c2) {
+    c1 = c2;
+  }
+
+  static bool eq(const char_type& c1, const char_type& c2) {
+    return c1 == c2;
+  }
+  static bool lt(const char_type& c1, const char_type& c2) {
+    return c1 < c2;
+  }
+
+  static int compare(const char_type* s1, const char_type* s2, size_t n) {
+    return c16memcmp(s1, s2, n);
+  }
+
+  static size_t length(const char_type* s) {
+    return c16len(s);
+  }
+
+  static const char_type* find(const char_type* s, size_t n,
+                               const char_type& a) {
+    return c16memchr(s, a, n);
+  }
+
+  static char_type* move(char_type* s1, const char_type* s2, int_type n) {
+    return c16memmove(s1, s2, n);
+  }
+
+  static char_type* copy(char_type* s1, const char_type* s2, size_t n) {
+    return c16memcpy(s1, s2, n);
+  }
+
+  static char_type* assign(char_type* s, size_t n, char_type a) {
+    return c16memset(s, a, n);
+  }
+
+  static int_type not_eof(const int_type& c) {
+    return eq_int_type(c, eof()) ? 0 : c;
+  }
+
+  static char_type to_char_type(const int_type& c) {
+    return char_type(c);
+  }
+
+  static int_type to_int_type(const char_type& c) {
+    return int_type(c);
+  }
+
+  static bool eq_int_type(const int_type& c1, const int_type& c2) {
+    return c1 == c2;
+  }
+
+  static int_type eof() {
+    return static_cast<int_type>(EOF);
+  }
+};
+
+}  // namespace base
+
+// The string class will be explicitly instantiated only once, in string16.cc.
+//
+// std::basic_string<> in GNU libstdc++ contains a static data member,
+// _S_empty_rep_storage, to represent empty strings.  When an operation such
+// as assignment or destruction is performed on a string, causing its existing
+// data member to be invalidated, it must not be freed if this static data
+// member is being used.  Otherwise, it counts as an attempt to free static
+// (and not allocated) data, which is a memory error.
+//
+// Generally, due to C++ template magic, _S_empty_rep_storage will be marked
+// as a coalesced symbol, meaning that the linker will combine multiple
+// instances into a single one when generating output.
+//
+// If a string class is used by multiple shared libraries, a problem occurs.
+// Each library will get its own copy of _S_empty_rep_storage.  When strings
+// are passed across a library boundary for alteration or destruction, memory
+// errors will result.  GNU libstdc++ contains a configuration option,
+// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which
+// disables the static data member optimization, but it's a good optimization
+// and non-STL code is generally at the mercy of the system's STL
+// configuration.  Fully-dynamic strings are not the default for GNU libstdc++
+// libstdc++ itself or for the libstdc++ installations on the systems we care
+// about, such as Mac OS X and relevant flavors of Linux.
+//
+// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 .
+//
+// To avoid problems, string classes need to be explicitly instantiated only
+// once, in exactly one library.  All other string users see it via an "extern"
+// declaration.  This is precisely how GNU libstdc++ handles
+// std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring).
+//
+// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2),
+// in which the linker does not fully coalesce symbols when dead code
+// stripping is enabled.  This bug causes the memory errors described above
+// to occur even when a std::basic_string<> does not cross shared library
+// boundaries, such as in statically-linked executables.
+//
+// TODO(mark): File this bug with Apple and update this note with a bug number.
+
+extern template class std::basic_string<char16, base::string16_char_traits>;
+
+typedef std::basic_string<char16, base::string16_char_traits> string16;
+
+extern std::ostream& operator<<(std::ostream& out, const string16& str);
+
+#endif  // !WIN32
+
+#endif  // BASE_STRING16_H_

diff --git a/googleurl/build/README.txt b/googleurl/build/README.txt
index 81a47dc..eab011a 100644
--- a/googleurl/build/README.txt
+++ b/googleurl/build/README.txt

@@ -1,2 +1,4 @@
 This directory includes solution and project files for compiling with
 Visual Studio 2005 on Windows.
+
+The base checkout directory must be named 'googleurl'.

diff --git a/googleurl/build/base.vcproj b/googleurl/build/base.vcproj
index 95be0ce..0e923cf 100644
--- a/googleurl/build/base.vcproj
+++ b/googleurl/build/base.vcproj

@@ -18,7 +18,7 @@
 		<Configuration
 			Name="Debug|Win32"
 			ConfigurationType="4"
-			InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\debug.vsprops"
+			InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\debug.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
@@ -69,8 +69,7 @@
 		<Configuration
 			Name="Release|Win32"
 			ConfigurationType="4"
-			InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\release.vsprops"
-			WholeProgramOptimization="1"
+			InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\release.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
@@ -135,15 +134,15 @@
 			>
 		</File>
 		<File
+			RelativePath="..\base\README.txt"
+			>
+		</File>
+		<File
 			RelativePath="..\base\scoped_ptr.h"
 			>
 		</File>
 		<File
-			RelativePath="..\base\string_util.cc"
-			>
-		</File>
-		<File
-			RelativePath="..\base\string_util.h"
+			RelativePath="..\base\string16.h"
 			>
 		</File>
 	</Files>

diff --git a/googleurl/build/common.vsprops b/googleurl/build/common.vsprops
new file mode 100644
index 0000000..ede28e9
--- /dev/null
+++ b/googleurl/build/common.vsprops

@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioPropertySheet
+	ProjectType="Visual C++"
+	Version="8.00"
+	Name="common"
+	OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+	IntermediateDirectory="$(SolutionDir)$(ConfigurationName)\obj\$(ProjectName)"
+	CharacterSet="1"
+	>
+	<Tool
+		Name="VCCLCompilerTool"
+		AdditionalIncludeDirectories="$(SolutionDir)..\..;$(SolutionDir).."
+		PreprocessorDefinitions="_WIN32_WINNT=0x0501;WINVER=0x0501;WIN32;_WINDOWS"
+		MinimalRebuild="false"
+		BufferSecurityCheck="true"
+		EnableFunctionLevelLinking="true"
+		WarningLevel="3"
+		WarnAsError="true"
+		Detect64BitPortabilityProblems="true"
+		DebugInformationFormat="3"
+	/>
+</VisualStudioPropertySheet>

diff --git a/googleurl/build/debug.vsprops b/googleurl/build/debug.vsprops
index 7762c65..d2aa43f 100644
--- a/googleurl/build/debug.vsprops
+++ b/googleurl/build/debug.vsprops

@@ -15,8 +15,4 @@
 		Name="VCLinkerTool"
 		LinkIncremental="2"
 	/>
-	<Tool
-		Name="VCResourceCompilerTool"
-		PreprocessorDefinitions="_DEBUG"
-	/>
 </VisualStudioPropertySheet>

diff --git a/googleurl/build/googleurl.sln b/googleurl/build/googleurl.sln
index 80db0e8..347810d 100644
--- a/googleurl/build/googleurl.sln
+++ b/googleurl/build/googleurl.sln

@@ -5,6 +5,12 @@
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "base", "base.vcproj", "{ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}"
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{D8E84C85-89D3-4B8D-9A3A-C44B63C3383A}"
+	ProjectSection(SolutionItems) = preProject
+		..\LICENSE.txt = ..\LICENSE.txt
+		..\README.txt = ..\README.txt
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32

diff --git a/googleurl/build/googleurl.vcproj b/googleurl/build/googleurl.vcproj
index 43ba87b..71b3123 100644
--- a/googleurl/build/googleurl.vcproj
+++ b/googleurl/build/googleurl.vcproj

@@ -18,7 +18,7 @@
 		<Configuration
 			Name="Debug|Win32"
 			ConfigurationType="4"
-			InheritedPropertySheets="$(SolutionDir)..\build\debug.vsprops;$(SolutionDir)..\build\common.vsprops;$(SolutionDir)../third_party/icu36/build/using_icu.vsprops"
+			InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\debug.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
@@ -69,8 +69,7 @@
 		<Configuration
 			Name="Release|Win32"
 			ConfigurationType="4"
-			InheritedPropertySheets="$(SolutionDir)..\build\release.vsprops;$(SolutionDir)..\build\common.vsprops;$(SolutionDir)../third_party/icu36/build/using_icu.vsprops"
-			WholeProgramOptimization="1"
+			InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\release.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
@@ -131,6 +130,10 @@
 			>
 		</File>
 		<File
+			RelativePath=".\README.txt"
+			>
+		</File>
+		<File
 			RelativePath="..\src\url_canon.h"
 			>
 		</File>
@@ -171,6 +174,14 @@
 			>
 		</File>
 		<File
+			RelativePath="..\src\url_canon_ip.h"
+			>
+		</File>
+		<File
+			RelativePath="..\src\url_canon_mailtourl.cc"
+			>
+		</File>
+		<File
 			RelativePath="..\src\url_canon_path.cc"
 			>
 		</File>

diff --git a/googleurl/build/release.vsprops b/googleurl/build/release.vsprops
index 9575e54..2e59356 100644
--- a/googleurl/build/release.vsprops
+++ b/googleurl/build/release.vsprops

@@ -6,16 +6,18 @@
 	>
 	<Tool
 		Name="VCCLCompilerTool"
+		WholeProgramOptimization="true"
 		PreprocessorDefinitions="NDEBUG"
 	/>
 	<Tool
+		Name="VCLibrarianTool"
+		AdditionalOptions="/ltcg"
+	/>
+	<Tool
 		Name="VCLinkerTool"
 		LinkIncremental="1"
 		OptimizeReferences="2"
 		EnableCOMDATFolding="2"
-	/>
-	<Tool
-		Name="VCResourceCompilerTool"
-		PreprocessorDefinitions="NDEBUG"
+		LinkTimeCodeGeneration="1"
 	/>
 </VisualStudioPropertySheet>

diff --git a/googleurl/src/gurl.cc b/googleurl/src/gurl.cc
index a89ea22..a0bfd26 100644
--- a/googleurl/src/gurl.cc
+++ b/googleurl/src/gurl.cc

@@ -29,8 +29,12 @@
 
 #ifdef WIN32
 #include <windows.h>
+#else
+#include <pthread.h>
 #endif
 
+#include <algorithm>
+
 #include "googleurl/src/gurl.h"
 
 #include "base/logging.h"
@@ -42,8 +46,8 @@
 // External template that can handle initialization of either character type.
 // The input spec is given, and the canonical version will be placed in
 // |*canonical|, along with the parsing of the canonical spec in |*parsed|.
-template<typename CHAR>
-bool InitCanonical(const std::basic_string<CHAR>& input_spec,
+template<typename STR>
+bool InitCanonical(const STR& input_spec,
                    std::string* canonical,
                    url_parse::Parsed* parsed) {
   // Reserve enough room in the output for the input, plus some extra so that
@@ -52,18 +56,21 @@
   url_canon::StdStringCanonOutput output(canonical);
   bool success = url_util::Canonicalize(
       input_spec.data(), static_cast<int>(input_spec.length()),
-      &output, parsed);
+      NULL, &output, parsed);
 
   output.Complete();  // Must be done before using string.
   return success;
 }
 
+static std::string* empty_string = NULL;
+static GURL* empty_gurl = NULL;
+
+#ifdef WIN32
+
 // Returns a static reference to an empty string for returning a reference
 // when there is no underlying string.
 const std::string& EmptyStringForGURL() {
-#ifdef WIN32
   // Avoid static object construction/destruction on startup/shutdown.
-  static std::string* empty_string = NULL;
   if (!empty_string) {
     // Create the string. Be careful that we don't break in the case that this
     // is being called from multiple threads. Statics are not threadsafe.
@@ -76,13 +83,25 @@
     }
   }
   return *empty_string;
-#else
-  // TODO(brettw) Write a threadsafe Unix version!
-  static std::string empty_string;
-  return empty_string;
-#endif
 }
 
+#else
+
+static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
+static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
+
+void EmptyStringForGURLOnce(void) {
+  empty_string = new std::string;
+}
+
+const std::string& EmptyStringForGURL() {
+  // Avoid static object construction/destruction on startup/shutdown.
+  pthread_once(&empty_string_once, EmptyStringForGURLOnce);
+  return *empty_string;
+}
+
+#endif  // WIN32
+
 } // namespace
 
 GURL::GURL() : is_valid_(false) {
@@ -98,31 +117,34 @@
   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
 }
 
-GURL::GURL(const UTF16String& url_string) {
+GURL::GURL(const string16& url_string) {
   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
 }
 
-GURL::GURL(const char* canonical_spec, int canonical_spec_len,
+GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
            const url_parse::Parsed& parsed, bool is_valid)
     : spec_(canonical_spec, canonical_spec_len),
       is_valid_(is_valid),
       parsed_(parsed) {
 #ifndef NDEBUG
-  // For testing purposes, check that the parsed canonical URL is
-  // identical to what we would have produced.
-  GURL test_url(spec_);
+  // For testing purposes, check that the parsed canonical URL is identical to
+  // what we would have produced. Skip checking for invalid URLs have no meaning
+  // and we can't always canonicalize then reproducabely.
+  if (is_valid_) {
+    GURL test_url(spec_);
 
-  DCHECK(test_url.is_valid_ == is_valid_);
-  DCHECK(test_url.spec_ == spec_);
+    DCHECK(test_url.is_valid_ == is_valid_);
+    DCHECK(test_url.spec_ == spec_);
 
-  DCHECK(test_url.parsed_.scheme == parsed_.scheme);
-  DCHECK(test_url.parsed_.username == parsed_.username);
-  DCHECK(test_url.parsed_.password == parsed_.password);
-  DCHECK(test_url.parsed_.host == parsed_.host);
-  DCHECK(test_url.parsed_.port == parsed_.port);
-  DCHECK(test_url.parsed_.path == parsed_.path);
-  DCHECK(test_url.parsed_.query == parsed_.query);
-  DCHECK(test_url.parsed_.ref == parsed_.ref);
+    DCHECK(test_url.parsed_.scheme == parsed_.scheme);
+    DCHECK(test_url.parsed_.username == parsed_.username);
+    DCHECK(test_url.parsed_.password == parsed_.password);
+    DCHECK(test_url.parsed_.host == parsed_.host);
+    DCHECK(test_url.parsed_.port == parsed_.port);
+    DCHECK(test_url.parsed_.path == parsed_.path);
+    DCHECK(test_url.parsed_.query == parsed_.query);
+    DCHECK(test_url.parsed_.ref == parsed_.ref);
+  }
 #endif
 }
 
@@ -134,8 +156,17 @@
   return EmptyStringForGURL();
 }
 
-// Note: code duplicated below (it's inconvenient to use a template here).
 GURL GURL::Resolve(const std::string& relative) const {
+  return ResolveWithCharsetConverter(relative, NULL);
+}
+GURL GURL::Resolve(const string16& relative) const {
+  return ResolveWithCharsetConverter(relative, NULL);
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::ResolveWithCharsetConverter(
+    const std::string& relative,
+    url_canon::CharsetConverter* charset_converter) const {
   // Not allowed for invalid URLs.
   if (!is_valid_)
     return GURL();
@@ -147,9 +178,10 @@
   result.spec_.reserve(spec_.size() + 32);
   url_canon::StdStringCanonOutput output(&result.spec_);
 
-  if (!url_util::ResolveRelative(spec_.data(), parsed_, relative.data(),
-                                 static_cast<int>(relative.length()),
-                                 &output, &result.parsed_)) {
+  if (!url_util::ResolveRelative(
+          spec_.data(), static_cast<int>(spec_.length()), parsed_,
+          relative.data(), static_cast<int>(relative.length()),
+          charset_converter, &output, &result.parsed_)) {
     // Error resolving, return an empty URL.
     return GURL();
   }
@@ -160,7 +192,9 @@
 }
 
 // Note: code duplicated above (it's inconvenient to use a template here).
-GURL GURL::Resolve(const UTF16String& relative) const {
+GURL GURL::ResolveWithCharsetConverter(
+    const string16& relative,
+    url_canon::CharsetConverter* charset_converter) const {
   // Not allowed for invalid URLs.
   if (!is_valid_)
     return GURL();
@@ -172,9 +206,10 @@
   result.spec_.reserve(spec_.size() + 32);
   url_canon::StdStringCanonOutput output(&result.spec_);
 
-  if (!url_util::ResolveRelative(spec_.data(), parsed_, relative.data(),
-                                 static_cast<int>(relative.length()),
-                                 &output, &result.parsed_)) {
+  if (!url_util::ResolveRelative(
+          spec_.data(), static_cast<int>(spec_.length()), parsed_,
+          relative.data(), static_cast<int>(relative.length()),
+          charset_converter, &output, &result.parsed_)) {
     // Error resolving, return an empty URL.
     return GURL();
   }
@@ -199,8 +234,8 @@
   url_canon::StdStringCanonOutput output(&result.spec_);
 
   result.is_valid_ = url_util::ReplaceComponents(
-      spec_.data(), parsed_, replacements,
-      &output, &result.parsed_);
+      spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+      NULL, &output, &result.parsed_);
 
   output.Complete();
   return result;
@@ -208,7 +243,7 @@
 
 // Note: code duplicated above (it's inconvenient to use a template here).
 GURL GURL::ReplaceComponents(
-    const url_canon::Replacements<UTF16Char>& replacements) const {
+    const url_canon::Replacements<char16>& replacements) const {
   GURL result;
 
   // Not allowed for invalid URLs.
@@ -221,17 +256,33 @@
   url_canon::StdStringCanonOutput output(&result.spec_);
 
   result.is_valid_ = url_util::ReplaceComponents(
-      spec_.data(), parsed_, replacements,
-      &output, &result.parsed_);
+      spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+      NULL, &output, &result.parsed_);
 
   output.Complete();
   return result;
 }
 
+GURL GURL::GetOrigin() const {
+  // This doesn't make sense for invalid or nonstandard URLs, so return
+  // the empty URL
+  if (!is_valid_ || !IsStandard())
+    return GURL();
+
+  url_canon::Replacements<char> replacements;
+  replacements.ClearUsername();
+  replacements.ClearPassword();
+  replacements.ClearPath();
+  replacements.ClearQuery();
+  replacements.ClearRef();
+
+  return ReplaceComponents(replacements);
+}
+
 GURL GURL::GetWithEmptyPath() const {
   // This doesn't make sense for invalid or nonstandard URLs, so return
   // the empty URL.
-  if (!is_valid_ || !SchemeIsStandard()) 
+  if (!is_valid_ || !IsStandard())
     return GURL();
 
   // We could optimize this since we know that the URL is canonical, and we are
@@ -252,6 +303,10 @@
   return other;
 }
 
+bool GURL::IsStandard() const {
+  return url_util::IsStandard(spec_.data(), parsed_.scheme);
+}
+
 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
   if (parsed_.scheme.len <= 0)
     return lower_ascii_scheme == NULL;
@@ -260,48 +315,26 @@
                                         lower_ascii_scheme);
 }
 
-bool GURL::SchemeIsStandard() const {
-  return url_util::IsStandardScheme(&spec_[parsed_.scheme.begin],
-                                    parsed_.scheme.len);
-}
-
 int GURL::IntPort() const {
   if (parsed_.port.is_nonempty())
     return url_parse::ParsePort(spec_.data(), parsed_.port);
   return url_parse::PORT_UNSPECIFIED;
 }
 
+int GURL::EffectiveIntPort() const {
+  int int_port = IntPort();
+  if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
+    return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
+                                           parsed_.scheme.len);
+  return int_port;
+}
+
 std::string GURL::ExtractFileName() const {
   url_parse::Component file_component;
   url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
   return ComponentString(file_component);
 }
 
-void GURL::ExtractQuery(QueryMap* r) const {
-  url_parse::Component q(parsed_.query);
-  url_parse::Component key;
-  url_parse::Component value;
-
-  while (q.is_nonempty()) {
-    key = url_parse::Component();
-    value = url_parse::Component();
-
-    url_parse::ExtractQueryFragment(spec_.data(),
-                                    &q,
-                                    &key,
-                                    &value);
-
-    // Something may or may not have been found. For example, the key and value
-    // will both be empty for the query string "&".
-    if (key.len > 0) {
-      if (value.len > 0)
-        (*r)[ComponentString(key)] = ComponentString(value);
-      else
-        (*r)[ComponentString(key)] = std::string("");
-    }
-  }
-}
-
 std::string GURL::PathForRequest() const {
   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
   if (parsed_.ref.len >= 0) {
@@ -315,22 +348,31 @@
   return std::string(spec_, parsed_.path.begin);
 }
 
+std::string GURL::HostNoBrackets() const {
+  // If host looks like an IPv6 literal, strip the square brackets.
+  url_parse::Component h(parsed_.host);
+  if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
+    h.begin++;
+    h.len -= 2;
+  }
+  return ComponentString(h);
+}
+
 bool GURL::HostIsIPAddress() const {
   if (!is_valid_ || spec_.empty())
      return false;
 
   url_canon::RawCanonOutputT<char, 128> ignored_output;
-  url_parse::Component ignored_component;
-  return url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
-                                          &ignored_output, &ignored_component);
+  url_canon::CanonHostInfo host_info;
+  url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
+                                   &ignored_output, &host_info);
+  return host_info.IsIPAddress();
 }
 
 #ifdef WIN32
 
-// static
 const GURL& GURL::EmptyGURL() {
   // Avoid static object construction/destruction on startup/shutdown.
-  static GURL* empty_gurl = NULL;
   if (!empty_gurl) {
     // Create the string. Be careful that we don't break in the case that this
     // is being called from multiple threads.
@@ -345,6 +387,18 @@
   return *empty_gurl;
 }
 
+#else
+
+void EmptyGURLOnce(void) {
+  empty_gurl = new GURL;
+}
+
+const GURL& GURL::EmptyGURL() {
+  // Avoid static object construction/destruction on startup/shutdown.
+  pthread_once(&empty_gurl_once, EmptyGURLOnce);
+  return *empty_gurl;
+}
+
 #endif  // WIN32
 
 bool GURL::DomainIs(const char* lower_ascii_domain,
@@ -354,7 +408,7 @@
     return false;
 
   // Check whether the host name is end with a dot. If yes, treat it
-  // the same as no-dot unless the input comparison domain is end 
+  // the same as no-dot unless the input comparison domain is end
   // with dot.
   const char* last_pos = spec_.data() + parsed_.host.end() - 1;
   int host_len = parsed_.host.len;
@@ -377,7 +431,7 @@
                                       lower_ascii_domain + domain_len))
     return false;
 
-  // Check whether host has right domain start with dot, make sure we got 
+  // Check whether host has right domain start with dot, make sure we got
   // right domain range. For example www.google.com has domain
   // "google.com" but www.iamnotgoogle.com does not.
   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
@@ -387,3 +441,9 @@
   return true;
 }
 
+void GURL::Swap(GURL* other) {
+  spec_.swap(other->spec_);
+  std::swap(is_valid_, other->is_valid_);
+  std::swap(parsed_, other->parsed_);
+}
+

diff --git a/googleurl/src/gurl.h b/googleurl/src/gurl.h
index d3ef846..29fea81 100644
--- a/googleurl/src/gurl.h
+++ b/googleurl/src/gurl.h

@@ -30,27 +30,26 @@
 #ifndef GOOGLEURL_SRC_GURL_H__
 #define GOOGLEURL_SRC_GURL_H__
 
+#include <iostream>
 #include <string>
-#include <map>
 
+#include "base/string16.h"
 #include "googleurl/src/url_canon.h"
 #include "googleurl/src/url_canon_stdstring.h"
+#include "googleurl/src/url_common.h"
 #include "googleurl/src/url_parse.h"
 
 class GURL {
  public:
-  typedef url_canon::UTF16Char UTF16Char;
-  typedef url_canon::UTF16String UTF16String;
-
-  typedef url_canon::StdStringReplacements<char> Replacements;
-  typedef url_canon::StdStringReplacements<UTF16Char> ReplacementsW;
+  typedef url_canon::StdStringReplacements<std::string> Replacements;
+  typedef url_canon::StdStringReplacements<string16> ReplacementsW;
 
   // Creates an empty, invalid URL.
-  GURL();
+  GURL_API GURL();
 
   // Copy construction is relatively inexpensive, with most of the time going
   // to reallocating the string. It does not re-parse.
-  GURL(const GURL& other);
+  GURL_API GURL(const GURL& other);
 
   // The narrow version requires the input be UTF-8. Invalid UTF-8 input will
   // result in an invalid URL.
@@ -59,14 +58,16 @@
   // encode the query parameters. It is probably sufficient for the narrow
   // version to assume the query parameter encoding should be the same as the
   // input encoding.
-  explicit GURL(const std::string& url_string /*, output_param_encoding*/);
-  explicit GURL(const UTF16String& url_string /*, output_param_encoding*/);
+  GURL_API explicit GURL(const std::string& url_string
+                         /*, output_param_encoding*/);
+  GURL_API explicit GURL(const string16& url_string
+                         /*, output_param_encoding*/);
 
   // Constructor for URLs that have already been parsed and canonicalized. This
   // is used for conversions from KURL, for example. The caller must supply all
   // information associated with the URL, which must be correct and consistent.
-  GURL(const char* canonical_spec, int canonical_spec_len,
-       const url_parse::Parsed& parsed, bool is_valid);
+  GURL_API GURL(const char* canonical_spec, size_t canonical_spec_len,
+                const url_parse::Parsed& parsed, bool is_valid);
 
   // Returns true when this object represents a valid parsed URL. When not
   // valid, other functions will still succeed, but you will not get canonical
@@ -98,7 +99,7 @@
   // Used invalid_spec() below to get the unusable spec of an invalid URL. This
   // separation is designed to prevent errors that may cause security problems
   // that could result from the mistaken use of an invalid URL.
-  const std::string& spec() const;
+  GURL_API const std::string& spec() const;
 
   // Returns the potentially invalid spec for a the URL. This spec MUST NOT be
   // modified or sent over the network. It is designed to be displayed in error
@@ -150,8 +151,22 @@
   //
   // It is an error to resolve a URL relative to an invalid URL. The result
   // will be the empty URL.
-  GURL Resolve(const std::string& relative) const;
-  GURL Resolve(const UTF16String& relative) const;
+  GURL_API GURL Resolve(const std::string& relative) const;
+  GURL_API GURL Resolve(const string16& relative) const;
+
+  // Like Resolve() above but takes a character set encoder which will be used
+  // for any query text specified in the input. The charset converter parameter
+  // may be NULL, in which case it will be treated as UTF-8.
+  //
+  // TODO(brettw): These should be replaced with versions that take something
+  // more friendly than a raw CharsetConverter (maybe like an ICU character set
+  // name).
+  GURL_API GURL ResolveWithCharsetConverter(
+      const std::string& relative,
+      url_canon::CharsetConverter* charset_converter) const;
+  GURL_API GURL ResolveWithCharsetConverter(
+      const string16& relative,
+      url_canon::CharsetConverter* charset_converter) const;
 
   // Creates a new GURL by replacing the current URL's components with the
   // supplied versions. See the Replacements class in url_canon.h for more.
@@ -164,10 +179,10 @@
   //
   // Note that we use the more general url_canon::Replacements type to give
   // callers extra flexibility rather than our override.
-  GURL ReplaceComponents(
+  GURL_API GURL ReplaceComponents(
       const url_canon::Replacements<char>& replacements) const;
-  GURL ReplaceComponents(
-      const url_canon::Replacements<UTF16Char>& replacements) const;
+  GURL_API GURL ReplaceComponents(
+      const url_canon::Replacements<char16>& replacements) const;
 
   // A helper function that is equivalent to replacing the path with a slash
   // and clearing out everything after that. We sometimes need to know just the
@@ -178,18 +193,30 @@
   //
   // It is an error to get an empty path on an invalid URL. The result
   // will be the empty URL.
-  GURL GetWithEmptyPath() const;
+  GURL_API GURL GetWithEmptyPath() const;
+
+  // A helper function to return a GURL containing just the scheme, host,
+  // and port from a URL. Equivalent to clearing any username and password,
+  // replacing the path with a slash, and clearing everything after that. If
+  // this URL is not a standard URL, then the result will be an empty,
+  // invalid GURL. If the URL has neither username nor password, this
+  // degenerates to GetWithEmptyPath().
+  //
+  // It is an error to get the origin of an invalid URL. The result
+  // will be the empty URL.
+  GURL_API GURL GetOrigin() const;
+
+  // Returns true if the scheme for the current URL is a known "standard"
+  // scheme. Standard schemes have an authority and a path section. This
+  // includes file:, which some callers may want to filter out explicitly by
+  // calling SchemeIsFile.
+  GURL_API bool IsStandard() const;
 
   // Returns true if the given parameter (should be lower-case ASCII to match
   // the canonicalized scheme) is the scheme for this URL. This call is more
   // efficient than getting the scheme and comparing it because no copies or
   // object constructions are done.
-  bool SchemeIs(const char* lower_ascii_scheme) const;
-
-  // Returns true if the scheme for the current URL is a "standard" scheme that
-  // has an authority and a path section. This includes file:, which some
-  // callers may want to filter out explicitly.
-  bool SchemeIsStandard() const;
+  GURL_API bool SchemeIs(const char* lower_ascii_scheme) const;
 
   // We often need to know if this is a file URL. File URLs are "standard", but
   // are often treated separately by some programs.
@@ -205,7 +232,7 @@
   // Returns true if the hostname is an IP address. Note: this function isn't
   // as cheap as a simple getter because it re-parses the hostname to verify.
   // This currently identifies only IPv4 addresses (bug 822685).
-  bool HostIsIPAddress() const;
+  GURL_API bool HostIsIPAddress() const;
 
   // Getters for various components of the URL. The returned string will be
   // empty if the component is empty or is not present.
@@ -218,6 +245,9 @@
   std::string password() const {
     return ComponentString(parsed_.password);
   }
+  // Note that this may be a hostname, an IPv4 address, or an IPv6 literal
+  // surrounded by square brackets, like "[2001:db8::1]".  To exclude these
+  // brackets, use HostNoBrackets() below.
   std::string host() const {
     return ComponentString(parsed_.host);
   }
@@ -268,31 +298,36 @@
 
   // Returns a parsed version of the port. Can also be any of the special
   // values defined in Parsed for ExtractPort.
-  int IntPort() const;
+  GURL_API int IntPort() const;
+
+  // Returns the port number of the url, or the default port number.
+  // If the scheme has no concept of port (or unknown default) returns
+  // PORT_UNSPECIFIED.
+  GURL_API int EffectiveIntPort() const;
 
   // Extracts the filename portion of the path and returns it. The filename
   // is everything after the last slash in the path. This may be empty.
-  std::string ExtractFileName() const;
-
-  // Decompose the query component into a map of key value pairs.
-  typedef std::map<std::string, std::string> QueryMap;
-  void ExtractQuery(QueryMap* r) const;
+  GURL_API std::string ExtractFileName() const;
 
   // Returns the path that should be sent to the server. This is the path,
   // parameter, and query portions of the URL. It is guaranteed to be ASCII.
-  std::string PathForRequest() const;
+  GURL_API std::string PathForRequest() const;
+
+  // Returns the host, excluding the square brackets surrounding IPv6 address
+  // literals.  This can be useful for passing to getaddrinfo().
+  GURL_API std::string HostNoBrackets() const;
 
   // Returns true if this URL's host matches or is in the same domain as
   // the given input string. For example if this URL was "www.google.com",
   // this would match "com", "google.com", and "www.google.com
   // (input domain should be lower-case ASCII to match the canonicalized
-  // scheme). This call is more efficient than getting the host and check 
-  // whether host has the specific domain or not because no copies or 
+  // scheme). This call is more efficient than getting the host and check
+  // whether host has the specific domain or not because no copies or
   // object constructions are done.
   //
   // If function DomainIs has parameter domain_len, which means the parameter
   // lower_ascii_domain does not gurantee to terminate with NULL character.
-  bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
+  GURL_API bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
 
   // If function DomainIs only has parameter lower_ascii_domain, which means
   // domain string should be terminate with NULL character.
@@ -301,12 +336,14 @@
                     static_cast<int>(strlen(lower_ascii_domain)));
   }
 
-#ifdef WIN32  // Currently defined only for Windows.
+  // Swaps the contents of this GURL object with the argument without doing
+  // any memory allocations.
+  GURL_API void Swap(GURL* other);
+
   // Returns a reference to a singleton empty GURL. This object is for callers
   // who return references but don't have anything to return in some cases.
   // This function may be called from any thread.
-  static const GURL& EmptyGURL();
-#endif
+  GURL_API static const GURL& EmptyGURL();
 
  private:
   // Returns the substring of the input identified by the given component.
@@ -330,4 +367,9 @@
   // TODO bug 684583: Add encoding for query params.
 };
 
+// Stream operator so GURL can be used in assertion statements.
+inline std::ostream& operator<<(std::ostream& out, const GURL& url) {
+  return out << url.possibly_invalid_spec();
+}
+
 #endif  // GOOGLEURL_SRC_GURL_H__

diff --git a/googleurl/src/gurl_test_main.cc b/googleurl/src/gurl_test_main.cc
new file mode 100644
index 0000000..9a7c9f4
--- /dev/null
+++ b/googleurl/src/gurl_test_main.cc

@@ -0,0 +1,97 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "build/build_config.h"
+
+#if defined(OS_WIN)
+#include <windows.h>
+#endif
+
+#include <string>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "unicode/putil.h"
+#include "unicode/udata.h"
+
+#define ICU_UTIL_DATA_SHARED 1
+#define ICU_UTIL_DATA_STATIC 2
+
+#ifndef ICU_UTIL_DATA_IMPL
+
+#if defined(OS_WIN)
+#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_SHARED
+#elif defined(OS_MACOSX)
+#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_STATIC
+#elif defined(OS_LINUX)
+#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_FILE
+#endif
+
+#endif  // ICU_UTIL_DATA_IMPL
+
+#if defined(OS_WIN)
+#define ICU_UTIL_DATA_SYMBOL "icudt" U_ICU_VERSION_SHORT "_dat"
+#define ICU_UTIL_DATA_SHARED_MODULE_NAME "icudt" U_ICU_VERSION_SHORT ".dll"
+#endif
+
+bool InitializeICU() {
+#if (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_SHARED)
+  // We expect to find the ICU data module alongside the current module.
+  // Because the module name is ASCII-only, "A" API should be safe.
+  HMODULE module = LoadLibraryA(ICU_UTIL_DATA_SHARED_MODULE_NAME);
+  if (!module)
+    return false;
+
+  FARPROC addr = GetProcAddress(module, ICU_UTIL_DATA_SYMBOL);
+  if (!addr)
+    return false;
+
+  UErrorCode err = U_ZERO_ERROR;
+  udata_setCommonData(reinterpret_cast<void*>(addr), &err);
+  return err == U_ZERO_ERROR;
+#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_STATIC)
+  // Mac bundles the ICU data in.
+  return true;
+#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE)
+  // We expect to find the ICU data module alongside the current module.
+  u_setDataDirectory(".");
+  // Only look for the packaged data file;
+  // the default behavior is to look for individual files.
+  UErrorCode err = U_ZERO_ERROR;
+  udata_setFileAccess(UDATA_ONLY_PACKAGES, &err);
+  return err == U_ZERO_ERROR;
+#endif
+}
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  InitializeICU();
+
+  return RUN_ALL_TESTS();
+}

diff --git a/googleurl/src/gurl_unittest.cc b/googleurl/src/gurl_unittest.cc
index a3ce1f6..b548cc2 100644
--- a/googleurl/src/gurl_unittest.cc
+++ b/googleurl/src/gurl_unittest.cc

@@ -31,8 +31,38 @@
   }
 }
 
+// Returns the canonicalized string for the given URL string for the
+// GURLTest.Types test.
+std::string TypesTestCase(const char* src) {
+  GURL gurl(src);
+  return gurl.possibly_invalid_spec();
+}
+
 }  // namespace
 
+// Different types of URLs should be handled differently by url_util, and
+// handed off to different canonicalizers.
+TEST(GURLTest, Types) {
+  // URLs with unknown schemes should be treated as path URLs, even when they
+  // have things like "://".
+  EXPECT_EQ("something:///HOSTNAME.com/",
+            TypesTestCase("something:///HOSTNAME.com/"));
+
+  // In the reverse, known schemes should always trigger standard URL handling.
+  EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com"));
+  EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com"));
+  EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com"));
+  EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com"));
+
+#ifdef WIN32
+  // URLs that look like absolute Windows drive specs.
+  EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt"));
+  EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt"));
+  EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt"));
+  EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt"));
+#endif
+}
+
 // Test the basic creation and querying of components in a GURL. We assume
 // the parser is already tested and works, so we are mostly interested if the
 // object does the right thing with the results.
@@ -137,24 +167,44 @@
     {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"},
     {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"},
     {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"},
+      // Unknown schemes are not standard.
     {"data:blahblah", "http://google.com/", true, "http://google.com/"},
     {"data:blahblah", "http:google.com", true, "http://google.com/"},
-    {"data:blahblah", "file.html", false, ""},
+    {"data:/blahblah", "file.html", false, ""},
   };
 
-  for (int i = 0; i < ARRAYSIZE(resolve_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(resolve_cases); i++) {
     // 8-bit code path.
     GURL input(resolve_cases[i].base);
     GURL output = input.Resolve(resolve_cases[i].relative);
-    EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid());
-    EXPECT_EQ(resolve_cases[i].expected, output.spec());
+    EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i;
+    EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i;
 
     // Wide code path.
     GURL inputw(ConvertUTF8ToUTF16(resolve_cases[i].base));
     GURL outputw =
         input.Resolve(ConvertUTF8ToUTF16(resolve_cases[i].relative));
-    EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid());
-    EXPECT_EQ(resolve_cases[i].expected, outputw.spec());
+    EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i;
+    EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i;
+  }
+}
+
+TEST(GURLTest, GetOrigin) {
+  struct TestCase {
+    const char* input;
+    const char* expected;
+  } cases[] = {
+    {"http://www.google.com", "http://www.google.com/"},
+    {"javascript:window.alert(\"hello,world\");", ""},
+    {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/"},
+    {"http://user@www.google.com", "http://www.google.com/"},
+    {"http://:pass@www.google.com", "http://www.google.com/"},
+    {"http://:@www.google.com", "http://www.google.com/"},
+  };
+  for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+    GURL url(cases[i].input);
+    GURL origin = url.GetOrigin();
+    EXPECT_EQ(cases[i].expected, origin.spec());
   }
 }
 
@@ -168,7 +218,7 @@
     {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"},
   };
 
-  for (int i = 0; i < ARRAYSIZE(cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
     GURL url(cases[i].input);
     GURL empty_path = url.GetWithEmptyPath();
     EXPECT_EQ(cases[i].expected, empty_path.spec());
@@ -199,7 +249,7 @@
 #endif
   };
 
-  for (int i = 0; i < ARRAYSIZE(replace_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(replace_cases); i++) {
     const ReplaceCase& cur = replace_cases[i];
     GURL url(cur.base);
     GURL::Replacements repl;
@@ -229,55 +279,50 @@
     {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query"},
   };
 
-  for (int i = 0; i < ARRAYSIZE(cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
     GURL url(cases[i].input);
     std::string path_request = url.PathForRequest();
     EXPECT_EQ(cases[i].expected, path_request);
   }
 }
 
-TEST(GURLTest, ExtractQuery) {
-  GURL::QueryMap map;
-  GURL::QueryMap::iterator i;
+TEST(GURLTest, EffectiveIntPort) {
+  struct PortTest {
+    const char* spec;
+    int expected_int_port;
+  } port_tests[] = {
+    // http
+    {"http://www.google.com/", 80},
+    {"http://www.google.com:80/", 80},
+    {"http://www.google.com:443/", 443},
 
-  // empty URL
-  GURL a("http://www.google.com");
-  a.ExtractQuery(&map);
-  i = map.find("foo");
-  EXPECT_TRUE(i == map.end());
+    // https
+    {"https://www.google.com/", 443},
+    {"https://www.google.com:443/", 443},
+    {"https://www.google.com:80/", 80},
 
-  // simple case
-  GURL b("http://www.google.com?arg1=1&arg2=2&bar");
-  b.ExtractQuery(&map);
-  EXPECT_EQ(map["arg1"], "1");
-  EXPECT_EQ(map["arg2"], "2");
-  EXPECT_EQ(map["bar"], "");
+    // ftp
+    {"ftp://www.google.com/", 21},
+    {"ftp://www.google.com:21/", 21},
+    {"ftp://www.google.com:80/", 80},
 
-  // Various terminations
-  const char* urls[] = {
-    "http://www.google.com?foo=bar",
-    "http://www.google.com?foo=bar&",
-    "http://www.google.com?&foo=bar",
-    "http://www.google.com?blaz&foo=bar",
-    "http://www.google.com?blaz=&foo=bar"
+    // gopher
+    {"gopher://www.google.com/", 70},
+    {"gopher://www.google.com:70/", 70},
+    {"gopher://www.google.com:80/", 80},
+
+    // file - no port
+    {"file://www.google.com/", url_parse::PORT_UNSPECIFIED},
+    {"file://www.google.com:443/", url_parse::PORT_UNSPECIFIED},
+
+    // data - no port
+    {"data:www.google.com:90", url_parse::PORT_UNSPECIFIED},
+    {"data:www.google.com", url_parse::PORT_UNSPECIFIED},
   };
 
-  for (int i = 0; i < arraysize(urls); ++i) {
-    GURL c(urls[i]);
-    c.ExtractQuery(&map);
-    EXPECT_EQ(map["foo"], "bar");
-  }
-
-  const char* stress[] = {
-    "http://www.google.com?&=",
-    "http://www.google.com?&&=&",
-    "http://www.google.com?=",
-    "http://www.google.com?==",
-    "http://www.google.com?==&&&="
-  };
-  for (int i = 0; i < arraysize(stress); ++i) {
-    GURL d(stress[i]);
-    d.ExtractQuery(&map);
+  for (size_t i = 0; i < ARRAYSIZE(port_tests); i++) {
+    GURL url(port_tests[i].spec);
+    EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort());
   }
 }
 
@@ -290,16 +335,44 @@
     {"http://192.168.9.1/", true},
     {"http://192.168.9.1.2/", false},
     {"http://192.168.m.1/", false},
+    {"http://2001:db8::1/", false},
+    {"http://[2001:db8::1]/", true},
     {"", false},
     {"some random input!", false},
   };
 
-  for (int i = 0; i < ARRAYSIZE(ip_tests); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(ip_tests); i++) {
     GURL url(ip_tests[i].spec);
     EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress());
   }
 }
 
+TEST(GURLTest, HostNoBrackets) {
+  struct TestCase {
+    const char* input;
+    const char* expected_host;
+    const char* expected_plainhost;
+  } cases[] = {
+    {"http://www.google.com", "www.google.com", "www.google.com"},
+    {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"},
+    {"http://[::]/", "[::]", "::"},
+
+    // Don't require a valid URL, but don't crash either.
+    {"http://[]/", "[]", ""},
+    {"http://[x]/", "[x]", "x"},
+    {"http://[x/", "[x", "[x"},
+    {"http://x]/", "x]", "x]"},
+    {"http://[/", "[", "["},
+    {"http://]/", "]", "]"},
+    {"", "", ""},
+  };
+  for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+    GURL url(cases[i].input);
+    EXPECT_EQ(cases[i].expected_host, url.host());
+    EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets());
+  }
+}
+
 TEST(GURLTest, DomainIs) {
   const char google_domain[] = "google.com";
 
@@ -333,3 +406,27 @@
   GURL url_10("http://www.iamnotgoogle.com../foo");
   EXPECT_FALSE(url_10.DomainIs(".com"));
 }
+
+// Newlines should be stripped from inputs.
+TEST(GURLTest, Newlines) {
+  // Constructor.
+  GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n ");
+  EXPECT_EQ("http://www.google.com/asdf", url_1.spec());
+
+  // Relative path resolver.
+  GURL url_2 = url_1.Resolve(" \n /fo\to\r ");
+  EXPECT_EQ("http://www.google.com/foo", url_2.spec());
+
+  // Note that newlines are NOT stripped from ReplaceComponents.
+}
+
+TEST(GURLTest, IsStandard) {
+  GURL a("http:foo/bar");
+  EXPECT_TRUE(a.IsStandard());
+
+  GURL b("foo:bar/baz");
+  EXPECT_FALSE(b.IsStandard());
+
+  GURL c("foo://bar/baz");
+  EXPECT_FALSE(c.IsStandard());
+}

diff --git a/googleurl/src/url_canon.h b/googleurl/src/url_canon.h
index f33c74f..e2cfb55 100644
--- a/googleurl/src/url_canon.h
+++ b/googleurl/src/url_canon.h

@@ -32,13 +32,12 @@
 #include <memory.h>
 #include <stdlib.h>
 
+#include "base/string16.h"
+#include "googleurl/src/url_common.h"
 #include "googleurl/src/url_parse.h"
 
 namespace url_canon {
 
-typedef url_parse::UTF16Char UTF16Char;
-typedef url_parse::UTF16String UTF16String;
-
 // Canonicalizer output -------------------------------------------------------
 
 // Base class for the canonicalizer output, this maintains a buffer and
@@ -197,12 +196,12 @@
 // the templates so it can also be used internally if a wide buffer is
 // required.
 typedef CanonOutputT<char> CanonOutput;
-typedef CanonOutputT<UTF16Char> CanonOutputW;
+typedef CanonOutputT<char16> CanonOutputW;
 
 template<int fixed_capacity>
 class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
 template<int fixed_capacity>
-class RawCanonOutputW : public RawCanonOutputT<UTF16Char, fixed_capacity> {};
+class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {};
 
 // Character set converter ----------------------------------------------------
 //
@@ -228,11 +227,35 @@
   // decimal, (such as "&#20320;") with escaping of the ampersand, number
   // sign, and semicolon (in the previous example it would be
   // "%26%2320320%3B"). This rule is based on what IE does in this situation.
-  virtual void ConvertFromUTF16(const UTF16Char* input,
+  virtual void ConvertFromUTF16(const char16* input,
                                 int input_len,
                                 CanonOutput* output) = 0;
 };
 
+// Whitespace -----------------------------------------------------------------
+
+// Searches for whitespace that should be removed from the middle of URLs, and
+// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces
+// are preserved, which is what most browsers do. A pointer to the output will
+// be returned, and the length of that output will be in |output_len|.
+//
+// This should be called before parsing if whitespace removal is desired (which
+// it normally is when you are canonicalizing).
+//
+// If no whitespace is removed, this function will not use the buffer and will
+// return a pointer to the input, to avoid the extra copy. If modification is
+// required, the given |buffer| will be used and the returned pointer will
+// point to the beginning of the buffer.
+//
+// Therefore, callers should not use the buffer, since it may actuall be empty,
+// use the computed pointer and |*output_len| instead.
+GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,
+                                         CanonOutputT<char>* buffer,
+                                         int* output_len);
+GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,
+                                           CanonOutputT<char16>* buffer,
+                                           int* output_len);
+
 // IDN ------------------------------------------------------------------------
 
 // Converts the Unicode input representing a hostname to ASCII using IDN rules.
@@ -244,7 +267,7 @@
 // the length of the output will be set to the length of the new host name.
 //
 // On error, returns false. The output in this case is undefined.
-bool IDNToASCII(const UTF16Char* src, int src_len, CanonOutputW* output);
+GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
 
 // Piece-by-piece canonicalizers ----------------------------------------------
 //
@@ -270,14 +293,14 @@
 // URLs.
 //
 // The 8-bit version requires UTF-8 encoding.
-bool CanonicalizeScheme(const char* spec,
-                        const url_parse::Component& scheme,
-                        CanonOutput* output,
-                        url_parse::Component* out_scheme);
-bool CanonicalizeScheme(const UTF16Char* spec,
-                        const url_parse::Component& scheme,
-                        CanonOutput* output,
-                        url_parse::Component* out_scheme);
+GURL_API bool CanonicalizeScheme(const char* spec,
+                                 const url_parse::Component& scheme,
+                                 CanonOutput* output,
+                                 url_parse::Component* out_scheme);
+GURL_API bool CanonicalizeScheme(const char16* spec,
+                                 const url_parse::Component& scheme,
+                                 CanonOutput* output,
+                                 url_parse::Component* out_scheme);
 
 // User info: username/password. If present, this will add the delimiters so
 // the output will be "<username>:<password>@" or "<username>@". Empty
@@ -289,69 +312,122 @@
 // is legal as long as the two components don't overlap.
 //
 // The 8-bit version requires UTF-8 encoding.
-bool CanonicalizeUserInfo(const char* username_source,
-                          const url_parse::Component& username,
-                          const char* password_source,
-                          const url_parse::Component& password,
-                          CanonOutput* output,
-                          url_parse::Component* out_username,
-                          url_parse::Component* out_password);
-bool CanonicalizeUserInfo(const UTF16Char* username_source,
-                          const url_parse::Component& username,
-                          const UTF16Char* password_source,
-                          const url_parse::Component& password,
-                          CanonOutput* output,
-                          url_parse::Component* out_username,
-                          url_parse::Component* out_password);
+GURL_API bool CanonicalizeUserInfo(const char* username_source,
+                                   const url_parse::Component& username,
+                                   const char* password_source,
+                                   const url_parse::Component& password,
+                                   CanonOutput* output,
+                                   url_parse::Component* out_username,
+                                   url_parse::Component* out_password);
+GURL_API bool CanonicalizeUserInfo(const char16* username_source,
+                                   const url_parse::Component& username,
+                                   const char16* password_source,
+                                   const url_parse::Component& password,
+                                   CanonOutput* output,
+                                   url_parse::Component* out_username,
+                                   url_parse::Component* out_password);
+
+
+// This structure holds detailed state exported from the IP/Host canonicalizers.
+// Additional fields may be added as callers require them.
+struct CanonHostInfo {
+  CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}
+
+  // Convenience function to test if family is an IP address.
+  bool IsIPAddress() const { return family == IPV4 || family == IPV6; }
+
+  // This field summarizes how the input was classified by the canonicalizer.
+  enum Family {
+    NEUTRAL,   // - Doesn't resemble an IP address.  As far as the IP
+               //   canonicalizer is concerned, it should be treated as a
+               //   hostname.
+    BROKEN,    // - Almost an IP, but was not canonicalized.  This could be an
+               //   IPv4 address where truncation occurred, or something
+               //   containing the special characters :[] which did not parse
+               //   as an IPv6 address.  Never attempt to connect to this
+               //   address, because it might actually succeed!
+    IPV4,      // - Successfully canonicalized as an IPv4 address.
+    IPV6,      // - Successfully canonicalized as an IPv6 address.
+  };
+  Family family;
+
+  // If |family| is IPV4, then this is the number of nonempty dot-separated
+  // components in the input text, from 1 to 4.  If |family| is not IPV4,
+  // this value is undefined.
+  int num_ipv4_components;
+
+  // Location of host within the canonicalized output.
+  // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6.
+  // CanonicalizeHostVerbose() always sets it.
+  url_parse::Component out_host;
+};
+
 
 // Host.
 //
-// The 8-bit version requires UTF-8 encoding.
-bool CanonicalizeHost(const char* spec,
-                      const url_parse::Component& host,
-                      CanonOutput* output,
-                      url_parse::Component* out_host);
-bool CanonicalizeHost(const UTF16Char* spec,
-                      const url_parse::Component& host,
-                      CanonOutput* output,
-                      url_parse::Component* out_host);
+// The 8-bit version requires UTF-8 encoding.  Use this version when you only
+// need to know whether canonicalization succeeded.
+GURL_API bool CanonicalizeHost(const char* spec,
+                               const url_parse::Component& host,
+                               CanonOutput* output,
+                               url_parse::Component* out_host);
+GURL_API bool CanonicalizeHost(const char16* spec,
+                               const url_parse::Component& host,
+                               CanonOutput* output,
+                               url_parse::Component* out_host);
+
+// Extended version of CanonicalizeHost, which returns additional information.
+// Use this when you need to know whether the hostname was an IP address.
+// A successful return is indicated by host_info->family != BROKEN.  See the
+// definition of CanonHostInfo above for details.
+GURL_API void CanonicalizeHostVerbose(const char* spec,
+                                      const url_parse::Component& host,
+                                      CanonOutput* output,
+                                      CanonHostInfo* host_info);
+GURL_API void CanonicalizeHostVerbose(const char16* spec,
+                                      const url_parse::Component& host,
+                                      CanonOutput* output,
+                                      CanonHostInfo* host_info);
 
 
 // IP addresses.
 //
-// Tries to interpret the given host name as an IP address. If it is an IP
-// address, it will canonicalize it as such, appending it to |output| and
-// identifying the added regions in |*out_host|, and will return true. If it
-// is not an IP address, it will do nothing and will return false. This means
-// that the host name should be treated as a non-IP address and resolved using
-// DNS like most names.
+// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is
+// an IP address, it will canonicalize it as such, appending it to |output|.
+// Additional status information is returned via the |*host_info| parameter.
+// See the definition of CanonHostInfo above for details.
 //
 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that
 // the input is unescaped and name-prepped, etc. It should not normally be
-// necessary or wise to call this directly, other than to check if a given
-// canonical hostname is an IP address.
-bool CanonicalizeIPAddress(const char* spec,
-                           const url_parse::Component& host,
-                           CanonOutput* output,
-                           url_parse::Component* out_host);
-bool CanonicalizeIPAddress(const UTF16Char* spec,
-                           const url_parse::Component& host,
-                           CanonOutput* output,
-                           url_parse::Component* out_host);
+// necessary or wise to call this directly.
+GURL_API void CanonicalizeIPAddress(const char* spec,
+                                    const url_parse::Component& host,
+                                    CanonOutput* output,
+                                    CanonHostInfo* host_info);
+GURL_API void CanonicalizeIPAddress(const char16* spec,
+                                    const url_parse::Component& host,
+                                    CanonOutput* output,
+                                    CanonHostInfo* host_info);
 
 // Port: this function will add the colon for the port if a port is present.
+// The caller can pass url_parse::PORT_UNSPECIFIED as the
+// default_port_for_scheme argument if there is no default port.
 //
 // The 8-bit version requires UTF-8 encoding.
-bool CanonicalizePort(const char* spec,
-                      const url_parse::Component& port,
-                      int default_port_for_scheme,
-                      CanonOutput* output,
-                      url_parse::Component* out_port);
-bool CanonicalizePort(const UTF16Char* spec,
-                      const url_parse::Component& port,
-                      int default_port_for_scheme,
-                      CanonOutput* output,
-                      url_parse::Component* out_port);
+GURL_API bool CanonicalizePort(const char* spec,
+                               const url_parse::Component& port,
+                               int default_port_for_scheme,
+                               CanonOutput* output,
+                               url_parse::Component* out_port);
+GURL_API bool CanonicalizePort(const char16* spec,
+                               const url_parse::Component& port,
+                               int default_port_for_scheme,
+                               CanonOutput* output,
+                               url_parse::Component* out_port);
+
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);
 
 // Path. If the input does not begin in a slash (including if the input is
 // empty), we'll prepend a slash to the path to make it canonical.
@@ -362,14 +438,14 @@
 // an issue. Somebody giving us an 8-bit path is responsible for generating
 // the path that the server expects (we'll escape high-bit characters), so
 // if something is invalid, it's their problem.
-bool CanonicalizePath(const char* spec,
-                      const url_parse::Component& path,
-                      CanonOutput* output,
-                      url_parse::Component* out_path);
-bool CanonicalizePath(const UTF16Char* spec,
-                      const url_parse::Component& path,
-                      CanonOutput* output,
-                      url_parse::Component* out_path);
+GURL_API bool CanonicalizePath(const char* spec,
+                               const url_parse::Component& path,
+                               CanonOutput* output,
+                               url_parse::Component* out_path);
+GURL_API bool CanonicalizePath(const char16* spec,
+                               const url_parse::Component& path,
+                               CanonOutput* output,
+                               url_parse::Component* out_path);
 
 // Canonicalizes the input as a file path. This is like CanonicalizePath except
 // that it also handles Windows drive specs. For example, the path can begin
@@ -377,14 +453,14 @@
 // The string will be appended to |*output| and |*out_path| will be updated.
 //
 // The 8-bit version requires UTF-8 encoding.
-bool FileCanonicalizePath(const char* spec,
-                          const url_parse::Component& path,
-                          CanonOutput* output,
-                          url_parse::Component* out_path);
-bool FileCanonicalizePath(const UTF16Char* spec,
-                          const url_parse::Component& path,
-                          CanonOutput* output,
-                          url_parse::Component* out_path);
+GURL_API bool FileCanonicalizePath(const char* spec,
+                                   const url_parse::Component& path,
+                                   CanonOutput* output,
+                                   url_parse::Component* out_path);
+GURL_API bool FileCanonicalizePath(const char16* spec,
+                                   const url_parse::Component& path,
+                                   CanonOutput* output,
+                                   url_parse::Component* out_path);
 
 // Query: Prepends the ? if needed.
 //
@@ -398,35 +474,31 @@
 // if necessary, for ASCII input, no conversions are necessary.
 //
 // The converter can be NULL. In this case, the output encoding will be UTF-8.
-void CanonicalizeQuery(const char* spec,
-                       const url_parse::Component& query,
-                       CharsetConverter* converter,
-                       CanonOutput* output,
-                       url_parse::Component* out_query);
-void CanonicalizeQuery(const UTF16Char* spec,
-                       const url_parse::Component& query,
-                       CharsetConverter* converter,
-                       CanonOutput* output,
-                       url_parse::Component* out_query);
+GURL_API void CanonicalizeQuery(const char* spec,
+                                const url_parse::Component& query,
+                                CharsetConverter* converter,
+                                CanonOutput* output,
+                                url_parse::Component* out_query);
+GURL_API void CanonicalizeQuery(const char16* spec,
+                                const url_parse::Component& query,
+                                CharsetConverter* converter,
+                                CanonOutput* output,
+                                url_parse::Component* out_query);
 
 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
 // canonicalizer that does not produce ASCII output). The output is
 // guaranteed to be valid UTF-8.
 //
-// The only way this function will fail is if the input is invalid
-// UTF-8/UTF-16. In this case, we'll use the "Unicode replacement character"
-// for the confusing bits and copy the rest. The application will probably not
-// want to treat a failure converting the ref as a failure canonicalizing the
-// URL, since the page can probably still be loaded, just not scrolled
-// properly.
-bool CanonicalizeRef(const char* spec,
-                     const url_parse::Component& path,
-                     CanonOutput* output,
-                     url_parse::Component* out_path);
-bool CanonicalizeRef(const UTF16Char* spec,
-                     const url_parse::Component& path,
-                     CanonOutput* output,
-                     url_parse::Component* out_path);
+// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
+// the "Unicode replacement character" for the confusing bits and copy the rest.
+GURL_API void CanonicalizeRef(const char* spec,
+                              const url_parse::Component& path,
+                              CanonOutput* output,
+                              url_parse::Component* out_path);
+GURL_API void CanonicalizeRef(const char16* spec,
+                              const url_parse::Component& path,
+                              CanonOutput* output,
+                              url_parse::Component* out_path);
 
 // Full canonicalizer ---------------------------------------------------------
 //
@@ -439,45 +511,61 @@
 // The 8-bit versions require UTF-8 encoding.
 
 // Use for standard URLs with authorities and paths.
-bool CanonicalizeStandardURL(const char* spec,
-                             int spec_len,
-                             const url_parse::Parsed& parsed,
-                             CharsetConverter* query_converter,
-                             CanonOutput* output,
-                             url_parse::Parsed* new_parsed);
-bool CanonicalizeStandardURL(const UTF16Char* spec,
-                             int spec_len,
-                             const url_parse::Parsed& parsed,
-                             CharsetConverter* query_converter,
-                             CanonOutput* output,
-                             url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeStandardURL(const char* spec,
+                                      int spec_len,
+                                      const url_parse::Parsed& parsed,
+                                      CharsetConverter* query_converter,
+                                      CanonOutput* output,
+                                      url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeStandardURL(const char16* spec,
+                                      int spec_len,
+                                      const url_parse::Parsed& parsed,
+                                      CharsetConverter* query_converter,
+                                      CanonOutput* output,
+                                      url_parse::Parsed* new_parsed);
 
 // Use for file URLs.
-bool CanonicalizeFileURL(const char* spec,
-                         int spec_len,
-                         const url_parse::Parsed& parsed,
-                         CharsetConverter* query_converter,
-                         CanonOutput* output,
-                         url_parse::Parsed* new_parsed);
-bool CanonicalizeFileURL(const UTF16Char* spec,
-                         int spec_len,
-                         const url_parse::Parsed& parsed,
-                         CharsetConverter* query_converter,
-                         CanonOutput* output,
-                         url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeFileURL(const char* spec,
+                                  int spec_len,
+                                  const url_parse::Parsed& parsed,
+                                  CharsetConverter* query_converter,
+                                  CanonOutput* output,
+                                  url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeFileURL(const char16* spec,
+                                  int spec_len,
+                                  const url_parse::Parsed& parsed,
+                                  CharsetConverter* query_converter,
+                                  CanonOutput* output,
+                                  url_parse::Parsed* new_parsed);
 
 // Use for path URLs such as javascript. This does not modify the path in any
 // way, for example, by escaping it.
-bool CanonicalizePathURL(const char* spec,
-                         int spec_len,
-                         const url_parse::Parsed& parsed,
-                         CanonOutput* output,
-                         url_parse::Parsed* new_parsed);
-bool CanonicalizePathURL(const UTF16Char* spec,
-                         int spec_len,
-                         const url_parse::Parsed& parsed,
-                         CanonOutput* output,
-                         url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizePathURL(const char* spec,
+                                  int spec_len,
+                                  const url_parse::Parsed& parsed,
+                                  CanonOutput* output,
+                                  url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizePathURL(const char16* spec,
+                                  int spec_len,
+                                  const url_parse::Parsed& parsed,
+                                  CanonOutput* output,
+                                  url_parse::Parsed* new_parsed);
+
+// Use for mailto URLs. This "canonicalizes" the url into a path and query
+// component. It does not attempt to merge "to" fields. It uses UTF-8 for
+// the query encoding if there is a query. This is because a mailto URL is
+// really intended for an external mail program, and the encoding of a page,
+// etc. which would influence a query encoding normally are irrelevant.
+GURL_API bool CanonicalizeMailtoURL(const char* spec,
+                                    int spec_len,
+                                    const url_parse::Parsed& parsed,
+                                    CanonOutput* output,
+                                    url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeMailtoURL(const char16* spec,
+                                    int spec_len,
+                                    const url_parse::Parsed& parsed,
+                                    CanonOutput* output,
+                                    url_parse::Parsed* new_parsed);
 
 // Part replacer --------------------------------------------------------------
 
@@ -498,12 +586,12 @@
 struct URLComponentSource {
   // Constructor normally used by callers wishing to replace components. This
   // will make them all NULL, which is no replacement. The caller would then
-  // override the compoents they want to replace.
+  // override the components they want to replace.
   URLComponentSource()
       : scheme(NULL),
         username(NULL),
         password(NULL),
-        host(NULL), 
+        host(NULL),
         port(NULL),
         path(NULL),
         query(NULL),
@@ -662,46 +750,59 @@
 };
 
 // The base must be an 8-bit canonical URL.
-bool ReplaceStandardURL(const char* base,
-                        const url_parse::Parsed& base_parsed,
-                        const Replacements<char>& replacements,
-                        CharsetConverter* query_converter,
-                        CanonOutput* output,
-                        url_parse::Parsed* new_parsed);
-bool ReplaceStandardURL(const char* base,
-                        const url_parse::Parsed& base_parsed,
-                        const Replacements<UTF16Char>& replacements,
-                        CharsetConverter* query_converter,
-                        CanonOutput* output,
-                        url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceStandardURL(const char* base,
+                                 const url_parse::Parsed& base_parsed,
+                                 const Replacements<char>& replacements,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceStandardURL(const char* base,
+                                 const url_parse::Parsed& base_parsed,
+                                 const Replacements<char16>& replacements,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* new_parsed);
 
 // Replacing some parts of a file URL is not permitted. Everything except
 // the host, path, query, and ref will be ignored.
-bool ReplaceFileURL(const char* base,
-                    const url_parse::Parsed& base_parsed,
-                    const Replacements<char>& replacements,
-                    CharsetConverter* query_converter,
-                    CanonOutput* output,
-                    url_parse::Parsed* new_parsed);
-bool ReplaceFileURL(const char* base,
-                    const url_parse::Parsed& base_parsed,
-                    const Replacements<UTF16Char>& replacements,
-                    CharsetConverter* query_converter,
-                    CanonOutput* output,
-                    url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceFileURL(const char* base,
+                             const url_parse::Parsed& base_parsed,
+                             const Replacements<char>& replacements,
+                             CharsetConverter* query_converter,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceFileURL(const char* base,
+                             const url_parse::Parsed& base_parsed,
+                             const Replacements<char16>& replacements,
+                             CharsetConverter* query_converter,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed);
 
 // Path URLs can only have the scheme and path replaced. All other components
 // will be ignored.
-bool ReplacePathURL(const char* base,
-                    const url_parse::Parsed& base_parsed,
-                    const Replacements<char>& replacements,
-                    CanonOutput* output,
-                    url_parse::Parsed* new_parsed);
-bool ReplacePathURL(const char* base,
-                    const url_parse::Parsed& base_parsed,
-                    const Replacements<UTF16Char>& replacements,
-                    CanonOutput* output,
-                    url_parse::Parsed* new_parsed);
+GURL_API bool ReplacePathURL(const char* base,
+                             const url_parse::Parsed& base_parsed,
+                             const Replacements<char>& replacements,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed);
+GURL_API bool ReplacePathURL(const char* base,
+                             const url_parse::Parsed& base_parsed,
+                             const Replacements<char16>& replacements,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed);
+
+// Mailto URLs can only have the scheme, path, and query replaced.
+// All other components will be ignored.
+GURL_API bool ReplaceMailtoURL(const char* base,
+                               const url_parse::Parsed& base_parsed,
+                               const Replacements<char>& replacements,
+                               CanonOutput* output,
+                               url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceMailtoURL(const char* base,
+                               const url_parse::Parsed& base_parsed,
+                               const Replacements<char16>& replacements,
+                               CanonOutput* output,
+                               url_parse::Parsed* new_parsed);
 
 // Relative URL ---------------------------------------------------------------
 
@@ -716,20 +817,20 @@
 // not). Failure means that the combination of URLs doesn't make any sense.
 //
 // The base URL should always be canonical, therefore is ASCII.
-bool IsRelativeURL(const char* base,
-                   const url_parse::Parsed& base_parsed,
-                   const char* fragment,
-                   int fragment_len,
-                   bool is_base_hierarchical,
-                   bool* is_relative,
-                   url_parse::Component* relative_component);
-bool IsRelativeURL(const char* base,
-                   const url_parse::Parsed& base_parsed,
-                   const UTF16Char* fragment,
-                   int fragment_len,
-                   bool is_base_hierarchical,
-                   bool* is_relative,
-                   url_parse::Component* relative_component);
+GURL_API bool IsRelativeURL(const char* base,
+                            const url_parse::Parsed& base_parsed,
+                            const char* fragment,
+                            int fragment_len,
+                            bool is_base_hierarchical,
+                            bool* is_relative,
+                            url_parse::Component* relative_component);
+GURL_API bool IsRelativeURL(const char* base,
+                            const url_parse::Parsed& base_parsed,
+                            const char16* fragment,
+                            int fragment_len,
+                            bool is_base_hierarchical,
+                            bool* is_relative,
+                            url_parse::Component* relative_component);
 
 // Given a canonical parsed source URL, a URL fragment known to be relative,
 // and the identified relevant portion of the relative URL (computed by
@@ -749,22 +850,22 @@
 // Returns true on success. On failure, the output will be "something
 // reasonable" that will be consistent and valid, just probably not what
 // was intended by the web page author or caller.
-bool ResolveRelativeURL(const char* base_url,
-                        const url_parse::Parsed& base_parsed,
-                        bool base_is_file,
-                        const char* relative_url,
-                        const url_parse::Component& relative_component,
-                        CharsetConverter* query_converter,
-                        CanonOutput* output,
-                        url_parse::Parsed* out_parsed);
-bool ResolveRelativeURL(const char* base_url,
-                        const url_parse::Parsed& base_parsed,
-                        bool base_is_file,
-                        const UTF16Char* relative_url,
-                        const url_parse::Component& relative_component,
-                        CharsetConverter* query_converter,
-                        CanonOutput* output,
-                        url_parse::Parsed* out_parsed);
+GURL_API bool ResolveRelativeURL(const char* base_url,
+                                 const url_parse::Parsed& base_parsed,
+                                 bool base_is_file,
+                                 const char* relative_url,
+                                 const url_parse::Component& relative_component,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* out_parsed);
+GURL_API bool ResolveRelativeURL(const char* base_url,
+                                 const url_parse::Parsed& base_parsed,
+                                 bool base_is_file,
+                                 const char16* relative_url,
+                                 const url_parse::Component& relative_component,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* out_parsed);
 
 }  // namespace url_canon
 

diff --git a/googleurl/src/url_canon_etc.cc b/googleurl/src/url_canon_etc.cc
index 4d6091b..aea181a 100644
--- a/googleurl/src/url_canon_etc.cc
+++ b/googleurl/src/url_canon_etc.cc

@@ -29,6 +29,8 @@
 
 // Canonicalizers for random bits that aren't big enough for their own files.
 
+#include <string.h>
+
 #include "googleurl/src/url_canon.h"
 #include "googleurl/src/url_canon_internal.h"
 
@@ -36,6 +38,46 @@
 
 namespace {
 
+// Returns true if the given character should be removed from the middle of a
+// URL.
+inline bool IsRemovableURLWhitespace(int ch) {
+  return ch == '\r' || ch == '\n' || ch == '\t';
+}
+
+// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
+// It sucks that we have to do this, since this takes about 13% of the total URL
+// canonicalization time.
+template<typename CHAR>
+const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
+                                  CanonOutputT<CHAR>* buffer,
+                                  int* output_len) {
+  // Fast verification that there's nothing that needs removal. This is the 99%
+  // case, so we want it to be fast and don't care about impacting the speed
+  // when we do find whitespace.
+  int found_whitespace = false;
+  for (int i = 0; i < input_len; i++) {
+    if (!IsRemovableURLWhitespace(input[i]))
+      continue;
+    found_whitespace = true;
+    break;
+  }
+
+  if (!found_whitespace) {
+    // Didn't find any whitespace, we don't need to do anything. We can just
+    // return the input as the output.
+    *output_len = input_len;
+    return input;
+  }
+
+  // Remove the whitespace into the new buffer and return it.
+  for (int i = 0; i < input_len; i++) {
+    if (!IsRemovableURLWhitespace(input[i]))
+      buffer->push_back(input[i]);
+  }
+  *output_len = buffer->length();
+  return buffer->data();
+}
+
 // Contains the canonical version of each possible input letter in the scheme
 // (basically, lower-cased). The corresponding entry will be 0 if the letter
 // is not allowed in a scheme.
@@ -53,14 +95,14 @@
     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
-//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~    
+//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
 
 // This could be a table lookup as well by setting the high bit for each
 // valid character, but it's only called once per URL, and it makes the lookup
 // table easier to read not having extra stuff in it.
 inline bool IsSchemeFirstChar(unsigned char c) {
-  return (c >= 'a' && c < 'z') || (c >= 'A' && c < 'Z');
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 }
 
 template<typename CHAR, typename UCHAR>
@@ -78,6 +120,11 @@
   // The output scheme starts from the current position.
   out_scheme->begin = output->length();
 
+  // Danger: it's important that this code does not strip any characters: it
+  // only emits the canonical version (be it valid or escaped) of each of
+  // the input characters. Stripping would put it out of sync with
+  // url_util::FindAndCompareScheme, which could cause some security checks on
+  // schemes to be incorrect.
   bool success = true;
   int end = scheme.end();
   for (int i = scheme.begin; i < end; i++) {
@@ -95,6 +142,12 @@
 
     if (replacement) {
       output->push_back(replacement);
+    } else if (ch == '%') {
+      // Canonicalizing the scheme multiple times should lead to the same
+      // result. Since invalid characters will be escaped, we need to preserve
+      // the percent to avoid multiple escaping. The scheme will be invalid.
+      success = false;
+      output->push_back('%');
     } else {
       // Invalid character, store it but mark this scheme as invalid.
       success = false;
@@ -131,14 +184,12 @@
     return true;
   }
 
-  // TODO(brettw) bug 735548: we should be checking that the user info
-  // characters are allowed in the input.
-
   // Write the username.
   out_username->begin = output->length();
   if (username.len > 0) {
-    AppendInvalidNarrowString(username_spec, username.begin, username.end(),
-                              output);
+    // This will escape characters not valid for the username.
+    AppendStringOfType(&username_spec[username.begin], username.len,
+                       CHAR_USERINFO, output);
   }
   out_username->len = output->length() - out_username->begin;
 
@@ -147,8 +198,8 @@
   if (password.len > 0) {
     output->push_back(':');
     out_password->begin = output->length();
-    AppendInvalidNarrowString(password_spec, password.begin, password.end(),
-                              output);
+    AppendStringOfType(&password_spec[password.begin], password.len,
+                       CHAR_USERINFO, output);
     out_password->len = output->length() - out_password->begin;
   } else {
     *out_password = url_parse::Component();
@@ -162,7 +213,7 @@
 inline void WritePortInt(char* output, int output_len, int port) {
   _itoa_s(port, output, output_len, 10);
 }
-inline void WritePortInt(UTF16Char* output, int output_len, int port) {
+inline void WritePortInt(char16* output, int output_len, int port) {
   _itow_s(port, output, output_len, 10);
 }
 
@@ -206,23 +257,66 @@
   return true;
 }
 
-// Like ConvertUTF?ToUTF? except this is a UTF-8 -> UTF-8 converter. We'll
-// validate the input, and use the unicode replacement character for invalid
-// input, and append the result to the output.
-bool CopyAndValidateUTF8(const char* input, int input_len,
-                         CanonOutput* output) {
-  bool success = true;
-  for (int i = 0; i < input_len; i++) {
-    unsigned code_point;
-    success &= ReadUTFChar(input, &i, input_len, &code_point);
-    AppendUTF8Value(code_point, output);
+template<typename CHAR, typename UCHAR>
+void DoCanonicalizeRef(const CHAR* spec,
+                       const url_parse::Component& ref,
+                       CanonOutput* output,
+                       url_parse::Component* out_ref) {
+  if (ref.len < 0) {
+    // Common case of no ref.
+    *out_ref = url_parse::Component();
+    return;
   }
-  return success;
+
+  // Append the ref separator. Note that we need to do this even when the ref
+  // is empty but present.
+  output->push_back('#');
+  out_ref->begin = output->length();
+
+  // Now iterate through all the characters, converting to UTF-8 and validating.
+  int end = ref.end();
+  for (int i = ref.begin; i < end; i++) {
+    if (spec[i] == 0) {
+      // IE just strips NULLs, so we do too.
+      continue;
+    } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
+      // Unline IE seems to, we escape control characters. This will probably
+      // make the reference fragment unusable on a web page, but people
+      // shouldn't be using control characters in their anchor names.
+      AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
+    } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
+      // Normal ASCII characters are just appended.
+      output->push_back(static_cast<char>(spec[i]));
+    } else {
+      // Non-ASCII characters are appended unescaped, but only when they are
+      // valid. Invalid Unicode characters are replaced with the "invalid
+      // character" as IE seems to.
+      unsigned code_point;
+      if (!ReadUTFChar(spec, &i, end, &code_point))
+        AppendUTF8Value(kUnicodeReplacementCharacter, output);
+      else
+        AppendUTF8Value(code_point, output);
+    }
+  }
+
+  out_ref->len = output->length() - out_ref->begin;
 }
 
 }  // namespace
 
-char CanonicalSchemeChar(UTF16Char ch) {
+const char* RemoveURLWhitespace(const char* input, int input_len,
+                                CanonOutputT<char>* buffer,
+                                int* output_len) {
+  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
+}
+
+const char16* RemoveURLWhitespace(const char16* input, int input_len,
+                                  CanonOutputT<char16>* buffer,
+                                  int* output_len) {
+  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
+}
+
+char CanonicalSchemeChar(char16 ch) {
   if (ch >= 0x80)
     return 0;  // Non-ASCII is not supported by schemes.
   return kSchemeCanonical[ch];
@@ -235,11 +329,11 @@
   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
 }
 
-bool CanonicalizeScheme(const UTF16Char* spec,
+bool CanonicalizeScheme(const char16* spec,
                         const url_parse::Component& scheme,
                         CanonOutput* output,
                         url_parse::Component* out_scheme) {
-  return DoScheme<UTF16Char, UTF16Char>(spec, scheme, output, out_scheme);
+  return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
 }
 
 bool CanonicalizeUserInfo(const char* username_source,
@@ -254,14 +348,14 @@
       output, out_username, out_password);
 }
 
-bool CanonicalizeUserInfo(const UTF16Char* username_source,
+bool CanonicalizeUserInfo(const char16* username_source,
                           const url_parse::Component& username,
-                          const UTF16Char* password_source,
+                          const char16* password_source,
                           const url_parse::Component& password,
                           CanonOutput* output,
                           url_parse::Component* out_username,
                           url_parse::Component* out_password) {
-  return DoUserInfo<UTF16Char, UTF16Char>(
+  return DoUserInfo<char16, char16>(
       username_source, username, password_source, password,
       output, out_username, out_password);
 }
@@ -276,58 +370,27 @@
                                      output, out_port);
 }
 
-bool CanonicalizePort(const UTF16Char* spec,
+bool CanonicalizePort(const char16* spec,
                       const url_parse::Component& port,
                       int default_port_for_scheme,
                       CanonOutput* output,
                       url_parse::Component* out_port) {
-  return DoPort<UTF16Char, UTF16Char>(spec, port, default_port_for_scheme,
+  return DoPort<char16, char16>(spec, port, default_port_for_scheme,
                                       output, out_port);
 }
 
-// We don't do anything fancy with refs, we just validate that the input is
-// valid UTF-8 and return.
-bool CanonicalizeRef(const char* spec,
+void CanonicalizeRef(const char* spec,
                      const url_parse::Component& ref,
                      CanonOutput* output,
                      url_parse::Component* out_ref) {
-  if (ref.len < 0) {
-    // Common case of no ref.
-    *out_ref = url_parse::Component();
-    return true;
-  }
-
-  // Append the ref separator. Note that we need to do this even when the ref
-  // is empty but present.
-  output->push_back('#');
-  out_ref->begin = output->length();
-
-  bool success = CopyAndValidateUTF8(&spec[ref.begin], ref.len, output);
-  out_ref->len = output->length() - out_ref->begin;
-  return success;
+  DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
 }
 
-// 16-bit-character refs need to get converted to UTF-8.
-bool CanonicalizeRef(const UTF16Char* spec,
+void CanonicalizeRef(const char16* spec,
                      const url_parse::Component& ref,
                      CanonOutput* output,
                      url_parse::Component* out_ref) {
-  if (ref.len < 0) {
-    // Common case of no ref.
-    *out_ref = url_parse::Component();
-    return true;
-  }
-
-  // Append the ref separator. Note that we need to do this even when the ref
-  // is empty but present.
-  output->push_back('#');
-  out_ref->begin = output->length();
-
-  // The handy-dandy conversion function will validate the UTF-16 and convert
-  // to UTF-8 for us!
-  bool success = ConvertUTF16ToUTF8(&spec[ref.begin], ref.len, output);
-  out_ref->len = output->length() - out_ref->begin;
-  return success;
+  DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
 }
 
 }  // namespace url_canon

diff --git a/googleurl/src/url_canon_fileurl.cc b/googleurl/src/url_canon_fileurl.cc
index 60c6ffe..97023eb 100644
--- a/googleurl/src/url_canon_fileurl.cc
+++ b/googleurl/src/url_canon_fileurl.cc

@@ -105,7 +105,7 @@
     // No input path, canonicalize to a slash.
     output->push_back('/');
   }
-  
+
   out_path->len = output->length() - out_path->begin;
   return success;
 }
@@ -124,16 +124,8 @@
   // Scheme (known, so we don't bother running it through the more
   // complicated scheme canonicalizer).
   new_parsed->scheme.begin = output->length();
-  output->push_back('f');
-  output->push_back('i');
-  output->push_back('l');
-  output->push_back('e');
-  new_parsed->scheme.len = output->length() - new_parsed->scheme.begin;
-  output->push_back(':');
-
-  // Write the separator for the host.
-  output->push_back('/');
-  output->push_back('/');
+  output->Append("file://", 7);
+  new_parsed->scheme.len = 4;
 
   // Append the host. For many file URLs, this will be empty. For UNC, this
   // will be present.
@@ -166,14 +158,14 @@
       output, new_parsed);
 }
 
-bool CanonicalizeFileURL(const UTF16Char* spec,
+bool CanonicalizeFileURL(const char16* spec,
                          int spec_len,
                          const url_parse::Parsed& parsed,
                          CharsetConverter* query_converter,
                          CanonOutput* output,
                          url_parse::Parsed* new_parsed) {
-  return DoCanonicalizeFileURL<UTF16Char, UTF16Char>(
-      URLComponentSource<UTF16Char>(spec), parsed, query_converter,
+  return DoCanonicalizeFileURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, query_converter,
       output, new_parsed);
 }
 
@@ -185,12 +177,12 @@
                                                      output, out_path);
 }
 
-bool FileCanonicalizePath(const UTF16Char* spec,
+bool FileCanonicalizePath(const char16* spec,
                           const url_parse::Component& path,
                           CanonOutput* output,
                           url_parse::Component* out_path) {
-  return DoFileCanonicalizePath<UTF16Char, UTF16Char>(spec, path,
-                                                      output, out_path);
+  return DoFileCanonicalizePath<char16, char16>(spec, path,
+                                                output, out_path);
 }
 
 bool ReplaceFileURL(const char* base,
@@ -208,7 +200,7 @@
 
 bool ReplaceFileURL(const char* base,
                     const url_parse::Parsed& base_parsed,
-                    const Replacements<UTF16Char>& replacements,
+                    const Replacements<char16>& replacements,
                     CharsetConverter* query_converter,
                     CanonOutput* output,
                     url_parse::Parsed* new_parsed) {

diff --git a/googleurl/src/url_canon_host.cc b/googleurl/src/url_canon_host.cc
index 09816e4..6642004 100644
--- a/googleurl/src/url_canon_host.cc
+++ b/googleurl/src/url_canon_host.cc

@@ -27,6 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#include "base/logging.h"
 #include "googleurl/src/url_canon.h"
 #include "googleurl/src/url_canon_internal.h"
 
@@ -43,7 +44,7 @@
 //         unescaped form)
 //      % (only allowed escaped in the input, will be unmodified).
 //      I left blank alpha numeric characters.
-// 
+//
 //    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
 //    -----------------------------------------------
 // 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
@@ -62,191 +63,155 @@
 // Surprisingly, space is accepted in the input and always escaped.
 
 // This table lists the canonical version of all characters we allow in the
-// input, with 0 indicating it is disallowed. We are more restricive than IE,
-// but less restrictive than Firefox, and we only have two modes: either the
-// character is allowed and it is unescaped if escaped in the input, or it is
-// disallowed and we will prohibit it.
+// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
+// value to indicate that this character should be escaped. We are a little more
+// restrictive than IE, but less restrictive than Firefox.
 //
-// Space is a special case, IE always escapes space, and some sites actually
-// use it, so we want to support it. We try to duplicate IE's behavior by treating
-// space as valid and unescaping it, and then doing a separate pass at the end of
-// canonicalization that looks for spaces. We'll then escape them at that point.
-const char kHostCharLookup[0x80] = {
+// Note that we disallow the % character. We will allow it when part of an
+// escape sequence, of course, but this disallows "%25". Even though IE allows
+// it, allowing it would put us in a funny state. If there was an invalid
+// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
+// Allowing percents means we'll succeed a second time, so validity would change
+// based on how many times you run the canonicalizer. We prefer to always report
+// the same vailidity, so reject this.
+const unsigned char kEsc = 0xff;
+const unsigned char kHostCharLookup[0x80] = {
 // 00-1f: all are invalid
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
-    ' ',  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
+   kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
-    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
-     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
-     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
-//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~    
-    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
+   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
 
 const int kTempHostBufferLen = 1024;
 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
-typedef RawCanonOutputT<UTF16Char, kTempHostBufferLen> StackBufferW;
+typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
 
 // Scans a host name and fills in the output flags according to what we find.
 // |has_non_ascii| will be true if there are any non-7-bit characters, and
 // |has_escaped| will be true if there is a percent sign.
 template<typename CHAR, typename UCHAR>
 void ScanHostname(const CHAR* spec, const url_parse::Component& host,
-                  bool* has_non_ascii, bool* has_escaped, bool* has_space) {
+                  bool* has_non_ascii, bool* has_escaped) {
   int end = host.end();
   *has_non_ascii = false;
   *has_escaped = false;
-  *has_space = false;
   for (int i = host.begin; i < end; i++) {
-    // This branch is normally taken and will be predicted very well. Testing
-    // shows that is is slightly faster to eliminate all the "normal" common
-    // characters here and fall through below to find out exactly which one
-    // failed.
-    if (static_cast<UCHAR>(spec[i]) < 0x80 && spec[i] != '%' && spec[i] != ' ')
-      continue;
-
     if (static_cast<UCHAR>(spec[i]) >= 0x80)
       *has_non_ascii = true;
     else if (spec[i] == '%')
       *has_escaped = true;
-    else if (spec[i] == ' ')
-      *has_space = true;
   }
 }
 
-// Considers the current contents of the output and sees if it looks like an
-// IP address. This is called because we canonicalize to the output assuming
-// that it's not an IP address, and now need to fix it if we produced one.
+// Canonicalizes a host name that is entirely 8-bit characters (even though
+// the type holding them may be 16 bits. Escaped characters will be unescaped.
+// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
 //
-// The generated hostname is identified by |host|. The output will be fixed
-// with a canonical IP address if the host looks like one. Otherwise, there
-// will be no change.
-void InterpretIPAddress(const url_parse::Component& host,
-                        CanonOutput* output) {
-  // Canonicalize the IP address in the output to this temporary buffer.
-  // IP addresses are small, so this should not cause an allocation.
-  RawCanonOutput<64> canon_ip;
-  url_parse::Component out_host;  // Unused.
-  if (CanonicalizeIPAddress(output->data(), host, &canon_ip, &out_host)) {
-    // Looks like an IP address, overwrite the existing host with the newly
-    // canonicalized IP address.
-    output->set_length(host.begin);
-    output->Append(canon_ip.data(), canon_ip.length());
-  }
-}
-
-// Unescapes all escaped characters in the input, writing the result to
-// |*unescaped| and the output length in |*unescaped_len|.
+// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
+// the output.
 //
-// This does validity checking of 7-bit characters based on the above table,
-// and allows all characters with the high bit set (UTF-8, hopefully).
+// This function is used in two situations:
 //
-// Returns true on success. On failure, |*unescaped| and |*unescaped_len|
-// will still be consistent & valid, just the contents will be meaningless.
-// The caller should return failure in this case.
+//  * When the caller knows there is no non-ASCII or percent escaped
+//    characters. This is what DoHost does. The result will be a completely
+//    canonicalized host since we know nothing weird can happen (escaped
+//    characters could be unescaped to non-7-bit, so they have to be treated
+//    with suspicion at this point). It does not use the |has_non_ascii| flag.
 //
-// |*has_non_ascii| will be set according to if there are any non-8-bit
-// values in the unescaped output.
-bool UnescapeAndValidateHost(const char* src, int src_len,
-                             CanonOutput* unescaped, bool* has_non_ascii) {
-  bool success = true;
+//  * When the caller has an 8-bit string that may need unescaping.
+//    DoComplexHost calls us this situation to do unescaping and validation.
+//    After this, it may do other IDN operations depending on the value of the
+//    |*has_non_ascii| flag.
+//
+// The return value indicates if the output is a potentially valid host name.
+template<typename INCHAR, typename OUTCHAR>
+bool DoSimpleHost(const INCHAR* host,
+                  int host_len,
+                  CanonOutputT<OUTCHAR>* output,
+                  bool* has_non_ascii) {
   *has_non_ascii = false;
 
-  for (int i = 0; i < src_len; i++) {
-    char ch = static_cast<char>(src[i]);
-    if (ch == '%') {
-      if (!DecodeEscaped(src, &i, src_len, &ch)) {
-        // Invalid escaped character, there is nothing that can make this
+  bool success = true;
+  for (int i = 0; i < host_len; ++i) {
+    unsigned int source = host[i];
+    if (source == '%') {
+      // Unescape first, if possible.
+      // Source will be used only if decode operation was successful.
+      if (!DecodeEscaped(host, &i, host_len,
+                         reinterpret_cast<unsigned char*>(&source))) {
+        // Invalid escaped character. There is nothing that can make this
         // host valid. We append an escaped percent so the URL looks reasonable
         // and mark as failed.
-        AppendEscapedChar('%', unescaped);
+        AppendEscapedChar('%', output);
         success = false;
         continue;
       }
-      // The unescaped character will now be in |ch|.
     }
 
-    if (static_cast<unsigned char>(ch) >= 0x80) {
-      // Pass through all high-bit characters so we don't mangle UTF-8. Set the
-      // flag so the caller knows it should fix the non-ASCII characters.
-      unescaped->push_back(ch);
-      *has_non_ascii = true;
-    } else {
-      // Use the lookup table to canonicalize this ASCII value.
-      char replacement = kHostCharLookup[ch];
+    if (source < 0x80) {
+      // We have ASCII input, we can use our lookup table.
+      unsigned char replacement = kHostCharLookup[source];
       if (!replacement) {
         // Invalid character, add it as percent-escaped and mark as failed.
-        AppendEscapedChar(ch, unescaped);
+        AppendEscapedChar(source, output);
         success = false;
+      } else if (replacement == kEsc) {
+        // This character is valid but should be escaped.
+        AppendEscapedChar(source, output);
       } else {
         // Common case, the given character is valid in a hostname, the lookup
         // table tells us the canonical representation of that character (lower
         // cased).
-        unescaped->push_back(replacement);
+        output->push_back(replacement);
       }
-    }
-  }
-  return success;
-}
-
-// Canonicalizes a host name assuming the input is 7-bit ASCII and requires
-// no unescaping. This is the most common case so it should be fast. We convert
-// to 8-bit by static_cast (input may be 16-bit) and check for validity.
-//
-// The return value will be false if there are invalid host characters.
-template<typename CHAR>
-bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output) {
-  // First check if the host name is an IP address.
-  url_parse::Component out_ip;  // Unused: we compute the size ourselves later.
-  if (CanonicalizeIPAddress(host, url_parse::Component(0, host_len),
-                            output, &out_ip))
-    return true;
-
-  bool success = true;
-  for (int i = 0; i < host_len; i++) {
-    // Find the replacement character (lower case for letters, the same as the
-    // input if no change is required).
-    char source = static_cast<char>(host[i]);
-    char replacement = kHostCharLookup[source];
-    if (!replacement) {
-      // Invalid character, add it as percent-escaped and mark as failed.
-      AppendEscapedChar(source, output);
-      success = false;
     } else {
-      // Common case, the given character is valid in a hostname, the lookup
-      // table tells us the canonical representation of that character (lower
-      // cased).
-      output->push_back(replacement);
+      // It's a non-ascii char. Just push it to the output.
+      // In case where we have char16 input, and char output it's safe to
+      // cast char16->char only if input string was converted to ASCII.
+      output->push_back(static_cast<OUTCHAR>(source));
+      *has_non_ascii = true;
     }
   }
+
   return success;
 }
 
-// Canonicalizes a host that requires IDN conversion. Returns true on success.
-bool DoIDNHost(const UTF16Char* src, int src_len, CanonOutput* output) {
+// Canonicalizes a host that requires IDN conversion. Returns true on success
+bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
+  // We need to escape URL before doing IDN conversion, since punicode strings
+  // cannot be escaped after they are created.
+  RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
+  bool has_non_ascii;
+  DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
+
   StackBufferW wide_output;
-  if (!IDNToASCII(src, src_len, &wide_output)) {
+  if (!IDNToASCII(url_escaped_host.data(),
+                  url_escaped_host.length(),
+                  &wide_output)) {
     // Some error, give up. This will write some reasonable looking
     // representation of the string to the output.
     AppendInvalidNarrowString(src, 0, src_len, output);
     return false;
   }
 
-  // Now we check the ASCII output like a normal host. This will fail for any
-  // invalid characters, including most importantly "%". If somebody does %00
-  // as fullwidth, ICU will convert this to ASCII. We don't want to pass this
-  // on since it could be interpreted incorrectly.
-  //
-  // We could unescape at this point, that that could also produce percents
-  // or more UTF-8 input, and it gets too complicated. If people want to
-  // escape domain names, they will have to use ASCII instead of fullwidth.
-  return DoSimpleHost<UTF16Char>(wide_output.data(), wide_output.length(),
-                                 output);
+  // Now we check the ASCII output like a normal host. It will also handle
+  // unescaping. Although we unescaped everything before this function call, if
+  // somebody does %00 as fullwidth, ICU will convert this to ASCII.
+  bool success = DoSimpleHost(wide_output.data(),
+                              wide_output.length(),
+                              output, &has_non_ascii);
+  DCHECK(!has_non_ascii);
+  return success;
 }
 
 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
@@ -268,20 +233,15 @@
     // save another huge stack buffer. It will be replaced below if it requires
     // IDN. This will also update our non-ASCII flag so we know whether the
     // unescaped input requires IDN.
-    if (!UnescapeAndValidateHost(host, host_len, output, &has_non_ascii)) {
+    if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
       // Error with some escape sequence. We'll call the current output
-      // complete. UnescapeAndValidateHost will have written some
-      // "reasonable" output.
+      // complete. DoSimpleHost will have written some "reasonable" output.
       return false;
     }
 
     // Unescaping may have left us with ASCII input, in which case the
     // unescaped version we wrote to output is complete.
     if (!has_non_ascii) {
-      // Need to be sure to check for IP addresses in the newly unescaped
-      // output. This will fix the output if necessary.
-      InterpretIPAddress(url_parse::MakeRange(begin_length, output->length()),
-                         output);
       return true;
     }
 
@@ -320,7 +280,7 @@
 // UTF-16 convert host to its ASCII version. The set up is already ready for
 // the backend, so we just pass through. The has_escaped flag should be set if
 // the input string requires unescaping.
-bool DoComplexHost(const UTF16Char* host, int host_len,
+bool DoComplexHost(const char16* host, int host_len,
                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
   if (has_escaped) {
     // Yikes, we have escaped characters with wide input. The escaped
@@ -350,84 +310,56 @@
   return DoIDNHost(host, host_len, output);
 }
 
-// Takes an otherwise canonicalized hostname in the output buffer starting
-// at |host_begin| and ending at the end of |output|. This will do an in-place
-// conversion of any spaces to "%20" for IE compatability.
-void EscapeSpacesInHost(CanonOutput* output, int host_begin) {
-  // First count the number of spaces to see what needs to be done.
-  int num_spaces = 0;
-  int end = output->length();
-  for (int i = host_begin; i < end; i++) {
-    if (output->at(i) != ' ') {
-    } else {
-      num_spaces++;
-    }
-  }
-  if (num_spaces == 0)
-    return;  // Common case, nothing to do
-
-  // Resize the buffer so that there's enough room for all the inserted chars.
-  // "%20" takes 3 chars, but we delete one for the space we're replacing.
-  int num_inserted_characters = num_spaces * 2;
-  for (int i = 0; i < num_inserted_characters; i++)
-    output->push_back(0);
-
-  // Now do an in-place replacement from the end of the string of all spaces.
-  int src = end - 1;
-  int dest = src + num_inserted_characters;
-  // When src = dest, we're in sync and there are no more spaces.
-  while (src >= host_begin && src != dest) {
-    char src_char = output->at(src--);
-    if (src_char == ' ') {
-      output->set(dest--, '0');
-      output->set(dest--, '2');
-      output->set(dest--, '%');
-    } else {
-      output->set(dest--, src_char);
-    }
-  }
-}
-
 template<typename CHAR, typename UCHAR>
-bool DoHost(const CHAR* spec,
+void DoHost(const CHAR* spec,
             const url_parse::Component& host,
             CanonOutput* output,
-            url_parse::Component* out_host) {
-  bool success = true;
+            CanonHostInfo* host_info) {
   if (host.len <= 0) {
     // Empty hosts don't need anything.
-    *out_host = url_parse::Component(output->length(), 0);
-    return true;
+    host_info->family = CanonHostInfo::NEUTRAL;
+    host_info->out_host = url_parse::Component();
+    return;
   }
 
-  bool has_non_ascii, has_escaped, has_spaces;
-  ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped,
-                            &has_spaces);
+  bool has_non_ascii, has_escaped;
+  ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
 
-  out_host->begin = output->length();
+  // Keep track of output's initial length, so we can rewind later.
+  const int output_begin = output->length();
 
+  bool success;
   if (!has_non_ascii && !has_escaped) {
-    success &= DoSimpleHost(&spec[host.begin], host.len, output);
-
-    // Don't look for spaces in the common case that we don't have any.
-    if (has_spaces)
-      EscapeSpacesInHost(output, out_host->begin);
+    success = DoSimpleHost(&spec[host.begin], host.len,
+                           output, &has_non_ascii);
+    DCHECK(!has_non_ascii);
   } else {
-    success &= DoComplexHost(&spec[host.begin], host.len,
-                             has_non_ascii, has_escaped, output);
-    // We could have had escaped numerals that should now be canonicalized as
-    // an IP address. This should be exceedingly rare, it's probably mostly
-    // used by scammers.
-
-    // Last, we need to fix up any spaces by escaping them. This must happen
-    // after we do everything so spaces get sent through IDN unescaped. We also
-    // can't rely on the has_spaces flag we computed above because unescaping
-    // could have produced new spaces.
-    EscapeSpacesInHost(output, out_host->begin);
+    success = DoComplexHost(&spec[host.begin], host.len,
+                            has_non_ascii, has_escaped, output);
   }
 
-  out_host->len = output->length() - out_host->begin;
-  return success;
+  if (!success) {
+    // Canonicalization failed.  Set BROKEN to notify the caller.
+    host_info->family = CanonHostInfo::BROKEN;
+  } else {
+    // After all the other canonicalization, check if we ended up with an IP
+    // address.  IP addresses are small, so writing into this temporary buffer
+    // should not cause an allocation.
+    RawCanonOutput<64> canon_ip;
+    CanonicalizeIPAddress(output->data(),
+                          url_parse::MakeRange(output_begin, output->length()),
+                          &canon_ip, host_info);
+
+    // If we got an IPv4/IPv6 address, copy the canonical form back to the
+    // real buffer.  Otherwise, it's a hostname or broken IP, in which case
+    // we just leave it in place.
+    if (host_info->IsIPAddress()) {
+      output->set_length(output_begin);
+      output->Append(canon_ip.data(), canon_ip.length());
+    }
+  }
+
+  host_info->out_host = url_parse::MakeRange(output_begin, output->length());
 }
 
 }  // namespace
@@ -436,14 +368,34 @@
                       const url_parse::Component& host,
                       CanonOutput* output,
                       url_parse::Component* out_host) {
-  return DoHost<char, unsigned char>(spec, host, output, out_host);
+  CanonHostInfo host_info;
+  DoHost<char, unsigned char>(spec, host, output, &host_info);
+  *out_host = host_info.out_host;
+  return (host_info.family != CanonHostInfo::BROKEN);
 }
 
-bool CanonicalizeHost(const UTF16Char* spec,
+bool CanonicalizeHost(const char16* spec,
                       const url_parse::Component& host,
                       CanonOutput* output,
                       url_parse::Component* out_host) {
-  return DoHost<UTF16Char, UTF16Char>(spec, host, output, out_host);
+  CanonHostInfo host_info;
+  DoHost<char16, char16>(spec, host, output, &host_info);
+  *out_host = host_info.out_host;
+  return (host_info.family != CanonHostInfo::BROKEN);
+}
+
+void CanonicalizeHostVerbose(const char* spec,
+                             const url_parse::Component& host,
+                             CanonOutput* output,
+                             CanonHostInfo *host_info) {
+  DoHost<char, unsigned char>(spec, host, output, host_info);
+}
+
+void CanonicalizeHostVerbose(const char16* spec,
+                             const url_parse::Component& host,
+                             CanonOutput* output,
+                             CanonHostInfo *host_info) {
+  DoHost<char16, char16>(spec, host, output, host_info);
 }
 
 }  // namespace url_canon

diff --git a/googleurl/src/url_canon_icu.cc b/googleurl/src/url_canon_icu.cc
index 2f786b0..b06808c 100644
--- a/googleurl/src/url_canon_icu.cc
+++ b/googleurl/src/url_canon_icu.cc

@@ -103,7 +103,7 @@
     : converter_(converter) {
 }
 
-void ICUCharsetConverter::ConvertFromUTF16(const UTF16Char* input,
+void ICUCharsetConverter::ConvertFromUTF16(const char16* input,
                                            int input_len,
                                            CanonOutput* output) {
   // Install our error handler. It will be called for character that can not
@@ -139,7 +139,7 @@
 // the length of the output will be set to the length of the new host name.
 //
 // On error, this will return false. The output in this case is undefined.
-bool IDNToASCII(const UTF16Char* src, int src_len, CanonOutputW* output) {
+bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) {
   DCHECK(output->length() == 0);  // Output buffer is assumed empty.
   while (true) {
     // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
@@ -161,4 +161,47 @@
   }
 }
 
+bool ReadUTFChar(const char* str, int* begin, int length,
+                 unsigned* code_point_out) {
+  int code_point;  // Avoids warning when U8_NEXT writes -1 to it.
+  U8_NEXT(str, *begin, length, code_point);
+  *code_point_out = static_cast<unsigned>(code_point);
+
+  // The ICU macro above moves to the next char, we want to point to the last
+  // char consumed.
+  (*begin)--;
+
+  // Validate the decoded value.
+  if (U_IS_UNICODE_CHAR(code_point))
+    return true;
+  *code_point_out = kUnicodeReplacementCharacter;
+  return false;
+}
+
+bool ReadUTFChar(const char16* str, int* begin, int length,
+                 unsigned* code_point) {
+  if (U16_IS_SURROGATE(str[*begin])) {
+    if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
+        !U16_IS_TRAIL(str[*begin + 1])) {
+      // Invalid surrogate pair.
+      *code_point = kUnicodeReplacementCharacter;
+      return false;
+    } else {
+      // Valid surrogate pair.
+      *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
+      (*begin)++;
+    }
+  } else {
+    // Not a surrogate, just one 16-bit word.
+    *code_point = str[*begin];
+  }
+
+  if (U_IS_UNICODE_CHAR(*code_point))
+    return true;
+
+  // Invalid code point.
+  *code_point = kUnicodeReplacementCharacter;
+  return false;
+}
+
 }  // namespace url_canon

diff --git a/googleurl/src/url_canon_icu.h b/googleurl/src/url_canon_icu.h
index 09d64a8..6bc52c3 100644
--- a/googleurl/src/url_canon_icu.h
+++ b/googleurl/src/url_canon_icu.h

@@ -45,13 +45,13 @@
   // Constructs a converter using an already-existing ICU character set
   // converter. This converter is NOT owned by this object; the lifetime must
   // be managed by the creator such that it is alive as long as this is.
-  ICUCharsetConverter(UConverter* converter);
+  GURL_API ICUCharsetConverter(UConverter* converter);
 
-  virtual ~ICUCharsetConverter() {}
+  GURL_API virtual ~ICUCharsetConverter() {}
 
-  virtual void ConvertFromUTF16(const UTF16Char* input,
-                                int input_len,
-                                CanonOutput* output);
+  GURL_API virtual void ConvertFromUTF16(const char16* input,
+                                         int input_len,
+                                         CanonOutput* output);
 
  private:
   // The ICU converter, not owned by this class.

diff --git a/googleurl/src/url_canon_internal.cc b/googleurl/src/url_canon_internal.cc
index cb19599..6b776bc 100644
--- a/googleurl/src/url_canon_internal.cc
+++ b/googleurl/src/url_canon_internal.cc

@@ -39,20 +39,25 @@
 namespace {
 
 template<typename CHAR, typename UCHAR>
-static bool DoCanonicalizeEscaped(const CHAR* spec, int* begin, int end,
-                                  CanonOutput* output) {
-  char value;
-  if (DecodeEscaped<CHAR>(spec, begin, end, &value)) {
-    // Valid escape sequence, re-escape it so we normalize the case of the
-    // hex digits in the canonical form.
-    AppendEscapedChar(value, output);
-    return true;
+void DoAppendStringOfType(const CHAR* source, int length,
+                          SharedCharTypes type,
+                          CanonOutput* output) {
+  for (int i = 0; i < length; i++) {
+    if (static_cast<UCHAR>(source[i]) >= 0x80) {
+      // ReadChar will fill the code point with kUnicodeReplacementCharacter
+      // when the input is invalid, which is what we want.
+      unsigned code_point;
+      ReadUTFChar(source, &i, length, &code_point);
+      AppendUTF8EscapedValue(code_point, output);
+    } else {
+      // Just append the 7-bit character, possibly escaping it.
+      unsigned char uch = static_cast<unsigned char>(source[i]);
+      if (!IsCharOfType(uch, type))
+        AppendEscapedChar(uch, output);
+      else
+        output->push_back(uch);
+    }
   }
-
-  // Invalid escaped value, don't copy anything. The caller will pick up on the
-  // next character after the percent and treat it normally.
-  output->push_back('%');
-  return false;
 }
 
 // This function assumes the input values are all contained in 8-bit,
@@ -89,17 +94,25 @@
   }
 }
 
-// Like DoOverrideComponent except that it takes a UTF-16 input. It is
-// converted to UTF-8 at the end of the given buffer as a temporary holding
-// place, and the output is set to reference that portion of the buffer.
-bool DoUTF16OverrideComponent(const UTF16Char* override_source,
-                              const url_parse::Component& override_component,
-                              CanonOutput* utf8_buffer,
-                              const char** dest,
-                              url_parse::Component* dest_component) {
+// Similar to DoOverrideComponent except that it takes a UTF-16 input and does
+// not actually set the output character pointer.
+//
+// The input is converted to UTF-8 at the end of the given buffer as a temporary
+// holding place. The component indentifying the portion of the buffer used in
+// the |utf8_buffer| will be specified in |*dest_component|.
+//
+// This will not actually set any |dest| pointer like DoOverrideComponent
+// does because all of the pointers will point into the |utf8_buffer|, which
+// may get resized while we're overriding a subsequent component. Instead, the
+// caller should use the beginning of the |utf8_buffer| as the string pointer
+// for all components once all overrides have been prepared.
+bool PrepareUTF16OverrideComponent(
+    const char16* override_source,
+    const url_parse::Component& override_component,
+    CanonOutput* utf8_buffer,
+    url_parse::Component* dest_component) {
   bool success = true;
   if (override_source) {
-    *dest = utf8_buffer->data();
     if (!override_component.is_valid()) {
       // Non-"valid" component (means delete), so we need to preserve that.
       *dest_component = url_parse::Component();
@@ -120,101 +133,101 @@
 const unsigned char kSharedCharTypeTable[0x100] = {
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x00 - 0x0f
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x10 - 0x1f
-    0,           // 0x20  ' ' (escape spaces in queries)
-    CHAR_QUERY,  // 0x21  !
-    CHAR_QUERY,  // 0x22  "  (IE doesn't escape this in the query!)
-    0,           // 0x23  #  (invalid in query since it marks the ref)
-    CHAR_QUERY,  // 0x24  $
-    CHAR_QUERY,  // 0x25  %
-    CHAR_QUERY,  // 0x26  &
-    CHAR_QUERY,  // 0x27  '
-    CHAR_QUERY,  // 0x28  (
-    CHAR_QUERY,  // 0x29  )
-    CHAR_QUERY,  // 0x2a  *
-    CHAR_QUERY,  // 0x2b  +
-    CHAR_QUERY,  // 0x2c  ,
-    CHAR_QUERY,  // 0x2d  -
-    CHAR_QUERY | CHAR_IPV4,  // 0x2e  .
-    CHAR_QUERY,  // 0x2f  /
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x30  0
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x31  1
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x32  2
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x33  3
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x34  4
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x35  5
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x36  6
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x37  7
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC,             // 0x38  8
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX | CHAR_DEC,             // 0x39  9
+    0,                           // 0x20  ' ' (escape spaces in queries)
+    CHAR_QUERY | CHAR_USERINFO,  // 0x21  !
+    0,                           // 0x22  "
+    0,                           // 0x23  #  (invalid in query since it marks the ref)
+    CHAR_QUERY | CHAR_USERINFO,  // 0x24  $
+    CHAR_QUERY | CHAR_USERINFO,  // 0x25  %
+    CHAR_QUERY | CHAR_USERINFO,  // 0x26  &
+    CHAR_QUERY | CHAR_USERINFO,  // 0x27  '
+    CHAR_QUERY | CHAR_USERINFO,  // 0x28  (
+    CHAR_QUERY | CHAR_USERINFO,  // 0x29  )
+    CHAR_QUERY | CHAR_USERINFO,  // 0x2a  *
+    CHAR_QUERY | CHAR_USERINFO,  // 0x2b  +
+    CHAR_QUERY | CHAR_USERINFO,  // 0x2c  ,
+    CHAR_QUERY | CHAR_USERINFO,  // 0x2d  -
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4,  // 0x2e  .
+    CHAR_QUERY,                              // 0x2f  /
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x30  0
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x31  1
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x32  2
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x33  3
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x34  4
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x35  5
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x36  6
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x37  7
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC,             // 0x38  8
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC,             // 0x39  9
     CHAR_QUERY,  // 0x3a  :
     CHAR_QUERY,  // 0x3b  ;
-    CHAR_QUERY,  // 0x3c  <
+    0,           // 0x3c  <  (Try to prevent certain types of XSS.)
     CHAR_QUERY,  // 0x3d  =
-    CHAR_QUERY,  // 0x3e  >
+    0,           // 0x3e  >  (Try to prevent certain types of XSS.)
     CHAR_QUERY,  // 0x3f  ?
     CHAR_QUERY,  // 0x40  @
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x41  A
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x42  B
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x43  C
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x44  D
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x45  E
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x46  F
-    CHAR_QUERY,  // 0x47  G
-    CHAR_QUERY,  // 0x48  H
-    CHAR_QUERY,  // 0x49  I
-    CHAR_QUERY,  // 0x4a  J
-    CHAR_QUERY,  // 0x4b  K
-    CHAR_QUERY,  // 0x4c  L
-    CHAR_QUERY,  // 0x4d  M
-    CHAR_QUERY,  // 0x4e  N
-    CHAR_QUERY,  // 0x4f  O
-    CHAR_QUERY,  // 0x50  P
-    CHAR_QUERY,  // 0x51  Q
-    CHAR_QUERY,  // 0x52  R
-    CHAR_QUERY,  // 0x53  S
-    CHAR_QUERY,  // 0x54  T
-    CHAR_QUERY,  // 0x55  U
-    CHAR_QUERY,  // 0x56  V
-    CHAR_QUERY,  // 0x57  W
-    CHAR_QUERY | CHAR_IPV4, // 0x58  X
-    CHAR_QUERY,  // 0x59  Y
-    CHAR_QUERY,  // 0x5a  Z
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x41  A
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x42  B
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x43  C
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x44  D
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x45  E
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x46  F
+    CHAR_QUERY | CHAR_USERINFO,  // 0x47  G
+    CHAR_QUERY | CHAR_USERINFO,  // 0x48  H
+    CHAR_QUERY | CHAR_USERINFO,  // 0x49  I
+    CHAR_QUERY | CHAR_USERINFO,  // 0x4a  J
+    CHAR_QUERY | CHAR_USERINFO,  // 0x4b  K
+    CHAR_QUERY | CHAR_USERINFO,  // 0x4c  L
+    CHAR_QUERY | CHAR_USERINFO,  // 0x4d  M
+    CHAR_QUERY | CHAR_USERINFO,  // 0x4e  N
+    CHAR_QUERY | CHAR_USERINFO,  // 0x4f  O
+    CHAR_QUERY | CHAR_USERINFO,  // 0x50  P
+    CHAR_QUERY | CHAR_USERINFO,  // 0x51  Q
+    CHAR_QUERY | CHAR_USERINFO,  // 0x52  R
+    CHAR_QUERY | CHAR_USERINFO,  // 0x53  S
+    CHAR_QUERY | CHAR_USERINFO,  // 0x54  T
+    CHAR_QUERY | CHAR_USERINFO,  // 0x55  U
+    CHAR_QUERY | CHAR_USERINFO,  // 0x56  V
+    CHAR_QUERY | CHAR_USERINFO,  // 0x57  W
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x58  X
+    CHAR_QUERY | CHAR_USERINFO,  // 0x59  Y
+    CHAR_QUERY | CHAR_USERINFO,  // 0x5a  Z
     CHAR_QUERY,  // 0x5b  [
     CHAR_QUERY,  // 0x5c  '\'
     CHAR_QUERY,  // 0x5d  ]
     CHAR_QUERY,  // 0x5e  ^
-    CHAR_QUERY,  // 0x5f  _
+    CHAR_QUERY | CHAR_USERINFO,  // 0x5f  _
     CHAR_QUERY,  // 0x60  `
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x61  a
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x62  b
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x63  c
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x64  d
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x65  e
-    CHAR_QUERY | CHAR_IPV4 | CHAR_HEX,  // 0x66  f
-    CHAR_QUERY,  // 0x67  g
-    CHAR_QUERY,  // 0x68  h
-    CHAR_QUERY,  // 0x69  i
-    CHAR_QUERY,  // 0x6a  j
-    CHAR_QUERY,  // 0x6b  k
-    CHAR_QUERY,  // 0x6c  l
-    CHAR_QUERY,  // 0x6d  m
-    CHAR_QUERY,  // 0x6e  n
-    CHAR_QUERY,  // 0x6f  o
-    CHAR_QUERY,  // 0x70  p
-    CHAR_QUERY,  // 0x71  q
-    CHAR_QUERY,  // 0x72  r
-    CHAR_QUERY,  // 0x73  s
-    CHAR_QUERY,  // 0x74  t
-    CHAR_QUERY,  // 0x75  u
-    CHAR_QUERY,  // 0x76  v
-    CHAR_QUERY,  // 0x77  w
-    CHAR_QUERY | CHAR_IPV4,  // 0x78  x
-    CHAR_QUERY,  // 0x79  y
-    CHAR_QUERY,  // 0x7a  z
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x61  a
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x62  b
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x63  c
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x64  d
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x65  e
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x66  f
+    CHAR_QUERY | CHAR_USERINFO,  // 0x67  g
+    CHAR_QUERY | CHAR_USERINFO,  // 0x68  h
+    CHAR_QUERY | CHAR_USERINFO,  // 0x69  i
+    CHAR_QUERY | CHAR_USERINFO,  // 0x6a  j
+    CHAR_QUERY | CHAR_USERINFO,  // 0x6b  k
+    CHAR_QUERY | CHAR_USERINFO,  // 0x6c  l
+    CHAR_QUERY | CHAR_USERINFO,  // 0x6d  m
+    CHAR_QUERY | CHAR_USERINFO,  // 0x6e  n
+    CHAR_QUERY | CHAR_USERINFO,  // 0x6f  o
+    CHAR_QUERY | CHAR_USERINFO,  // 0x70  p
+    CHAR_QUERY | CHAR_USERINFO,  // 0x71  q
+    CHAR_QUERY | CHAR_USERINFO,  // 0x72  r
+    CHAR_QUERY | CHAR_USERINFO,  // 0x73  s
+    CHAR_QUERY | CHAR_USERINFO,  // 0x74  t
+    CHAR_QUERY | CHAR_USERINFO,  // 0x75  u
+    CHAR_QUERY | CHAR_USERINFO,  // 0x76  v
+    CHAR_QUERY | CHAR_USERINFO,  // 0x77  w
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4,  // 0x78  x
+    CHAR_QUERY | CHAR_USERINFO,  // 0x79  y
+    CHAR_QUERY | CHAR_USERINFO,  // 0x7a  z
     CHAR_QUERY,  // 0x7b  {
     CHAR_QUERY,  // 0x7c  |
     CHAR_QUERY,  // 0x7d  }
-    CHAR_QUERY,  // 0x7e  ~
+    CHAR_QUERY | CHAR_USERINFO,  // 0x7e  ~
     0,           // 0x7f
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8f
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9f
@@ -242,16 +255,18 @@
     0,         // 0xE0 - 0xFF
 };
 
-const UTF16Char kUnicodeReplacementCharacter = 0xfffd;
+const char16 kUnicodeReplacementCharacter = 0xfffd;
 
-bool CanonicalizeEscaped(const char* spec, int* begin, int end,
-                         CanonOutput* output) {
-  return DoCanonicalizeEscaped<char, unsigned char>(spec, begin, end, output);
+void AppendStringOfType(const char* source, int length,
+                        SharedCharTypes type,
+                        CanonOutput* output) {
+  DoAppendStringOfType<char, unsigned char>(source, length, type, output);
 }
 
-bool CanonicalizeEscaped(const UTF16Char* spec, int* begin, int end,
-                         CanonOutput* output) {
-  return DoCanonicalizeEscaped<UTF16Char, UTF16Char>(spec, begin, end, output);
+void AppendStringOfType(const char16* source, int length,
+                        SharedCharTypes type,
+                        CanonOutput* output) {
+  DoAppendStringOfType<char16, char16>(source, length, type, output);
 }
 
 void AppendInvalidNarrowString(const char* spec, int begin, int end,
@@ -259,12 +274,12 @@
   DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);
 }
 
-void AppendInvalidNarrowString(const UTF16Char* spec, int begin, int end,
+void AppendInvalidNarrowString(const char16* spec, int begin, int end,
                                CanonOutput* output) {
-  DoAppendInvalidNarrowString<UTF16Char, UTF16Char>(spec, begin, end, output);
+  DoAppendInvalidNarrowString<char16, char16>(spec, begin, end, output);
 }
 
-bool ConvertUTF16ToUTF8(const UTF16Char* input, int input_len,
+bool ConvertUTF16ToUTF8(const char16* input, int input_len,
                         CanonOutput* output) {
   bool success = true;
   for (int i = 0; i < input_len; i++) {
@@ -276,7 +291,7 @@
 }
 
 bool ConvertUTF8ToUTF16(const char* input, int input_len,
-                        CanonOutputT<UTF16Char>* output) {
+                        CanonOutputT<char16>* output) {
   bool success = true;
   for (int i = 0; i < input_len; i++) {
     unsigned code_point;
@@ -318,63 +333,76 @@
 }
 
 bool SetupUTF16OverrideComponents(const char* base,
-                                  const Replacements<UTF16Char>& repl,
+                                  const Replacements<char16>& repl,
                                   CanonOutput* utf8_buffer,
                                   URLComponentSource<char>* source,
                                   url_parse::Parsed* parsed) {
   bool success = true;
 
   // Get the source and parsed structures of the things we are replacing.
-  const URLComponentSource<UTF16Char>& repl_source = repl.sources();
+  const URLComponentSource<char16>& repl_source = repl.sources();
   const url_parse::Parsed& repl_parsed = repl.components();
 
-  success &= DoUTF16OverrideComponent(repl_source.scheme, repl_parsed.scheme,
-                                      utf8_buffer,
-                                      &source->scheme, &parsed->scheme);
-  success &= DoUTF16OverrideComponent(repl_source.username,
-                                      repl_parsed.username, utf8_buffer,
-                                      &source->username, &parsed->username);
-  success &= DoUTF16OverrideComponent(repl_source.password,
-                                      repl_parsed.password, utf8_buffer,
-                                      &source->password, &parsed->password);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.scheme, repl_parsed.scheme,
+      utf8_buffer, &parsed->scheme);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.username, repl_parsed.username,
+      utf8_buffer, &parsed->username);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.password, repl_parsed.password,
+      utf8_buffer, &parsed->password);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.host, repl_parsed.host,
+      utf8_buffer, &parsed->host);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.port, repl_parsed.port,
+      utf8_buffer, &parsed->port);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.path, repl_parsed.path,
+      utf8_buffer, &parsed->path);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.query, repl_parsed.query,
+      utf8_buffer, &parsed->query);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.ref, repl_parsed.ref,
+      utf8_buffer, &parsed->ref);
 
-  // Our host should be empty if not present, so override the default setup.
-  success &= DoUTF16OverrideComponent(repl_source.host, repl_parsed.host,
-                                      utf8_buffer,
-                                      &source->host, &parsed->host);
-  if (parsed->host.len == -1)
-    parsed->host.len = 0;
+  // PrepareUTF16OverrideComponent will not have set the data pointer since the
+  // buffer could be resized, invalidating the pointers. We set the data
+  // pointers for affected components now that the buffer is finalized.
+  if (repl_source.scheme)   source->scheme = utf8_buffer->data();
+  if (repl_source.username) source->username = utf8_buffer->data();
+  if (repl_source.password) source->password = utf8_buffer->data();
+  if (repl_source.host)     source->host = utf8_buffer->data();
+  if (repl_source.port)     source->port = utf8_buffer->data();
+  if (repl_source.path)     source->path = utf8_buffer->data();
+  if (repl_source.query)    source->query = utf8_buffer->data();
+  if (repl_source.ref)      source->ref = utf8_buffer->data();
 
-  success &= DoUTF16OverrideComponent(repl_source.port, repl_parsed.port,
-                                      utf8_buffer,
-                                      &source->port, &parsed->port);
-  success &= DoUTF16OverrideComponent(repl_source.path, repl_parsed.path,
-                                      utf8_buffer,
-                                      &source->path, &parsed->path);
-  success &= DoUTF16OverrideComponent(repl_source.query, repl_parsed.query,
-                                      utf8_buffer,
-                                      &source->query, &parsed->query);
-  success &= DoUTF16OverrideComponent(repl_source.ref, repl_parsed.ref,
-                                      utf8_buffer,
-                                      &source->ref, &parsed->ref);
   return success;
 }
 
 #ifndef WIN32
 
 int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) {
-  if (radix != 10)
+  const char* format_str;
+  if (radix == 10)
+    format_str = "%d";
+  else if (radix == 16)
+    format_str = "%x";
+  else
     return EINVAL;
 
-  int written = snprintf(buffer, size_in_chars, "%d", value);
-  if (written >= size_in_chars) {
-    // Output was truncated
+  int written = snprintf(buffer, size_in_chars, format_str, value);
+  if (static_cast<size_t>(written) >= size_in_chars) {
+    // Output was truncated, or written was negative.
     return EINVAL;
   }
   return 0;
 }
 
-int _itow_s(int value, UTF16Char* buffer, size_t size_in_chars, int radix) {
+int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix) {
   if (radix != 10)
     return EINVAL;
 
@@ -382,13 +410,13 @@
   // Add an extra byte for the terminating null.
   char temp[13];
   int written = snprintf(temp, sizeof(temp), "%d", value);
-  if (written >= size_in_chars) {
-    // Output was truncated
+  if (static_cast<size_t>(written) >= size_in_chars) {
+    // Output was truncated, or written was negative.
     return EINVAL;
   }
 
   for (int i = 0; i < written; ++i) {
-    buffer[i] = static_cast<UTF16Char>(temp[i]);
+    buffer[i] = static_cast<char16>(temp[i]);
   }
   buffer[written] = '\0';
   return 0;

diff --git a/googleurl/src/url_canon_internal.h b/googleurl/src/url_canon_internal.h
index b73eb92..4b1e45a 100644
--- a/googleurl/src/url_canon_internal.h
+++ b/googleurl/src/url_canon_internal.h

@@ -36,12 +36,13 @@
 #define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
 
 #include <stdlib.h>
-#include <unicode/utf.h>
 
 #include "googleurl/src/url_canon.h"
 
 namespace url_canon {
 
+// Character type handling -----------------------------------------------------
+
 // Bits that identify different character types. These types identify different
 // bits that are set for each 8-bit character in the kSharedCharTypeTable.
 enum SharedCharTypes {
@@ -49,17 +50,20 @@
   // not have this flag will be escaped, see url_canon_query.cc
   CHAR_QUERY = 1,
 
+  // Valid in the username/password field.
+  CHAR_USERINFO = 2,
+
   // Valid in a IPv4 address (digits plus dot and 'x' for hex).
-  CHAR_IPV4 = 2,
+  CHAR_IPV4 = 4,
 
   // Valid in an ASCII-representation of a hex digit (as in %-escaped).
-  CHAR_HEX = 4,
+  CHAR_HEX = 8,
 
   // Valid in an ASCII-representation of a decimal digit.
-  CHAR_DEC = 8,
+  CHAR_DEC = 16,
 
   // Valid in an ASCII-representation of an octal digit.
-  CHAR_OCT = 16,
+  CHAR_OCT = 32,
 };
 
 // This table contains the flags in SharedCharTypes for each 8-bit character.
@@ -85,6 +89,15 @@
   return IsCharOfType(c, CHAR_HEX);
 }
 
+// Appends the given string to the output, escaping characters that do not
+// match the given |type| in SharedCharTypes.
+void AppendStringOfType(const char* source, int length,
+                        SharedCharTypes type,
+                        CanonOutput* output);
+void AppendStringOfType(const char16* source, int length,
+                        SharedCharTypes type,
+                        CanonOutput* output);
+
 // Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit
 // that will be used to represent it.
 extern const char kHexCharLookup[0x10];
@@ -125,19 +138,22 @@
 // required for relative URL resolving to test for scheme equality.
 //
 // Returns 0 if the input character is not a valid scheme character.
-char CanonicalSchemeChar(UTF16Char ch);
+char CanonicalSchemeChar(char16 ch);
 
 // Write a single character, escaped, to the output. This always escapes: it
 // does no checking that thee character requires escaping.
-inline void AppendEscapedChar(unsigned char ch,
-                              CanonOutput* output) {
+// Escaping makes sense only 8 bit chars, so code works in all cases of
+// input parameters (8/16bit).
+template<typename UINCHAR, typename OUTCHAR>
+inline void AppendEscapedChar(UINCHAR ch,
+                              CanonOutputT<OUTCHAR>* output) {
   output->push_back('%');
   output->push_back(kHexCharLookup[ch >> 4]);
   output->push_back(kHexCharLookup[ch & 0xf]);
 }
 
 // The character we'll substitute for undecodable or invalid characters.
-extern const UTF16Char kUnicodeReplacementCharacter;
+extern const char16 kUnicodeReplacementCharacter;
 
 // UTF-8 functions ------------------------------------------------------------
 
@@ -149,20 +165,10 @@
 // |*begin| will be updated to point to the last character consumed so it
 // can be incremented in a loop and will be ready for the next character.
 // (for a single-byte ASCII character, it will not be changed).
-inline bool ReadUTFChar(const char* str, int* begin, int length,
-                        unsigned* code_point) {
-  U8_NEXT(str, *begin, length, *code_point);
-
-  // The ICU macro above moves to the next char, we want to point to the last
-  // char consumed.
-  (*begin)--;
-
-  // Validate the decoded value.
-  if (U_IS_UNICODE_CHAR(*code_point))
-    return true;
-  *code_point = kUnicodeReplacementCharacter;
-  return false;
-}
+//
+// Implementation is in url_canon_icu.cc.
+bool ReadUTFChar(const char* str, int* begin, int length,
+                 unsigned* code_point_out);
 
 // Generic To-UTF-8 converter. This will call the given append method for each
 // character that should be appended, with the given output method. Wrappers
@@ -244,40 +250,19 @@
 // |*begin| will be updated to point to the last character consumed so it
 // can be incremented in a loop and will be ready for the next character.
 // (for a single-16-bit-word character, it will not be changed).
-inline bool ReadUTFChar(const UTF16Char* str, int* begin, int length,
-                        unsigned* code_point) {
-  if (U16_IS_SURROGATE(str[*begin])) {
-    if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
-        !U16_IS_TRAIL(str[*begin + 1])) {
-      // Invalid surrogate pair.
-      *code_point = kUnicodeReplacementCharacter;
-      return false;
-    } else {
-      // Valid surrogate pair.
-      *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
-      (*begin)++;
-    }
-  } else {
-    // Not a surrogate, just one 16-bit word.
-    *code_point = str[*begin];
-  }
-
-  if (U_IS_UNICODE_CHAR(*code_point))
-    return true;
-
-  // Invalid code point.
-  *code_point = kUnicodeReplacementCharacter;
-  return false;
-}
+//
+// Implementation is in url_canon_icu.cc.
+bool ReadUTFChar(const char16* str, int* begin, int length,
+                 unsigned* code_point);
 
 // Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method.
 inline void AppendUTF16Value(unsigned code_point,
-                             CanonOutputT<UTF16Char>* output) {
+                             CanonOutputT<char16>* output) {
   if (code_point > 0xffff) {
-    output->push_back(static_cast<UTF16Char>((code_point >> 10) + 0xd7c0));
-    output->push_back(static_cast<UTF16Char>((code_point & 0x3ff) | 0xdc00));
+    output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0));
+    output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00));
   } else {
-    output->push_back(static_cast<UTF16Char>(code_point));
+    output->push_back(static_cast<char16>(code_point));
   }
 }
 
@@ -302,9 +287,9 @@
 //
 // Assumes that ch[begin] is within range in the array, but does not assume
 // that any following characters are.
-inline bool AppendUTF8EscapedChar(const UTF16Char* str, int* begin, int length,
+inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length,
                                   CanonOutput* output) {
-  // UTF-16 input. ReadUTF16Char will handle invalid characters for us and give
+  // UTF-16 input. Readchar16 will handle invalid characters for us and give
   // us the kUnicodeReplacementCharacter, so we don't have to do special
   // checking after failure, just pass through the failure to the caller.
   unsigned char_value;
@@ -337,13 +322,13 @@
 inline bool Is8BitChar(char c) {
   return true;  // this case is specialized to avoid a warning
 }
-inline bool Is8BitChar(UTF16Char c) {
+inline bool Is8BitChar(char16 c) {
   return c <= 255;
 }
 
 template<typename CHAR>
 inline bool DecodeEscaped(const CHAR* spec, int* begin, int end,
-                          char* unescaped_value) {
+                          unsigned char* unescaped_value) {
   if (*begin + 3 > end ||
       !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) {
     // Invalid escape sequence because there's not enough room, or the
@@ -364,29 +349,6 @@
   return true;
 }
 
-// Given a '%' character at |*begin| in the string |spec|, this will copy
-// the appropriate characters to the output for it to be in canonical form.
-//
-// |*begin| will be updated to point to the last character of the escape
-// sequence so that when called with the index of a for loop, the next time
-// through it will point to the next character to be considered.
-//
-// On failure (return false) this will just do a literal copy of the percent
-// sign and, if possible, the following two characters, which would be
-// otherwise interpreted as the values. If the characters are wide, we will
-// stop copying since they we don't know the proper conversion rules.
-//
-// On failure, the caller should NOT accept the URL as valid, as it could be
-// a security problem. Imagine if the caller could be coaxed to produce a
-// valid escape sequence out of characters this function does not consider
-// to be vaild (maybe the caller is canonicalizing fullwidth to ASCII).
-// A URL could be constructed that the canonicalizer would treat differently
-// than the server, which could potentially be bad.
-bool CanonicalizeEscaped(const char* spec, int* begin, int end,
-                         CanonOutput* output);
-bool CanonicalizeEscaped(const UTF16Char* spec, int* begin, int end,
-                         CanonOutput* output);
-
 // Appends the given substring to the output, escaping "some" characters that
 // it feels may not be safe. It assumes the input values are all contained in
 // 8-bit although it allows any type.
@@ -396,7 +358,7 @@
 // the escaping rules are not guaranteed!
 void AppendInvalidNarrowString(const char* spec, int begin, int end,
                                CanonOutput* output);
-void AppendInvalidNarrowString(const UTF16Char* spec, int begin, int end,
+void AppendInvalidNarrowString(const char16* spec, int begin, int end,
                                CanonOutput* output);
 
 // Misc canonicalization helpers ----------------------------------------------
@@ -409,14 +371,14 @@
 // replacing the invalid characters with the "invalid character". It will
 // return false in the failure case, and the caller should not continue as
 // normal.
-bool ConvertUTF16ToUTF8(const UTF16Char* input, int input_len,
+bool ConvertUTF16ToUTF8(const char16* input, int input_len,
                         CanonOutput* output);
 bool ConvertUTF8ToUTF16(const char* input, int input_len,
-                        CanonOutputT<UTF16Char>* output);
+                        CanonOutputT<char16>* output);
 
 // Converts from UTF-16 to 8-bit using the character set converter. If the
 // converter is NULL, this will use UTF-8.
-void ConvertUTF16ToQueryEncoding(const UTF16Char* input,
+void ConvertUTF16ToQueryEncoding(const char16* input,
                                  const url_parse::Component& query,
                                  CharsetConverter* converter,
                                  CanonOutput* output);
@@ -444,11 +406,15 @@
 // no storage, so the buffer must have the same lifetime as the source
 // parameter owned by the caller.
 //
+// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of
+// |source| will point into this buffer, which could be invalidated if
+// additional data is added and the CanonOutput resizes its buffer.
+//
 // Returns true on success. Fales means that the input was not valid UTF-16,
 // although we will have still done the override with "invalid characters" in
 // place of errors.
 bool SetupUTF16OverrideComponents(const char* base,
-                                  const Replacements<UTF16Char>& repl,
+                                  const Replacements<char16>& repl,
                                   CanonOutput* utf8_buffer,
                                   URLComponentSource<char>* source,
                                   url_parse::Parsed* parsed);
@@ -459,7 +425,7 @@
                              const url_parse::Component& path,
                              int path_begin_in_output,
                              CanonOutput* output);
-bool CanonicalizePartialPath(const UTF16Char* spec,
+bool CanonicalizePartialPath(const char16* spec,
                              const url_parse::Component& path,
                              int path_begin_in_output,
                              CanonOutput* output);
@@ -468,7 +434,7 @@
 
 // Implementations of Windows' int-to-string conversions
 int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix);
-int _itow_s(int value, UTF16Char* buffer, size_t size_in_chars, int radix);
+int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix);
 
 // Secure template overloads for these functions
 template<size_t N>
@@ -477,7 +443,7 @@
 }
 
 template<size_t N>
-inline int _itow_s(int value, UTF16Char (&buffer)[N], int radix) {
+inline int _itow_s(int value, char16 (&buffer)[N], int radix) {
   return _itow_s(value, buffer, N, radix);
 }
 

diff --git a/googleurl/src/url_canon_internal_file.h b/googleurl/src/url_canon_internal_file.h
index 387f29b..63a9c5b 100644
--- a/googleurl/src/url_canon_internal_file.h
+++ b/googleurl/src/url_canon_internal_file.h

@@ -68,7 +68,7 @@
     output->push_back(spec[after_slashes] - 'a' + 'A');
   else
     output->push_back(static_cast<char>(spec[after_slashes]));
-  
+
   // Normalize the character following it to a colon rather than pipe.
   output->push_back(':');
   output->push_back('/');
@@ -131,7 +131,7 @@
   // for regular IP hosts.
   bool success = URLCanonInternal<CHAR, UCHAR>::DoHost(
       source.host, parsed.host, output, &new_parsed->host);
-  
+
   // Write a separator for the start of the path. We'll ignore any slashes
   // already at the beginning of the path.
   new_parsed->path.begin = output->length();

diff --git a/googleurl/src/url_canon_ip.cc b/googleurl/src/url_canon_ip.cc
index c61be7b..86f7c9c 100644
--- a/googleurl/src/url_canon_ip.cc
+++ b/googleurl/src/url_canon_ip.cc

@@ -1,4 +1,4 @@
-// Copyright 2007, Google Inc.
+// Copyright 2009, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -27,8 +27,12 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#include "googleurl/src/url_canon_ip.h"
+
 #include <stdlib.h>
 
+#include "base/basictypes.h"
+#include "base/logging.h"
 #include "googleurl/src/url_canon_internal.h"
 
 namespace url_canon {
@@ -50,33 +54,18 @@
   }
 }
 
-// Searches the host name for the portions of the IPv4 address. On success,
-// each component will be placed into |components| and it will return true.
-// It will return false if the host can not be separated as an IPv4 address
-// or if there are any non-7-bit characters or other characters that can not
-// be in an IP address. (This is important so we fail as early as possible for
-// common non-IP hostnames.)
-//
-// Not all components may exist. If there are only 3 components, for example,
-// the last one will have a length of -1 or 0 to indicate it does not exist.
-//
-// Note that many platform's inet_addr will ignore everything after a space
-// in certain curcumstances if the stuff before the space looks like an IP
-// address. IE6 is included in this. We do NOT handle this case. In many cases,
-// the browser's canonicalization will get run before this which converts
-// spaces to %20 (in the case of IE7) or rejects them (in the case of
-// Mozilla), so this code path never gets hit. Our host canonicalization will
-// notice these spaces and escape them, which will make IP address finding
-// fail. This seems like better behavior than stripping after a space.
 template<typename CHAR, typename UCHAR>
-bool FindIPv4Components(const CHAR* spec,
-                        const url_parse::Component& host,
-                        url_parse::Component components[4]) {
+bool DoFindIPv4Components(const CHAR* spec,
+                          const url_parse::Component& host,
+                          url_parse::Component components[4]) {
+  if (!host.is_nonempty())
+    return false;
+
   int cur_component = 0;  // Index of the component we're working on.
   int cur_component_begin = host.begin;  // Start of the current component.
   int end = host.end();
   for (int i = host.begin; /* nothing */; i++) {
-    if (i == end || spec[i] == '.') {
+    if (i >= end || spec[i] == '.') {
       // Found the end of the current component.
       int component_len = i - cur_component_begin;
       components[cur_component] =
@@ -90,10 +79,10 @@
       // allow an empty component at the end (this would indicate that the
       // input ends in a dot). We also want to error if the component is
       // empty and it's the only component (cur_component == 1).
-      if (component_len == 0 && (i != end || cur_component == 1))
+      if (component_len == 0 && (i < end || cur_component == 1))
         return false;
 
-      if (i == end)
+      if (i >= end)
         break;  // End of the input.
 
       if (cur_component == 4) {
@@ -101,11 +90,11 @@
         // dot that would otherwise be treated as the end of input.
         if (spec[i] == '.' && i + 1 == end)
           break;
-        return false;  
+        return false;
       }
     } else if (static_cast<UCHAR>(spec[i]) >= 0x80 ||
                !IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
-      // Invalid character for an IP address.
+      // Invalid character for an IPv4 address.
       return false;
     }
   }
@@ -116,17 +105,21 @@
   return true;
 }
 
-// Converts an IPv4 component to a 32-bit number, returning true on success.
-// False means that the number is invalid and that the input can not be an
-// IP address. The number will be truncated to 32 bits.
+// Converts an IPv4 component to a 32-bit number, while checking for overflow.
+//
+// Possible return values:
+// - IPV4    - The number was valid, and did not overflow.
+// - BROKEN  - The input was numeric, but too large for a 32-bit field.
+// - NEUTRAL - Input was not numeric.
 //
 // The input is assumed to be ASCII. FindIPv4Components should have stripped
 // out any input that is greater than 7 bits. The components are assumed
 // to be non-empty.
 template<typename CHAR>
-bool IPv4ComponentToNumber(const CHAR* spec,
-                           const url_parse::Component& component,
-                           uint32_t* number) {
+CanonHostInfo::Family IPv4ComponentToNumber(
+    const CHAR* spec,
+    const url_parse::Component& component,
+    uint32* number) {
   // Figure out the base
   SharedCharTypes base;
   int base_prefix_len = 0;  // Size of the prefix for this base.
@@ -146,33 +139,46 @@
     base = CHAR_DEC;
   }
 
-  // Reject any components that are too long. This is generous, Windows
-  // allows at most 16 characters for the entire host name, and 12 per
-  // component, while Mac and Linux will take up to 10 per component.
-  const int kMaxComponentLen = 16;
-  if (component.len - base_prefix_len > kMaxComponentLen)
-    return false;
+  // Extend the prefix to consume all leading zeros.
+  while (base_prefix_len < component.len &&
+         spec[component.begin + base_prefix_len] == '0')
+    base_prefix_len++;
 
-  // Put the component, minus any base prefix, to a NULL-terminated buffer so
-  // we can call the standard library. We know the input is 7-bit, so convert
-  // to narrow (if this is the wide version of the template) by casting.
-  char buf[kMaxComponentLen + 1];
+  // Put the component, minus any base prefix, into a NULL-terminated buffer so
+  // we can call the standard library.  Because leading zeros have already been
+  // discarded, filling the entire buffer is guaranteed to trigger the 32-bit
+  // overflow check.
+  const int kMaxComponentLen = 16;
+  char buf[kMaxComponentLen + 1];  // digits + '\0'
   int dest_i = 0;
-  for (int i = base_prefix_len; i < component.len; i++, dest_i++) {
-    char input = static_cast<char>(spec[component.begin + i]);
+  for (int i = component.begin + base_prefix_len; i < component.end(); i++) {
+    // We know the input is 7-bit, so convert to narrow (if this is the wide
+    // version of the template) by casting.
+    char input = static_cast<char>(spec[i]);
 
     // Validate that this character is OK for the given base.
     if (!IsCharOfType(input, base))
-      return false;
-    buf[dest_i] = input;
+      return CanonHostInfo::NEUTRAL;
+
+    // Fill the buffer, if there's space remaining.  This check allows us to
+    // verify that all characters are numeric, even those that don't fit.
+    if (dest_i < kMaxComponentLen)
+      buf[dest_i++] = input;
   }
-  buf[dest_i] = 0;
+
+  buf[dest_i] = '\0';
 
   // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
-  // number can overflow a 64-bit number in <= 16 characters). Then cast to
-  // truncate down to a 32-bit number. This may be further truncated later.
-  *number = static_cast<uint32_t>(_strtoui64(buf, NULL, BaseForType(base)));
-  return true;
+  // number can overflow a 64-bit number in <= 16 characters).
+  uint64 num = _strtoui64(buf, NULL, BaseForType(base));
+
+  // Check for 32-bit overflow.
+  if (num > kuint32max)
+    return CanonHostInfo::BROKEN;
+
+  // No overflow.  Success!
+  *number = static_cast<uint32>(num);
+  return CanonHostInfo::IPV4;
 }
 
 // Writes the given address (with each character representing one dotted
@@ -195,67 +201,537 @@
   out_host->len = output->length() - out_host->begin;
 }
 
-template<typename CHAR, typename UCHAR>
-bool DoCanonicalizeIPv4Address(const CHAR* spec,
-                               const url_parse::Component& host,
-                               CanonOutput* output,
-                               url_parse::Component* out_host) {
+// See declaration of IPv4AddressToNumber for documentation.
+template<typename CHAR>
+CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec,
+                                            const url_parse::Component& host,
+                                            unsigned char address[4],
+                                            int* num_ipv4_components) {
   // The identified components. Not all may exist.
   url_parse::Component components[4];
-  if (!FindIPv4Components<CHAR, UCHAR>(spec, host, components))
-    return false;
+  if (!FindIPv4Components(spec, host, components))
+    return CanonHostInfo::NEUTRAL;
 
   // Convert existing components to digits. Values up to
   // |existing_components| will be valid.
-  uint32_t component_values[4];
+  uint32 component_values[4];
   int existing_components = 0;
   for (int i = 0; i < 4; i++) {
     if (components[i].len <= 0)
       continue;
-    if (!IPv4ComponentToNumber(spec, components[i],
-                               &component_values[existing_components]))
-      return false;
+    CanonHostInfo::Family family = IPv4ComponentToNumber(
+        spec, components[i], &component_values[existing_components]);
+
+    // Stop if we hit an invalid non-empty component.
+    if (family != CanonHostInfo::IPV4)
+      return family;
+
     existing_components++;
   }
 
   // Use that sequence of numbers to fill out the 4-component IP address.
-  unsigned char address[4];
 
-  // ...first fill all but the last component by truncating to one byte.
-  for (int i = 0; i < existing_components - 1; i++)
+  // First, process all components but the last, while making sure each fits
+  // within an 8-bit field.
+  for (int i = 0; i < existing_components - 1; i++) {
+    if (component_values[i] > kuint8max)
+      return CanonHostInfo::BROKEN;
     address[i] = static_cast<unsigned char>(component_values[i]);
+  }
 
-  // ...then fill out the rest of the bytes by filling them with the last
-  // component.
-  uint32_t last_value = component_values[existing_components - 1];
-  if (existing_components == 1)
-    address[0] = (last_value & 0xFF000000) >> 24;
-  if (existing_components <= 2)
-    address[1] = (last_value & 0x00FF0000) >> 16;
-  if (existing_components <= 3)
-    address[2] = (last_value & 0x0000FF00) >> 8;
-  address[3] = last_value & 0xFF;
+  // Next, consume the last component to fill in the remaining bytes.
+  uint32 last_value = component_values[existing_components - 1];
+  for (int i = 3; i >= existing_components - 1; i--) {
+    address[i] = static_cast<unsigned char>(last_value);
+    last_value >>= 8;
+  }
 
-  AppendIPv4Address(address, output, out_host);
+  // If the last component has residual bits, report overflow.
+  if (last_value != 0)
+    return CanonHostInfo::BROKEN;
+
+  // Tell the caller how many components we saw.
+  *num_ipv4_components = existing_components;
+
+  // Success!
+  return CanonHostInfo::IPV4;
+}
+
+// Return true if we've made a final IPV4/BROKEN decision, false if the result
+// is NEUTRAL, and we could use a second opinion.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeIPv4Address(const CHAR* spec,
+                               const url_parse::Component& host,
+                               CanonOutput* output,
+                               CanonHostInfo* host_info) {
+  unsigned char address[4];
+  host_info->family = IPv4AddressToNumber(
+      spec, host, address, &host_info->num_ipv4_components);
+
+  switch (host_info->family) {
+    case CanonHostInfo::IPV4:
+      // Definitely an IPv4 address.
+      AppendIPv4Address(address, output, &host_info->out_host);
+      return true;
+    case CanonHostInfo::BROKEN:
+      // Definitely broken.
+      return true;
+    default:
+      // Could be IPv6 or a hostname.
+      return false;
+  }
+}
+
+// Helper class that describes the main components of an IPv6 input string.
+// See the following examples to understand how it breaks up an input string:
+//
+// [Example 1]: input = "[::aa:bb]"
+//  ==> num_hex_components = 2
+//  ==> hex_components[0] = Component(3,2) "aa"
+//  ==> hex_components[1] = Component(6,2) "bb"
+//  ==> index_of_contraction = 0
+//  ==> ipv4_component = Component(0, -1)
+//
+// [Example 2]: input = "[1:2::3:4:5]"
+//  ==> num_hex_components = 5
+//  ==> hex_components[0] = Component(1,1) "1"
+//  ==> hex_components[1] = Component(3,1) "2"
+//  ==> hex_components[2] = Component(6,1) "3"
+//  ==> hex_components[3] = Component(8,1) "4"
+//  ==> hex_components[4] = Component(10,1) "5"
+//  ==> index_of_contraction = 2
+//  ==> ipv4_component = Component(0, -1)
+//
+// [Example 3]: input = "[::ffff:192.168.0.1]"
+//  ==> num_hex_components = 1
+//  ==> hex_components[0] = Component(3,4) "ffff"
+//  ==> index_of_contraction = 0
+//  ==> ipv4_component = Component(8, 11) "192.168.0.1"
+//
+// [Example 4]: input = "[1::]"
+//  ==> num_hex_components = 1
+//  ==> hex_components[0] = Component(1,1) "1"
+//  ==> index_of_contraction = 1
+//  ==> ipv4_component = Component(0, -1)
+//
+// [Example 5]: input = "[::192.168.0.1]"
+//  ==> num_hex_components = 0
+//  ==> index_of_contraction = 0
+//  ==> ipv4_component = Component(8, 11) "192.168.0.1"
+//
+struct IPv6Parsed {
+  // Zero-out the parse information.
+  void reset() {
+    num_hex_components = 0;
+    index_of_contraction = -1;
+    ipv4_component.reset();
+  }
+
+  // There can be up to 8 hex components (colon separated) in the literal.
+  url_parse::Component hex_components[8];
+
+  // The count of hex components present. Ranges from [0,8].
+  int num_hex_components;
+
+  // The index of the hex component that the "::" contraction precedes, or
+  // -1 if there is no contraction.
+  int index_of_contraction;
+
+  // The range of characters which are an IPv4 literal.
+  url_parse::Component ipv4_component;
+};
+
+// Parse the IPv6 input string. If parsing succeeded returns true and fills
+// |parsed| with the information. If parsing failed (because the input is
+// invalid) returns false.
+template<typename CHAR, typename UCHAR>
+bool DoParseIPv6(const CHAR* spec,
+                 const url_parse::Component& host,
+                 IPv6Parsed* parsed) {
+  // Zero-out the info.
+  parsed->reset();
+
+  if (!host.is_nonempty())
+    return false;
+
+  // The index for start and end of address range (no brackets).
+  int begin = host.begin;
+  int end = host.end();
+
+  int cur_component_begin = begin;  // Start of the current component.
+
+  // Scan through the input, searching for hex components, "::" contractions,
+  // and IPv4 components.
+  for (int i = begin; /* i <= end */; i++) {
+    bool is_colon = spec[i] == ':';
+    bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':';
+
+    // We reached the end of the current component if we encounter a colon
+    // (separator between hex components, or start of a contraction), or end of
+    // input.
+    if (is_colon || i == end) {
+      int component_len = i - cur_component_begin;
+
+      // A component should not have more than 4 hex digits.
+      if (component_len > 4)
+        return false;
+
+      // Don't allow empty components.
+      if (component_len == 0) {
+        // The exception is when contractions appear at beginning of the
+        // input or at the end of the input.
+        if (!((is_contraction && i == begin) || (i == end &&
+            parsed->index_of_contraction == parsed->num_hex_components)))
+          return false;
+      }
+
+      // Add the hex component we just found to running list.
+      if (component_len > 0) {
+        // Can't have more than 8 components!
+        if (parsed->num_hex_components >= 8)
+          return false;
+
+        parsed->hex_components[parsed->num_hex_components++] =
+            url_parse::Component(cur_component_begin, component_len);
+      }
+    }
+
+    if (i == end)
+      break;  // Reached the end of the input, DONE.
+
+    // We found a "::" contraction.
+    if (is_contraction) {
+      // There can be at most one contraction in the literal.
+      if (parsed->index_of_contraction != -1)
+        return false;
+      parsed->index_of_contraction = parsed->num_hex_components;
+      ++i;  // Consume the colon we peeked.
+    }
+
+    if (is_colon) {
+      // Colons are separators between components, keep track of where the
+      // current component started (after this colon).
+      cur_component_begin = i + 1;
+    } else {
+      if (static_cast<UCHAR>(spec[i]) >= 0x80)
+        return false;  // Not ASCII.
+
+      if (!IsHexChar(static_cast<unsigned char>(spec[i]))) {
+        // Regular components are hex numbers. It is also possible for
+        // a component to be an IPv4 address in dotted form.
+        if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
+          // Since IPv4 address can only appear at the end, assume the rest
+          // of the string is an IPv4 address. (We will parse this separately
+          // later).
+          parsed->ipv4_component = url_parse::Component(
+              cur_component_begin, end - cur_component_begin);
+          break;
+        } else {
+          // The character was neither a hex digit, nor an IPv4 character.
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+// Verifies the parsed IPv6 information, checking that the various components
+// add up to the right number of bits (hex components are 16 bits, while
+// embedded IPv4 formats are 32 bits, and contractions are placeholdes for
+// 16 or more bits). Returns true if sizes match up, false otherwise. On
+// success writes the length of the contraction (if any) to
+// |out_num_bytes_of_contraction|.
+bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed,
+                             int* out_num_bytes_of_contraction) {
+  // Each group of four hex digits contributes 16 bits.
+  int num_bytes_without_contraction = parsed.num_hex_components * 2;
+
+  // If an IPv4 address was embedded at the end, it contributes 32 bits.
+  if (parsed.ipv4_component.is_valid())
+    num_bytes_without_contraction += 4;
+
+  // If there was a "::" contraction, its size is going to be:
+  // MAX([16bits], [128bits] - num_bytes_without_contraction).
+  int num_bytes_of_contraction = 0;
+  if (parsed.index_of_contraction != -1) {
+    num_bytes_of_contraction = 16 - num_bytes_without_contraction;
+    if (num_bytes_of_contraction < 2)
+      num_bytes_of_contraction = 2;
+  }
+
+  // Check that the numbers add up.
+  if (num_bytes_without_contraction + num_bytes_of_contraction != 16)
+    return false;
+
+  *out_num_bytes_of_contraction = num_bytes_of_contraction;
+  return true;
+}
+
+// Converts a hex comonent into a number. This cannot fail since the caller has
+// already verified that each character in the string was a hex digit, and
+// that there were no more than 4 characters.
+template<typename CHAR>
+uint16 IPv6HexComponentToNumber(const CHAR* spec,
+                                const url_parse::Component& component) {
+  DCHECK(component.len <= 4);
+
+  // Copy the hex string into a C-string.
+  char buf[5];
+  for (int i = 0; i < component.len; ++i)
+    buf[i] = static_cast<char>(spec[component.begin + i]);
+  buf[component.len] = '\0';
+
+  // Convert it to a number (overflow is not possible, since with 4 hex
+  // characters we can at most have a 16 bit number).
+  return static_cast<uint16>(_strtoui64(buf, NULL, 16));
+}
+
+// Converts an IPv6 address to a 128-bit number (network byte order), returning
+// true on success. False means that the input was not a valid IPv6 address.
+template<typename CHAR, typename UCHAR>
+bool DoIPv6AddressToNumber(const CHAR* spec,
+                           const url_parse::Component& host,
+                           unsigned char address[16]) {
+  // Make sure the component is bounded by '[' and ']'.
+  int end = host.end();
+  if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']')
+    return false;
+
+  // Exclude the square brackets.
+  url_parse::Component ipv6_comp(host.begin + 1, host.len - 2);
+
+  // Parse the IPv6 address -- identify where all the colon separated hex
+  // components are, the "::" contraction, and the embedded IPv4 address.
+  IPv6Parsed ipv6_parsed;
+  if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed))
+    return false;
+
+  // Do some basic size checks to make sure that the address doesn't
+  // specify more than 128 bits or fewer than 128 bits. This also resolves
+  // how may zero bytes the "::" contraction represents.
+  int num_bytes_of_contraction;
+  if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction))
+    return false;
+
+  int cur_index_in_address = 0;
+
+  // Loop through each hex components, and contraction in order.
+  for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) {
+    // Append the contraction if it appears before this component.
+    if (i == ipv6_parsed.index_of_contraction) {
+      for (int j = 0; j < num_bytes_of_contraction; ++j)
+        address[cur_index_in_address++] = 0;
+    }
+    // Append the hex component's value.
+    if (i != ipv6_parsed.num_hex_components) {
+      // Get the 16-bit value for this hex component.
+      uint16 number = IPv6HexComponentToNumber<CHAR>(
+          spec, ipv6_parsed.hex_components[i]);
+      // Append to |address|, in network byte order.
+      address[cur_index_in_address++] = (number & 0xFF00) >> 8;
+      address[cur_index_in_address++] = (number & 0x00FF);
+    }
+  }
+
+  // If there was an IPv4 section, convert it into a 32-bit number and append
+  // it to |address|.
+  if (ipv6_parsed.ipv4_component.is_valid()) {
+    // We only allow the embedded IPv4 syntax to be used for "compat" and
+    // "mapped" formats:
+    //     "mapped" ==>  0:0:0:0:0:ffff:<IPv4-literal>
+    //     "compat" ==>  0:0:0:0:0:0000:<IPv4-literal>
+    for (int j = 0; j < 10; ++j) {
+      if (address[j] != 0)
+        return false;
+    }
+    if (!((address[10] == 0 && address[11] == 0) ||
+          (address[10] == 0xFF && address[11] == 0xFF)))
+      return false;
+
+    // Append the 32-bit number to |address|.
+    int ignored_num_ipv4_components;
+    if (CanonHostInfo::IPV4 !=
+        IPv4AddressToNumber(spec,
+                            ipv6_parsed.ipv4_component,
+                            &address[cur_index_in_address],
+                            &ignored_num_ipv4_components))
+      return false;
+  }
+
+  return true;
+}
+
+// Searches for the longest sequence of zeros in |address|, and writes the
+// range into |contraction_range|. The run of zeros must be at least 16 bits,
+// and if there is a tie the first is chosen.
+void ChooseIPv6ContractionRange(const unsigned char address[16],
+                                url_parse::Component* contraction_range) {
+  // The longest run of zeros in |address| seen so far.
+  url_parse::Component max_range;
+
+  // The current run of zeros in |address| being iterated over.
+  url_parse::Component cur_range;
+
+  for (int i = 0; i < 16; i += 2) {
+    // Test for 16 bits worth of zero.
+    bool is_zero = (address[i] == 0 && address[i + 1] == 0);
+
+    if (is_zero) {
+      // Add the zero to the current range (or start a new one).
+      if (!cur_range.is_valid())
+        cur_range = url_parse::Component(i, 0);
+      cur_range.len += 2;
+    }
+
+    if (!is_zero || i == 14) {
+      // Just completed a run of zeros. If the run is greater than 16 bits,
+      // it is a candidate for the contraction.
+      if (cur_range.len > 2 && cur_range.len > max_range.len) {
+        max_range = cur_range;
+      }
+      cur_range.reset();
+    }
+  }
+  *contraction_range = max_range;
+}
+
+// Return true if we've made a final IPV6/BROKEN decision, false if the result
+// is NEUTRAL, and we could use a second opinion.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeIPv6Address(const CHAR* spec,
+                               const url_parse::Component& host,
+                               CanonOutput* output,
+                               CanonHostInfo* host_info) {
+  // Turn the IP address into a 128 bit number.
+  unsigned char address[16];
+  if (!IPv6AddressToNumber(spec, host, address)) {
+    // If it's not an IPv6 address, scan for characters that should *only*
+    // exist in an IPv6 address.
+    for (int i = host.begin; i < host.end(); i++) {
+      switch (spec[i]) {
+        case '[':
+        case ']':
+        case ':':
+          host_info->family = CanonHostInfo::BROKEN;
+          return true;
+      }
+    }
+
+    // No invalid characters.  Could still be IPv4 or a hostname.
+    host_info->family = CanonHostInfo::NEUTRAL;
+    return false;
+  }
+
+  host_info->out_host.begin = output->length();
+  output->push_back('[');
+
+  // We will now output the address according to the rules in:
+  // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4
+
+  // Start by finding where to place the "::" contraction (if any).
+  url_parse::Component contraction_range;
+  ChooseIPv6ContractionRange(address, &contraction_range);
+
+  for (int i = 0; i <= 14;) {
+    // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive.
+    DCHECK(i % 2 == 0);
+    if (i == contraction_range.begin && contraction_range.len > 0) {
+      // Jump over the contraction.
+      if (i == 0)
+        output->push_back(':');
+      output->push_back(':');
+      i = contraction_range.end();
+    } else {
+      // Consume the next 16 bits from |address|.
+      int x = address[i] << 8 | address[i + 1];
+
+      i += 2;
+
+      // Stringify the 16 bit number (at most requires 4 hex digits).
+      char str[5];
+      _itoa_s(x, str, 16);
+      for (int ch = 0; str[ch] != 0; ++ch)
+        output->push_back(str[ch]);
+
+      // Put a colon after each number, except the last.
+      if (i < 16)
+        output->push_back(':');
+    }
+  }
+
+  output->push_back(']');
+  host_info->out_host.len = output->length() - host_info->out_host.begin;
+
+  host_info->family = CanonHostInfo::IPV6;
   return true;
 }
 
 }  // namespace
 
-bool CanonicalizeIPAddress(const char* spec,
-                           const url_parse::Component& host,
-                           CanonOutput* output,
-                           url_parse::Component* out_host) {
-  return DoCanonicalizeIPv4Address<char, unsigned char>(
-      spec, host, output, out_host);
+bool FindIPv4Components(const char* spec,
+                        const url_parse::Component& host,
+                        url_parse::Component components[4]) {
+  return DoFindIPv4Components<char, unsigned char>(spec, host, components);
 }
 
-bool CanonicalizeIPAddress(const UTF16Char* spec,
+bool FindIPv4Components(const char16* spec,
+                        const url_parse::Component& host,
+                        url_parse::Component components[4]) {
+  return DoFindIPv4Components<char16, char16>(spec, host, components);
+}
+
+void CanonicalizeIPAddress(const char* spec,
                            const url_parse::Component& host,
                            CanonOutput* output,
-                           url_parse::Component* out_host) {
-  return DoCanonicalizeIPv4Address<UTF16Char, UTF16Char>(
-      spec, host, output, out_host);
+                           CanonHostInfo* host_info) {
+  if (DoCanonicalizeIPv4Address<char, unsigned char>(
+          spec, host, output, host_info))
+    return;
+  if (DoCanonicalizeIPv6Address<char, unsigned char>(
+          spec, host, output, host_info))
+    return;
 }
 
+void CanonicalizeIPAddress(const char16* spec,
+                           const url_parse::Component& host,
+                           CanonOutput* output,
+                           CanonHostInfo* host_info) {
+  if (DoCanonicalizeIPv4Address<char16, char16>(
+          spec, host, output, host_info))
+    return;
+  if (DoCanonicalizeIPv6Address<char16, char16>(
+          spec, host, output, host_info))
+    return;
+}
+
+CanonHostInfo::Family IPv4AddressToNumber(const char* spec,
+                                          const url_parse::Component& host,
+                                          unsigned char address[4],
+                                          int* num_ipv4_components) {
+  return DoIPv4AddressToNumber<char>(spec, host, address, num_ipv4_components);
+}
+
+CanonHostInfo::Family IPv4AddressToNumber(const char16* spec,
+                                          const url_parse::Component& host,
+                                          unsigned char address[4],
+                                          int* num_ipv4_components) {
+  return DoIPv4AddressToNumber<char16>(
+      spec, host, address, num_ipv4_components);
+}
+
+bool IPv6AddressToNumber(const char* spec,
+                         const url_parse::Component& host,
+                         unsigned char address[16]) {
+  return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address);
+}
+
+bool IPv6AddressToNumber(const char16* spec,
+                         const url_parse::Component& host,
+                         unsigned char address[16]) {
+  return DoIPv6AddressToNumber<char16, char16>(spec, host, address);
+}
+
+
 }  // namespace url_canon

diff --git a/googleurl/src/url_canon_ip.h b/googleurl/src/url_canon_ip.h
new file mode 100644
index 0000000..0a01c9f
--- /dev/null
+++ b/googleurl/src/url_canon_ip.h

@@ -0,0 +1,101 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_IP_H__
+#define GOOGLEURL_SRC_URL_CANON_IP_H__
+
+#include "base/string16.h"
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_common.h"
+#include "googleurl/src/url_parse.h"
+
+namespace url_canon {
+
+// Searches the host name for the portions of the IPv4 address. On success,
+// each component will be placed into |components| and it will return true.
+// It will return false if the host can not be separated as an IPv4 address
+// or if there are any non-7-bit characters or other characters that can not
+// be in an IP address. (This is important so we fail as early as possible for
+// common non-IP hostnames.)
+//
+// Not all components may exist. If there are only 3 components, for example,
+// the last one will have a length of -1 or 0 to indicate it does not exist.
+//
+// Note that many platform's inet_addr will ignore everything after a space
+// in certain curcumstances if the stuff before the space looks like an IP
+// address. IE6 is included in this. We do NOT handle this case. In many cases,
+// the browser's canonicalization will get run before this which converts
+// spaces to %20 (in the case of IE7) or rejects them (in the case of
+// Mozilla), so this code path never gets hit. Our host canonicalization will
+// notice these spaces and escape them, which will make IP address finding
+// fail. This seems like better behavior than stripping after a space.
+GURL_API bool FindIPv4Components(const char* spec,
+                                 const url_parse::Component& host,
+                                 url_parse::Component components[4]);
+GURL_API bool FindIPv4Components(const char16* spec,
+                                 const url_parse::Component& host,
+                                 url_parse::Component components[4]);
+
+// Converts an IPv4 address to a 32-bit number (network byte order).
+//
+// Possible return values:
+//   IPV4    - IPv4 address was successfully parsed.
+//   BROKEN  - Input was formatted like an IPv4 address, but overflow occurred
+//             during parsing.
+//   NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address.
+//             It might be an IPv6 address, or a hostname.
+//
+// On success, |num_ipv4_components| will be populated with the number of
+// components in the IPv4 address.
+GURL_API CanonHostInfo::Family IPv4AddressToNumber(
+    const char* spec,
+    const url_parse::Component& host,
+    unsigned char address[4],
+    int* num_ipv4_components);
+GURL_API CanonHostInfo::Family IPv4AddressToNumber(
+    const char16* spec,
+    const url_parse::Component& host,
+    unsigned char address[4],
+    int* num_ipv4_components);
+
+// Converts an IPv6 address to a 128-bit number (network byte order), returning
+// true on success. False means that the input was not a valid IPv6 address.
+//
+// NOTE that |host| is expected to be surrounded by square brackets.
+// i.e. "[::1]" rather than "::1".
+GURL_API bool IPv6AddressToNumber(const char* spec,
+                                  const url_parse::Component& host,
+                                  unsigned char address[16]);
+GURL_API bool IPv6AddressToNumber(const char16* spec,
+                                  const url_parse::Component& host,
+                                  unsigned char address[16]);
+
+}  // namespace url_canon
+
+#endif  // GOOGLEURL_SRC_URL_CANON_IP_H__

diff --git a/googleurl/src/url_canon_mailtourl.cc b/googleurl/src/url_canon_mailtourl.cc
new file mode 100644
index 0000000..97868b8
--- /dev/null
+++ b/googleurl/src/url_canon_mailtourl.cc

@@ -0,0 +1,137 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "mailto:" URLs.
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_file.h"
+#include "googleurl/src/url_parse_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source,
+                             const url_parse::Parsed& parsed,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed) {
+
+  // mailto: only uses {scheme, path, query} -- clear the rest.
+  new_parsed->username = url_parse::Component();
+  new_parsed->password = url_parse::Component();
+  new_parsed->host = url_parse::Component();
+  new_parsed->port = url_parse::Component();
+  new_parsed->ref = url_parse::Component();
+
+  // Scheme (known, so we don't bother running it through the more
+  // complicated scheme canonicalizer).
+  new_parsed->scheme.begin = output->length();
+  output->Append("mailto:", 7);
+  new_parsed->scheme.len = 6;
+
+  bool success = true;
+
+  // Path
+  if (parsed.path.is_valid()) {
+    new_parsed->path.begin = output->length();
+
+    // Copy the path using path URL's more lax escaping rules.
+    // We convert to UTF-8 and escape non-ASCII, but leave all
+    // ASCII characters alone.
+    int end = parsed.path.end();
+    for (int i = parsed.path.begin; i < end; ++i) {
+      UCHAR uch = static_cast<UCHAR>(source.path[i]);
+      if (uch < 0x20 || uch >= 0x80)
+        success &= AppendUTF8EscapedChar(source.path, &i, end, output);
+      else
+        output->push_back(static_cast<char>(uch));
+    }
+
+    new_parsed->path.len = output->length() - new_parsed->path.begin;
+  } else {
+    // No path at all
+    new_parsed->path.reset();
+  }
+
+  // Query -- always use the default utf8 charset converter.
+  CanonicalizeQuery(source.query, parsed.query, NULL,
+                    output, &new_parsed->query);
+
+  return success;
+}
+
+} // namespace
+
+bool CanonicalizeMailtoURL(const char* spec,
+                          int spec_len,
+                          const url_parse::Parsed& parsed,
+                          CanonOutput* output,
+                          url_parse::Parsed* new_parsed) {
+  return DoCanonicalizeMailtoURL<char, unsigned char>(
+      URLComponentSource<char>(spec), parsed, output, new_parsed);
+}
+
+bool CanonicalizeMailtoURL(const char16* spec,
+                           int spec_len,
+                           const url_parse::Parsed& parsed,
+                           CanonOutput* output,
+                           url_parse::Parsed* new_parsed) {
+  return DoCanonicalizeMailtoURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, output, new_parsed);
+}
+
+bool ReplaceMailtoURL(const char* base,
+                      const url_parse::Parsed& base_parsed,
+                      const Replacements<char>& replacements,
+                      CanonOutput* output,
+                      url_parse::Parsed* new_parsed) {
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupOverrideComponents(base, replacements, &source, &parsed);
+  return DoCanonicalizeMailtoURL<char, unsigned char>(
+      source, parsed, output, new_parsed);
+}
+
+bool ReplaceMailtoURL(const char* base,
+                      const url_parse::Parsed& base_parsed,
+                      const Replacements<char16>& replacements,
+                      CanonOutput* output,
+                      url_parse::Parsed* new_parsed) {
+  RawCanonOutput<1024> utf8;
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+  return DoCanonicalizeMailtoURL<char, unsigned char>(
+      source, parsed, output, new_parsed);
+}
+
+}  // namespace url_canon

diff --git a/googleurl/src/url_canon_path.cc b/googleurl/src/url_canon_path.cc
index 587f1a0..df97aad 100644
--- a/googleurl/src/url_canon_path.cc
+++ b/googleurl/src/url_canon_path.cc

@@ -82,15 +82,15 @@
 //   ' '      !        "        #        $        %        &        '        (        )        *        +        ,        -        .        /
      ESCAPE,  PASS,    ESCAPE,  ESCAPE,  PASS,    ESCAPE,  PASS,    PASS,    PASS,    PASS,    PASS,    PASS,    PASS,    UNESCAPE,SPECIAL, PASS,
 //   0        1        2        3        4        5        6        7        8        9        :        ;        <        =        >        ?
-     UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS,    PASS,    ESCAPE,  UNESCAPE,ESCAPE,  ESCAPE,
+     UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS,    PASS,    ESCAPE,  PASS,    ESCAPE,  ESCAPE,
 //   @        A        B        C        D        E        F        G        H        I        J        K        L        M        N        O
-     ESCAPE,  UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
+     PASS,    UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
 //   P        Q        R        S        T        U        V        W        X        Y        Z        [        \        ]        ^        _
      UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS,    ESCAPE,  PASS,    ESCAPE,  UNESCAPE,
 //   `        a        b        c        d        e        f        g        h        i        j        k        l        m        n        o
      ESCAPE,  UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
 //   p        q        r        s        t        u        v        w        x        y        z        {        |        }        ~        <NBSP>
-     UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE,  ESCAPE,  ESCAPE,  UNESCAPE,INVALID,
+     UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE,  ESCAPE,  ESCAPE,  UNESCAPE,ESCAPE,
 //   ...all the high-bit characters are escaped
      ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
      ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
@@ -190,7 +190,7 @@
 
 // Appends the given path to the output. It assumes that if the input path
 // starts with a slash, it should be copied to the output. If no path has
-// have already been appended to the output (the case when not resolving
+// already been appended to the output (the case when not resolving
 // relative URLs), the path should begin with a slash.
 //
 // If there are already path components (this mode is used when appending
@@ -259,6 +259,7 @@
             // This dot is not preceeded by a slash, it is just part of some
             // file name.
             output->push_back('.');
+            i += dotlen - 1;
           }
 
         } else if (out_ch == '\\') {
@@ -267,20 +268,19 @@
 
         } else if (out_ch == '%') {
           // Handle escape sequences.
-          char unescaped_value;
+          unsigned char unescaped_value;
           if (DecodeEscaped(spec, &i, end, &unescaped_value)) {
             // Valid escape sequence, see if we keep, reject, or unescape it.
-            unsigned char unsigned_unescaped =
-                static_cast<unsigned char>(unescaped_value);
-            char unescaped_flags = kPathCharLookup[unsigned_unescaped];
+            char unescaped_flags = kPathCharLookup[unescaped_value];
 
             if (unescaped_flags & UNESCAPE) {
               // This escaped value shouldn't be escaped, copy it.
               output->push_back(unescaped_value);
             } else if (unescaped_flags & INVALID_BIT) {
-              // Invalid escaped character, copy the percent and remember
-              // the error.
+              // Invalid escaped character, copy it and remember the error.
               output->push_back('%');
+              output->push_back(static_cast<char>(spec[i - 1]));
+              output->push_back(static_cast<char>(spec[i]));
               success = false;
             } else {
               // Valid escaped character but we should keep it escaped. We
@@ -325,7 +325,7 @@
             CanonOutput* output,
             url_parse::Component* out_path) {
   bool success = true;
-  if (path.len >= 0) {
+  if (path.len > 0) {
     out_path->begin = output->length();
 
     // Write out an initial slash if the input has none. If we just parse a URL
@@ -354,11 +354,11 @@
   return DoPath<char, unsigned char>(spec, path, output, out_path);
 }
 
-bool CanonicalizePath(const UTF16Char* spec,
+bool CanonicalizePath(const char16* spec,
                       const url_parse::Component& path,
                       CanonOutput* output,
                       url_parse::Component* out_path) {
-  return DoPath<UTF16Char, UTF16Char>(spec, path, output, out_path);
+  return DoPath<char16, char16>(spec, path, output, out_path);
 }
 
 bool CanonicalizePartialPath(const char* spec,
@@ -369,12 +369,12 @@
                                             output);
 }
 
-bool CanonicalizePartialPath(const UTF16Char* spec,
+bool CanonicalizePartialPath(const char16* spec,
                              const url_parse::Component& path,
                              int path_begin_in_output,
                              CanonOutput* output) {
-  return DoPartialPath<UTF16Char, UTF16Char>(spec, path, path_begin_in_output,
-                                             output);
+  return DoPartialPath<char16, char16>(spec, path, path_begin_in_output,
+                                       output);
 }
 
 }  // namespace url_canon

diff --git a/googleurl/src/url_canon_pathurl.cc b/googleurl/src/url_canon_pathurl.cc
index a0346c8..4a990c7 100644
--- a/googleurl/src/url_canon_pathurl.cc
+++ b/googleurl/src/url_canon_pathurl.cc

@@ -51,7 +51,7 @@
   // have -1 length.
   new_parsed->username.reset();
   new_parsed->password.reset();
-  new_parsed->host = url_parse::Component(output->length(), 0);
+  new_parsed->host.reset();
   new_parsed->port.reset();
 
   if (parsed.path.is_valid()) {
@@ -91,13 +91,13 @@
       URLComponentSource<char>(spec), parsed, output, new_parsed);
 }
 
-bool CanonicalizePathURL(const UTF16Char* spec,
+bool CanonicalizePathURL(const char16* spec,
                          int spec_len,
                          const url_parse::Parsed& parsed,
                          CanonOutput* output,
                          url_parse::Parsed* new_parsed) {
-  return DoCanonicalizePathURL<UTF16Char, UTF16Char>(
-      URLComponentSource<UTF16Char>(spec), parsed, output, new_parsed);
+  return DoCanonicalizePathURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, output, new_parsed);
 }
 
 bool ReplacePathURL(const char* base,
@@ -114,7 +114,7 @@
 
 bool ReplacePathURL(const char* base,
                     const url_parse::Parsed& base_parsed,
-                    const Replacements<UTF16Char>& replacements,
+                    const Replacements<char16>& replacements,
                     CanonOutput* output,
                     url_parse::Parsed* new_parsed) {
   RawCanonOutput<1024> utf8;

diff --git a/googleurl/src/url_canon_query.cc b/googleurl/src/url_canon_query.cc
index f6b0e53..cee8774 100644
--- a/googleurl/src/url_canon_query.cc
+++ b/googleurl/src/url_canon_query.cc

@@ -76,23 +76,19 @@
   return true;
 }
 
-// Given an input character, appends to the output, escaping as needed for
-// query values.
-inline void Append8BitQueryChar(unsigned char ch, CanonOutput* output) {
-  if (ch <= 0x20 || ch >= 0x7f) 
-    AppendEscapedChar(ch, output);
-  else
-    output->push_back(ch);
-}
-
-// Given an input string that is represented in no more than 8-bits per
-// character, appends the characters to the output according to query rules.
+// Appends the given string to the output, escaping characters that do not
+// match the given |type| in SharedCharTypes. This version will accept 8 or 16
+// bit characters, but assumes that they have only 7-bit values. It also assumes
+// that all UTF-8 values are correct, so doesn't bother checking
 template<typename CHAR>
-void Append8BitQueryString(const CHAR* source,
-                           int length,
-                           CanonOutput* output) {
-  for (int i = 0; i < length; i++)
-    Append8BitQueryChar(static_cast<unsigned char>(source[i]), output);
+void AppendRaw8BitQueryString(const CHAR* source, int length,
+                              CanonOutput* output) {
+  for (int i = 0; i < length; i++) {
+    if (!IsQueryChar(static_cast<unsigned char>(source[i])))
+      AppendEscapedChar(static_cast<unsigned char>(source[i]), output);
+    else  // Doesn't need escaping.
+      output->push_back(static_cast<char>(source[i]));
+  }
 }
 
 // Runs the converter on the given UTF-8 input. Since the converter expects
@@ -111,7 +107,7 @@
 // Runs the converter with the given UTF-16 input. We don't have to do
 // anything, but this overriddden function allows us to use the same code
 // for both UTF-8 and UTF-16 input.
-void RunConverter(const UTF16Char* spec,
+void RunConverter(const char16* spec,
                   const url_parse::Component& query,
                   CharsetConverter* converter,
                   CanonOutput* output) {
@@ -123,10 +119,9 @@
                               const url_parse::Component& query,
                               CharsetConverter* converter,
                               CanonOutput* output) {
-  int end = query.end();
   if (IsAllASCII<CHAR, UCHAR>(spec, query)) {
     // Easy: the input can just appended with no character set conversions.
-    Append8BitQueryString(&spec[query.begin], query.len, output);
+    AppendRaw8BitQueryString(&spec[query.begin], query.len, output);
 
   } else {
     // Harder: convert to the proper encoding first.
@@ -135,22 +130,11 @@
       // necessary values.
       RawCanonOutput<1024> eight_bit;
       RunConverter(spec, query, converter, &eight_bit);
-      Append8BitQueryString(eight_bit.data(), eight_bit.length(), output);
+      AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output);
 
     } else {
       // No converter, do our own UTF-8 conversion.
-      for (int i = query.begin; i < end; i++) {
-        if (static_cast<UCHAR>(spec[i]) >= 0x80) {
-          // ReadChar will fill the code point with
-          // kUnicodeReplacementCharacter when the input is invalid, which is
-          // what we want.
-          unsigned code_point;
-          ReadUTFChar(spec, &i, end, &code_point);
-          AppendUTF8EscapedValue(code_point, output);
-        } else {
-          Append8BitQueryChar(static_cast<unsigned char>(spec[i]), output);
-        }
-      }
+      AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);
     }
   }
 }
@@ -185,21 +169,21 @@
                                            output, out_query);
 }
 
-void CanonicalizeQuery(const UTF16Char* spec,
+void CanonicalizeQuery(const char16* spec,
                        const url_parse::Component& query,
                        CharsetConverter* converter,
                        CanonOutput* output,
                        url_parse::Component* out_query) {
-  DoCanonicalizeQuery<UTF16Char, UTF16Char>(spec, query, converter,
-                                            output, out_query);
+  DoCanonicalizeQuery<char16, char16>(spec, query, converter,
+                                      output, out_query);
 }
 
-void ConvertUTF16ToQueryEncoding(const UTF16Char* input,
+void ConvertUTF16ToQueryEncoding(const char16* input,
                                  const url_parse::Component& query,
                                  CharsetConverter* converter,
                                  CanonOutput* output) {
-  DoConvertToQueryEncoding<UTF16Char, UTF16Char>(input, query,
-                                                 converter, output);
+  DoConvertToQueryEncoding<char16, char16>(input, query,
+                                           converter, output);
 }
 
 }  // namespace url_canon

diff --git a/googleurl/src/url_canon_relative.cc b/googleurl/src/url_canon_relative.cc
index ee0cde0..6bcc72f 100644
--- a/googleurl/src/url_canon_relative.cc
+++ b/googleurl/src/url_canon_relative.cc

@@ -120,20 +120,12 @@
     return true;
 #endif  // WIN32
 
-  // Beginning with a slash means for sure this scheme is relative. We do
-  // this before checking the scheme in case we have input like "/foo:bar"
-  // that the scheme finder will think is a scheme.
-  if (begin < url_len && url_parse::IsURLSlash(url[begin])) {
-    *relative_component = url_parse::MakeRange(begin, url_len);
-    *is_relative = true;
-    return true;
-  }
-
   // See if we've got a scheme, if not, we know this is a relative URL.
   // BUT: Just because we have a scheme, doesn't make it absolute.
-  // "http:foo.html" is a relative URL with path "foo.html".
+  // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
+  // empty, we treat it as relative (":foo") like IE does.
   url_parse::Component scheme;
-  if (!url_parse::ExtractScheme(url, url_len, &scheme)) {
+  if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
     // Don't allow relative URLs if the base scheme doesn't support it.
     if (!is_base_hierarchical)
       return false;
@@ -143,6 +135,16 @@
     return true;
   }
 
+  // If the scheme isn't valid, then it's relative.
+  int scheme_end = scheme.end();
+  for (int i = scheme.begin; i < scheme_end; i++) {
+    if (!CanonicalSchemeChar(url[i])) {
+      *relative_component = url_parse::MakeRange(begin, url_len);
+      *is_relative = true;
+      return true;
+    }
+  }
+
   // If the scheme is not the same, then we can't count it as relative.
   if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme))
     return true;
@@ -250,7 +252,9 @@
 
   // The path should begin with a slash (as all canonical paths do). We check
   // if it is followed by a drive letter and copy it.
-  if (DoesBeginSlashWindowsDriveSpec(base_url, base_path_begin, base_path_end)) {
+  if (DoesBeginSlashWindowsDriveSpec(base_url,
+                                     base_path_begin,
+                                     base_path_end)) {
     // Copy the two-character drive spec to the output. It will now look like
     // "file:///C:" so the rest of it can be treated like a standard path.
     output->push_back('/');
@@ -280,7 +284,11 @@
   // We know the authority section didn't change, copy it to the output. We
   // also know we have a path so can copy up to there.
   url_parse::Component path, query, ref;
-  url_parse::ParsePathInternal(relative_url, relative_component, &path, &query, &ref);
+  url_parse::ParsePathInternal(relative_url,
+                               relative_component,
+                               &path,
+                               &query,
+                               &ref);
   // Canonical URLs always have a path, so we can use that offset.
   output->Append(base_url, base_parsed.path.begin);
 
@@ -343,7 +351,8 @@
   if (query.is_valid()) {
     // Just the query specified, replace the query and reference (ignore
     // failures for refs)
-    CanonicalizeQuery(relative_url, query, NULL, output, &out_parsed->query);
+    CanonicalizeQuery(relative_url, query, query_converter,
+                      output, &out_parsed->query);
     CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
     return success;
   }
@@ -411,7 +420,7 @@
   // Parse the file URL. The file URl parsing function uses the same logic
   // as we do for determining if the file is absolute, in which case it will
   // not bother to look for a scheme.
-  url_parse::Parsed relative_parsed; 
+  url_parse::Parsed relative_parsed;
   url_parse::ParseFileURL(&relative_url[relative_component.begin],
                           relative_component.len, &relative_parsed);
 
@@ -438,7 +447,7 @@
   // paths (even the default path of "/" is OK).
   //
   // We allow hosts with no length so we can handle file URLs, for example.
-  if (base_parsed.host.len < 0 || base_parsed.path.len <= 0) {
+  if (base_parsed.path.len <= 0) {
     // On error, return the input (resolving a relative URL on a non-relative
     // base = the base).
     int base_len = base_parsed.Length();
@@ -448,10 +457,11 @@
   }
 
   if (relative_component.len <= 0) {
-    // Empty relative URL, make no changes.
+    // Empty relative URL, leave unchanged, only removing the ref component.
     int base_len = base_parsed.Length();
-    for (int i = 0; i < base_len; i++)
-      output->push_back(base_url[i]);
+    base_len -= base_parsed.ref.len + 1;
+    out_parsed->ref.reset();
+    output->Append(base_url, base_len);
     return true;
   }
 
@@ -469,6 +479,9 @@
   // case (we reject anything like "/c:/foo") because that should be treated
   // as a path. For file URLs, we allow any number of slashes since that would
   // be setting the path.
+  //
+  // This assumes the absolute path resolver handles absolute URLs like this
+  // properly. url_util::DoCanonicalize does this.
   int after_slashes = relative_component.begin + num_slashes;
   if (url_parse::DoesBeginUNCPath(relative_url, relative_component.begin,
                                   relative_component.end(), !base_is_file) ||
@@ -478,6 +491,17 @@
     return DoResolveAbsoluteFile(relative_url, relative_component,
                                  query_converter, output, out_parsed);
   }
+#else
+  // Other platforms need explicit handling for file: URLs with multiple
+  // slashes because the generic scheme parsing always extracts a host, but a
+  // file: URL only has a host if it has exactly 2 slashes. This also
+  // handles the special case where the URL is only slashes, since that
+  // doesn't have a host part either.
+  if (base_is_file &&
+      (num_slashes > 2 || num_slashes == relative_component.len)) {
+    return DoResolveAbsoluteFile(relative_url, relative_component,
+                                 query_converter, output, out_parsed);
+  }
 #endif
 
   // Any other double-slashes mean that this is relative to the scheme.
@@ -509,12 +533,12 @@
 
 bool IsRelativeURL(const char* base,
                    const url_parse::Parsed& base_parsed,
-                   const UTF16Char* fragment,
+                   const char16* fragment,
                    int fragment_len,
                    bool is_base_hierarchical,
                    bool* is_relative,
                    url_parse::Component* relative_component) {
-  return DoIsRelativeURL<UTF16Char>(
+  return DoIsRelativeURL<char16>(
       base, base_parsed, fragment, fragment_len, is_base_hierarchical,
       is_relative, relative_component);
 }
@@ -535,12 +559,12 @@
 bool ResolveRelativeURL(const char* base_url,
                         const url_parse::Parsed& base_parsed,
                         bool base_is_file,
-                        const UTF16Char* relative_url,
+                        const char16* relative_url,
                         const url_parse::Component& relative_component,
                         CharsetConverter* query_converter,
                         CanonOutput* output,
                         url_parse::Parsed* out_parsed) {
-  return DoResolveRelativeURL<UTF16Char>(
+  return DoResolveRelativeURL<char16>(
       base_url, base_parsed, base_is_file, relative_url,
       relative_component, query_converter, output, out_parsed);
 }

diff --git a/googleurl/src/url_canon_stdstring.h b/googleurl/src/url_canon_stdstring.h
index b8f0d11..c43b777 100644
--- a/googleurl/src/url_canon_stdstring.h
+++ b/googleurl/src/url_canon_stdstring.h

@@ -31,15 +31,15 @@
 // strings. Because the canonicalizer tries not to be dependent on the STL,
 // we have segregated it here.
 
-#ifndef GOOGLEURL_SRC_URL_CANON_STRING_H__
-#define GOOGLEURL_SRC_URL_CANON_STRING_H__
+#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
 
 #include <string>
 #include "googleurl/src/url_canon.h"
 
 namespace url_canon {
 
-// Write into a std::string given in the constructor. This object odes not own
+// Write into a std::string given in the constructor. This object does not own
 // the string itself, and the user must ensure that the string stays alive
 // throughout the lifetime of this object.
 //
@@ -82,7 +82,7 @@
   }
 
  protected:
-   std::string* str_;
+  std::string* str_;
 };
 
 // An extension of the Replacements class that allows the setters to use
@@ -90,43 +90,45 @@
 //
 // The strings passed as arguments are not copied and must remain valid until
 // this class goes out of scope.
-template<typename CHAR>
-class StdStringReplacements : public url_canon::Replacements<CHAR> {
+template<typename STR>
+class StdStringReplacements :
+    public url_canon::Replacements<typename STR::value_type> {
  public:
-  void SetSchemeStr(const std::basic_string<CHAR>& s) {
-    SetScheme(s.data(),
-              url_parse::Component(0, static_cast<int>(s.length())));
+  void SetSchemeStr(const STR& s) {
+    this->SetScheme(s.data(),
+                    url_parse::Component(0, static_cast<int>(s.length())));
   }
-  void SetUsernameStr(const std::basic_string<CHAR>& s) {
-    SetUsername(s.data(),
-                url_parse::Component(0, static_cast<int>(s.length())));
+  void SetUsernameStr(const STR& s) {
+    this->SetUsername(s.data(),
+                      url_parse::Component(0, static_cast<int>(s.length())));
   }
-  void SetPasswordStr(const std::basic_string<CHAR>& s) {
-    SetPassword(s.data(),
-                url_parse::Component(0, static_cast<int>(s.length())));
+  void SetPasswordStr(const STR& s) {
+    this->SetPassword(s.data(),
+                      url_parse::Component(0, static_cast<int>(s.length())));
   }
-  void SetHostStr(const std::basic_string<CHAR>& s) {
-    SetHost(s.data(),
-            url_parse::Component(0, static_cast<int>(s.length())));
+  void SetHostStr(const STR& s) {
+    this->SetHost(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
   }
-  void SetPortStr(const std::basic_string<CHAR>& s) {
-    SetPort(s.data(),
-            url_parse::Component(0, static_cast<int>(s.length())));
+  void SetPortStr(const STR& s) {
+    this->SetPort(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
   }
-  void SetPathStr(const std::basic_string<CHAR>& s) {
-    SetPath(s.data(),
-            url_parse::Component(0, static_cast<int>(s.length())));
+  void SetPathStr(const STR& s) {
+    this->SetPath(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
   }
-  void SetQueryStr(const std::basic_string<CHAR>& s) {
-    SetQuery(s.data(),
-             url_parse::Component(0, static_cast<int>(s.length())));
+  void SetQueryStr(const STR& s) {
+    this->SetQuery(s.data(),
+                   url_parse::Component(0, static_cast<int>(s.length())));
   }
-  void SetRefStr(const std::basic_string<CHAR>& s) {
-    SetRef(s.data(),
-           url_parse::Component(0, static_cast<int>(s.length())));
+  void SetRefStr(const STR& s) {
+    this->SetRef(s.data(),
+                 url_parse::Component(0, static_cast<int>(s.length())));
   }
 };
 
 }  // namespace url_canon
 
-#endif  // GOOGLEURL_SRC_URL_CANON_STRING_H__
+#endif  // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+

diff --git a/googleurl/src/url_canon_stdurl.cc b/googleurl/src/url_canon_stdurl.cc
index 9a1a8eb..1e21a14 100644
--- a/googleurl/src/url_canon_stdurl.cc
+++ b/googleurl/src/url_canon_stdurl.cc

@@ -48,7 +48,7 @@
                                     output, &new_parsed->scheme);
 
   // Authority (username, password, host, port)
-  bool have_authority = false;
+  bool have_authority;
   if (parsed.username.is_valid() || parsed.password.is_valid() ||
       parsed.host.is_nonempty() || parsed.port.is_valid()) {
     have_authority = true;
@@ -65,15 +65,27 @@
                                     output,
                                     &new_parsed->username,
                                     &new_parsed->password);
- 
-    // Host: always write if we have an authority (may be empty).
+
     success &= CanonicalizeHost(source.host, parsed.host,
                                 output, &new_parsed->host);
 
-    // Port: the port canonicalizer will handle the colon
-    // FIXME(brettw) DO SOMETHING BETTER WITH THE PORT!!!
-    success &= CanonicalizePort(source.port, parsed.port, 80,
+    // Host must not be empty for standard URLs.
+    if (!parsed.host.is_nonempty())
+      success = false;
+
+    // Port: the port canonicalizer will handle the colon.
+    int default_port = DefaultPortForScheme(
+        &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len);
+    success &= CanonicalizePort(source.port, parsed.port, default_port,
                                 output, &new_parsed->port);
+  } else {
+    // No authority, clear the components.
+    have_authority = false;
+    new_parsed->host.reset();
+    new_parsed->username.reset();
+    new_parsed->password.reset();
+    new_parsed->port.reset();
+    success = false;  // Standard URLs must have an authority.
   }
 
   // Path
@@ -89,7 +101,7 @@
     output->push_back('/');
   } else {
     // No path at all
-    new_parsed->path = url_parse::Component();
+    new_parsed->path.reset();
   }
 
   // Query
@@ -104,6 +116,38 @@
 
 }  // namespace
 
+
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+int DefaultPortForScheme(const char* scheme, int scheme_len) {
+  int default_port = url_parse::PORT_UNSPECIFIED;
+  switch (scheme_len) {
+    case 4:
+      if (!strncmp(scheme, "http", scheme_len))
+        default_port = 80;
+      break;
+    case 5:
+      if (!strncmp(scheme, "https", scheme_len))
+        default_port = 443;
+      break;
+    case 3:
+      if (!strncmp(scheme, "ftp", scheme_len))
+        default_port = 21;
+      else if (!strncmp(scheme, "wss", scheme_len))
+        default_port = 443;
+      break;
+    case 6:
+      if (!strncmp(scheme, "gopher", scheme_len))
+        default_port = 70;
+      break;
+    case 2:
+      if (!strncmp(scheme, "ws", scheme_len))
+        default_port = 80;
+      break;
+  }
+  return default_port;
+}
+
 bool CanonicalizeStandardURL(const char* spec,
                              int spec_len,
                              const url_parse::Parsed& parsed,
@@ -115,17 +159,26 @@
       output, new_parsed);
 }
 
-bool CanonicalizeStandardURL(const UTF16Char* spec,
+bool CanonicalizeStandardURL(const char16* spec,
                              int spec_len,
                              const url_parse::Parsed& parsed,
                              CharsetConverter* query_converter,
                              CanonOutput* output,
                              url_parse::Parsed* new_parsed) {
-  return DoCanonicalizeStandardURL<UTF16Char, UTF16Char>(
-      URLComponentSource<UTF16Char>(spec), parsed, query_converter,
+  return DoCanonicalizeStandardURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, query_converter,
       output, new_parsed);
 }
 
+// It might be nice in the future to optimize this so unchanged components don't
+// need to be recanonicalized. This is especially true since the common case for
+// ReplaceComponents is removing things we don't want, like reference fragments
+// and usernames. These cases can become more efficient if we can assume the
+// rest of the URL is OK with these removed (or only the modified parts
+// recanonicalized). This would be much more complex to implement, however.
+//
+// You would also need to update DoReplaceComponents in url_util.cc which
+// relies on this re-checking everything (see the comment there for why).
 bool ReplaceStandardURL(const char* base,
                         const url_parse::Parsed& base_parsed,
                         const Replacements<char>& replacements,
@@ -143,7 +196,7 @@
 // regular codepath can be used.
 bool ReplaceStandardURL(const char* base,
                         const url_parse::Parsed& base_parsed,
-                        const Replacements<UTF16Char>& replacements,
+                        const Replacements<char16>& replacements,
                         CharsetConverter* query_converter,
                         CanonOutput* output,
                         url_parse::Parsed* new_parsed) {

diff --git a/googleurl/src/url_canon_unittest.cc b/googleurl/src/url_canon_unittest.cc
index 43cc38d..76b596c 100644
--- a/googleurl/src/url_canon_unittest.cc
+++ b/googleurl/src/url_canon_unittest.cc

@@ -45,12 +45,10 @@
 #define ARRAYSIZE ARRAYSIZE_UNSAFE
 #endif
 
-using url_canon::UTF16Char;
-using url_canon::UTF16String;
-
 using url_test_utils::WStringToUTF16;
 using url_test_utils::ConvertUTF8ToUTF16;
 using url_test_utils::ConvertUTF16ToUTF8;
+using url_canon::CanonHostInfo;
 
 namespace {
 
@@ -72,6 +70,19 @@
   bool expected_success;
 };
 
+// Test cases for CanonicalizeIPAddress().  The inputs are identical to
+// DualComponentCase, but the output has extra CanonHostInfo fields.
+struct IPAddressCase {
+  const char* input8;
+  const wchar_t* input16;
+  const char* expected;
+  url_parse::Component expected_component;
+
+  // CanonHostInfo fields, for verbose output.
+  CanonHostInfo::Family expected_family;
+  int expected_num_ipv4_components;
+};
+
 struct ReplaceCase {
   const char* base;
   const char* scheme;
@@ -88,7 +99,7 @@
 // Wrapper around a UConverter object that managers creation and destruction.
 class UConvScoper {
  public:
-  UConvScoper(const char* charset_name) {
+  explicit UConvScoper(const char* charset_name) {
     UErrorCode err = U_ZERO_ERROR;
     converter_ = ucnv_open(charset_name, &err);
   }
@@ -162,7 +173,7 @@
   };
 
   std::string out_str;
-  for (int i = 0; i < ARRAYSIZE(utf_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) {
     if (utf_cases[i].input8) {
       out_str.clear();
       url_canon::StdStringCanonOutput output(&out_str);
@@ -181,7 +192,7 @@
       out_str.clear();
       url_canon::StdStringCanonOutput output(&out_str);
 
-      UTF16String input_str(WStringToUTF16(utf_cases[i].input16));
+      string16 input_str(WStringToUTF16(utf_cases[i].input16));
       int input_len = static_cast<int>(input_str.length());
       bool success = true;
       for (int ch = 0; ch < input_len; ch++) {
@@ -199,7 +210,7 @@
 
       // UTF-16 -> UTF-8
       std::string input8_str(utf_cases[i].input8);
-      UTF16String input16_str(WStringToUTF16(utf_cases[i].input16));
+      string16 input16_str(WStringToUTF16(utf_cases[i].input16));
       EXPECT_EQ(input8_str, ConvertUTF16ToUTF8(input16_str));
 
       // UTF-8 -> UTF-16
@@ -225,15 +236,15 @@
     {L"hello\x4f60\x06de\x597dworld", "big5", "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
   };
 
-  for (int i = 0; i < ARRAYSIZE(icu_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) {
     UConvScoper conv(icu_cases[i].encoding);
-    ASSERT_TRUE(conv.converter());
+    ASSERT_TRUE(conv.converter() != NULL);
     url_canon::ICUCharsetConverter converter(conv.converter());
 
     std::string str;
     url_canon::StdStringCanonOutput output(&str);
 
-    UTF16String input_str(WStringToUTF16(icu_cases[i].input));
+    string16 input_str(WStringToUTF16(icu_cases[i].input));
     int input_len = static_cast<int>(input_str.length());
     converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
     output.Complete();
@@ -249,14 +260,14 @@
   url_canon::ICUCharsetConverter converter(conv.converter());
   for (int i = static_size - 2; i <= static_size + 2; i++) {
     // Make a string with the appropriate length.
-    UTF16String input;
+    string16 input;
     for (int ch = 0; ch < i; ch++)
       input.push_back('a');
 
     url_canon::RawCanonOutput<static_size> output;
     converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
                                &output);
-    EXPECT_TRUE(output.length() == input.length());
+    EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
   }
 }
 
@@ -275,11 +286,14 @@
     {" HTTP ", "%20http%20:", url_parse::Component(0, 10),false},
     {"htt: ", "htt%3A%20:", url_parse::Component(0, 9), false},
     {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", url_parse::Component(0, 22), false},
+      // Don't re-escape something already escaped. Note that it will
+      // "canonicalize" the 'A' to 'a', but that's OK.
+    {"ht%3Atp", "ht%3atp:", url_parse::Component(0, 7), false},
   };
 
   std::string out_str;
 
-  for (int i = 0; i < arraysize(scheme_cases); i++) {
+  for (size_t i = 0; i < arraysize(scheme_cases); i++) {
     int url_len = static_cast<int>(strlen(scheme_cases[i].input));
     url_parse::Component in_comp(0, url_len);
     url_parse::Component out_comp;
@@ -299,7 +313,7 @@
     out_str.clear();
     url_canon::StdStringCanonOutput output2(&out_str);
 
-    UTF16String wide_input(ConvertUTF8ToUTF16(scheme_cases[i].input));
+    string16 wide_input(ConvertUTF8ToUTF16(scheme_cases[i].input));
     in_comp.len = static_cast<int>(wide_input.length());
     success = url_canon::CanonicalizeScheme(wide_input.c_str(), in_comp,
                                             &output2, &out_comp);
@@ -327,65 +341,71 @@
 }
 
 TEST(URLCanonTest, Host) {
-  DualComponentCase host_cases[] = {
+  IPAddressCase host_cases[] = {
        // Basic canonicalization, uppercase should be converted to lowercase.
-    {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", url_parse::Component(0, 10), true},
-      // Spaces should be escaped
-    {"Goo goo.com", L"Goo goo.com", "goo%20goo.com", url_parse::Component(0, 13), true},
-    {"Goo%20 goo.com", L"Goo%20 goo.com", "goo%20%20goo.com", url_parse::Component(0, 16), true},
+    {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1},
+      // Spaces and some other characters should be escaped.
+    {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", url_parse::Component(0, 22), CanonHostInfo::NEUTRAL, -1},
       // Exciting different types of spaces!
-    {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", url_parse::Component(0, 16), true},
+    {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", url_parse::Component(0, 16), CanonHostInfo::NEUTRAL, -1},
       // Other types of space (no-break, zero-width, zero-width-no-break) are
       // name-prepped away to nothing.
-    {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", url_parse::Component(0, 10), true},
+    {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1},
       // Ideographic full stop (full-width period for Chinese, etc.) should be
       // treated as a dot.
-    {NULL, L"www.foo\x3002"L"bar.com", "www.foo.bar.com", url_parse::Component(0, 15), true},
+    {NULL, L"www.foo\x3002"L"bar.com", "www.foo.bar.com", url_parse::Component(0, 15), CanonHostInfo::NEUTRAL, -1},
       // Invalid unicode characters should fail...
       // ...In wide input, ICU will barf and we'll end up with the input as
       //    escaped UTF-8 (the invalid character should be replaced with the
       //    replacement character).
-    {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), false},
+    {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1},
       // ...This is the same as previous but with with escaped.
-    {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), false},
+    {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1},
       // Test name prepping, fullwidth input should be converted to ASCII and NOT
       // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
-    {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", url_parse::Component(0, 6), true},
+    {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", url_parse::Component(0, 6), CanonHostInfo::NEUTRAL, -1},
       // Test that fullwidth escaped values are properly name-prepped,
       // then converted or rejected.
       // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
-    {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "%2541.com", url_parse::Component(0, 9), false},
-    {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "%2541.com", url_parse::Component(0, 9), false},
+    {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1},
+    {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1},
       // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
-    {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%2500.com", url_parse::Component(0, 9), false},
-    {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%2500.com", url_parse::Component(0, 9), false},
+    {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1},
+    {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1},
       // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
-    {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), true},
+    {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1},
       // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
       // UTF-8 (wide case). The output should be equivalent to the true wide
       // character input above).
-    {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), true},
+    {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1},
       // Invalid escaped characters should fail and the percents should be
       // escaped.
-    {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", url_parse::Component(0, 10), false},
+    {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", url_parse::Component(0, 10), CanonHostInfo::BROKEN, -1},
       // If we get an invalid character that has been escaped.
-    {"%25", L"%25", "%25", url_parse::Component(0, 3), false},
-    {"hello%00", L"hello%00", "hello%00", url_parse::Component(0, 8), false},
+    {"%25", L"%25", "%25", url_parse::Component(0, 3), CanonHostInfo::BROKEN, -1},
+    {"hello%00", L"hello%00", "hello%00", url_parse::Component(0, 8), CanonHostInfo::BROKEN, -1},
       // Escaped numbers should be treated like IP addresses if they are.
-    {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", url_parse::Component(0, 11), true},
+    {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
+    {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
       // Invalid escaping should trigger the regular host error handling.
-    {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", url_parse::Component(0, 17), false},
+    {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", url_parse::Component(0, 17), CanonHostInfo::BROKEN, -1},
       // Something that isn't exactly an IP should get treated as a host and
       // spaces escaped.
-    {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", url_parse::Component(0, 19), true},
+    {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", url_parse::Component(0, 19), CanonHostInfo::NEUTRAL, -1},
       // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
       // These are "0Xc0.0250.01" in fullwidth.
-    {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", url_parse::Component(0, 11), true},
+    {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
+      // Broken IP addresses get marked as such.
+    {"192.168.0.257", L"192.168.0.257", "192.168.0.257", url_parse::Component(0, 13), CanonHostInfo::BROKEN, -1},
+    {"[google.com]", L"[google.com]", "[google.com]", url_parse::Component(0, 12), CanonHostInfo::BROKEN, -1},
+      // Cyrillic letter followed buy ( should return punicode for ( escaped before punicode string was created. I.e.
+      // if ( is escaped after punicode is created we would get xn--%28-8tb (incorrect).
+    {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", url_parse::Component(0, 11), CanonHostInfo::NEUTRAL, -1},
   };
 
+  // CanonicalizeHost() non-verbose.
   std::string out_str;
-  for (int i = 0; i < arraysize(host_cases); i++) {
+  for (size_t i = 0; i < arraysize(host_cases); i++) {
     // Narrow version.
     if (host_cases[i].input8) {
       int host_len = static_cast<int>(strlen(host_cases[i].input8));
@@ -399,7 +419,8 @@
                                                  &output, &out_comp);
       output.Complete();
 
-      EXPECT_EQ(host_cases[i].expected_success, success);
+      EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
+                success);
       EXPECT_EQ(std::string(host_cases[i].expected), out_str);
       EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
       EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
@@ -407,7 +428,7 @@
 
     // Wide version.
     if (host_cases[i].input16) {
-      UTF16String input16(WStringToUTF16(host_cases[i].input16));
+      string16 input16(WStringToUTF16(host_cases[i].input16));
       int host_len = static_cast<int>(input16.length());
       url_parse::Component in_comp(0, host_len);
       url_parse::Component out_comp;
@@ -419,102 +440,348 @@
                                                  &output, &out_comp);
       output.Complete();
 
-      EXPECT_EQ(host_cases[i].expected_success, success);
+      EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
+                success);
       EXPECT_EQ(std::string(host_cases[i].expected), out_str);
       EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
       EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
     }
   }
+
+  // CanonicalizeHostVerbose()
+  for (size_t i = 0; i < arraysize(host_cases); i++) {
+    // Narrow version.
+    if (host_cases[i].input8) {
+      int host_len = static_cast<int>(strlen(host_cases[i].input8));
+      url_parse::Component in_comp(0, host_len);
+
+      out_str.clear();
+      url_canon::StdStringCanonOutput output(&out_str);
+      CanonHostInfo host_info;
+
+      url_canon::CanonicalizeHostVerbose(host_cases[i].input8, in_comp,
+                                         &output, &host_info);
+      output.Complete();
+
+      EXPECT_EQ(host_cases[i].expected_family, host_info.family);
+      EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+      EXPECT_EQ(host_cases[i].expected_component.begin,
+                host_info.out_host.begin);
+      EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
+      if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
+        EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
+                  host_info.num_ipv4_components);
+      }
+    }
+
+    // Wide version.
+    if (host_cases[i].input16) {
+      string16 input16(WStringToUTF16(host_cases[i].input16));
+      int host_len = static_cast<int>(input16.length());
+      url_parse::Component in_comp(0, host_len);
+
+      out_str.clear();
+      url_canon::StdStringCanonOutput output(&out_str);
+      CanonHostInfo host_info;
+
+      url_canon::CanonicalizeHostVerbose(input16.c_str(), in_comp,
+                                         &output, &host_info);
+      output.Complete();
+
+      EXPECT_EQ(host_cases[i].expected_family, host_info.family);
+      EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+      EXPECT_EQ(host_cases[i].expected_component.begin,
+                host_info.out_host.begin);
+      EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
+      if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
+        EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
+                  host_info.num_ipv4_components);
+      }
+    }
+  }
 }
 
 TEST(URLCanonTest, IPv4) {
-  DualComponentCase cases[] = {
+  IPAddressCase cases[] = {
       // Empty is not an IP address.
-    {"", L"", "", url_parse::Component(), false},
-    {".", L".", "", url_parse::Component(), false},
+    {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+    {".", L".", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
       // Regular IP addresses in different bases.
-    {"192.168.0.1", L"192.168.0.1", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", url_parse::Component(0, 11), true},
+    {"192.168.0.1", L"192.168.0.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4},
+    {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4},
+    {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4},
       // Non-IP addresses due to invalid characters.
-    {"192.168.9.com", L"192.168.9.com", "", url_parse::Component(), false},
+    {"192.168.9.com", L"192.168.9.com", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
       // Invalid characters for the base should be rejected.
-    {"19a.168.0.1", L"19a.168.0.1", "", url_parse::Component(), false},
-    {"0308.0250.00.01", L"0308.0250.00.01", "", url_parse::Component(), false},
-    {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", url_parse::Component(), false},
+    {"19a.168.0.1", L"19a.168.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+    {"0308.0250.00.01", L"0308.0250.00.01", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+    {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
       // If there are not enough components, the last one should fill them out.
-    {"192", L"192", "0.0.0.192", url_parse::Component(0, 9), true},
-    {"0xC0a80001", L"0xC0a80001", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"030052000001", L"030052000001", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"000030052000001", L"000030052000001", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"192.168", L"192.168", "192.0.0.168", url_parse::Component(0, 11), true},
-    {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"192.168.1", L"192.168.1", "192.168.0.1", url_parse::Component(0, 11), true},
+    {"192", L"192", "0.0.0.192", url_parse::Component(0, 9), CanonHostInfo::IPV4, 1},
+    {"0xC0a80001", L"0xC0a80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1},
+    {"030052000001", L"030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1},
+    {"000030052000001", L"000030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1},
+    {"192.168", L"192.168", "192.0.0.168", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2},
+    {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2},
+    {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2},
+    {"192.168.1", L"192.168.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
       // Too many components means not an IP address.
-    {"192.168.0.0.1", L"192.168.0.0.1", "", url_parse::Component(), false},
+    {"192.168.0.0.1", L"192.168.0.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
       // We allow a single trailing dot.
-    {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"192.168.0.1. hello", L"192.168.0.1. hello", "", url_parse::Component(), false},
-    {"192.168.0.1..", L"192.168.0.1..", "", url_parse::Component(), false},
+    {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4},
+    {"192.168.0.1. hello", L"192.168.0.1. hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+    {"192.168.0.1..", L"192.168.0.1..", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
       // Two dots in a row means not an IP address.
-    {"192.168..1", L"192.168..1", "", url_parse::Component(), false},
-      // Any non-first components get truncated to one byte.
-    {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "20.0.162.255", url_parse::Component(0, 12), true},
-      // The last component should get truncated to however much space is
-      // remaining.
-    {"192.168.0.257", L"192.168.0.257", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"192.168.0xa20001", L"192.168.0xa20001", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"192.015052000001", L"192.015052000001", "192.168.0.1", url_parse::Component(0, 11), true},
-    {"0X12C0a80001", L"0X12C0a80001", "192.168.0.1", url_parse::Component(0, 11), true},
+    {"192.168..1", L"192.168..1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+      // Any numerical overflow should be marked as BROKEN.
+    {"0x100.0", L"0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0x100.0.0", L"0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0x100.0.0.0", L"0x100.0.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0.0x100.0.0", L"0.0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0.0.0x100.0", L"0.0.0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0.0.0.0x100", L"0.0.0.0x100", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0.0.0x10000", L"0.0.0x10000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0.0x1000000", L"0.0x1000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0x100000000", L"0x100000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+      // Repeat the previous tests, minus 1, to verify boundaries.
+    {"0xFF.0", L"0xFF.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 2},
+    {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 3},
+    {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4},
+    {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4},
+    {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4},
+    {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4},
+    {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
+    {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", url_parse::Component(0, 13), CanonHostInfo::IPV4, 2},
+    {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", url_parse::Component(0, 15), CanonHostInfo::IPV4, 1},
+      // Old trunctations tests.  They're all "BROKEN" now.
+    {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"192.168.0.257", L"192.168.0.257", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"192.168.0xa20001", L"192.168.0xa20001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"192.015052000001", L"192.015052000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"0X12C0a80001", L"0X12C0a80001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"276.1.2", L"276.1.2", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
       // Spaces should be rejected.
-    {"192.168.0.1 hello", L"192.168.0.1 hello", "", url_parse::Component(), false},
-      // Truncation plus the last component missing.
-    {"276.1.2", L"276.1.2", "20.1.0.2", url_parse::Component(0, 8), true},
-      // Very large numbers. We support up to 16 characters per component
-      // before rejecting.
-    {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "192.255.0.1", url_parse::Component(0, 11), true},
-    {"000000000000000300.168.1", L"000000000000000300.168.1", "", url_parse::Component(), false},
+    {"192.168.0.1 hello", L"192.168.0.1 hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+      // Very large numbers.
+    {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3},
+    {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", url_parse::Component(0, 11), CanonHostInfo::BROKEN, -1},
+      // A number has no length limit, but long numbers can still overflow.
+    {"00000000000000000001", L"00000000000000000001", "0.0.0.1", url_parse::Component(0, 7), CanonHostInfo::IPV4, 1},
+    {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+      // If a long component is non-numeric, it's a hostname, *not* a broken IP.
+    {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+    {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+      // Truncation of all zeros should still result in 0.
+    {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", url_parse::Component(0, 7), CanonHostInfo::IPV4, 4},
   };
 
-  for (int i = 0; i < arraysize(cases); i++) {
+  for (size_t i = 0; i < arraysize(cases); i++) {
     // 8-bit version.
     url_parse::Component component(0,
                                    static_cast<int>(strlen(cases[i].input8)));
 
     std::string out_str1;
     url_canon::StdStringCanonOutput output1(&out_str1);
-    url_parse::Component out_ip;
-    bool success = url_canon::CanonicalizeIPAddress(cases[i].input8, component,
-                                                    &output1, &out_ip);
+    url_canon::CanonHostInfo host_info;
+    url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1,
+                                     &host_info);
     output1.Complete();
 
-    EXPECT_EQ(cases[i].expected_success, success);
-    if (success) {
+    EXPECT_EQ(cases[i].expected_family, host_info.family);
+    if (host_info.family == CanonHostInfo::IPV4) {
       EXPECT_STREQ(cases[i].expected, out_str1.c_str());
-      EXPECT_EQ(cases[i].expected_component.begin, out_ip.begin);
-      EXPECT_EQ(cases[i].expected_component.len, out_ip.len);
+      EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+      EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+      EXPECT_EQ(cases[i].expected_num_ipv4_components,
+                host_info.num_ipv4_components);
     }
 
     // 16-bit version.
-    UTF16String input16(WStringToUTF16(cases[i].input16));
+    string16 input16(WStringToUTF16(cases[i].input16));
     component = url_parse::Component(0, static_cast<int>(input16.length()));
 
     std::string out_str2;
     url_canon::StdStringCanonOutput output2(&out_str2);
-    success = url_canon::CanonicalizeIPAddress(input16.c_str(), component,
-                                               &output2, &out_ip);
+    url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2,
+                                     &host_info);
     output2.Complete();
 
-    EXPECT_EQ(cases[i].expected_success, success);
-    if (success) {
-      EXPECT_STREQ(cases[i].expected, out_str1.c_str());
-      EXPECT_EQ(cases[i].expected_component.begin, out_ip.begin);
-      EXPECT_EQ(cases[i].expected_component.len, out_ip.len);
+    EXPECT_EQ(cases[i].expected_family, host_info.family);
+    if (host_info.family == CanonHostInfo::IPV4) {
+      EXPECT_STREQ(cases[i].expected, out_str2.c_str());
+      EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+      EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+      EXPECT_EQ(cases[i].expected_num_ipv4_components,
+                host_info.num_ipv4_components);
     }
   }
 }
 
+TEST(URLCanonTest, IPv6) {
+  IPAddressCase cases[] = {
+      // Empty is not an IP address.
+    {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1},
+      // Non-IPs with [:] characters are marked BROKEN.
+    {":", L":", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[", L"[", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[:", L"[:", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"]", L"]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {":]", L":]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[]", L"[]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[:]", L"[:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+      // Regular IP address is invalid without bounding '[' and ']'.
+    {"2001:db8::1", L"2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[2001:db8::1", L"[2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"2001:db8::1]", L"2001:db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+      // Regular IP addresses.
+    {"[::]", L"[::]", "[::]", url_parse::Component(0,4), CanonHostInfo::IPV6, -1},
+    {"[::1]", L"[::1]", "[::1]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1},
+    {"[1::]", L"[1::]", "[1::]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1},
+    {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", url_parse::Component(0,10), CanonHostInfo::IPV6, -1},
+    {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1},
+
+    // Leading zeros should be stripped.
+    {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1},
+
+    // Upper case letters should be lowercased.
+    {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", url_parse::Component(0,20), CanonHostInfo::IPV6, -1},
+
+    // The same address can be written with different contractions, but should
+    // get canonicalized to the same thing.
+    {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1},
+    {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1},
+
+    // IPv4 addresses
+    // Only mapped and compat addresses can have IPv4 syntax embedded.
+    {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+
+    // IPv4 with last component missing.
+    {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1},
+
+    // IPv4 using hex.
+    // TODO(eroman): Should this format be disallowed?
+    {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1},
+
+    // There may be zeros surrounding the "::" contraction.
+    {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1},
+
+    {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", url_parse::Component(0,13), CanonHostInfo::IPV6, -1},
+
+      // Can only have one "::" contraction in an IPv6 string literal.
+    {"[2001::db8::1]", L"[2001::db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+      // No more than 2 consecutive ':'s.
+    {"[2001:db8:::1]", L"[2001:db8:::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[:::]", L"[:::]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+      // Non-IP addresses due to invalid characters.
+    {"[2001::.com]", L"[2001::.com]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+      // If there are not enough components, the last one should fill them out.
+    // ... omitted at this time ...
+      // Too many components means not an IP address.  Similarly with too few if using IPv4 compat or mapped addresses.
+    {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    // Too many bits (even though 8 comonents, the last one holds 32 bits).
+    {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+
+    // Too many bits specified -- the contraction would have to be zero-length
+    // to not exceed 128 bits.
+    {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+
+    // The contraction is for 16 bits of zero.
+    {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1},
+
+    // Cannot have a trailing colon.
+    {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+
+    // Cannot have negative numbers.
+    {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+
+    // Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
+    // The scope_id should be included in the canonicalized URL, and is an
+    // unsigned decimal number.
+
+    // Invalid because no ID was given after the percent.
+
+    // Don't allow scope-id
+    {"[1::%1]", L"[1::%1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[1::%eth0]", L"[1::%eth0]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[1::%]", L"[1::%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[%]", L"[%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[::%:]", L"[::%:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+
+    // Don't allow leading or trailing colons.
+    {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+    {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+
+      // We allow a single trailing dot.
+    // ... omitted at this time ...
+      // Two dots in a row means not an IP address.
+    {"[::192.168..1]", L"[::192.168..1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+      // Any non-first components get truncated to one byte.
+    // ... omitted at this time ...
+      // Spaces should be rejected.
+    {"[::1 hello]", L"[::1 hello]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1},
+  };
+
+  for (size_t i = 0; i < arraysize(cases); i++) {
+    // 8-bit version.
+    url_parse::Component component(0,
+                                   static_cast<int>(strlen(cases[i].input8)));
+
+    std::string out_str1;
+    url_canon::StdStringCanonOutput output1(&out_str1);
+    url_canon::CanonHostInfo host_info;
+    url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1,
+                                     &host_info);
+    output1.Complete();
+
+    EXPECT_EQ(cases[i].expected_family, host_info.family);
+    if (host_info.family == CanonHostInfo::IPV6) {
+      EXPECT_STREQ(cases[i].expected, out_str1.c_str());
+      EXPECT_EQ(cases[i].expected_component.begin,
+                host_info.out_host.begin);
+      EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+    }
+
+    // 16-bit version.
+    string16 input16(WStringToUTF16(cases[i].input16));
+    component = url_parse::Component(0, static_cast<int>(input16.length()));
+
+    std::string out_str2;
+    url_canon::StdStringCanonOutput output2(&out_str2);
+    url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2,
+                                     &host_info);
+    output2.Complete();
+
+    EXPECT_EQ(cases[i].expected_family, host_info.family);
+    if (host_info.family == CanonHostInfo::IPV6) {
+      EXPECT_STREQ(cases[i].expected, out_str2.c_str());
+      EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+      EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+    }
+  }
+}
+
+TEST(URLCanonTest, IPEmpty) {
+  std::string out_str1;
+  url_canon::StdStringCanonOutput output1(&out_str1);
+  url_canon::CanonHostInfo host_info;
+
+  // This tests tests.
+  const char spec[] = "192.168.0.1";
+  url_canon::CanonicalizeIPAddress(spec, url_parse::Component(),
+                                   &output1, &host_info);
+  EXPECT_FALSE(host_info.IsIPAddress());
+
+  url_canon::CanonicalizeIPAddress(spec, url_parse::Component(0, 0),
+                                   &output1, &host_info);
+  EXPECT_FALSE(host_info.IsIPAddress());
+}
+
 TEST(URLCanonTest, UserInfo) {
   // Note that the canonicalizer should escape and treat empty components as
   // not being there.
@@ -532,14 +799,16 @@
     {"http://:@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true},
     {"http://foo:@host.com/", "foo@", url_parse::Component(0, 3), url_parse::Component(0, -1), true},
     {"http://:foo@host.com/", ":foo@", url_parse::Component(0, 0), url_parse::Component(1, 3), true},
-    {"http://^ :$\t@host.com/", "^%20:$%09@", url_parse::Component(0, 4), url_parse::Component(5, 4), true},
+    {"http://^ :$\t@host.com/", "%5E%20:$%09@", url_parse::Component(0, 6), url_parse::Component(7, 4), true},
+    {"http://user:pass@/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true},
+    {"http://%2540:bar@domain.com/", "%2540:bar@", url_parse::Component(0, 5), url_parse::Component(6, 3), true },
 
       // IE7 compatability: old versions allowed backslashes in usernames, but
       // IE7 does not. We disallow it as well.
     {"ftp://me\\mydomain:pass@foo.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true},
   };
 
-  for (int i = 0; i < ARRAYSIZE(user_info_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(user_info_cases); i++) {
     int url_len = static_cast<int>(strlen(user_info_cases[i].input));
     url_parse::Parsed parsed;
     url_parse::ParseStandardURL(user_info_cases[i].input, url_len, &parsed);
@@ -565,7 +834,7 @@
     // Now try the wide version
     out_str.clear();
     url_canon::StdStringCanonOutput output2(&out_str);
-    UTF16String wide_input(ConvertUTF8ToUTF16(user_info_cases[i].input));
+    string16 wide_input(ConvertUTF8ToUTF16(user_info_cases[i].input));
     success = url_canon::CanonicalizeUserInfo(wide_input.c_str(),
                                               parsed.username,
                                               wide_input.c_str(),
@@ -588,24 +857,32 @@
   //
   // Note that the CanonicalizePort will always prepend a colon to the output
   // to separate it from the colon that it assumes preceeds it.
-  const int default_port = 80;
-  ComponentCase port_cases[] = {
+  struct PortCase {
+    const char* input;
+    int default_port;
+    const char* expected;
+    url_parse::Component expected_component;
+    bool expected_success;
+  } port_cases[] = {
       // Invalid input should be copied w/ failure.
-    {"as df", ":as%20df", url_parse::Component(1, 7), false},
+    {"as df", 80, ":as%20df", url_parse::Component(1, 7), false},
+    {"-2", 80, ":-2", url_parse::Component(1, 2), false},
       // Default port should be omitted.
-    {"80", "", url_parse::Component(0, -1), true},
-    {"8080", ":8080", url_parse::Component(1, 4), true},
+    {"80", 80, "", url_parse::Component(0, -1), true},
+    {"8080", 80, ":8080", url_parse::Component(1, 4), true},
+      // PORT_UNSPECIFIED should mean always keep the port.
+    {"80", url_parse::PORT_UNSPECIFIED, ":80", url_parse::Component(1, 2), true},
   };
 
-  for (int i = 0; i < arraysize(port_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(port_cases); i++) {
     int url_len = static_cast<int>(strlen(port_cases[i].input));
     url_parse::Component in_comp(0, url_len);
     url_parse::Component out_comp;
     std::string out_str;
     url_canon::StdStringCanonOutput output1(&out_str);
     bool success = url_canon::CanonicalizePort(port_cases[i].input, in_comp,
-                                               default_port, &output1,
-                                               &out_comp);
+                                               port_cases[i].default_port,
+                                               &output1, &out_comp);
     output1.Complete();
 
     EXPECT_EQ(port_cases[i].expected_success, success);
@@ -616,9 +893,10 @@
     // Now try the wide version
     out_str.clear();
     url_canon::StdStringCanonOutput output2(&out_str);
-    UTF16String wide_input(ConvertUTF8ToUTF16(port_cases[i].input));
+    string16 wide_input(ConvertUTF8ToUTF16(port_cases[i].input));
     success = url_canon::CanonicalizePort(wide_input.c_str(), in_comp,
-                                          default_port, &output2, &out_comp);
+                                          port_cases[i].default_port,
+                                          &output2, &out_comp);
     output2.Complete();
 
     EXPECT_EQ(port_cases[i].expected_success, success);
@@ -671,6 +949,8 @@
       // Funny characters that are unescaped should be escaped
     {"/foo\x09\x91%91", NULL, "/foo%09%91%91", url_parse::Component(0, 13), true},
     {NULL, L"/foo\x09\x91%91", "/foo%09%C2%91%91", url_parse::Component(0, 16), true},
+      // Invalid characters that are escaped should cause a failure.
+    {"/foo%00%51", L"/foo%00%51", "/foo%00Q", url_parse::Component(0, 8), false},
       // Some characters should be passed through unchanged regardless of esc.
     {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", url_parse::Component(0, 13), true},
       // Characters that are properly escaped should not have the case changed
@@ -680,6 +960,14 @@
     {"/foo\tbar", L"/foo\tbar", "/foo%09bar", url_parse::Component(0, 10), true},
       // Backslashes should get converted to forward slashes
     {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", url_parse::Component(0, 8), true},
+      // Hashes found in paths (possibly only when the caller explicitly sets
+      // the path on an already-parsed URL) should be escaped.
+    {"/foo#bar", L"/foo#bar", "/foo%23bar", url_parse::Component(0, 10), true},
+      // %7f should be allowed and %3D should not be unescaped (these were wrong
+      // in a previous version).
+    {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", url_parse::Component(0, 24), true},
+      // @ should be passed through unchanged (escaped or unescaped).
+    {"/@asdf%40", L"/@asdf%40", "/@asdf%40", url_parse::Component(0, 9), true},
 
     // ----- encoding tests -----
       // Basic conversions
@@ -690,7 +978,7 @@
     {NULL, L"/\xfdd0zyx", "/%EF%BF%BDzyx", url_parse::Component(0, 13), false},
   };
 
-  for (int i = 0; i < arraysize(path_cases); i++) {
+  for (size_t i = 0; i < arraysize(path_cases); i++) {
     if (path_cases[i].input8) {
       int len = static_cast<int>(strlen(path_cases[i].input8));
       url_parse::Component in_comp(0, len);
@@ -708,7 +996,7 @@
     }
 
     if (path_cases[i].input16) {
-      UTF16String input16(WStringToUTF16(path_cases[i].input16));
+      string16 input16(WStringToUTF16(path_cases[i].input16));
       int len = static_cast<int>(input16.length());
       url_parse::Component in_comp(0, len);
       url_parse::Component out_comp;
@@ -725,6 +1013,20 @@
       EXPECT_EQ(path_cases[i].expected, out_str);
     }
   }
+
+  // Manual test: embedded NULLs should be escaped and the URL should be marked
+  // as invalid.
+  const char path_with_null[] = "/ab\0c";
+  url_parse::Component in_comp(0, 5);
+  url_parse::Component out_comp;
+
+  std::string out_str;
+  url_canon::StdStringCanonOutput output(&out_str);
+  bool success = url_canon::CanonicalizePath(path_with_null, in_comp,
+                                             &output, &out_comp);
+  output.Complete();
+  EXPECT_FALSE(success);
+  EXPECT_EQ("/ab%00c", out_str);
 }
 
 TEST(URLCanonTest, Query) {
@@ -741,6 +1043,8 @@
     {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
       // Allow question marks in the query without escaping
     {"as?df", L"as?df", NULL, "?as?df"},
+      // Always escape '#' since it would mark the ref.
+    {"as#df", L"as#df", NULL, "?as%23df"},
       // Escape some questionable 8-bit characters, but never unescape.
     {"\x02hello\x7f bye", L"\x02hello\x7f bye", NULL, "?%02hello%7F%20bye"},
     {"%40%41123", L"%40%41123", NULL, "?%40%41123"},
@@ -754,9 +1058,14 @@
     {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", "?q=Chinese%26%2365319%3B"},
       // Invalid UTF-8/16 input should be replaced with invalid characters.
     {"q=\xed\xed", L"q=\xd800\xd800", NULL, "?q=%EF%BF%BD%EF%BF%BD"},
+      // Don't allow < or > because sometimes they are used for XSS if the
+      // URL is echoed in content. Firefox does this, IE doesn't.
+    {"q=<asdf>", L"q=<asdf>", NULL, "?q=%3Casdf%3E"},
+      // Escape double quotemarks in the query.
+    {"q=\"asdf\"", L"q=\"asdf\"", NULL, "?q=%22asdf%22"},
   };
 
-  for (int i = 0; i < ARRAYSIZE(query_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) {
     url_parse::Component out_comp;
 
     UConvScoper conv(query_cases[i].encoding);
@@ -782,7 +1091,7 @@
     }
 
     if (query_cases[i].input16) {
-      UTF16String input16(WStringToUTF16(query_cases[i].input16));
+      string16 input16(WStringToUTF16(query_cases[i].input16));
       int len = static_cast<int>(input16.length());
       url_parse::Component in_comp(0, len);
       std::string out_str;
@@ -818,11 +1127,16 @@
       // Escaping should be preserved unchanged, even invalid ones
     {"%41%a", L"%41%a", "#%41%a", url_parse::Component(1, 5), true},
       // Invalid UTF-8/16 input should be flagged and the input made valid
-    {"\xc2", NULL, "#\xef\xbf\xbd", url_parse::Component(1, 3), false},
-    {NULL, L"\xd800\x597d", "#\xef\xbf\xbd\xe5\xa5\xbd", url_parse::Component(1, 6), false},
+    {"\xc2", NULL, "#\xef\xbf\xbd", url_parse::Component(1, 3), true},
+    {NULL, L"\xd800\x597d", "#\xef\xbf\xbd\xe5\xa5\xbd", url_parse::Component(1, 6), true},
+      // Test a Unicode invalid character.
+    {"a\xef\xb7\x90", L"a\xfdd0", "#a\xef\xbf\xbd", url_parse::Component(1, 4), true},
+      // Refs can have # signs and we should preserve them.
+    {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", url_parse::Component(1, 9), true},
+    {"#asdf", L"#asdf", "##asdf", url_parse::Component(1, 5), true},
   };
 
-  for (int i = 0; i < arraysize(ref_cases); i++) {
+  for (size_t i = 0; i < arraysize(ref_cases); i++) {
     // 8-bit input
     if (ref_cases[i].input8) {
       int len = static_cast<int>(strlen(ref_cases[i].input8));
@@ -831,11 +1145,10 @@
 
       std::string out_str;
       url_canon::StdStringCanonOutput output(&out_str);
-      bool success = url_canon::CanonicalizeRef(ref_cases[i].input8, in_comp,
+      url_canon::CanonicalizeRef(ref_cases[i].input8, in_comp,
                                                 &output, &out_comp);
       output.Complete();
 
-      EXPECT_EQ(ref_cases[i].expected_success, success);
       EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
       EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
       EXPECT_EQ(ref_cases[i].expected, out_str);
@@ -843,28 +1156,41 @@
 
     // 16-bit input
     if (ref_cases[i].input16) {
-      UTF16String input16(WStringToUTF16(ref_cases[i].input16));
+      string16 input16(WStringToUTF16(ref_cases[i].input16));
       int len = static_cast<int>(input16.length());
       url_parse::Component in_comp(0, len);
       url_parse::Component out_comp;
 
       std::string out_str;
       url_canon::StdStringCanonOutput output(&out_str);
-      bool success = url_canon::CanonicalizeRef(input16.c_str(), in_comp,
-                                                &output, &out_comp);
+      url_canon::CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp);
       output.Complete();
 
-      EXPECT_EQ(ref_cases[i].expected_success, success);
       EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
       EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
       EXPECT_EQ(ref_cases[i].expected, out_str);
     }
   }
+
+  // Try one with an embedded NULL. It should be stripped.
+  const char null_input[5] = "ab\x00z";
+  url_parse::Component null_input_component(0, 4);
+  url_parse::Component out_comp;
+
+  std::string out_str;
+  url_canon::StdStringCanonOutput output(&out_str);
+  url_canon::CanonicalizeRef(null_input, null_input_component,
+                             &output, &out_comp);
+  output.Complete();
+
+  EXPECT_EQ(1, out_comp.begin);
+  EXPECT_EQ(3, out_comp.len);
+  EXPECT_EQ("#abz", out_str);
 }
 
 TEST(URLCanonTest, CanonicalizeStandardURL) {
   // The individual component canonicalize tests should have caught the cases
-  // for each of thost components. Here, we just need to test that the various
+  // for each of those components. Here, we just need to test that the various
   // parts are included or excluded properly, and have the correct separators.
   struct URLCase {
     const char* input;
@@ -872,19 +1198,44 @@
     bool expected_success;
   } cases[] = {
     {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#", true},
-    {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com/;p?#", false},
+    {"http://[www.google.com]/", "http://[www.google.com]/", false},
+    {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#", false},
     {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo", true},
     {"www.google.com", ":www.google.com/", true},
     {"http://192.0x00A80001", "http://192.168.0.1/", true},
+    {"http://www/foo%2Ehtml", "http://www/foo.html", true},
+    {"http://user:pass@/", "http://user:pass@/", false},
+    {"http://%25DOMAIN:foobar@foodomain.com/", "http://%25DOMAIN:foobar@foodomain.com/", true},
 
       // Backslashes should get converted to forward slashes.
     {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true},
 
       // Busted refs shouldn't make the whole thing fail.
     {"http://www.google.com/asdf#\xc2", "http://www.google.com/asdf#\xef\xbf\xbd", true},
+
+      // Basic port tests.
+    {"http://foo:80/", "http://foo/", true},
+    {"http://foo:81/", "http://foo:81/", true},
+    {"httpa://foo:80/", "httpa://foo:80/", true},
+    {"http://foo:-80/", "http://foo:-80/", false},
+
+    {"https://foo:443/", "https://foo/", true},
+    {"https://foo:80/", "https://foo:80/", true},
+    {"ftp://foo:21/", "ftp://foo/", true},
+    {"ftp://foo:80/", "ftp://foo:80/", true},
+    {"gopher://foo:70/", "gopher://foo/", true},
+    {"gopher://foo:443/", "gopher://foo:443/", true},
+    {"ws://foo:80/", "ws://foo/", true},
+    {"ws://foo:81/", "ws://foo:81/", true},
+    {"ws://foo:443/", "ws://foo:443/", true},
+    {"ws://foo:815/", "ws://foo:815/", true},
+    {"wss://foo:80/", "wss://foo:80/", true},
+    {"wss://foo:81/", "wss://foo:81/", true},
+    {"wss://foo:443/", "wss://foo/", true},
+    {"wss://foo:815/", "wss://foo:815/", true},
   };
 
-  for (int i = 0; i < ARRAYSIZE(cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
     int url_len = static_cast<int>(strlen(cases[i].input));
     url_parse::Parsed parsed;
     url_parse::ParseStandardURL(cases[i].input, url_len, &parsed);
@@ -913,7 +1264,7 @@
     {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"},
   };
 
-  for (int i = 0; i < arraysize(replace_cases); i++) {
+  for (size_t i = 0; i < arraysize(replace_cases); i++) {
     const ReplaceCase& cur = replace_cases[i];
     int base_len = static_cast<int>(strlen(cur.base));
     url_parse::Parsed parsed;
@@ -942,6 +1293,34 @@
 
     EXPECT_EQ(replace_cases[i].expected, out_str);
   }
+
+  // The path pointer should be ignored if the address is invalid.
+  {
+    const char src[] = "http://www.google.com/here_is_the_path";
+    int src_len = static_cast<int>(strlen(src));
+
+    url_parse::Parsed parsed;
+    url_parse::ParseStandardURL(src, src_len, &parsed);
+
+    // Replace the path to 0 length string. By using 1 as the string address,
+    // the test should get an access violation if it tries to dereference it.
+    url_canon::Replacements<char> r;
+    r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component(0, 0));
+    std::string out_str1;
+    url_canon::StdStringCanonOutput output1(&out_str1);
+    url_parse::Parsed new_parsed;
+    url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output1, &new_parsed);
+    output1.Complete();
+    EXPECT_STREQ("http://www.google.com/", out_str1.c_str());
+
+    // Same with an "invalid" path.
+    r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component());
+    std::string out_str2;
+    url_canon::StdStringCanonOutput output2(&out_str2);
+    url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output2, &new_parsed);
+    output2.Complete();
+    EXPECT_STREQ("http://www.google.com/", out_str2.c_str());
+  }
 }
 
 TEST(URLCanonTest, ReplaceFileURL) {
@@ -961,7 +1340,7 @@
     {"file:///home/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"},
   };
 
-  for (int i = 0; i < arraysize(replace_cases); i++) {
+  for (size_t i = 0; i < arraysize(replace_cases); i++) {
     const ReplaceCase& cur = replace_cases[i];
     int base_len = static_cast<int>(strlen(cur.base));
     url_parse::Parsed parsed;
@@ -1001,7 +1380,7 @@
     {"data:foo", NULL, NULL, NULL, NULL, NULL, kDeleteComp, NULL, NULL, "data:"},
   };
 
-  for (int i = 0; i < arraysize(replace_cases); i++) {
+  for (size_t i = 0; i < arraysize(replace_cases); i++) {
     const ReplaceCase& cur = replace_cases[i];
     int base_len = static_cast<int>(strlen(cur.base));
     url_parse::Parsed parsed;
@@ -1029,6 +1408,58 @@
   }
 }
 
+TEST(URLCanonTest, ReplaceMailtoURL) {
+  ReplaceCase replace_cases[] = {
+      // Replace everything
+    {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"},
+      // Replace nothing
+    {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"},
+      // Replace the path
+    {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"},
+      // Replace the query
+    {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"},
+      // Replace the path and query
+    {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"},
+      // Set the query to empty (should leave trailing question mark)
+    {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"},
+      // Clear the query
+    {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"},
+      // Clear the path
+    {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"},
+      // Clear the path + query
+    {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"},
+      // Setting the ref should have no effect
+    {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"},
+  };
+
+  for (size_t i = 0; i < arraysize(replace_cases); i++) {
+    const ReplaceCase& cur = replace_cases[i];
+    int base_len = static_cast<int>(strlen(cur.base));
+    url_parse::Parsed parsed;
+    url_parse::ParseMailtoURL(cur.base, base_len, &parsed);
+
+    url_canon::Replacements<char> r;
+    typedef url_canon::Replacements<char> R;
+    SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+    SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+    SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+    SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+    SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+    SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+    SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+    SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+    std::string out_str;
+    url_canon::StdStringCanonOutput output(&out_str);
+    url_parse::Parsed out_parsed;
+    url_canon::ReplaceMailtoURL(cur.base, parsed,
+                                r, &output, &out_parsed);
+    output.Complete();
+
+    EXPECT_EQ(replace_cases[i].expected, out_str);
+  }
+}
+
 TEST(URLCanonTest, CanonicalizeFileURL) {
   struct URLCase {
     const char* input;
@@ -1039,48 +1470,59 @@
   } cases[] = {
 #ifdef _WIN32
       // Windows-style paths
-    {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(7, 0), url_parse::Component(7, 16)},
-    {"  File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(7, 0), url_parse::Component(7, 19)},
-    {"file:", "file:///", true, url_parse::Component(7, 0), url_parse::Component(7, 1)},
+    {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
+    {"  File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
+    {"file:", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
     {"file:UNChost/path", "file://unchost/path", true, url_parse::Component(7, 7), url_parse::Component(14, 5)},
       // CanonicalizeFileURL supports absolute Windows style paths for IE
       // compatability. Note that the caller must decide that this is a file
       // URL itself so it can call the file canonicalizer. This is usually
       // done automatically as part of relative URL resolving.
-    {"c:\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(7, 0), url_parse::Component(7, 11)},
-    {"C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(7, 0), url_parse::Component(7, 11)},
-    {"/C|\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(7, 0), url_parse::Component(7, 11)},
-    {"//C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(7, 0), url_parse::Component(7, 11)},
+    {"c:\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
+    {"C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
+    {"/C|\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
+    {"//C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
     {"//server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
     {"\\\\server\\file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
     {"/\\server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
       // We should preserve the number of slashes after the colon for IE
       // compatability, except when there is none, in which case we should
       // add one.
-    {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(7, 0), url_parse::Component(7, 16)},
-    {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(7, 0), url_parse::Component(7, 19)},
+    {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
+    {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
       // Three slashes should be non-UNC, even if there is no drive spec (IE
       // does this, which makes the resulting request invalid).
-    {"file:///foo/bar.txt", "file:///foo/bar.txt", true, url_parse::Component(7, 0), url_parse::Component(7, 12)},
+    {"file:///foo/bar.txt", "file:///foo/bar.txt", true, url_parse::Component(), url_parse::Component(7, 12)},
       // TODO(brettw) we should probably fail for invalid host names, which
-      // would change the expected result on this test.
-    {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7%3A////foo/bar.html", false, url_parse::Component(7, 4), url_parse::Component(11, 16)},
+      // would change the expected result on this test. We also currently allow
+      // colon even though it's probably invalid, because its currently the
+      // "natural" result of the way the canonicalizer is written. There doesn't
+      // seem to be a strong argument for why allowing it here would be bad, so
+      // we just tolerate it and the load will fail later.
+    {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false, url_parse::Component(7, 2), url_parse::Component(9, 16)},
     {"file:filer/home\\me", "file://filer/home/me", true, url_parse::Component(7, 5), url_parse::Component(12, 8)},
       // Make sure relative paths can't go above the "C:"
-    {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, url_parse::Component(7, 0), url_parse::Component(7, 12)},
+    {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, url_parse::Component(), url_parse::Component(7, 12)},
       // Busted refs shouldn't make the whole thing fail.
-    {"file:///C:/asdf#\xc2", "file:///C:/asdf#\xef\xbf\xbd", true, url_parse::Component(7, 0), url_parse::Component(7, 8)},
+    {"file:///C:/asdf#\xc2", "file:///C:/asdf#\xef\xbf\xbd", true, url_parse::Component(), url_parse::Component(7, 8)},
 #else
       // Unix-style paths
-    {"file:///home/me", "file:///home/me", true, url_parse::Component(7, 0), url_parse::Component(7, 8)},
+    {"file:///home/me", "file:///home/me", true, url_parse::Component(), url_parse::Component(7, 8)},
       // Windowsy ones should get still treated as Unix-style.
-    {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, url_parse::Component(7, 0), url_parse::Component(7, 16)},
-    {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, url_parse::Component(7, 0), url_parse::Component(7, 19)},
-      // TODO(brettw) there should be a "file://localhost/" example here.
+    {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
+    {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
+      // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html)
+    {"//", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
+    {"///", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
+    {"///test", "file:///test", true, url_parse::Component(), url_parse::Component(7, 5)},
+    {"file://test", "file://test/", true, url_parse::Component(7, 4), url_parse::Component(11, 1)},
+    {"file://localhost",  "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)},
+    {"file://localhost/", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)},
+    {"file://localhost/test", "file://localhost/test", true, url_parse::Component(7, 9), url_parse::Component(16, 5)},
 #endif  // _WIN32
   };
 
-  for (int i = 0; i < ARRAYSIZE(cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
     int url_len = static_cast<int>(strlen(cases[i].input));
     url_parse::Parsed parsed;
     url_parse::ParseFileURL(cases[i].input, url_len, &parsed);
@@ -1120,7 +1562,7 @@
     {":\":This /is interesting;?#", ":\":This /is interesting;?#"},
   };
 
-  for (int i = 0; i < ARRAYSIZE(path_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) {
     int url_len = static_cast<int>(strlen(path_cases[i].input));
     url_parse::Parsed parsed;
     url_parse::ParsePathURL(path_cases[i].input, url_len, &parsed);
@@ -1136,8 +1578,8 @@
     EXPECT_TRUE(success);
     EXPECT_EQ(path_cases[i].expected, out_str);
 
-    // Hosts should be 0 length not -1 if they don't exist
-    EXPECT_EQ(0, out_parsed.host.len);
+    EXPECT_EQ(0, out_parsed.host.begin);
+    EXPECT_EQ(-1, out_parsed.host.len);
 
     // When we end with a colon at the end, there should be no path.
     if (path_cases[i].input[url_len - 1] == ':') {
@@ -1147,6 +1589,65 @@
   }
 }
 
+TEST(URLCanonTest, CanonicalizeMailtoURL) {
+  struct URLCase {
+    const char* input;
+    const char* expected;
+    bool expected_success;
+    url_parse::Component expected_path;
+    url_parse::Component expected_query;
+  } cases[] = {
+    {"mailto:addr1", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()},
+    {"mailto:addr1@foo.com", "mailto:addr1@foo.com", true, url_parse::Component(7, 13), url_parse::Component()},
+    // Trailing whitespace is stripped.
+    {"MaIlTo:addr1 \t ", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()},
+    {"MaIlTo:addr1?to=jon", "mailto:addr1?to=jon", true, url_parse::Component(7, 5), url_parse::Component(13,6)},
+    {"mailto:addr1,addr2", "mailto:addr1,addr2", true, url_parse::Component(7, 11), url_parse::Component()},
+    {"mailto:addr1, addr2", "mailto:addr1, addr2", true, url_parse::Component(7, 12), url_parse::Component()},
+    {"mailto:addr1%2caddr2", "mailto:addr1%2caddr2", true, url_parse::Component(7, 13), url_parse::Component()},
+    {"mailto:\xF0\x90\x8C\x80", "mailto:%F0%90%8C%80", true, url_parse::Component(7, 12), url_parse::Component()},
+    // Null character should be escaped to %00
+    {"mailto:addr1\0addr2?foo", "mailto:addr1%00addr2?foo", true, url_parse::Component(7, 13), url_parse::Component(21, 3)},
+    // Invalid -- UTF-8 encoded surrogate value.
+    {"mailto:\xed\xa0\x80", "mailto:%EF%BF%BD", false, url_parse::Component(7, 9), url_parse::Component()},
+    {"mailto:addr1?", "mailto:addr1?", true, url_parse::Component(7, 5), url_parse::Component(13, 0)},
+  };
+
+  // Define outside of loop to catch bugs where components aren't reset
+  url_parse::Parsed parsed;
+  url_parse::Parsed out_parsed;
+
+  for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+    int url_len = static_cast<int>(strlen(cases[i].input));
+    if (i == 8) {
+      // The 9th test case purposely has a '\0' in it -- don't count it
+      // as the string terminator.
+      url_len = 22;
+    }
+    url_parse::ParseMailtoURL(cases[i].input, url_len, &parsed);
+
+    std::string out_str;
+    url_canon::StdStringCanonOutput output(&out_str);
+    bool success = url_canon::CanonicalizeMailtoURL(cases[i].input, url_len,
+                                                    parsed, &output,
+                                                    &out_parsed);
+    output.Complete();
+
+    EXPECT_EQ(cases[i].expected_success, success);
+    EXPECT_EQ(cases[i].expected, out_str);
+
+    // Make sure the spec was properly identified
+    EXPECT_EQ(0, out_parsed.scheme.begin);
+    EXPECT_EQ(6, out_parsed.scheme.len);
+
+    EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
+    EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
+
+    EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin);
+    EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len);
+  }
+}
+
 #ifndef WIN32
 
 TEST(URLCanonTest, _itoa_s) {
@@ -1157,60 +1658,67 @@
   memset(buf, 0xff, sizeof(buf));
   EXPECT_EQ(0, url_canon::_itoa_s(12, buf, sizeof(buf) - 1, 10));
   EXPECT_STREQ("12", buf);
-  EXPECT_EQ(0xff, buf[3]);
+  EXPECT_EQ('\xFF', buf[3]);
 
   // Test the edge cases - exactly the buffer size and one over
   memset(buf, 0xff, sizeof(buf));
   EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 10));
   EXPECT_STREQ("1234", buf);
-  EXPECT_EQ(0xff, buf[5]);
+  EXPECT_EQ('\xFF', buf[5]);
 
   memset(buf, 0xff, sizeof(buf));
   EXPECT_EQ(EINVAL, url_canon::_itoa_s(12345, buf, sizeof(buf) - 1, 10));
-  EXPECT_EQ(0xff, buf[5]);  // should never write to this location
+  EXPECT_EQ('\xFF', buf[5]);  // should never write to this location
 
   // Test the template overload (note that this will see the full buffer)
   memset(buf, 0xff, sizeof(buf));
   EXPECT_EQ(0, url_canon::_itoa_s(12, buf, 10));
   EXPECT_STREQ("12", buf);
-  EXPECT_EQ(0xff, buf[3]);
+  EXPECT_EQ('\xFF', buf[3]);
 
   memset(buf, 0xff, sizeof(buf));
   EXPECT_EQ(0, url_canon::_itoa_s(12345, buf, 10));
   EXPECT_STREQ("12345", buf);
 
   EXPECT_EQ(EINVAL, url_canon::_itoa_s(123456, buf, 10));
+
+  // Test that radix 16 is supported.
+  memset(buf, 0xff, sizeof(buf));
+  EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 16));
+  EXPECT_STREQ("4d2", buf);
+  EXPECT_EQ('\xFF', buf[5]);
 }
 
 TEST(URLCanonTest, _itow_s) {
   // We fill the buffer with 0xff to ensure that it's getting properly
   // null-terminated.  We also allocate one byte more than what we tell
   // _itoa_s about, and ensure that the extra byte is untouched.
-  UTF16Char buf[6];
-  memset(buf, 0xff, sizeof(buf));
+  char16 buf[6];
+  const char fill_mem = 0xff;
+  const char16 fill_char = 0xffff;
+  memset(buf, fill_mem, sizeof(buf));
   EXPECT_EQ(0, url_canon::_itow_s(12, buf, sizeof(buf) / 2 - 1, 10));
-  EXPECT_EQ(WStringToUTF16(L"12"), UTF16String(buf));
-  EXPECT_EQ(0xffff, buf[3]);
+  EXPECT_EQ(WStringToUTF16(L"12"), string16(buf));
+  EXPECT_EQ(fill_char, buf[3]);
 
   // Test the edge cases - exactly the buffer size and one over
-  memset(buf, 0xff, sizeof(buf));
   EXPECT_EQ(0, url_canon::_itow_s(1234, buf, sizeof(buf) / 2 - 1, 10));
-  EXPECT_EQ(WStringToUTF16(L"1234"), UTF16String(buf));
-  EXPECT_EQ(0xffff, buf[5]);
+  EXPECT_EQ(WStringToUTF16(L"1234"), string16(buf));
+  EXPECT_EQ(fill_char, buf[5]);
 
-  memset(buf, 0xff, sizeof(buf));
+  memset(buf, fill_mem, sizeof(buf));
   EXPECT_EQ(EINVAL, url_canon::_itow_s(12345, buf, sizeof(buf) / 2 - 1, 10));
-  EXPECT_EQ(0xffff, buf[5]);  // should never write to this location
+  EXPECT_EQ(fill_char, buf[5]);  // should never write to this location
 
   // Test the template overload (note that this will see the full buffer)
-  memset(buf, 0xff, sizeof(buf));
+  memset(buf, fill_mem, sizeof(buf));
   EXPECT_EQ(0, url_canon::_itow_s(12, buf, 10));
-  EXPECT_EQ(WStringToUTF16(L"12"), UTF16String(buf));
-  EXPECT_EQ(0xffff, buf[3]);
+  EXPECT_EQ(WStringToUTF16(L"12"), string16(buf));
+  EXPECT_EQ(fill_char, buf[3]);
 
-  memset(buf, 0xff, sizeof(buf));
+  memset(buf, fill_mem, sizeof(buf));
   EXPECT_EQ(0, url_canon::_itow_s(12345, buf, 10));
-  EXPECT_EQ(WStringToUTF16(L"12345"), UTF16String(buf));
+  EXPECT_EQ(WStringToUTF16(L"12345"), string16(buf));
 
   EXPECT_EQ(EINVAL, url_canon::_itow_s(123456, buf, 10));
 }
@@ -1244,8 +1752,11 @@
       // Basic absolute input.
     {"http://host/a", true, false, "http://another/", true, false, false, NULL},
     {"http://host/a", true, false, "http:////another/", true, false, false, NULL},
-      // Empty relative URLs shouldn't change the input.
+      // Empty relative URLs should only remove the ref part of the URL,
+      // leaving the rest unchanged.
     {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
+    {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"},
+    {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
       // Spaces at the ends of the relative path should be ignored.
     {"http://foo/bar", true, false, "  another  ", true, true, true, "http://foo/another"},
     {"http://foo/bar", true, false, "  .  ", true, true, true, "http://foo/"},
@@ -1288,9 +1799,14 @@
     {"data:foobar", false, false, "baz.html", false, false, false, NULL},
     {"data:foobar", false, false, "data:baz", true, false, false, NULL},
     {"data:foobar", false, false, "data:/base", true, false, false, NULL},
-      // Non-hierarchical base: absolute input should succeed for
+      // Non-hierarchical base: absolute input should succeed.
     {"data:foobar", false, false, "http://host/", true, false, false, NULL},
     {"data:foobar", false, false, "http:host", true, false, false, NULL},
+      // Invalid schemes should be treated as relative.
+    {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"},
+    {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"},
+    {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"},
+    {"data:asdf", false, false, ":foo", false, false, false, NULL},
       // We should treat semicolons like any other character in URL resolving 
     {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"},
     {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"},
@@ -1302,6 +1818,7 @@
     {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"},
     {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"},
     {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"},
+    {"http://host/a", true, false, "//", true, true, false, "http:"},
       // IE will also allow one or the other to be a backslash to get the same
       // behavior.
     {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"},
@@ -1346,7 +1863,7 @@
     {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"},
   };
 
-  for (int i = 0; i < ARRAYSIZE(rel_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(rel_cases); i++) {
     const RelativeCase& cur_case = rel_cases[i];
 
     url_parse::Parsed parsed;
@@ -1366,9 +1883,10 @@
         cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier,
         &is_relative, &relative_component);
 
-    EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel);
-    EXPECT_EQ(cur_case.is_rel, is_relative);
-
+    EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) <<
+        "succeed is rel failure on " << cur_case.test;
+    EXPECT_EQ(cur_case.is_rel, is_relative) <<
+        "is rel failure on " << cur_case.test;
     // Now resolve it.
     if (succeed_is_rel && is_relative && cur_case.is_rel) {
       std::string resolved;
@@ -1381,7 +1899,7 @@
       output.Complete();
 
       EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve);
-      EXPECT_EQ(cur_case.resolved, resolved);
+      EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test;
 
       // Verify that the output parsed structure is the same as parsing a
       // the URL freshly.
@@ -1397,3 +1915,41 @@
     }
   }
 }
+
+// It used to be when we did a replacement with a long buffer of UTF-16
+// characters, we would get invalid data in the URL. This is because the buffer
+// it used to hold the UTF-8 data was resized, while some pointers were still
+// kept to the old buffer that was removed.
+TEST(URLCanonTest, ReplacementOverflow) {
+  const char src[] = "file:///C:/foo/bar";
+  int src_len = static_cast<int>(strlen(src));
+  url_parse::Parsed parsed;
+  url_parse::ParseFileURL(src, src_len, &parsed);
+
+  // Override two components, the path with something short, and the query with
+  // sonething long enough to trigger the bug.
+  url_canon::Replacements<char16> repl;
+  string16 new_query;
+  for (int i = 0; i < 4800; i++)
+    new_query.push_back('a');
+
+  string16 new_path(WStringToUTF16(L"/foo"));
+  repl.SetPath(new_path.c_str(), url_parse::Component(0, 4));
+  repl.SetQuery(new_query.c_str(),
+                url_parse::Component(0, static_cast<int>(new_query.length())));
+
+  // Call ReplaceComponents on the string. It doesn't matter if we call it for
+  // standard URLs, file URLs, etc, since they will go to the same replacement
+  // function that was buggy.
+  url_parse::Parsed repl_parsed;
+  std::string repl_str;
+  url_canon::StdStringCanonOutput repl_output(&repl_str);
+  url_canon::ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed);
+  repl_output.Complete();
+
+  // Generate the expected string and check.
+  std::string expected("file:///foo?");
+  for (size_t i = 0; i < new_query.length(); i++)
+    expected.push_back('a');
+  EXPECT_TRUE(expected == repl_str);
+}

diff --git a/googleurl/src/url_common.h b/googleurl/src/url_common.h
new file mode 100644
index 0000000..7e7e27a
--- /dev/null
+++ b/googleurl/src/url_common.h

@@ -0,0 +1,48 @@
+// Copyright 2010, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_COMMON_H__
+#define GOOGLEURL_SRC_URL_COMMON_H__
+
+#if !defined(GURL_IMPLEMENTATION)
+#define GURL_IMPLEMENTATION 0
+#endif
+
+#if defined(WIN32) && defined(GURL_DLL)
+#if GURL_IMPLEMENTATION
+#define GURL_API __declspec(dllexport)
+#else
+#define GURL_API __declspec(dllimport)
+#endif
+#else
+#define GURL_API
+#endif
+
+#endif  // GOOGLEURL_SRC_URL_COMMON_H__
+

diff --git a/googleurl/src/url_file.h b/googleurl/src/url_file.h
index 0b71e7c..c1b8ac9 100644
--- a/googleurl/src/url_file.h
+++ b/googleurl/src/url_file.h

@@ -40,10 +40,10 @@
 #ifdef WIN32
 
 // We allow both "c:" and "c|" as drive identifiers.
-inline bool IsWindowsDriveSeparator(UTF16Char ch) {
+inline bool IsWindowsDriveSeparator(char16 ch) {
   return ch == ':' || ch == '|';
 }
-inline bool IsWindowsDriveLetter(UTF16Char ch) {
+inline bool IsWindowsDriveLetter(char16 ch) {
   return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
 }
 

diff --git a/googleurl/src/url_parse.cc b/googleurl/src/url_parse.cc
index 762bc25..a08c4da 100644
--- a/googleurl/src/url_parse.cc
+++ b/googleurl/src/url_parse.cc

@@ -36,6 +36,8 @@
 
 #include "googleurl/src/url_parse.h"
 
+#include <stdlib.h>
+
 #include "base/logging.h"
 #include "googleurl/src/url_parse_internal.h"
 
@@ -44,7 +46,7 @@
 namespace {
 
 // Returns true if the given character is a valid digit to use in a port.
-inline bool IsPortDigit(UTF16Char ch) {
+inline bool IsPortDigit(char16 ch) {
   return ch >= '0' && ch <= '9';
 }
 
@@ -92,36 +94,42 @@
                      Component* port_num) {
   if (serverinfo.len == 0) {
     // No server info, host name is empty.
-    *hostname = Component(serverinfo.begin, 0);
-    *port_num = Component();
+    hostname->reset();
+    port_num->reset();
     return;
   }
 
-  // Search backwards for a ':' but stop on ']' (IPv6 address literal
-  // delimiter).
-  int i = serverinfo.begin + serverinfo.len - 1;
-  int colon = -1, bracket = -1;
-  while (i >= serverinfo.begin && colon < 0) {
+  // If the host starts with a left-bracket, assume the entire host is an
+  // IPv6 literal.  Otherwise, assume none of the host is an IPv6 literal.
+  // This assumption will be overridden if we find a right-bracket.
+  //
+  // Our IPv6 address canonicalization code requires both brackets to exist,
+  // but the ability to locate an incomplete address can still be useful.
+  int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1;
+  int colon = -1;
+
+  // Find the last right-bracket, and the last colon.
+  for (int i = serverinfo.begin; i < serverinfo.end(); i++) {
     switch (spec[i]) {
       case ']':
-        bracket = i;
+        ipv6_terminator = i;
         break;
       case ':':
-        if (bracket < 0)
-          colon = i;  // Will cause loop to terminate.
+        colon = i;
         break;
     }
-    i--;
   }
 
-  if (colon >= 0) {
+  if (colon > ipv6_terminator) {
     // Found a port number: <hostname>:<port>
     *hostname = MakeRange(serverinfo.begin, colon);
-    *port_num = MakeRange(colon + 1, serverinfo.begin + serverinfo.len);
+    if (hostname->len == 0)
+      hostname->reset();
+    *port_num = MakeRange(colon + 1, serverinfo.end());
   } else {
     // No port: <hostname>
     *hostname = serverinfo;
-    *port_num = Component();
+    port_num->reset();
   }
 }
 
@@ -130,18 +138,18 @@
 // filled into the given *port variable, or -1 if there is no port number or it
 // is invalid.
 template<typename CHAR>
-void ParseAuthority(const CHAR* spec,
-                    const Component& auth,
-                    Component* username,
-                    Component* password,
-                    Component* hostname,
-                    Component* port_num) {
+void DoParseAuthority(const CHAR* spec,
+                      const Component& auth,
+                      Component* username,
+                      Component* password,
+                      Component* hostname,
+                      Component* port_num) {
   DCHECK(auth.is_valid()) << "We should always get an authority";
   if (auth.len == 0) {
-    *username = Component();
-    *password = Component();
-    *hostname = Component(0, 0);
-    *port_num = Component();
+    username->reset();
+    password->reset();
+    hostname->reset();
+    port_num->reset();
     return;
   }
 
@@ -159,8 +167,8 @@
                     hostname, port_num);
   } else {
     // No user info, everything is server info.
-    *username = Component();
-    *password = Component();
+    username->reset();
+    password->reset();
     ParseServerInfo(spec, auth, hostname, port_num);
   }
 }
@@ -175,9 +183,9 @@
 
   // Special case when there is no path.
   if (path.len == -1) {
-    *filepath = Component();
-    *query = Component();
-    *ref = Component();
+    filepath->reset();
+    query->reset();
+    ref->reset();
     return;
   }
   DCHECK(path.len > 0) << "We should never have 0 length paths";
@@ -196,9 +204,9 @@
           query_separator = i;
         break;
       case '#':
-        // We want to find the LAST reference fragment, so overwrite any
-        // previous one.
-        ref_separator = i;
+        // Record the first # sign only.
+        if (ref_separator < 0)
+          ref_separator = i;
         break;
     }
   }
@@ -214,7 +222,7 @@
     *ref = MakeRange(ref_separator + 1, path_end);
   } else {
     file_end = query_end = path_end;
-    *ref = Component();
+    ref->reset();
   }
 
   // Query fragment: everything from the ? to the next boundary (either the end
@@ -223,14 +231,35 @@
     file_end = query_separator;
     *query = MakeRange(query_separator + 1, query_end);
   } else {
-    *query = Component();
+    query->reset();
   }
 
   // File path: treat an empty file path as no file path.
   if (file_end != path.begin)
     *filepath = MakeRange(path.begin, file_end);
   else
-    *filepath = Component();
+    filepath->reset();
+}
+
+template<typename CHAR>
+bool DoExtractScheme(const CHAR* url,
+                     int url_len,
+                     Component* scheme) {
+  // Skip leading whitespace and control characters.
+  int begin = 0;
+  while (begin < url_len && ShouldTrimFromURL(url[begin]))
+    begin++;
+  if (begin == url_len)
+    return false;  // Input is empty or all whitespace.
+
+  // Find the first colon character.
+  for (int i = begin; i < url_len; i++) {
+    if (url[i] == ':') {
+      *scheme = MakeRange(begin, i);
+      return true;
+    }
+  }
+  return false;  // No colon found: no scheme
 }
 
 // Fills in all members of the Parsed structure except for the scheme.
@@ -242,7 +271,7 @@
 // Compatability data points. I list "host", "path" extracted:
 // Input                IE6             Firefox                Us
 // -----                --------------  --------------         --------------
-// http://foo.com/      "foo.com", "/"  "foo.com", "/"         "foo.com", "/"                      
+// http://foo.com/      "foo.com", "/"  "foo.com", "/"         "foo.com", "/"
 // http:foo.com/        "foo.com", "/"  "foo.com", "/"         "foo.com", "/"
 // http:/foo.com/       fail(*)         "foo.com", "/"         "foo.com", "/"
 // http:\foo.com/       fail(*)         "\foo.com", "/"(fail)  "foo.com", "/"
@@ -276,37 +305,13 @@
     full_path = Component(end_auth, spec_len - end_auth);
 
   // Now parse those two sub-parts.
-  ParseAuthority(spec, authority, &parsed->username, &parsed->password,
-                 &parsed->host, &parsed->port);
+  DoParseAuthority(spec, authority, &parsed->username, &parsed->password,
+                   &parsed->host, &parsed->port);
   ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
 }
 
-template<typename CHAR>
-bool DoExtractScheme(const CHAR* url,
-                     int url_len,
-                     Component* scheme) {
-  // Skip leading whitespace and control characters.
-  int begin = 0;
-  while (begin < url_len && ShouldTrimFromURL(url[begin]))
-    begin++;
-  if (begin == url_len)
-    return false;  // Input is empty or all whitespace.
-
-  // Find the first colon character.
-  for (int i = begin; i < url_len; i++) {
-    if (url[i] == ':') {
-      *scheme = MakeRange(begin, i);
-      return true;
-    } else if (IsAuthorityTerminator(url[i])) {
-      // An authority terminator was found before the end of the scheme, so we
-      // say that there is no scheme (for example "google.com/foo:bar").
-      return false;
-    }
-  }
-  return false;  // No colon found: no scheme
-}
-
-// The main parsing function for URLs, this is the backend for the 
+// The main parsing function for standard URLs. Standard URLs have a scheme,
+// host, path, etc.
 template<typename CHAR>
 void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
   DCHECK(spec_len >= 0);
@@ -315,50 +320,14 @@
   int begin = 0;
   TrimURL(spec, &begin, &spec_len);
 
-  // Handle empty specs or ones that contain only whitespace or control chars.
-  if (begin == spec_len) {
-    // ParsedAfterScheme will fill in empty values if there is no more data.
-    parsed->scheme = Component();
-    DoParseAfterScheme(spec, spec_len, begin, parsed);
-    return;
-  }
-
-  // Find the first non-scheme character before the beginning of the path. This
-  // code handles URLs that may have empty schemes, which makes it different
-  // than the ExtractScheme code above, which can happily fail if it doesn't
-  // find a colon.
-  int scheme_colon = -1;  // Index of first colon that preceeds the authority
-  for (int i = begin; i < spec_len; i++) {
-    if (IsAuthorityTerminator(spec[i]) ||
-        spec[i] == '@' || spec[i] == '[') {
-      // Start of path, found a username ("@"), or start of an IPV6 address
-      // literal. This means there is no scheme found.
-      break;
-    }
-    if (spec[i] == ':') {
-      scheme_colon = i;
-      break;
-    }
-  }
-
   int after_scheme;
-  if (scheme_colon >= 0) {
-    //   spec = <scheme>:/<the-rest>
-    // or
-    //   spec = <scheme>:<authority>
-    //   spec = <scheme>:<path-no-slashes>
-    parsed->scheme = MakeRange(begin, scheme_colon);
-    after_scheme = scheme_colon + 1;  // Character following the colon.
+  if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
+    after_scheme = parsed->scheme.end() + 1;  // Skip past the colon.
   } else {
-    //   spec = <authority-no-port-or-password>/<path>
-    //   spec = <path>
-    // or
-    //   spec = <authority-no-port-or-password>/<path-with-colon>
-    //   spec = <path-with-colon>
-    // or
-    //   spec = <authority-no-port-or-password>
-    //   spec = <path-no-slashes-or-colon>
-    parsed->scheme = Component();
+    // Say there's no scheme when there is a colon. We could also say that
+    // everything is the scheme. Both would produce an invalid URL, but this way
+    // seems less wrong in more cases.
+    parsed->scheme.reset();
     after_scheme = begin;
   }
   DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
@@ -370,12 +339,12 @@
 void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
   // Get the non-path and non-scheme parts of the URL out of the way, we never
   // use them.
-  parsed->username = Component();
-  parsed->password = Component();
-  parsed->host = Component(0, 0);
-  parsed->port = Component();
-  parsed->query = Component();
-  parsed->ref = Component();
+  parsed->username.reset();
+  parsed->password.reset();
+  parsed->host.reset();
+  parsed->port.reset();
+  parsed->query.reset();
+  parsed->ref.reset();
 
   // Strip leading & trailing spaces and control characters.
   int begin = 0;
@@ -383,9 +352,8 @@
 
   // Handle empty specs or ones that contain only whitespace or control chars.
   if (begin == spec_len) {
-    // ParsedAfterScheme will fill in empty values if there is no more data.
-    parsed->scheme = Component();
-    parsed->path = Component();
+    parsed->scheme.reset();
+    parsed->path.reset();
     return;
   }
 
@@ -399,16 +367,78 @@
     // -1, rather than having a length of 0 (we normally wouldn't care so
     // much for these non-standard URLs).
     if (parsed->scheme.end() == spec_len - 1)
-      parsed->path = Component();
+      parsed->path.reset();
     else
       parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
   } else {
     // No scheme found, just path.
-    parsed->scheme = Component();
+    parsed->scheme.reset();
     parsed->path = MakeRange(begin, spec_len);
   }
 }
 
+template<typename CHAR>
+void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+  DCHECK(spec_len >= 0);
+
+  // Get the non-path and non-scheme parts of the URL out of the way, we never
+  // use them.
+  parsed->username.reset();
+  parsed->password.reset();
+  parsed->host.reset();
+  parsed->port.reset();
+  parsed->ref.reset();
+  parsed->query.reset();  // May use this; reset for convenience.
+
+  // Strip leading & trailing spaces and control characters.
+  int begin = 0;
+  TrimURL(spec, &begin, &spec_len);
+
+  // Handle empty specs or ones that contain only whitespace or control chars.
+  if (begin == spec_len) {
+    parsed->scheme.reset();
+    parsed->path.reset();
+    return;
+  }
+
+  int path_begin = -1;
+  int path_end = -1;
+
+  // Extract the scheme, with the path being everything following. We also
+  // handle the case where there is no scheme.
+  if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+    // Offset the results since we gave ExtractScheme a substring.
+    parsed->scheme.begin += begin;
+
+    if (parsed->scheme.end() != spec_len - 1) {
+      path_begin = parsed->scheme.end() + 1;
+      path_end = spec_len;
+    }
+  } else {
+    // No scheme found, just path.
+    parsed->scheme.reset();
+    path_begin = begin;
+    path_end = spec_len;
+  }
+
+  // Split [path_begin, path_end) into a path + query.
+  for (int i = path_begin; i < path_end; ++i) {
+    if (spec[i] == '?') {
+      parsed->query = MakeRange(i + 1, path_end);
+      path_end = i;
+      break;
+    }
+  }
+
+  // For compatability with the standard URL parser, treat no path as
+  // -1, rather than having a length of 0
+  if (path_begin == path_end) {
+    parsed->path.reset();
+  } else {
+    parsed->path = MakeRange(path_begin, path_end);
+  }
+}
+
 // Converts a port number in a string to an integer. We'd like to just call
 // sscanf but our input is not NULL-terminated, which sscanf requires. Instead,
 // we copy the digits to a small stack buffer (since we know the maximum number
@@ -416,7 +446,7 @@
 template<typename CHAR>
 int DoParsePort(const CHAR* spec, const Component& component) {
   // Easy success case when there is no port.
-  const int max_digits = 5;
+  const int kMaxDigits = 5;
   if (!component.is_nonempty())
     return PORT_UNSPECIFIED;
 
@@ -433,11 +463,11 @@
 
   // Verify we don't have too many digits (we'll be copying to our buffer so
   // we need to double-check).
-  if (digits_comp.len > max_digits)
+  if (digits_comp.len > kMaxDigits)
     return PORT_INVALID;
 
   // Copy valid digits to the buffer.
-  char digits[max_digits + 1];  // +1 for null terminator
+  char digits[kMaxDigits + 1];  // +1 for null terminator
   for (int i = 0; i < digits_comp.len; i++) {
     CHAR ch = spec[digits_comp.begin + i];
     if (!IsPortDigit(ch)) {
@@ -462,7 +492,7 @@
                        Component* file_name) {
   // Handle empty paths: they have no file names.
   if (!path.is_nonempty()) {
-    *file_name = Component();
+    file_name->reset();
     return;
   }
 
@@ -496,77 +526,125 @@
 }
 
 template<typename CHAR>
-void DoExtractQueryFragment(const CHAR* spec,
+bool DoExtractQueryKeyValue(const CHAR* spec,
                             Component* query,
                             Component* key,
                             Component* value) {
+  if (!query->is_nonempty())
+    return false;
+
   int start = query->begin;
-  int c = start;
-  int end = query->begin + query->len;
-  while (c < end && spec[c] != '&' && spec[c] != '=')
-    c++;
+  int cur = start;
+  int end = query->end();
 
-  if ((c - start) > 0) {
-    key->begin = start;
-    key->len = c - start;
-  }
+  // We assume the beginning of the input is the beginning of the "key" and we
+  // skip to the end of it.
+  key->begin = cur;
+  while (cur < end && spec[cur] != '&' && spec[cur] != '=')
+    cur++;
+  key->len = cur - key->begin;
 
-  // We have a key, skip the separator if any
-  if (c < end && spec[c] == '=')
-    ++c;
+  // Skip the separator after the key (if any).
+  if (cur < end && spec[cur] == '=')
+    cur++;
 
-  if (c < end) {
-    start = c;
-    while (c < end && spec[c] != '&')
-      c++;
-    if ((c - start) > 0) {
-      value->begin = start;
-      value->len = c - start;
-    }
-  }
+  // Find the value part.
+  value->begin = cur;
+  while (cur < end && spec[cur] != '&')
+    cur++;
+  value->len = cur - value->begin;
 
   // Finally skip the next separator if any
-  if (c < end && spec[c] == '&')
-    ++c;
+  if (cur < end && spec[cur] == '&')
+    cur++;
 
   // Save the new query
-  query->begin = c;
-  query->len = end - c;
+  *query = url_parse::MakeRange(cur, end);
+  return true;
 }
 
 }  // namespace
 
 int Parsed::Length() const {
-  if (ref.is_nonempty())
+  if (ref.is_valid())
     return ref.end();
-  if (query.is_nonempty())
-    return query.end();
-  if (path.is_nonempty())
-    return path.end();
-  if (port.is_nonempty())
-    return port.end();
-  if (host.is_nonempty())
-    return host.end();
-  if (password.is_nonempty())
-    return password.end();
-  if (username.is_nonempty())
-    return username.end();
-  if (scheme.is_nonempty())
-    return scheme.end();
-  return 0;
+  return CountCharactersBefore(REF, false);
+}
+
+int Parsed::CountCharactersBefore(ComponentType type,
+                                  bool include_delimiter) const {
+  if (type == SCHEME)
+    return scheme.begin;
+
+  // There will be some characters after the scheme like "://" and we don't
+  // know how many. Search forwards for the next thing until we find one.
+  int cur = 0;
+  if (scheme.is_valid())
+    cur = scheme.end() + 1;  // Advance over the ':' at the end of the scheme.
+
+  if (username.is_valid()) {
+    if (type <= USERNAME)
+      return username.begin;
+    cur = username.end() + 1;  // Advance over the '@' or ':' at the end.
+  }
+
+  if (password.is_valid()) {
+    if (type <= PASSWORD)
+      return password.begin;
+    cur = password.end() + 1;  // Advance over the '@' at the end.
+  }
+
+  if (host.is_valid()) {
+    if (type <= HOST)
+      return host.begin;
+    cur = host.end();
+  }
+
+  if (port.is_valid()) {
+    if (type < PORT || (type == PORT && include_delimiter))
+      return port.begin - 1;  // Back over delimiter.
+    if (type == PORT)
+      return port.begin;  // Don't want delimiter counted.
+    cur = port.end();
+  }
+
+  if (path.is_valid()) {
+    if (type <= PATH)
+      return path.begin;
+    cur = path.end();
+  }
+
+  if (query.is_valid()) {
+    if (type < QUERY || (type == QUERY && include_delimiter))
+      return query.begin - 1;  // Back over delimiter.
+    if (type == QUERY)
+      return query.begin;  // Don't want delimiter counted.
+    cur = query.end();
+  }
+
+  if (ref.is_valid()) {
+    if (type == REF && !include_delimiter)
+      return ref.begin;  // Back over delimiter.
+
+    // When there is a ref and we get here, the component we wanted was before
+    // this and not found, so we always know the beginning of the ref is right.
+    return ref.begin - 1;  // Don't want delimiter counted.
+  }
+
+  return cur;
 }
 
 bool ExtractScheme(const char* url, int url_len, Component* scheme) {
   return DoExtractScheme(url, url_len, scheme);
 }
 
-bool ExtractScheme(const UTF16Char* url, int url_len, Component* scheme) {
+bool ExtractScheme(const char16* url, int url_len, Component* scheme) {
   return DoExtractScheme(url, url_len, scheme);
 }
 
 // This handles everything that may be an authority terminator, including
 // backslash. For special backslash handling see DoParseAfterScheme.
-bool IsAuthorityTerminator(UTF16Char ch) {
+bool IsAuthorityTerminator(char16 ch) {
   return IsURLSlash(ch) || ch == '?' || ch == '#' || ch == ';';
 }
 
@@ -576,31 +654,49 @@
   DoExtractFileName(url, path, file_name);
 }
 
-void ExtractFileName(const UTF16Char* url,
+void ExtractFileName(const char16* url,
                      const Component& path,
                      Component* file_name) {
   DoExtractFileName(url, path, file_name);
 }
 
-void ExtractQueryFragment(const char* url,
+bool ExtractQueryKeyValue(const char* url,
                           Component* query,
                           Component* key,
                           Component* value) {
-  DoExtractQueryFragment(url, query, key, value);
+  return DoExtractQueryKeyValue(url, query, key, value);
 }
 
-void ExtractQueryFragment(const UTF16Char* url,
+bool ExtractQueryKeyValue(const char16* url,
                           Component* query,
                           Component* key,
                           Component* value) {
-  DoExtractQueryFragment(url, query, key, value);
+  return DoExtractQueryKeyValue(url, query, key, value);
+}
+
+void ParseAuthority(const char* spec,
+                    const Component& auth,
+                    Component* username,
+                    Component* password,
+                    Component* hostname,
+                    Component* port_num) {
+  DoParseAuthority(spec, auth, username, password, hostname, port_num);
+}
+
+void ParseAuthority(const char16* spec,
+                    const Component& auth,
+                    Component* username,
+                    Component* password,
+                    Component* hostname,
+                    Component* port_num) {
+  DoParseAuthority(spec, auth, username, password, hostname, port_num);
 }
 
 int ParsePort(const char* url, const Component& port) {
   return DoParsePort(url, port);
 }
 
-int ParsePort(const UTF16Char* url, const Component& port) {
+int ParsePort(const char16* url, const Component& port) {
   return DoParsePort(url, port);
 }
 
@@ -608,7 +704,7 @@
   DoParseStandardURL(url, url_len, parsed);
 }
 
-void ParseStandardURL(const UTF16Char* url, int url_len, Parsed* parsed) {
+void ParseStandardURL(const char16* url, int url_len, Parsed* parsed) {
   DoParseStandardURL(url, url_len, parsed);
 }
 
@@ -616,10 +712,18 @@
   DoParsePathURL(url, url_len, parsed);
 }
 
-void ParsePathURL(const UTF16Char* url, int url_len, Parsed* parsed) {
+void ParsePathURL(const char16* url, int url_len, Parsed* parsed) {
   DoParsePathURL(url, url_len, parsed);
 }
 
+void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) {
+  DoParseMailtoURL(url, url_len, parsed);
+}
+
+void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed) {
+  DoParseMailtoURL(url, url_len, parsed);
+}
+
 void ParsePathInternal(const char* spec,
                        const Component& path,
                        Component* filepath,
@@ -628,7 +732,7 @@
   ParsePath(spec, path, filepath, query, ref);
 }
 
-void ParsePathInternal(const UTF16Char* spec,
+void ParsePathInternal(const char16* spec,
                        const Component& path,
                        Component* filepath,
                        Component* query,
@@ -643,7 +747,7 @@
   DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
 }
 
-void ParseAfterScheme(const UTF16Char* spec,
+void ParseAfterScheme(const char16* spec,
                       int spec_len,
                       int after_scheme,
                       Parsed* parsed) {

diff --git a/googleurl/src/url_parse.h b/googleurl/src/url_parse.h
index 281c6dc..134b445 100644
--- a/googleurl/src/url_parse.h
+++ b/googleurl/src/url_parse.h

@@ -33,17 +33,14 @@
 #include <string>
 
 #include "base/basictypes.h"
+#include "base/string16.h"
+#include "googleurl/src/url_common.h"
 
 namespace url_parse {
 
-// We represent UTF-16 data using a 2-byte character.  On platforms with
-// 2-byte wchar_t, we use that type directly.
-#ifdef WIN32
-typedef wchar_t UTF16Char;
-#else
-typedef uint16 UTF16Char;
-#endif
-typedef std::basic_string<UTF16Char> UTF16String;
+// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and
+// KURLGoogle.cpp still rely on this type.
+typedef char16 UTF16Char;
 
 // Component ------------------------------------------------------------------
 
@@ -110,13 +107,57 @@
 //      url_parse::ParsePathURL(url, url_len, &parsed);
 //
 struct Parsed {
-  // Use a special initializer for the host since its length should be 0 when
-  // not present. The default constructor is sufficient for the rest.
-  Parsed() : host(0, 0) {
-  }
+  // Identifies different components.
+  enum ComponentType {
+    SCHEME,
+    USERNAME,
+    PASSWORD,
+    HOST,
+    PORT,
+    PATH,
+    QUERY,
+    REF,
+  };
+
+  // The default constructor is sufficient for the components.
+  Parsed() {}
 
   // Returns the length of the URL (the end of the last component).
-  int Length() const;
+  //
+  // Note that for some invalid, non-canonical URLs, this may not be the length
+  // of the string. For example "http://": the parsed structure will only
+  // contain an entry for the four-character scheme, and it doesn't know about
+  // the "://". For all other last-components, it will return the real length.
+  GURL_API int Length() const;
+
+  // Returns the number of characters before the given component if it exists,
+  // or where the component would be if it did exist. This will return the
+  // string length if the component would be appended to the end.
+  //
+  // Note that this can get a little funny for the port, query, and ref
+  // components which have a delimiter that is not counted as part of the
+  // component. The |include_delimiter| flag controls if you want this counted
+  // as part of the component or not when the component exists.
+  //
+  // This example shows the difference between the two flags for two of these
+  // delimited components that is present (the port and query) and one that
+  // isn't (the reference). The components that this flag affects are marked
+  // with a *.
+  //                 0         1         2
+  //                 012345678901234567890
+  // Example input:  http://foo:80/?query
+  //              include_delim=true,  ...=false  ("<-" indicates different)
+  //      SCHEME: 0                    0
+  //    USERNAME: 5                    5
+  //    PASSWORD: 5                    5
+  //        HOST: 7                    7
+  //       *PORT: 10                   11 <-
+  //        PATH: 13                   13
+  //      *QUERY: 14                   15 <-
+  //        *REF: 20                   20
+  //
+  GURL_API int CountCharactersBefore(ComponentType type,
+                                     bool include_delimiter) const;
 
   // Scheme without the colon: "http://foo"/ would have a scheme of "http".
   // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there
@@ -134,8 +175,7 @@
   // "http://me:secret@host/"
   Component password;
 
-  // Host name. The host name is never unspecified, and will have a length of
-  // zero if empty.
+  // Host name.
   Component host;
 
   // Port number.
@@ -177,20 +217,24 @@
 // StandardURL is for when the scheme is known to be one that has an
 // authority (host) like "http". This function will not handle weird ones
 // like "about:" and "javascript:", or do the right thing for "file:" URLs.
-void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
-void ParseStandardURL(const UTF16Char* url, int url_len, Parsed* parsed);
+GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);
 
 // PathURL is for when the scheme is known not to have an authority (host)
 // section but that aren't file URLs either. The scheme is parsed, and
 // everything after the scheme is considered as the path. This is used for
 // things like "about:" and "javascript:"
-void ParsePathURL(const char* url, int url_len, Parsed* parsed);
-void ParsePathURL(const UTF16Char* url, int url_len, Parsed* parsed);
+GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed);
 
 // FileURL is for file URLs. There are some special rules for interpreting
 // these.
-void ParseFileURL(const char* url, int url_len, Parsed* parsed);
-void ParseFileURL(const UTF16Char* url, int url_len, Parsed* parsed);
+GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed);
+
+// MailtoURL is for mailto: urls. They are made up scheme,path,query
+GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
 
 // Helper functions -----------------------------------------------------------
 
@@ -214,12 +258,27 @@
 // end of the string).
 //
 // The 8-bit version requires UTF-8 encoding.
-bool ExtractScheme(const char* url, int url_len, Component* scheme);
-bool ExtractScheme(const UTF16Char* url, int url_len, Component* scheme);
+GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme);
+GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme);
 
 // Returns true if ch is a character that terminates the authority segment
 // of a URL.
-bool IsAuthorityTerminator(UTF16Char ch);
+GURL_API bool IsAuthorityTerminator(char16 ch);
+
+// Does a best effort parse of input |spec|, in range |auth|. If a particular
+// component is not found, it will be set to invalid.
+GURL_API void ParseAuthority(const char* spec,
+                             const Component& auth,
+                             Component* username,
+                             Component* password,
+                             Component* hostname,
+                             Component* port_num);
+GURL_API void ParseAuthority(const char16* spec,
+                             const Component& auth,
+                             Component* username,
+                             Component* password,
+                             Component* hostname,
+                             Component* port_num);
 
 // Computes the integer port value from the given port component. The port
 // component should have been identified by one of the init functions on
@@ -228,8 +287,8 @@
 // The return value will be a positive integer between 0 and 64K, or one of
 // the two special values below.
 enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
-int ParsePort(const char* url, const Component& port);
-int ParsePort(const UTF16Char* url, const Component& port);
+GURL_API int ParsePort(const char* url, const Component& port);
+GURL_API int ParsePort(const char16* url, const Component& port);
 
 // Extracts the range of the file name in the given url. The path must
 // already have been computed by the parse function, and the matching URL
@@ -241,25 +300,36 @@
 // following the last slash.
 //
 // The 8-bit version requires UTF-8 encoding.
-void ExtractFileName(const char* url,
-                     const Component& path,
-                     Component* file_name);
-void ExtractFileName(const UTF16Char* url,
-                     const Component& path,
-                     Component* file_name);
+GURL_API void ExtractFileName(const char* url,
+                              const Component& path,
+                              Component* file_name);
+GURL_API void ExtractFileName(const char16* url,
+                              const Component& path,
+                              Component* file_name);
 
-// Extract the next key / value from the range defined by query.
-// Updates query to start at the end of the extracted key / value
-// pair. If no key or / and no value are found query and key are
-// unchanged.
-void ExtractQueryFragment(const char* url,
-                          Component* query,
-                          Component* key,
-                          Component* value);
-void ExtractQueryFragment(const UTF16Char* url,
-                          Component* query,
-                          Component* key,
-                          Component* value);
+// Extract the first key/value from the range defined by |*query|. Updates
+// |*query| to start at the end of the extracted key/value pair. This is
+// designed for use in a loop: you can keep calling it with the same query
+// object and it will iterate over all items in the query.
+//
+// Some key/value pairs may have the key, the value, or both be empty (for
+// example, the query string "?&"). These will be returned. Note that an empty
+// last parameter "foo.com?" or foo.com?a&" will not be returned, this case
+// is the same as "done."
+//
+// The initial query component should not include the '?' (this is the default
+// for parsed URLs).
+//
+// If no key/value are found |*key| and |*value| will be unchanged and it will
+// return false.
+GURL_API bool ExtractQueryKeyValue(const char* url,
+                                   Component* query,
+                                   Component* key,
+                                   Component* value);
+GURL_API bool ExtractQueryKeyValue(const char16* url,
+                                   Component* query,
+                                   Component* key,
+                                   Component* value);
 
 }  // namespace url_parse
 

diff --git a/googleurl/src/url_parse_file.cc b/googleurl/src/url_parse_file.cc
index f98fb60..2e8429f 100644
--- a/googleurl/src/url_parse_file.cc
+++ b/googleurl/src/url_parse_file.cc

@@ -67,13 +67,11 @@
 
 namespace {
 
-#ifdef WIN32
-
-// A Windows-only subcomponent of DoInitFileURL, the input of this function
-// should be a UNC path name, with the index of the first character after the
-// slashes following the scheme given in |after_slashes|. This will initialize
-// the host, path, query, and ref, and leave the other output components
-// untouched (DoInitFileURL handles these for us).
+// A subcomponent of DoInitFileURL, the input of this function should be a UNC
+// path name, with the index of the first character after the slashes following
+// the scheme given in |after_slashes|. This will initialize the host, path,
+// query, and ref, and leave the other output components untouched
+// (DoInitFileURL handles these for us).
 template<typename CHAR>
 void DoParseUNC(const CHAR* spec,
                 int after_slashes,
@@ -83,39 +81,45 @@
   if (next_slash == spec_len) {
     // No additional slash found, as in "file://foo", treat the text as the
     // host with no path (this will end up being UNC to server "foo").
-    parsed->host = MakeRange(after_slashes, spec_len);
-    parsed->path = Component();
+    int host_len = spec_len - after_slashes;
+    if (host_len)
+      parsed->host = Component(after_slashes, host_len);
+    else
+      parsed->host.reset();
+    parsed->path.reset();
     return;
   }
 
+#ifdef WIN32
   // See if we have something that looks like a path following the first
   // component. As in "file://localhost/c:/", we get "c:/" out. We want to
   // treat this as a having no host but the path given. Works on Windows only.
   if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
-    parsed->host = Component(after_slashes, 0);
+    parsed->host.reset();
     ParsePathInternal(spec, MakeRange(next_slash, spec_len),
                       &parsed->path, &parsed->query, &parsed->ref);
     return;
   }
+#endif
 
   // Otherwise, everything up until that first slash we found is the host name,
   // which will end up being the UNC host. For example "file://foo/bar.txt"
-  // will get a server name of "foo" and a path of "/bar". Later, this should
-  // be treated as the filename "\\foo\bar.txt" in proper UNC notation.
-  //
-  // This is Windows only. On Linux systems, this definition will have to
-  // be different, but so will our treatment of UNC filenames.
-  parsed->host = MakeRange(after_slashes, next_slash);
+  // will get a server name of "foo" and a path of "/bar". Later, on Windows,
+  // this should be treated as the filename "\\foo\bar.txt" in proper UNC
+  // notation.
+  int host_len = next_slash - after_slashes;
+  if (host_len)
+    parsed->host = MakeRange(after_slashes, next_slash);
+  else
+    parsed->host.reset();
   if (next_slash < spec_len) {
     ParsePathInternal(spec, MakeRange(next_slash, spec_len),
                       &parsed->path, &parsed->query, &parsed->ref);
   } else {
-    parsed->path = Component();
+    parsed->path.reset();
   }
 }
 
-#endif  // WIN32
-
 // A subcomponent of DoParseFileURL, the input should be a local file, with the
 // beginning of the path indicated by the index in |path_begin|. This will
 // initialize the host, path, query, and ref, and leave the other output
@@ -125,7 +129,7 @@
                       int path_begin,
                       int spec_len,
                       Parsed* parsed) {
-  parsed->host = Component(path_begin, 0);
+  parsed->host.reset();
   ParsePathInternal(spec, MakeRange(path_begin, spec_len),
                     &parsed->path, &parsed->query, &parsed->ref);
 }
@@ -138,26 +142,38 @@
   DCHECK(spec_len >= 0);
 
   // Get the parts we never use for file URLs out of the way.
-  parsed->username = Component();
-  parsed->password = Component();
-  parsed->port = Component();
+  parsed->username.reset();
+  parsed->password.reset();
+  parsed->port.reset();
 
   // Many of the code paths don't set these, so it's convenient to just clear
   // them. We'll write them in those cases we need them.
-  parsed->query = Component();
-  parsed->ref = Component();
+  parsed->query.reset();
+  parsed->ref.reset();
 
   // Strip leading & trailing spaces and control characters.
   int begin = 0;
   TrimURL(spec, &begin, &spec_len);
 
   // Find the scheme.
+  int num_slashes;
   int after_scheme;
+  int after_slashes;
 #ifdef WIN32
-  if (DoesBeginWindowsDriveSpec(spec, begin, spec_len) ||
-      DoesBeginUNCPath(spec, begin, spec_len, false)) {
+  // See how many slashes there are. We want to handle cases like UNC but also
+  // "/c:/foo". This is when there is no scheme, so we can allow pages to do
+  // links like "c:/foo/bar" or "//foo/bar". This is also called by the
+  // relative URL resolver when it determines there is an absolute URL, which
+  // may give us input like "/c:/foo".
+  num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
+  after_slashes = begin + num_slashes;
+  if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
     // Windows path, don't try to extract the scheme (for example, "c:\foo").
-    parsed->scheme = Component();
+    parsed->scheme.reset();
+    after_scheme = after_slashes;
+  } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
+    // Windows UNC path: don't try to extract the scheme, but keep the slashes.
+    parsed->scheme.reset();
     after_scheme = begin;
   } else
 #endif
@@ -168,7 +184,7 @@
       after_scheme = parsed->scheme.end() + 1;
     } else {
       // No scheme found, remember that.
-      parsed->scheme = Component();
+      parsed->scheme.reset();
       after_scheme = begin;
     }
   }
@@ -176,15 +192,19 @@
   // Handle empty specs ones that contain only whitespace or control chars,
   // or that are just the scheme (for example "file:").
   if (after_scheme == spec_len) {
-    parsed->host = Component(begin, 0);
-    parsed->path = Component();
+    parsed->host.reset();
+    parsed->path.reset();
     return;
   }
 
-  int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
+  num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
 
+  after_slashes = after_scheme + num_slashes;
 #ifdef WIN32
-  int after_slashes = after_scheme + num_slashes;
+  // Check whether the input is a drive again. We checked above for windows
+  // drive specs, but that's only at the very beginning to see if we have a
+  // scheme at all. This test will be duplicated in that case, but will
+  // additionally handle all cases with a real scheme such as "file:///C:/".
   if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
       num_slashes != 3) {
     // Anything not beginning with a drive spec ("c:\") on Windows is treated
@@ -193,6 +213,12 @@
     DoParseUNC(spec, after_slashes, spec_len, parsed);
     return;
   }
+#else
+  // file: URL with exactly 2 slashes is considered to have a host component.
+  if (num_slashes == 2) {
+    DoParseUNC(spec, after_slashes, spec_len, parsed);
+    return;
+  }
 #endif  // WIN32
 
   // Easy and common case, the full path immediately follows the scheme
@@ -210,7 +236,7 @@
   DoParseFileURL(url, url_len, parsed);
 }
 
-void ParseFileURL(const UTF16Char* url, int url_len, Parsed* parsed) {
+void ParseFileURL(const char16* url, int url_len, Parsed* parsed) {
   DoParseFileURL(url, url_len, parsed);
 }
 

diff --git a/googleurl/src/url_parse_internal.h b/googleurl/src/url_parse_internal.h
index f7c8ab8..61bd068 100644
--- a/googleurl/src/url_parse_internal.h
+++ b/googleurl/src/url_parse_internal.h

@@ -37,18 +37,21 @@
 namespace url_parse {
 
 // We treat slashes and backslashes the same for IE compatability.
-inline bool IsURLSlash(UTF16Char ch) {
+inline bool IsURLSlash(char16 ch) {
   return ch == '/' || ch == '\\';
 }
 
 // Returns true if we should trim this character from the URL because it is a
 // space or a control character.
-inline bool ShouldTrimFromURL(UTF16Char ch) {
+inline bool ShouldTrimFromURL(char16 ch) {
   return ch <= ' ';
 }
 
 // Given an already-initialized begin index and length, this shrinks the range
-// to eliminate "should-be-trimmed" characters.
+// to eliminate "should-be-trimmed" characters. Note that the length does *not*
+// indicate the length of untrimmed data from |*begin|, but rather the position
+// in the input string (so the string starts at character |*begin| in the spec,
+// and goes until |*len|).
 template<typename CHAR>
 inline void TrimURL(const CHAR* spec, int* begin, int* len) {
   // Strip leading whitespace and control characters.
@@ -85,7 +88,7 @@
                        Component* filepath,
                        Component* query,
                        Component* ref);
-void ParsePathInternal(const UTF16Char* spec,
+void ParsePathInternal(const char16* spec,
                        const Component& path,
                        Component* filepath,
                        Component* query,
@@ -99,7 +102,7 @@
                       int spec_len,
                       int after_scheme,
                       Parsed* parsed);
-void ParseAfterScheme(const UTF16Char* spec,
+void ParseAfterScheme(const char16* spec,
                       int spec_len,
                       int after_scheme,
                       Parsed* parsed);

diff --git a/googleurl/src/url_parse_unittest.cc b/googleurl/src/url_parse_unittest.cc
index 95e77f3..299488b 100644
--- a/googleurl/src/url_parse_unittest.cc
+++ b/googleurl/src/url_parse_unittest.cc

@@ -84,7 +84,7 @@
   const char* ref;
 };
 
-// Simpler version of the above for testing path URLs.
+// Simpler version of URLParseCase for testing path URLs.
 struct PathURLParseCase {
   const char* input;
 
@@ -92,6 +92,16 @@
   const char* path;
 };
 
+// Simpler version of URLParseCase for testing mailto URLs.
+struct MailtoURLParseCase {
+  const char* input;
+
+  const char* scheme;
+  const char* path;
+  const char* query;
+};
+
+
 bool ComponentMatches(const char* input,
                       const char* reference,
                       const url_parse::Component& component) {
@@ -107,16 +117,118 @@
   if (component.len < 0)
     return false;  // Reference is not NULL but we don't have anything
 
-  if (strlen(reference) != component.len)
+  if (strlen(reference) != static_cast<size_t>(component.len))
     return false;  // Lengths don't match
 
   // Now check the actual characters.
   return strncmp(reference, &input[component.begin], component.len) == 0;
 }
 
+void ExpectInvalidComponent(const url_parse::Component& component) {
+  EXPECT_EQ(0, component.begin);
+  EXPECT_EQ(-1, component.len);
+}
+
 }  // namespace
 
-// Standard -------------------------------------------------------------------
+// Parsed ----------------------------------------------------------------------
+
+TEST(URLParser, Length) {
+  const char* length_cases[] = {
+      // One with everything in it.
+    "http://user:pass@host:99/foo?bar#baz",
+      // One with nothing in it.
+    "",
+      // Working backwards, let's start taking off stuff from the full one.
+    "http://user:pass@host:99/foo?bar#",
+    "http://user:pass@host:99/foo?bar",
+    "http://user:pass@host:99/foo?",
+    "http://user:pass@host:99/foo",
+    "http://user:pass@host:99/",
+    "http://user:pass@host:99",
+    "http://user:pass@host:",
+    "http://user:pass@host",
+    "http://host",
+    "http://user@",
+    "http:",
+  };
+  for (size_t i = 0; i < arraysize(length_cases); i++) {
+    int true_length = static_cast<int>(strlen(length_cases[i]));
+
+    url_parse::Parsed parsed;
+    url_parse::ParseStandardURL(length_cases[i], true_length, &parsed);
+
+    EXPECT_EQ(true_length, parsed.Length());
+  }
+}
+
+TEST(URLParser, CountCharactersBefore) {
+  using namespace url_parse;
+  struct CountCase {
+    const char* url;
+    Parsed::ComponentType component;
+    bool include_delimiter;
+    int expected_count;
+  } count_cases[] = {
+      // Test each possibility in the case where all components are present.
+//    0         1         2
+//    0123456789012345678901
+    {"http://u:p@h:8/p?q#r", Parsed::SCHEME, true, 0},
+    {"http://u:p@h:8/p?q#r", Parsed::SCHEME, false, 0},
+    {"http://u:p@h:8/p?q#r", Parsed::USERNAME, true, 7},
+    {"http://u:p@h:8/p?q#r", Parsed::USERNAME, false, 7},
+    {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, true, 9},
+    {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, false, 9},
+    {"http://u:p@h:8/p?q#r", Parsed::HOST, true, 11},
+    {"http://u:p@h:8/p?q#r", Parsed::HOST, false, 11},
+    {"http://u:p@h:8/p?q#r", Parsed::PORT, true, 12},
+    {"http://u:p@h:8/p?q#r", Parsed::PORT, false, 13},
+    {"http://u:p@h:8/p?q#r", Parsed::PATH, false, 14},
+    {"http://u:p@h:8/p?q#r", Parsed::PATH, true, 14},
+    {"http://u:p@h:8/p?q#r", Parsed::QUERY, true, 16},
+    {"http://u:p@h:8/p?q#r", Parsed::QUERY, false, 17},
+    {"http://u:p@h:8/p?q#r", Parsed::REF, true, 18},
+    {"http://u:p@h:8/p?q#r", Parsed::REF, false, 19},
+      // Now test when the requested component is missing.
+    {"http://u:p@h:8/p?", Parsed::REF, true, 17},
+    {"http://u:p@h:8/p?q", Parsed::REF, true, 18},
+    {"http://u:p@h:8/p#r", Parsed::QUERY, true, 16},
+    {"http://u:p@h:8#r", Parsed::PATH, true, 14},
+    {"http://u:p@h/", Parsed::PORT, true, 12},
+    {"http://u:p@/", Parsed::HOST, true, 11},
+      // This case is a little weird. It will report that the password would
+      // start where the host begins. This is arguably correct, although you
+      // could also argue that it should start at the '@' sign. Doing it
+      // starting with the '@' sign is actually harder, so we don't bother.
+    {"http://u@h/", Parsed::PASSWORD, true, 9},
+    {"http://h/", Parsed::USERNAME, true, 7},
+    {"http:", Parsed::USERNAME, true, 5},
+    {"", Parsed::SCHEME, true, 0},
+      // Make sure a random component still works when there's nothing there.
+    {"", Parsed::REF, true, 0},
+      // File URLs are special with no host, so we test those.
+    {"file:///c:/foo", Parsed::USERNAME, true, 7},
+    {"file:///c:/foo", Parsed::PASSWORD, true, 7},
+    {"file:///c:/foo", Parsed::HOST, true, 7},
+    {"file:///c:/foo", Parsed::PATH, true, 7},
+  };
+  for (size_t i = 0; i < ARRAYSIZE(count_cases); i++) {
+    int length = static_cast<int>(strlen(count_cases[i].url));
+
+    // Simple test to distinguish file and standard URLs.
+    url_parse::Parsed parsed;
+    if (length > 0 && count_cases[i].url[0] == 'f')
+      url_parse::ParseFileURL(count_cases[i].url, length, &parsed);
+    else
+      url_parse::ParseStandardURL(count_cases[i].url, length, &parsed);
+
+    int chars_before = parsed.CountCharactersBefore(
+        count_cases[i].component, count_cases[i].include_delimiter);
+    EXPECT_EQ(count_cases[i].expected_count, chars_before);
+  }
+}
+
+// Standard --------------------------------------------------------------------
 
 // Input                               Scheme  Usrname Passwd     Host         Port Path       Query        Ref
 // ------------------------------------ ------- ------- ---------- ------------ --- ---------- ------------ -----
@@ -147,28 +259,27 @@
 {"http://f: 21 / b ? d # e ",           "http", NULL,  NULL,      "f",          -2, "/ b ",    " d ",       " e"},
 
   // Creative URLs missing key elements
-{"",                                    NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
-{"  \t",                                NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
+{"",                                    NULL,   NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
+{"  \t",                                NULL,   NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
 {":foo.com/",                           "",     NULL,  NULL,      "foo.com",    -1, "/",       NULL,        NULL},
 {":foo.com\\",                          "",     NULL,  NULL,      "foo.com",    -1, "\\",      NULL,        NULL},
-{":",                                   "",     NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
+{":",                                   "",     NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
 {":a",                                  "",     NULL,  NULL,      "a",          -1, NULL,      NULL,        NULL},
-{":/",                                  "",     NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
-{":\\",                                 "",     NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
-{":#",                                  "",     NULL,  NULL,      "",           -1, NULL,      NULL,        ""},
-{"#",                                   NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        ""},
-{"#/",                                  NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        "/"},
-{"#\\",                                 NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        "\\"},
-{"#;?",                                 NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        ";?"},
-{"?",                                   NULL,   NULL,  NULL,      "",           -1, NULL,      "",          NULL},
-{"/",                                   NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
+{":/",                                  "",     NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
+{":\\",                                 "",     NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
+{":#",                                  "",     NULL,  NULL,      NULL,         -1, NULL,      NULL,        ""},
+{"#",                                   NULL,   NULL,  NULL,      NULL,         -1, NULL,      NULL,        ""},
+{"#/",                                  NULL,   NULL,  NULL,      NULL,         -1, NULL,      NULL,        "/"},
+{"#\\",                                 NULL,   NULL,  NULL,      NULL,         -1, NULL,      NULL,        "\\"},
+{"#;?",                                 NULL,   NULL,  NULL,      NULL,         -1, NULL,      NULL,        ";?"},
+{"?",                                   NULL,   NULL,  NULL,      NULL,         -1, NULL,      "",          NULL},
+{"/",                                   NULL,   NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
 {":23",                                 "",     NULL,  NULL,      "23",         -1, NULL,      NULL,        NULL},
-{"/:23",                                NULL,   NULL,  NULL,      "",           23, NULL,      NULL,        NULL},
-{"//",                                  NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
-{"/:",                                  NULL,   NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
-{"::",                                  "",     NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
-{"::23",                                "",     NULL,  NULL,      "",           23, NULL,      NULL,        NULL},
-{"foo://",                              "foo",  NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
+{"/:23",                                "/",    NULL,  NULL,      "23",         -1, NULL,      NULL,        NULL},
+{"//",                                  NULL,   NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
+{"::",                                  "",     NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
+{"::23",                                "",     NULL,  NULL,      NULL,         23, NULL,      NULL,        NULL},
+{"foo://",                              "foo",  NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
 
   // Username/passwords and things that look like them
 {"http://a:b@c:29/d",                   "http", "a",   "b",       "c",          29, "/d",      NULL,        NULL},
@@ -183,27 +294,35 @@
 {"http:\\\\a\\b:c\\d@foo.com\\",        "http", NULL,  NULL,      "a",          -1, "\\b:c\\d@foo.com\\", NULL,   NULL},
 
   // Tolerate different numbers of slashes.
-{"foo:/",                               "foo",  NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
+{"foo:/",                               "foo",  NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
 {"foo:/bar.com/",                       "foo",  NULL,  NULL,      "bar.com",    -1, "/",       NULL,        NULL},
-{"foo://///////",                       "foo",  NULL,  NULL,      "",           -1, NULL,      NULL,        NULL},
+{"foo://///////",                       "foo",  NULL,  NULL,      NULL,         -1, NULL,      NULL,        NULL},
 {"foo://///////bar.com/",               "foo",  NULL,  NULL,      "bar.com",    -1, "/",       NULL,        NULL},
-{"foo:////://///",                      "foo",  NULL,  NULL,      "",           -1, "/////",   NULL,        NULL},
+{"foo:////://///",                      "foo",  NULL,  NULL,      NULL,         -1, "/////",   NULL,        NULL},
 
   // Raw file paths on Windows aren't handled by the parser.
 {"c:/foo",                              "c",    NULL,  NULL,      "foo",        -1, NULL,      NULL,        NULL},
 {"//foo/bar",                           NULL,   NULL,  NULL,      "foo",        -1, "/bar",    NULL,        NULL},
 
-  // Use the first question mark for the query, the last # for the ref.
-{"http://foo/path;a??e#f#g",            "http", NULL,  NULL,      "foo",        -1, "/path;a", "?e#f",      "g"},
+  // Use the first question mark for the query and the ref.
+{"http://foo/path;a??e#f#g",            "http", NULL,  NULL,      "foo",        -1, "/path;a", "?e",      "f#g"},
 {"http://foo/abcd?efgh?ijkl",           "http", NULL,  NULL,      "foo",        -1, "/abcd",   "efgh?ijkl", NULL},
-
-  // Use first question mark for the query.
 {"http://foo/abcd#foo?bar",             "http", NULL,  NULL,      "foo",        -1, "/abcd",   NULL,        "foo?bar"},
 
-  // IPV6, check also interesting uses of colons
-{"[61:24:74]:98",                       NULL,   NULL,  NULL,      "[61:24:74]", 98, NULL,      NULL,        NULL},
+  // IPv6, check also interesting uses of colons.
+{"[61:24:74]:98",                       "[61",  NULL,  NULL,      "24:74]",     98, NULL,      NULL,        NULL},
 {"http://[61:27]:98",                   "http", NULL,  NULL,      "[61:27]",    98, NULL,      NULL,        NULL},
 {"http:[61:27]/:foo",                   "http", NULL,  NULL,      "[61:27]",    -1, "/:foo",   NULL,        NULL},
+{"http://[1::2]:3:4",                   "http", NULL,  NULL,      "[1::2]:3",    4, NULL,      NULL,        NULL},
+
+  // Partially-complete IPv6 literals, and related cases.
+{"http://2001::1",                      "http", NULL,  NULL,      "2001:",       1, NULL,      NULL,        NULL},
+{"http://[2001::1",                     "http", NULL,  NULL,      "[2001::1",   -1, NULL,      NULL,        NULL},
+{"http://2001::1]",                     "http", NULL,  NULL,      "2001::1]",   -1, NULL,      NULL,        NULL},
+{"http://2001::1]:80",                  "http", NULL,  NULL,      "2001::1]",   80, NULL,      NULL,        NULL},
+{"http://[2001::1]",                    "http", NULL,  NULL,      "[2001::1]",  -1, NULL,      NULL,        NULL},
+{"http://[2001::1]:80",                 "http", NULL,  NULL,      "[2001::1]",  80, NULL,      NULL,        NULL},
+{"http://[[::]]",                       "http", NULL,  NULL,      "[[::]]",     -1, NULL,      NULL,        NULL},
 
 };
 
@@ -211,7 +330,7 @@
   // Declared outside for loop to try to catch cases in init() where we forget
   // to reset something that is reset by the construtor.
   url_parse::Parsed parsed;
-  for (int i = 0; i < arraysize(cases); i++) {
+  for (size_t i = 0; i < arraysize(cases); i++) {
     const char* url = cases[i].input;
     url_parse::ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed);
     int port = url_parse::ParsePort(url, parsed.port);
@@ -246,31 +365,20 @@
   // Declared outside for loop to try to catch cases in init() where we forget
   // to reset something that is reset by the construtor.
   url_parse::Parsed parsed;
-  for (int i = 0; i < arraysize(path_cases); i++) {
+  for (size_t i = 0; i < arraysize(path_cases); i++) {
     const char* url = path_cases[i].input;
     url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), &parsed);
 
     EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme));
     EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.path));
 
-    EXPECT_EQ(0, parsed.username.begin);
-    EXPECT_EQ(-1, parsed.username.len);
-
-    EXPECT_EQ(0, parsed.password.begin);
-    EXPECT_EQ(-1, parsed.password.len);
-
-    // Hosts have 0, not -1 when not present.
-    EXPECT_EQ(0, parsed.host.begin);
-    EXPECT_EQ(0, parsed.host.len);
-
-    EXPECT_EQ(0, parsed.port.begin);
-    EXPECT_EQ(-1, parsed.port.len);
-
-    EXPECT_EQ(0, parsed.query.begin);
-    EXPECT_EQ(-1, parsed.query.len);
-
-    EXPECT_EQ(0, parsed.ref.begin);
-    EXPECT_EQ(-1, parsed.ref.len);
+    // The remaining components are never used for path urls.
+    ExpectInvalidComponent(parsed.username);
+    ExpectInvalidComponent(parsed.password);
+    ExpectInvalidComponent(parsed.host);
+    ExpectInvalidComponent(parsed.port);
+    ExpectInvalidComponent(parsed.query);
+    ExpectInvalidComponent(parsed.ref);
   }
 }
 
@@ -282,31 +390,31 @@
 static URLParseCase file_cases[] = {
 {"file:server",              "file", NULL, NULL, "server", -1, NULL,          NULL, NULL},
 {"  file: server  \t",       "file", NULL, NULL, " server",-1, NULL,          NULL, NULL},
-{"FiLe:c|",                  "FiLe", NULL, NULL, "",       -1, "c|",          NULL, NULL},
+{"FiLe:c|",                  "FiLe", NULL, NULL, NULL,     -1, "c|",          NULL, NULL},
 {"FILE:/\\\\/server/file",   "FILE", NULL, NULL, "server", -1, "/file",       NULL, NULL},
 {"file://server/",           "file", NULL, NULL, "server", -1, "/",           NULL, NULL},
-{"file://localhost/c:/",     "file", NULL, NULL, "",       -1, "/c:/",        NULL, NULL},
-{"file://127.0.0.1/c|\\",    "file", NULL, NULL, "",       -1, "/c|\\",       NULL, NULL},
-{"file:/",                   "file", NULL, NULL, "",       -1, NULL,          NULL, NULL},
-{"file:",                    "file", NULL, NULL, "",       -1, NULL,          NULL, NULL},
+{"file://localhost/c:/",     "file", NULL, NULL, NULL,     -1, "/c:/",        NULL, NULL},
+{"file://127.0.0.1/c|\\",    "file", NULL, NULL, NULL,     -1, "/c|\\",       NULL, NULL},
+{"file:/",                   "file", NULL, NULL, NULL,     -1, NULL,          NULL, NULL},
+{"file:",                    "file", NULL, NULL, NULL,     -1, NULL,          NULL, NULL},
   // If there is a Windows drive letter, treat any number of slashes as the
   // path part.
-{"file:c:\\fo\\b",           "file", NULL, NULL, "",       -1, "c:\\fo\\b",   NULL, NULL},
-{"file:/c:\\foo/bar",        "file", NULL, NULL, "",       -1, "/c:\\foo/bar",NULL, NULL},
-{"file://c:/f\\b",           "file", NULL, NULL, "",       -1, "/c:/f\\b",    NULL, NULL},
-{"file:///C:/foo",           "file", NULL, NULL, "",       -1, "/C:/foo",     NULL, NULL},
-{"file://///\\/\\/c:\\f\\b", "file", NULL, NULL, "",       -1, "/c:\\f\\b",   NULL, NULL},
+{"file:c:\\fo\\b",           "file", NULL, NULL, NULL,     -1, "c:\\fo\\b",   NULL, NULL},
+{"file:/c:\\foo/bar",        "file", NULL, NULL, NULL,     -1, "/c:\\foo/bar",NULL, NULL},
+{"file://c:/f\\b",           "file", NULL, NULL, NULL,     -1, "/c:/f\\b",    NULL, NULL},
+{"file:///C:/foo",           "file", NULL, NULL, NULL,     -1, "/C:/foo",     NULL, NULL},
+{"file://///\\/\\/c:\\f\\b", "file", NULL, NULL, NULL,     -1, "/c:\\f\\b",   NULL, NULL},
   // If there is not a drive letter, we should treat is as UNC EXCEPT for
   // three slashes, which we treat as a Unix style path.
 {"file:server/file",         "file", NULL, NULL, "server", -1, "/file",       NULL, NULL},
 {"file:/server/file",        "file", NULL, NULL, "server", -1, "/file",       NULL, NULL},
 {"file://server/file",       "file", NULL, NULL, "server", -1, "/file",       NULL, NULL},
-{"file:///server/file",      "file", NULL, NULL, "",       -1, "/server/file",NULL, NULL},
-{"file://\\server/file",     "file", NULL, NULL, "",       -1, "\\server/file",NULL, NULL},
+{"file:///server/file",      "file", NULL, NULL, NULL,     -1, "/server/file",NULL, NULL},
+{"file://\\server/file",     "file", NULL, NULL, NULL,     -1, "\\server/file",NULL, NULL},
 {"file:////server/file",     "file", NULL, NULL, "server", -1, "/file",       NULL, NULL},
   // Queries and refs are valid for file URLs as well.
-{"file:///C:/foo.html?#",   "file", NULL, NULL, "",       -1, "/C:/foo.html",  "",   ""},
-{"file:///C:/foo.html?query=yes#ref", "file", NULL, NULL, "", -1, "/C:/foo.html", "query=yes", "ref"},
+{"file:///C:/foo.html?#",   "file", NULL, NULL,  NULL,     -1, "/C:/foo.html",  "",   ""},
+{"file:///C:/foo.html?query=yes#ref", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "query=yes", "ref"},
 };
 
 TEST(URLParser, WindowsFile) {
@@ -348,7 +456,7 @@
     {"http://www.google.com/foo/bar.html?query#ref", "bar.html"},
   };
 
-  for (int i = 0; i < ARRAYSIZE(file_cases); i++) {
+  for (size_t i = 0; i < ARRAYSIZE(file_cases); i++) {
     const char* url = file_cases[i].input;
     int len = static_cast<int>(strlen(url));
 
@@ -361,3 +469,115 @@
     EXPECT_TRUE(ComponentMatches(url, file_cases[i].expected, file_name));
   }
 }
+
+// Returns true if the parameter with index |parameter| in the given URL's
+// query string. The expected key can be NULL to indicate no such key index
+// should exist. The parameter number is 1-based.
+static bool NthParameterIs(const char* url,
+                           int parameter,
+                           const char* expected_key,
+                           const char* expected_value) {
+  url_parse::Parsed parsed;
+  url_parse::ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed);
+
+  url_parse::Component query = parsed.query;
+
+  for (int i = 1; i <= parameter; i++) {
+    url_parse::Component key, value;
+    if (!url_parse::ExtractQueryKeyValue(url, &query, &key, &value)) {
+      if (parameter >= i && !expected_key)
+        return true;  // Expected nonexistant key, got one.
+      return false;  // Not enough keys.
+    }
+
+    if (i == parameter) {
+      if (!expected_key)
+        return false;
+
+      if (strncmp(&url[key.begin], expected_key, key.len) != 0)
+        return false;
+      if (strncmp(&url[value.begin], expected_value, value.len) != 0)
+        return false;
+      return true;
+    }
+  }
+  return expected_key == NULL;  // We didn't find that many parameters.
+}
+
+TEST(URLParser, ExtractQueryKeyValue) {
+  EXPECT_TRUE(NthParameterIs("http://www.google.com", 1, NULL, NULL));
+
+  // Basic case.
+  char a[] = "http://www.google.com?arg1=1&arg2=2&bar";
+  EXPECT_TRUE(NthParameterIs(a, 1, "arg1", "1"));
+  EXPECT_TRUE(NthParameterIs(a, 2, "arg2", "2"));
+  EXPECT_TRUE(NthParameterIs(a, 3, "bar", ""));
+  EXPECT_TRUE(NthParameterIs(a, 4, NULL, NULL));
+
+  // Empty param at the end.
+  char b[] = "http://www.google.com?foo=bar&";
+  EXPECT_TRUE(NthParameterIs(b, 1, "foo", "bar"));
+  EXPECT_TRUE(NthParameterIs(b, 2, NULL, NULL));
+
+  // Empty param at the beginning.
+  char c[] = "http://www.google.com?&foo=bar";
+  EXPECT_TRUE(NthParameterIs(c, 1, "", ""));
+  EXPECT_TRUE(NthParameterIs(c, 2, "foo", "bar"));
+  EXPECT_TRUE(NthParameterIs(c, 3, NULL, NULL));
+
+  // Empty key with value.
+  char d[] = "http://www.google.com?=foo";
+  EXPECT_TRUE(NthParameterIs(d, 1, "", "foo"));
+  EXPECT_TRUE(NthParameterIs(d, 2, NULL, NULL));
+
+  // Empty value with key.
+  char e[] = "http://www.google.com?foo=";
+  EXPECT_TRUE(NthParameterIs(e, 1, "foo", ""));
+  EXPECT_TRUE(NthParameterIs(e, 2, NULL, NULL));
+
+  // Empty key and values.
+  char f[] = "http://www.google.com?&&==&=";
+  EXPECT_TRUE(NthParameterIs(f, 1, "", ""));
+  EXPECT_TRUE(NthParameterIs(f, 2, "", ""));
+  EXPECT_TRUE(NthParameterIs(f, 3, "", "="));
+  EXPECT_TRUE(NthParameterIs(f, 4, "", ""));
+  EXPECT_TRUE(NthParameterIs(f, 5, NULL, NULL));
+}
+
+// MailtoURL --------------------------------------------------------------------
+
+static MailtoURLParseCase mailto_cases[] = {
+//|input                       |scheme   |path               |query
+{"mailto:foo@gmail.com",        "mailto", "foo@gmail.com",    NULL},
+{"  mailto: to  \t",            "mailto", " to",              NULL},
+{"mailto:addr1%2C%20addr2 ",    "mailto", "addr1%2C%20addr2", NULL},
+{"Mailto:addr1, addr2 ",        "Mailto", "addr1, addr2",     NULL},
+{"mailto:addr1:addr2 ",         "mailto", "addr1:addr2",      NULL},
+{"mailto:?to=addr1,addr2",      "mailto", NULL,               "to=addr1,addr2"},
+{"mailto:?to=addr1%2C%20addr2", "mailto", NULL,               "to=addr1%2C%20addr2"},
+{"mailto:addr1?to=addr2",       "mailto", "addr1",            "to=addr2"},
+{"mailto:?body=#foobar#",       "mailto", NULL,               "body=#foobar#",},
+{"mailto:#?body=#foobar#",      "mailto", "#",                "body=#foobar#"},
+};
+
+TEST(URLParser, MailtoUrl) {
+  // Declared outside for loop to try to catch cases in init() where we forget
+  // to reset something that is reset by the construtor.
+  url_parse::Parsed parsed;
+  for (size_t i = 0; i < arraysize(mailto_cases); ++i) {
+    const char* url = mailto_cases[i].input;
+    url_parse::ParseMailtoURL(url, static_cast<int>(strlen(url)), &parsed);
+    int port = url_parse::ParsePort(url, parsed.port);
+
+    EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].scheme, parsed.scheme));
+    EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].path, parsed.path));
+    EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query));
+    EXPECT_EQ(url_parse::PORT_UNSPECIFIED, port);
+
+    // The remaining components are never used for mailto urls.
+    ExpectInvalidComponent(parsed.username);
+    ExpectInvalidComponent(parsed.password);
+    ExpectInvalidComponent(parsed.port);
+    ExpectInvalidComponent(parsed.ref);
+  }
+}

diff --git a/googleurl/src/url_test_utils.h b/googleurl/src/url_test_utils.h
index f2b97e9..fdadf7f 100644
--- a/googleurl/src/url_test_utils.h
+++ b/googleurl/src/url_test_utils.h

@@ -35,34 +35,35 @@
 
 #include <string>
 
+#include "base/string16.h"
 #include "googleurl/src/url_canon_internal.h"
 #include "testing/base/public/gunit.h"
 
 namespace url_test_utils {
 
-// Converts a UTF-16 string from native wchar_t format to
-// url_canon::UTF16Char, by truncating the high 32 bits.  This is not meant to
-// handle true UTF-32 encoded strings.
-inline url_canon::UTF16String WStringToUTF16(const wchar_t* src) {
-  url_canon::UTF16String str;
+// Converts a UTF-16 string from native wchar_t format to char16, by
+// truncating the high 32 bits.  This is not meant to handle true UTF-32
+// encoded strings.
+inline string16 WStringToUTF16(const wchar_t* src) {
+  string16 str;
   int length = static_cast<int>(wcslen(src));
   for (int i = 0; i < length; ++i) {
-    str.push_back(static_cast<url_canon::UTF16Char>(src[i]));
+    str.push_back(static_cast<char16>(src[i]));
   }
   return str;
 }
 
 // Converts a string from UTF-8 to UTF-16
-inline url_canon::UTF16String ConvertUTF8ToUTF16(const std::string& src) {
+inline string16 ConvertUTF8ToUTF16(const std::string& src) {
   int length = static_cast<int>(src.length());
   EXPECT_LT(length, 1024);
   url_canon::RawCanonOutputW<1024> output;
   EXPECT_TRUE(url_canon::ConvertUTF8ToUTF16(src.data(), length, &output));
-  return url_canon::UTF16String(output.data(), output.length());
+  return string16(output.data(), output.length());
 }
 
 // Converts a string from UTF-16 to UTF-8
-inline std::string ConvertUTF16ToUTF8(const url_canon::UTF16String& src) {
+inline std::string ConvertUTF16ToUTF8(const string16& src) {
   std::string str;
   url_canon::StdStringCanonOutput output(&str);
   EXPECT_TRUE(url_canon::ConvertUTF16ToUTF8(src.data(),
@@ -74,17 +75,11 @@
 
 }  // namespace url_test_utils
 
-// This operator allows EXPECT_EQ(aUTF16String, anotherUTF16String); to work. It
-// has to be defined in the ::std namespace to allow it to be found via ADL for
-// UTF16String-type arguments as those are actually std::basic_string<...>. The
-// typedef doesn't contribute the url_canon namespace to the set of associated
-// namespaces searched during ADL. See C++'03 [basic.lookup.koenig] 3.4.2/2 for
-// more details.
-namespace std {
-inline ostream& operator<<(ostream& os, const ::url_canon::UTF16String& str) {
+// This operator allows EXPECT_EQ(astring16, anotherstring16); to work.
+inline std::ostream& operator<<(std::ostream& os,
+                                const string16& str) {
   // Convert to UTF-8 and print the string
-  return os << ::url_test_utils::ConvertUTF16ToUTF8(str);
+  return os << url_test_utils::ConvertUTF16ToUTF8(str);
 }
-}  // namespace std
 
 #endif  // GOOGLEURL_SRC_URL_TEST_UTILS_H__

diff --git a/googleurl/src/url_util.cc b/googleurl/src/url_util.cc
index c7e1f8a..7e100aa 100644
--- a/googleurl/src/url_util.cc
+++ b/googleurl/src/url_util.cc

@@ -33,6 +33,8 @@
 #include "googleurl/src/url_util.h"
 
 #include "base/logging.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_file.h"
 
 namespace url_util {
 
@@ -55,14 +57,17 @@
 }
 
 const char kFileScheme[] = "file";  // Used in a number of places.
+const char kMailtoScheme[] = "mailto";
 
-const int kNumStandardURLSchemes = 5;
+const int kNumStandardURLSchemes = 7;
 const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
   "http",
   "https",
   kFileScheme,  // Yes, file urls can have a hostname!
   "ftp",
   "gopher",
+  "ws",  // WebSocket.
+  "wss",  // WebSocket secure.
 };
 
 // List of the currently installed standard schemes. This list is lazily
@@ -70,6 +75,9 @@
 // any destructors from being called that will slow us down or cause problems.
 std::vector<const char*>* standard_schemes = NULL;
 
+// See the LockStandardSchemes declaration in the header.
+bool standard_schemes_locked = false;
+
 // Ensures that the standard_schemes list is initialized, does nothing if it
 // already has values.
 void InitStandardSchemes() {
@@ -86,19 +94,23 @@
 inline bool CompareSchemeComponent(const CHAR* spec,
                                    const url_parse::Component& component,
                                    const char* compare_to) {
+  if (!component.is_nonempty())
+    return compare_to[0] == 0;  // When component is empty, match empty scheme.
   return LowerCaseEqualsASCII(&spec[component.begin],
                               &spec[component.end()],
                               compare_to);
 }
 
-// Returns true if the given scheme is one of the registered "standard"
-// schemes.
+// Returns true if the given scheme identified by |scheme| within |spec| is one
+// of the registered "standard" schemes.
 template<typename CHAR>
-bool DoIsStandardScheme(const CHAR* scheme,
-                        int scheme_len) {
+bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {
+  if (!scheme.is_nonempty())
+    return false;  // Empty or invalid schemes are non-standard.
+
   InitStandardSchemes();
   for (size_t i = 0; i < standard_schemes->size(); i++) {
-    if (LowerCaseEqualsASCII(scheme, &scheme[scheme_len],
+    if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
                              standard_schemes->at(i)))
       return true;
   }
@@ -106,22 +118,19 @@
 }
 
 template<typename CHAR>
-bool DoIsStandard(const CHAR* spec, int spec_len) {
-  // TODO(brettw) bug 772441: treat URLs with "://" and possible ":/" as
-  // standard.
-  url_parse::Component scheme;
-  if (!url_parse::ExtractScheme(spec, spec_len, &scheme))
-    return false;
-  return IsStandardScheme(&spec[scheme.begin], scheme.len);
-}
-
-template<typename CHAR>
 bool DoFindAndCompareScheme(const CHAR* str,
                             int str_len,
                             const char* compare,
                             url_parse::Component* found_scheme) {
+  // Before extracting scheme, canonicalize the URL to remove any whitespace.
+  // This matches the canonicalization done in DoCanonicalize function.
+  url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+  int spec_len;
+  const CHAR* spec = RemoveURLWhitespace(str, str_len,
+                                         &whitespace_buffer, &spec_len);
+
   url_parse::Component our_scheme;
-  if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) {
+  if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) {
     // No scheme.
     if (found_scheme)
       *found_scheme = url_parse::Component();
@@ -129,33 +138,68 @@
   }
   if (found_scheme)
     *found_scheme = our_scheme;
-  return CompareSchemeComponent(str, our_scheme, compare);
+  return CompareSchemeComponent(spec, our_scheme, compare);
 }
 
 template<typename CHAR>
-bool DoCanonicalize(const CHAR* spec,
-                    int spec_len,
+bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
+                    url_canon::CharsetConverter* charset_converter,
                     url_canon::CanonOutput* output,
                     url_parse::Parsed* output_parsed) {
+  // Remove any whitespace from the middle of the relative URL, possibly
+  // copying to the new buffer.
+  url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+  int spec_len;
+  const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
+                                         &whitespace_buffer, &spec_len);
+
+  url_parse::Parsed parsed_input;
+#ifdef WIN32
+  // For Windows, we allow things that look like absolute Windows paths to be
+  // fixed up magically to file URLs. This is done for IE compatability. For
+  // example, this will change "c:/foo" into a file URL rather than treating
+  // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
+  // There is similar logic in url_canon_relative.cc for
+  //
+  // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
+  // has no meaning as an absolute path name. This is because browsers on Mac
+  // & Unix don't generally do this, so there is no compatibility reason for
+  // doing so.
+  if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) ||
+      url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
+    url_parse::ParseFileURL(spec, spec_len, &parsed_input);
+    return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
+                                           charset_converter,
+                                           output, output_parsed);
+  }
+#endif
+
   url_parse::Component scheme;
-  if(!url_parse::ExtractScheme(spec, spec_len, &scheme))
+  if (!url_parse::ExtractScheme(spec, spec_len, &scheme))
     return false;
 
   // This is the parsed version of the input URL, we have to canonicalize it
   // before storing it in our object.
   bool success;
-  url_parse::Parsed parsed_input;
   if (CompareSchemeComponent(spec, scheme, kFileScheme)) {
     // File URLs are special.
     url_parse::ParseFileURL(spec, spec_len, &parsed_input);
     success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
-                                             NULL, output, output_parsed);
+                                             charset_converter,
+                                             output, output_parsed);
 
-  } else if (IsStandardScheme(&spec[scheme.begin], scheme.len)) {
+  } else if (DoIsStandard(spec, scheme)) {
     // All "normal" URLs.
     url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
     success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
-                                                 NULL, output, output_parsed);
+                                                 charset_converter,
+                                                 output, output_parsed);
+
+  } else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) {
+    // Mailto are treated like a standard url with only a scheme, path, query
+    url_parse::ParseMailtoURL(spec, spec_len, &parsed_input);
+    success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input,
+                                               output, output_parsed);
 
   } else {
     // "Weird" URLs like data: and javascript:
@@ -168,14 +212,25 @@
 
 template<typename CHAR>
 bool DoResolveRelative(const char* base_spec,
+                       int base_spec_len,
                        const url_parse::Parsed& base_parsed,
-                       const CHAR* relative,
-                       int relative_length,
+                       const CHAR* in_relative,
+                       int in_relative_length,
+                       url_canon::CharsetConverter* charset_converter,
                        url_canon::CanonOutput* output,
                        url_parse::Parsed* output_parsed) {
+  // Remove any whitespace from the middle of the relative URL, possibly
+  // copying to the new buffer.
+  url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+  int relative_length;
+  const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
+                                             &whitespace_buffer,
+                                             &relative_length);
+
+  // See if our base URL should be treated as "standard".
   bool standard_base_scheme =
-      IsStandardScheme(&base_spec[base_parsed.scheme.begin],
-                       base_parsed.scheme.len);
+      base_parsed.scheme.is_nonempty() &&
+      DoIsStandard(base_spec, base_parsed.scheme);
 
   bool is_relative;
   url_parse::Component relative_component;
@@ -190,60 +245,132 @@
 
   if (is_relative) {
     // Relative, resolve and canonicalize.
-    bool file_base_scheme =
+    bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
         CompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
     return url_canon::ResolveRelativeURL(base_spec, base_parsed,
                                          file_base_scheme, relative,
-                                         relative_component, NULL,
+                                         relative_component, charset_converter,
                                          output, output_parsed);
   }
 
   // Not relative, canonicalize the input.
-  return DoCanonicalize(relative, relative_length, output, output_parsed);
+  return DoCanonicalize(relative, relative_length, charset_converter,
+                        output, output_parsed);
 }
 
 template<typename CHAR>
 bool DoReplaceComponents(const char* spec,
+                         int spec_len,
                          const url_parse::Parsed& parsed,
                          const url_canon::Replacements<CHAR>& replacements,
+                         url_canon::CharsetConverter* charset_converter,
                          url_canon::CanonOutput* output,
                          url_parse::Parsed* out_parsed) {
-  // Note that we dispatch to the parser according the the scheme type of
-  // the OUTPUT URL. Normally, this is the same as our scheme, but if the
-  // scheme is being overridden, we need to test that.
+  // If the scheme is overridden, just do a simple string substitution and
+  // reparse the whole thing. There are lots of edge cases that we really don't
+  // want to deal with. Like what happens if I replace "http://e:8080/foo"
+  // with a file. Does it become "file:///E:/8080/foo" where the port number
+  // becomes part of the path? Parsing that string as a file URL says "yes"
+  // but almost no sane rule for dealing with the components individually would
+  // come up with that.
+  //
+  // Why allow these crazy cases at all? Programatically, there is almost no
+  // case for replacing the scheme. The most common case for hitting this is
+  // in JS when building up a URL using the location object. In this case, the
+  // JS code expects the string substitution behavior:
+  //   http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
+  if (replacements.IsSchemeOverridden()) {
+    // Canonicalize the new scheme so it is 8-bit and can be concatenated with
+    // the existing spec.
+    url_canon::RawCanonOutput<128> scheme_replaced;
+    url_parse::Component scheme_replaced_parsed;
+    url_canon::CanonicalizeScheme(
+        replacements.sources().scheme,
+        replacements.components().scheme,
+        &scheme_replaced, &scheme_replaced_parsed);
 
-  if (// Either the scheme is not replaced and the old one is a file,
-      (!replacements.IsSchemeOverridden() &&
-       CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) ||
-      // Or it is being replaced and the new one is a file.
-      (replacements.IsSchemeOverridden() &&
-       CompareSchemeComponent(replacements.sources().scheme,
-                              replacements.components().scheme,
-                              kFileScheme))) {
+    // We can assume that the input is canonicalized, which means it always has
+    // a colon after the scheme (or where the scheme would be).
+    int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
+                                                    : 1;
+    if (spec_len - spec_after_colon > 0) {
+      scheme_replaced.Append(&spec[spec_after_colon],
+                             spec_len - spec_after_colon);
+    }
+
+    // We now need to completely re-parse the resulting string since its meaning
+    // may have changed with the different scheme.
+    url_canon::RawCanonOutput<128> recanonicalized;
+    url_parse::Parsed recanonicalized_parsed;
+    DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
+                   charset_converter,
+                   &recanonicalized, &recanonicalized_parsed);
+
+    // Recurse using the version with the scheme already replaced. This will now
+    // use the replacement rules for the new scheme.
+    //
+    // Warning: this code assumes that ReplaceComponents will re-check all
+    // components for validity. This is because we can't fail if DoCanonicalize
+    // failed above since theoretically the thing making it fail could be
+    // getting replaced here. If ReplaceComponents didn't re-check everything,
+    // we wouldn't know if something *not* getting replaced is a problem.
+    // If the scheme-specific replacers are made more intelligent so they don't
+    // re-check everything, we should instead recanonicalize the whole thing
+    // after this call to check validity (this assumes replacing the scheme is
+    // much much less common than other types of replacements, like clearing the
+    // ref).
+    url_canon::Replacements<CHAR> replacements_no_scheme = replacements;
+    replacements_no_scheme.SetScheme(NULL, url_parse::Component());
+    return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
+                               recanonicalized_parsed, replacements_no_scheme,
+                               charset_converter, output, out_parsed);
+  }
+
+  // If we get here, then we know the scheme doesn't need to be replaced, so can
+  // just key off the scheme in the spec to know how to do the replacements.
+  if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {
     return url_canon::ReplaceFileURL(spec, parsed, replacements,
-                                     NULL, output, out_parsed);
+                                     charset_converter, output, out_parsed);
+  }
+  if (DoIsStandard(spec, parsed.scheme)) {
+    return url_canon::ReplaceStandardURL(spec, parsed, replacements,
+                                         charset_converter, output, out_parsed);
+  }
+  if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {
+     return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
+                                        output, out_parsed);
   }
 
-  if (// Either the scheme is not replaced and the old one is standard,
-      (!replacements.IsSchemeOverridden() &&
-       IsStandardScheme(&spec[parsed.scheme.begin], parsed.scheme.len)) ||
-      // Or it is being replaced and the new one is standard.
-      (replacements.IsSchemeOverridden() &&
-       IsStandardScheme(&replacements.sources().scheme[
-                            replacements.components().scheme.begin],
-                        replacements.components().scheme.len))) {
-    // Standard URL with all parts.
-    return url_canon::ReplaceStandardURL(spec, parsed, replacements, NULL,
-                                         output, out_parsed);
-  }
-
+  // Default is a path URL.
   return url_canon::ReplacePathURL(spec, parsed, replacements,
                                    output, out_parsed);
 }
 
 }  // namespace
 
+void Initialize() {
+  InitStandardSchemes();
+}
+
+void Shutdown() {
+  if (standard_schemes) {
+    delete standard_schemes;
+    standard_schemes = NULL;
+  }
+}
+
 void AddStandardScheme(const char* new_scheme) {
+  // If this assert triggers, it means you've called AddStandardScheme after
+  // LockStandardSchemes have been called (see the header file for
+  // LockStandardSchemes for more).
+  //
+  // This normally means you're trying to set up a new standard scheme too late
+  // in your application's init process. Locate where your app does this
+  // initialization and calls LockStandardScheme, and add your new standard
+  // scheme there.
+  DCHECK(!standard_schemes_locked) <<
+      "Trying to add a standard scheme after the list has been locked.";
+
   size_t scheme_len = strlen(new_scheme);
   if (scheme_len == 0)
     return;
@@ -257,20 +384,16 @@
   standard_schemes->push_back(dup_scheme);
 }
 
-bool IsStandardScheme(const char* scheme, int scheme_len) {
-  return DoIsStandardScheme(scheme, scheme_len);
+void LockStandardSchemes() {
+  standard_schemes_locked = true;
 }
 
-bool IsStandardScheme(const UTF16Char* scheme, int scheme_len) {
-  return DoIsStandardScheme(scheme, scheme_len);
+bool IsStandard(const char* spec, const url_parse::Component& scheme) {
+  return DoIsStandard(spec, scheme);
 }
 
-bool IsStandard(const char* spec, int spec_len) {
-  return DoIsStandard(spec, spec_len);
-}
-
-bool IsStandard(const UTF16Char* spec, int spec_len) {
-  return DoIsStandard(spec, spec_len);
+bool IsStandard(const char16* spec, const url_parse::Component& scheme) {
+  return DoIsStandard(spec, scheme);
 }
 
 bool FindAndCompareScheme(const char* str,
@@ -280,7 +403,7 @@
   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
 }
 
-bool FindAndCompareScheme(const UTF16Char* str,
+bool FindAndCompareScheme(const char16* str,
                           int str_len,
                           const char* compare,
                           url_parse::Component* found_scheme) {
@@ -289,52 +412,68 @@
 
 bool Canonicalize(const char* spec,
                   int spec_len,
+                  url_canon::CharsetConverter* charset_converter,
                   url_canon::CanonOutput* output,
                   url_parse::Parsed* output_parsed) {
-  return DoCanonicalize(spec, spec_len, output, output_parsed);
+  return DoCanonicalize(spec, spec_len, charset_converter,
+                        output, output_parsed);
 }
 
-bool Canonicalize(const UTF16Char* spec,
+bool Canonicalize(const char16* spec,
                   int spec_len,
+                  url_canon::CharsetConverter* charset_converter,
                   url_canon::CanonOutput* output,
                   url_parse::Parsed* output_parsed) {
-  return DoCanonicalize(spec, spec_len, output, output_parsed);
+  return DoCanonicalize(spec, spec_len, charset_converter,
+                        output, output_parsed);
 }
 
 bool ResolveRelative(const char* base_spec,
+                     int base_spec_len,
                      const url_parse::Parsed& base_parsed,
                      const char* relative,
                      int relative_length,
+                     url_canon::CharsetConverter* charset_converter,
                      url_canon::CanonOutput* output,
                      url_parse::Parsed* output_parsed) {
-  return DoResolveRelative(base_spec, base_parsed, relative, relative_length,
-                           output, output_parsed);
+  return DoResolveRelative(base_spec, base_spec_len, base_parsed,
+                           relative, relative_length,
+                           charset_converter, output, output_parsed);
 }
 
 bool ResolveRelative(const char* base_spec,
+                     int base_spec_len,
                      const url_parse::Parsed& base_parsed,
-                     const UTF16Char* relative,
+                     const char16* relative,
                      int relative_length,
+                     url_canon::CharsetConverter* charset_converter,
                      url_canon::CanonOutput* output,
                      url_parse::Parsed* output_parsed) {
-  return DoResolveRelative(base_spec, base_parsed, relative, relative_length,
-                           output, output_parsed);
+  return DoResolveRelative(base_spec, base_spec_len, base_parsed,
+                           relative, relative_length,
+                           charset_converter, output, output_parsed);
 }
 
 bool ReplaceComponents(const char* spec,
+                       int spec_len,
                        const url_parse::Parsed& parsed,
                        const url_canon::Replacements<char>& replacements,
+                       url_canon::CharsetConverter* charset_converter,
                        url_canon::CanonOutput* output,
                        url_parse::Parsed* out_parsed) {
-  return DoReplaceComponents(spec, parsed, replacements, output, out_parsed);
+  return DoReplaceComponents(spec, spec_len, parsed, replacements,
+                             charset_converter, output, out_parsed);
 }
 
 bool ReplaceComponents(const char* spec,
+                       int spec_len,
                        const url_parse::Parsed& parsed,
-                       const url_canon::Replacements<UTF16Char>& replacements,
+                       const url_canon::Replacements<char16>& replacements,
+                       url_canon::CharsetConverter* charset_converter,
                        url_canon::CanonOutput* output,
                        url_parse::Parsed* out_parsed) {
-  return DoReplaceComponents(spec, parsed, replacements, output, out_parsed);
+  return DoReplaceComponents(spec, spec_len, parsed, replacements,
+                             charset_converter, output, out_parsed);
 }
 
 // Front-ends for LowerCaseEqualsASCII.
@@ -356,10 +495,59 @@
   return a_begin == a_end && b_begin == b_end;
 }
 
-bool LowerCaseEqualsASCII(const UTF16Char* a_begin,
-                          const UTF16Char* a_end,
+bool LowerCaseEqualsASCII(const char16* a_begin,
+                          const char16* a_end,
                           const char* b) {
   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 }
 
+void DecodeURLEscapeSequences(const char* input, int length,
+                              url_canon::CanonOutputW* output) {
+  url_canon::RawCanonOutputT<char> unescaped_chars;
+  for (int i = 0; i < length; i++) {
+    if (input[i] == '%') {
+      unsigned char ch;
+      if (url_canon::DecodeEscaped(input, &i, length, &ch)) {
+        unescaped_chars.push_back(ch);
+      } else {
+        // Invalid escape sequence, copy the percent literal.
+        unescaped_chars.push_back('%');
+      }
+    } else {
+      // Regular non-escaped 8-bit character.
+      unescaped_chars.push_back(input[i]);
+    }
+  }
+
+  // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
+  // JavaScript URLs, but Firefox and Safari do.
+  for (int i = 0; i < unescaped_chars.length(); i++) {
+    unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
+    if (uch < 0x80) {
+      // Non-UTF-8, just append directly
+      output->push_back(uch);
+    } else {
+      // next_ch will point to the last character of the decoded
+      // character.
+      int next_character = i;
+      unsigned code_point;
+      if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character,
+                                 unescaped_chars.length(), &code_point)) {
+        // Valid UTF-8 character, convert to UTF-16.
+        url_canon::AppendUTF16Value(code_point, output);
+        i = next_character;
+      } else {
+        // If there are any sequences that are not valid UTF-8, we keep
+        // invalid code points and promote to UTF-16. We copy all characters
+        // from the current position to the end of the identified sequence.
+        while (i < next_character) {
+          output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+          i++;
+        }
+        output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+      }
+    }
+  }
+}
+
 }  // namespace url_util

diff --git a/googleurl/src/url_util.h b/googleurl/src/url_util.h
index 98d3d40..ec4cf9e 100644
--- a/googleurl/src/url_util.h
+++ b/googleurl/src/url_util.h

@@ -32,64 +32,95 @@
 
 #include <string>
 
+#include "base/string16.h"
+#include "googleurl/src/url_common.h"
 #include "googleurl/src/url_parse.h"
 #include "googleurl/src/url_canon.h"
 
 namespace url_util {
 
-typedef url_parse::UTF16Char UTF16Char;
-typedef url_parse::UTF16String UTF16String;
+// Init ------------------------------------------------------------------------
+
+// Initialization is NOT required, it will be implicitly initialized when first
+// used. However, this implicit initialization is NOT threadsafe. If you are
+// using this library in a threaded environment and don't have a consistent
+// "first call" (an example might be calling "AddStandardScheme" with your
+// special application-specific schemes) then you will want to call initialize
+// before spawning any threads.
+//
+// It is OK to call this function more than once, subsequent calls will simply
+// "noop", unless Shutdown() was called in the mean time. This will also be a
+// "noop" if other calls to the library have forced an initialization
+// beforehand.
+GURL_API void Initialize();
+
+// Cleanup is not required, except some strings may leak. For most user
+// applications, this is fine. If you're using it in a library that may get
+// loaded and unloaded, you'll want to unload to properly clean up your
+// library.
+GURL_API void Shutdown();
 
 // Schemes --------------------------------------------------------------------
 
 // Adds an application-defined scheme to the internal list of "standard" URL
-// schemes.
-void AddStandardScheme(const char* new_scheme);
+// schemes. This function is not threadsafe and can not be called concurrently
+// with any other url_util function. It will assert if the list of standard
+// schemes has been locked (see LockStandardSchemes).
+GURL_API void AddStandardScheme(const char* new_scheme);
+
+// Sets a flag to prevent future calls to AddStandardScheme from succeeding.
+//
+// This is designed to help prevent errors for multithreaded applications.
+// Normal usage would be to call AddStandardScheme for your custom schemes at
+// the beginning of program initialization, and then LockStandardSchemes. This
+// prevents future callers from mistakenly calling AddStandardScheme when the
+// program is running with multiple threads, where such usage would be
+// dangerous.
+//
+// We could have had AddStandardScheme use a lock instead, but that would add
+// some platform-specific dependencies we don't otherwise have now, and is
+// overkill considering the normal usage is so simple.
+GURL_API void LockStandardSchemes();
 
 // Locates the scheme in the given string and places it into |found_scheme|,
 // which may be NULL to indicate the caller does not care about the range.
+//
 // Returns whether the given |compare| scheme matches the scheme found in the
-// input (if any).
-bool FindAndCompareScheme(const char* str,
-                          int str_len,
-                          const char* compare,
-                          url_parse::Component* found_scheme);
-bool FindAndCompareScheme(const UTF16Char* str,
-                          int str_len,
-                          const char* compare,
-                          url_parse::Component* found_scheme);
+// input (if any). The |compare| scheme must be a valid canonical scheme or
+// the result of the comparison is undefined.
+GURL_API bool FindAndCompareScheme(const char* str,
+                                   int str_len,
+                                   const char* compare,
+                                   url_parse::Component* found_scheme);
+GURL_API bool FindAndCompareScheme(const char16* str,
+                                   int str_len,
+                                   const char* compare,
+                                   url_parse::Component* found_scheme);
 inline bool FindAndCompareScheme(const std::string& str,
                                  const char* compare,
                                  url_parse::Component* found_scheme) {
   return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
                               compare, found_scheme);
 }
-inline bool FindAndCompareScheme(const UTF16String& str,
+inline bool FindAndCompareScheme(const string16& str,
                                  const char* compare,
                                  url_parse::Component* found_scheme) {
   return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
                               compare, found_scheme);
 }
 
-// Returns true if the given string corresponds to a known scheme in the
-// database.
-bool IsStandardScheme(const char* scheme, int scheme_len);
-bool IsStandardScheme(const UTF16Char* scheme, int scheme_len);
-inline bool IsStandardScheme(const std::string& scheme) {
-  return IsStandardScheme(scheme.data(), static_cast<int>(scheme.size()));
-}
-inline bool IsStandardScheme(const UTF16String& scheme) {
-  return IsStandardScheme(scheme.data(), static_cast<int>(scheme.size()));
-}
+// Returns true if the given string represents a standard URL. This means that
+// either the scheme is in the list of known standard schemes.
+GURL_API bool IsStandard(const char* spec,
+                         const url_parse::Component& scheme);
+GURL_API bool IsStandard(const char16* spec,
+                         const url_parse::Component& scheme);
 
-// Returns true if the given string represents a standard URL.
-bool IsStandard(const char* spec, int spec_len);
-bool IsStandard(const UTF16Char* spec, int spec_len);
-inline bool IsStandard(const std::string& spec) {
-  return IsStandard(spec.data(), static_cast<int>(spec.size()));
-}
-inline bool IsStandard(const UTF16String& spec) {
-  return IsStandard(spec.data(), static_cast<int>(spec.size()));
+// TODO(brettw) remove this. This is a temporary compatibility hack to avoid
+// breaking the WebKit build when this version is synced via Chrome.
+inline bool IsStandard(const char* spec, int spec_len,
+                       const url_parse::Component& scheme) {
+  return IsStandard(spec, scheme);
 }
 
 // URL library wrappers -------------------------------------------------------
@@ -98,54 +129,70 @@
 // should use the URL object, although this may be useful if performance is
 // critical and you don't want to do the heap allocation for the std::string.
 //
+// As with the url_canon::Canonicalize* functions, the charset converter can
+// be NULL to use UTF-8 (it will be faster in this case).
+//
 // Returns true if a valid URL was produced, false if not. On failure, the
 // output and parsed structures will still be filled and will be consistent,
 // but they will not represent a loadable URL.
-bool Canonicalize(const char* spec,
-                  int spec_len,
-                  url_canon::CanonOutput* output,
-                  url_parse::Parsed* output_parsed);
-bool Canonicalize(const UTF16Char* spec,
-                  int spec_len,
-                  url_canon::CanonOutput* output,
-                  url_parse::Parsed* output_parsed);
+GURL_API bool Canonicalize(const char* spec,
+                           int spec_len,
+                           url_canon::CharsetConverter* charset_converter,
+                           url_canon::CanonOutput* output,
+                           url_parse::Parsed* output_parsed);
+GURL_API bool Canonicalize(const char16* spec,
+                           int spec_len,
+                           url_canon::CharsetConverter* charset_converter,
+                           url_canon::CanonOutput* output,
+                           url_parse::Parsed* output_parsed);
 
 // Resolves a potentially relative URL relative to the given parsed base URL.
 // The base MUST be valid. The resulting canonical URL and parsed information
 // will be placed in to the given out variables.
 //
 // The relative need not be relative. If we discover that it's absolute, this
-// will produce a canonical version of that URL.
+// will produce a canonical version of that URL. See Canonicalize() for more
+// about the charset_converter.
 //
 // Returns true if the output is valid, false if the input could not produce
 // a valid URL.
-bool ResolveRelative(const char* base_spec,
-                     const url_parse::Parsed& base_parsed,
-                     const char* relative,
-                     int relative_length,
-                     url_canon::CanonOutput* output,
-                     url_parse::Parsed* output_parsed);
-bool ResolveRelative(const char* base_spec,
-                     const url_parse::Parsed& base_parsed,
-                     const UTF16Char* relative,
-                     int relative_length,
-                     url_canon::CanonOutput* output,
-                     url_parse::Parsed* output_parsed);
+GURL_API bool ResolveRelative(const char* base_spec,
+                              int base_spec_len,
+                              const url_parse::Parsed& base_parsed,
+                              const char* relative,
+                              int relative_length,
+                              url_canon::CharsetConverter* charset_converter,
+                              url_canon::CanonOutput* output,
+                              url_parse::Parsed* output_parsed);
+GURL_API bool ResolveRelative(const char* base_spec,
+                              int base_spec_len,
+                              const url_parse::Parsed& base_parsed,
+                              const char16* relative,
+                              int relative_length,
+                              url_canon::CharsetConverter* charset_converter,
+                              url_canon::CanonOutput* output,
+                              url_parse::Parsed* output_parsed);
 
 // Replaces components in the given VALID input url. The new canonical URL info
 // is written to output and out_parsed.
 //
 // Returns true if the resulting URL is valid.
-bool ReplaceComponents(const char* spec,
-                       const url_parse::Parsed& parsed,
-                       const url_canon::Replacements<char>& replacements,
-                       url_canon::CanonOutput* output,
-                       url_parse::Parsed* out_parsed);
-bool ReplaceComponents(const char* spec,
-                       const url_parse::Parsed& parsed,
-                       const url_canon::Replacements<UTF16Char>& replacements,
-                       url_canon::CanonOutput* output,
-                       url_parse::Parsed* out_parsed);
+GURL_API bool ReplaceComponents(
+    const char* spec,
+    int spec_len,
+    const url_parse::Parsed& parsed,
+    const url_canon::Replacements<char>& replacements,
+    url_canon::CharsetConverter* charset_converter,
+    url_canon::CanonOutput* output,
+    url_parse::Parsed* out_parsed);
+GURL_API bool ReplaceComponents(
+    const char* spec,
+    int spec_len,
+    const url_parse::Parsed& parsed,
+    const url_canon::Replacements<char16>& replacements,
+    url_canon::CharsetConverter* charset_converter,
+    url_canon::CanonOutput* output,
+    url_parse::Parsed* out_parsed);
 
 // String helper functions ----------------------------------------------------
 
@@ -155,16 +202,20 @@
 //
 // The versions of this function that don't take a b_end assume that the b
 // string is NULL terminated.
-bool LowerCaseEqualsASCII(const char* a_begin,
-                          const char* a_end,
-                          const char* b);
-bool LowerCaseEqualsASCII(const char* a_begin,
-                          const char* a_end,
-                          const char* b_begin,
-                          const char* b_end);
-bool LowerCaseEqualsASCII(const UTF16Char* a_begin,
-                          const UTF16Char* a_end,
-                          const char* b);
+GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
+                                   const char* a_end,
+                                   const char* b);
+GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
+                                   const char* a_end,
+                                   const char* b_begin,
+                                   const char* b_end);
+GURL_API bool LowerCaseEqualsASCII(const char16* a_begin,
+                                   const char16* a_end,
+                                   const char* b);
+
+// Unescapes the given string using URL escaping rules.
+GURL_API void DecodeURLEscapeSequences(const char* input, int length,
+                                       url_canon::CanonOutputW* output);
 
 }  // namespace url_util
 

diff --git a/googleurl/src/url_util_unittest.cc b/googleurl/src/url_util_unittest.cc
new file mode 100644
index 0000000..bb04905
--- /dev/null
+++ b/googleurl/src/url_util_unittest.cc

@@ -0,0 +1,222 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_stdstring.h"
+#include "googleurl/src/url_parse.h"
+#include "googleurl/src/url_test_utils.h"
+#include "googleurl/src/url_util.h"
+#include "testing/base/public/gunit.h"
+
+// From googleurl/base/basictypes.h
+#define ARRAYSIZE_UNSAFE(a) \
+  ((sizeof(a) / sizeof(*(a))) / \
+   static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+
+TEST(URLUtilTest, FindAndCompareScheme) {
+  url_parse::Component found_scheme;
+
+  // Simple case where the scheme is found and matches.
+  const char kStr1[] = "http://www.com/";
+  EXPECT_TRUE(url_util::FindAndCompareScheme(
+      kStr1, static_cast<int>(strlen(kStr1)), "http", NULL));
+  EXPECT_TRUE(url_util::FindAndCompareScheme(
+      kStr1, static_cast<int>(strlen(kStr1)), "http", &found_scheme));
+  EXPECT_TRUE(found_scheme == url_parse::Component(0, 4));
+
+  // A case where the scheme is found and doesn't match.
+  EXPECT_FALSE(url_util::FindAndCompareScheme(
+      kStr1, static_cast<int>(strlen(kStr1)), "https", &found_scheme));
+  EXPECT_TRUE(found_scheme == url_parse::Component(0, 4));
+
+  // A case where there is no scheme.
+  const char kStr2[] = "httpfoobar";
+  EXPECT_FALSE(url_util::FindAndCompareScheme(
+      kStr2, static_cast<int>(strlen(kStr2)), "http", &found_scheme));
+  EXPECT_TRUE(found_scheme == url_parse::Component());
+
+  // When there is an empty scheme, it should match the empty scheme.
+  const char kStr3[] = ":foo.com/";
+  EXPECT_TRUE(url_util::FindAndCompareScheme(
+      kStr3, static_cast<int>(strlen(kStr3)), "", &found_scheme));
+  EXPECT_TRUE(found_scheme == url_parse::Component(0, 0));
+
+  // But when there is no scheme, it should fail.
+  EXPECT_FALSE(url_util::FindAndCompareScheme("", 0, "", &found_scheme));
+  EXPECT_TRUE(found_scheme == url_parse::Component());
+
+  // When there is a whitespace char in scheme, it should canonicalize the url
+  // before comparison.
+  const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)";
+  EXPECT_TRUE(url_util::FindAndCompareScheme(
+      whtspc_str, static_cast<int>(strlen(whtspc_str)), "javascript",
+      &found_scheme));
+  EXPECT_TRUE(found_scheme == url_parse::Component(1, 10));
+
+  // Control characters should be stripped out on the ends, and kept in the
+  // middle.
+  const char ctrl_str[] = "\02jav\02scr\03ipt:alert(1)";
+  EXPECT_FALSE(url_util::FindAndCompareScheme(
+      ctrl_str, static_cast<int>(strlen(ctrl_str)), "javascript",
+      &found_scheme));
+  EXPECT_TRUE(found_scheme == url_parse::Component(1, 11));
+}
+
+TEST(URLUtilTest, ReplaceComponents) {
+  url_parse::Parsed parsed;
+  url_canon::RawCanonOutputT<char> output;
+  url_parse::Parsed new_parsed;
+
+  // Check that the following calls do not cause crash
+  url_canon::Replacements<char> replacements;
+  replacements.SetRef("test", url_parse::Component(0, 4));
+  url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
+                              &new_parsed);
+  url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
+                              &new_parsed);
+  replacements.ClearRef();
+  replacements.SetHost("test", url_parse::Component(0, 4));
+  url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
+                              &new_parsed);
+  url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
+                              &new_parsed);
+
+  replacements.ClearHost();
+  url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
+                              &new_parsed);
+  url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
+                              &new_parsed);
+  url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
+                              &new_parsed);
+  url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
+                              &new_parsed);
+}
+
+static std::string CheckReplaceScheme(const char* base_url,
+                                      const char* scheme) {
+  // Make sure the input is canonicalized.
+  url_canon::RawCanonOutput<32> original;
+  url_parse::Parsed original_parsed;
+  url_util::Canonicalize(base_url, strlen(base_url), NULL,
+                         &original, &original_parsed);
+
+  url_canon::Replacements<char> replacements;
+  replacements.SetScheme(scheme, url_parse::Component(0, strlen(scheme)));
+
+  std::string output_string;
+  url_canon::StdStringCanonOutput output(&output_string);
+  url_parse::Parsed output_parsed;
+  url_util::ReplaceComponents(original.data(), original.length(),
+                              original_parsed, replacements, NULL,
+                              &output, &output_parsed);
+
+  output.Complete();
+  return output_string;
+}
+
+TEST(URLUtilTest, ReplaceScheme) {
+  EXPECT_EQ("https://google.com/",
+            CheckReplaceScheme("http://google.com/", "https"));
+  EXPECT_EQ("file://google.com/",
+            CheckReplaceScheme("http://google.com/", "file"));
+  EXPECT_EQ("http://home/Build",
+            CheckReplaceScheme("file:///Home/Build", "http"));
+  EXPECT_EQ("javascript:foo",
+            CheckReplaceScheme("about:foo", "javascript"));
+  EXPECT_EQ("://google.com/",
+            CheckReplaceScheme("http://google.com/", ""));
+  EXPECT_EQ("http://google.com/",
+            CheckReplaceScheme("about:google.com", "http"));
+  EXPECT_EQ("http:", CheckReplaceScheme("", "http"));
+
+#ifdef WIN32
+  // Magic Windows drive letter behavior when converting to a file URL.
+  EXPECT_EQ("file:///E:/foo/",
+            CheckReplaceScheme("http://localhost/e:foo/", "file"));
+#endif
+
+  // This will probably change to "about://google.com/" when we fix
+  // http://crbug.com/160 which should also be an acceptable result.
+  EXPECT_EQ("about://google.com/",
+            CheckReplaceScheme("http://google.com/", "about"));
+}
+
+TEST(URLUtilTest, DecodeURLEscapeSequences) {
+  struct DecodeCase {
+    const char* input;
+    const char* output;
+  } decode_cases[] = {
+    {"hello, world", "hello, world"},
+    {"%01%02%03%04%05%06%07%08%09%0a%0B%0C%0D%0e%0f/",
+     "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0B\x0C\x0D\x0e\x0f/"},
+    {"%10%11%12%13%14%15%16%17%18%19%1a%1B%1C%1D%1e%1f/",
+     "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1B\x1C\x1D\x1e\x1f/"},
+    {"%20%21%22%23%24%25%26%27%28%29%2a%2B%2C%2D%2e%2f/",
+     " !\"#$%&'()*+,-.//"},
+    {"%30%31%32%33%34%35%36%37%38%39%3a%3B%3C%3D%3e%3f/",
+     "0123456789:;<=>?/"},
+    {"%40%41%42%43%44%45%46%47%48%49%4a%4B%4C%4D%4e%4f/",
+     "@ABCDEFGHIJKLMNO/"},
+    {"%50%51%52%53%54%55%56%57%58%59%5a%5B%5C%5D%5e%5f/",
+     "PQRSTUVWXYZ[\\]^_/"},
+    {"%60%61%62%63%64%65%66%67%68%69%6a%6B%6C%6D%6e%6f/",
+     "`abcdefghijklmno/"},
+    {"%70%71%72%73%74%75%76%77%78%79%7a%7B%7C%7D%7e%7f/",
+     "pqrstuvwxyz{|}~\x7f/"},
+    // Test un-UTF-8-ization.
+    {"%e4%bd%a0%e5%a5%bd", "\xe4\xbd\xa0\xe5\xa5\xbd"},
+  };
+
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(decode_cases); i++) {
+    const char* input = decode_cases[i].input;
+    url_canon::RawCanonOutputT<char16> output;
+    url_util::DecodeURLEscapeSequences(input, strlen(input), &output);
+    EXPECT_EQ(decode_cases[i].output,
+              url_test_utils::ConvertUTF16ToUTF8(
+                string16(output.data(), output.length())));
+  }
+
+  // Our decode should decode %00
+  const char zero_input[] = "%00";
+  url_canon::RawCanonOutputT<char16> zero_output;
+  url_util::DecodeURLEscapeSequences(zero_input, strlen(zero_input),
+                                     &zero_output);
+  EXPECT_NE("%00",
+            url_test_utils::ConvertUTF16ToUTF8(
+              string16(zero_output.data(), zero_output.length())));
+
+  // Test the error behavior for invalid UTF-8.
+  const char invalid_input[] = "%e4%a0%e5%a5%bd";
+  const char16 invalid_expected[4] = {0x00e4, 0x00a0, 0x597d, 0};
+  url_canon::RawCanonOutputT<char16> invalid_output;
+  url_util::DecodeURLEscapeSequences(invalid_input, strlen(invalid_input),
+                                     &invalid_output);
+  EXPECT_EQ(string16(invalid_expected),
+            string16(invalid_output.data(), invalid_output.length()));
+}

diff --git a/googleurl/third_party/icu/build/using_icu.vsprops b/googleurl/third_party/icu/build/using_icu.vsprops
new file mode 100644
index 0000000..a3989ef
--- /dev/null
+++ b/googleurl/third_party/icu/build/using_icu.vsprops

@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioPropertySheet
+	ProjectType="Visual C++"
+	Version="8.00"
+	Name="using_icu"
+	>
+	<Tool
+		Name="VCCLCompilerTool"
+		AdditionalIncludeDirectories="&quot;$(SolutionDir)..\..\third_party\icu\public\common&quot;;&quot;$(SolutionDir)..\..\third_party\icu\public\i18n&quot;"
+	/>
+</VisualStudioPropertySheet>

diff --git a/googleurl/third_party/icu36/build/using_icu.vsprops b/googleurl/third_party/icu36/build/using_icu.vsprops
deleted file mode 100644
index 876532f..0000000
--- a/googleurl/third_party/icu36/build/using_icu.vsprops
+++ /dev/null

@@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioPropertySheet
-	ProjectType="Visual C++"
-	Version="8.00"
-	Name="using_icu"
-	>
-	<Tool
-		Name="VCCLCompilerTool"
-		AdditionalIncludeDirectories="&quot;$(SolutionDir)..\third_party\icu36\source\common&quot;;&quot;$(SolutionDir)..\third_party\icu36\source\i18n&quot;"
-	/>
-</VisualStudioPropertySheet>
commit	6ee28c9143d800380b6ab2efe45a682651e8e815	[log] [tgz]
author	Devany Sandoval <sandovad@google.com>	Thu Jul 29 08:56:43 2010 -0700
committer	sandovad <sandovad@google.com>	Tue Sep 03 12:51:02 2019 -0700
tree	67720a304b2340a49d098d619a7483d2fc15d803
parent	2199655e9cb5804f00eaeeccc7be22de10012f3d [diff]