src/base/strings/string_util.cc - googleurl - Git at Google

 // Copyright 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "base/strings/string_util.h"

 #include <ctype.h>
 #include <errno.h>
 #include <math.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <wchar.h>
 #include <wctype.h>

 #include <algorithm>
 #include <limits>
 #include <vector>

 namespace url {
 namespace base {

 namespace {

 // Assuming that a pointer is the size of a "machine word", then
 // uintptr_t is an integer type that is also a machine word.
 typedef uintptr_t MachineWord;
 const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;

 inline bool IsAlignedToMachineWord(const void* pointer) {
   return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
 }

 template<typename T> inline T* AlignToMachineWord(T* pointer) {
   return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
                               ~kMachineWordAlignmentMask);
 }

 template<size_t size, typename CharacterType> struct NonASCIIMask;
 template<> struct NonASCIIMask<4, char16> {
     static inline uint32_t value() { return 0xFF80FF80U; }
 };
 template<> struct NonASCIIMask<4, char> {
     static inline uint32_t value() { return 0x80808080U; }
 };
 template<> struct NonASCIIMask<8, char16> {
     static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
 };
 template<> struct NonASCIIMask<8, char> {
     static inline uint64_t value() { return 0x8080808080808080ULL; }
 };
 #if defined(WCHAR_T_IS_UTF32)
 template<> struct NonASCIIMask<4, wchar_t> {
     static inline uint32_t value() { return 0xFFFFFF80U; }
 };
 template<> struct NonASCIIMask<8, wchar_t> {
     static inline uint64_t value() { return 0xFFFFFF80FFFFFF80ULL; }
 };
 #endif  // WCHAR_T_IS_UTF32

 template<typename StringType>
 StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) {
   StringType ret;
   ret.reserve(str.size());
   for (size_t i = 0; i < str.size(); i++)
     ret.push_back(ToLowerASCII(str[i]));
   return ret;
 }

 template<typename StringType>
 StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) {
   StringType ret;
   ret.reserve(str.size());
   for (size_t i = 0; i < str.size(); i++)
     ret.push_back(ToUpperASCII(str[i]));
   return ret;
 }

 }  // namespace

 std::string ToLowerASCII(StringPiece str) {
   return ToLowerASCIIImpl<std::string>(str);
 }

 string16 ToLowerASCII(StringPiece16 str) {
   return ToLowerASCIIImpl<string16>(str);
 }

 template <class Char>
 inline bool DoIsStringASCII(const Char* characters, size_t length) {
   MachineWord all_char_bits = 0;
   const Char* end = characters + length;

   // Prologue: align the input.
   while (!IsAlignedToMachineWord(characters) && characters != end) {
     all_char_bits |= *characters;
     ++characters;
   }

   // Compare the values of CPU word size.
   const Char* word_end = AlignToMachineWord(end);
   const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
   while (characters < word_end) {
     all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
     characters += loop_increment;
   }

   // Process the remaining bytes.
   while (characters != end) {
     all_char_bits |= *characters;
     ++characters;
   }

   MachineWord non_ascii_bit_mask =
       NonASCIIMask<sizeof(MachineWord), Char>::value();
   return !(all_char_bits & non_ascii_bit_mask);
 }

 bool IsStringASCII(const StringPiece& str) {
   return DoIsStringASCII(str.data(), str.length());
 }

 bool IsStringASCII(const StringPiece16& str) {
   return DoIsStringASCII(str.data(), str.length());
 }

 bool IsStringASCII(const string16& str) {
   return DoIsStringASCII(str.data(), str.length());
 }

 #if defined(WCHAR_T_IS_UTF32)
 bool IsStringASCII(const std::wstring& str) {
   return DoIsStringASCII(str.data(), str.length());
 }
 #endif

 // Implementation note: Normally this function will be called with a hardcoded
 // constant for the lowercase_ascii parameter. Constructing a StringPiece from
 // a C constant requires running strlen, so the result will be two passes
 // through the buffers, one to file the length of lowercase_ascii, and one to
 // compare each letter.
 //
 // This function could have taken a const char* to avoid this and only do one
 // pass through the string. But the strlen is faster than the case-insensitive
 // compares and lets us early-exit in the case that the strings are different
 // lengths (will often be the case for non-matches). So whether one approach or
 // the other will be faster depends on the case.
 //
 // The hardcoded strings are typically very short so it doesn't matter, and the
 // string piece gives additional flexibility for the caller (doesn't have to be
 // null terminated) so we choose the StringPiece route.
 template<typename Str>
 static inline bool DoLowerCaseEqualsASCII(BasicStringPiece<Str> str,
                                           StringPiece lowercase_ascii) {
   if (str.size() != lowercase_ascii.size())
     return false;
   for (size_t i = 0; i < str.size(); i++) {
     if (ToLowerASCII(str[i]) != lowercase_ascii[i])
       return false;
   }
   return true;
 }

 bool LowerCaseEqualsASCII(StringPiece str, StringPiece lowercase_ascii) {
   return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
 }

 bool LowerCaseEqualsASCII(StringPiece16 str, StringPiece lowercase_ascii) {
   return DoLowerCaseEqualsASCII<string16>(str, lowercase_ascii);
 }

 }  // namespace base
 }  // namespace url
	// Copyright 2013 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "base/strings/string_util.h"

	#include <ctype.h>
	#include <errno.h>
	#include <math.h>
	#include <stdarg.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <time.h>
	#include <wchar.h>
	#include <wctype.h>

	#include <algorithm>
	#include <limits>
	#include <vector>

	namespace url {
	namespace base {

	namespace {

	// Assuming that a pointer is the size of a "machine word", then
	// uintptr_t is an integer type that is also a machine word.
	typedef uintptr_t MachineWord;
	const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;

	inline bool IsAlignedToMachineWord(const void* pointer) {
	return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
	}

	template<typename T> inline T* AlignToMachineWord(T* pointer) {
	return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
	~kMachineWordAlignmentMask);
	}

	template<size_t size, typename CharacterType> struct NonASCIIMask;
	template<> struct NonASCIIMask<4, char16> {
	static inline uint32_t value() { return 0xFF80FF80U; }
	};
	template<> struct NonASCIIMask<4, char> {
	static inline uint32_t value() { return 0x80808080U; }
	};
	template<> struct NonASCIIMask<8, char16> {
	static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
	};
	template<> struct NonASCIIMask<8, char> {
	static inline uint64_t value() { return 0x8080808080808080ULL; }
	};
	#if defined(WCHAR_T_IS_UTF32)
	template<> struct NonASCIIMask<4, wchar_t> {
	static inline uint32_t value() { return 0xFFFFFF80U; }
	};
	template<> struct NonASCIIMask<8, wchar_t> {
	static inline uint64_t value() { return 0xFFFFFF80FFFFFF80ULL; }
	};
	#endif // WCHAR_T_IS_UTF32

	template<typename StringType>
	StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) {
	StringType ret;
	ret.reserve(str.size());
	for (size_t i = 0; i < str.size(); i++)
	ret.push_back(ToLowerASCII(str[i]));
	return ret;
	}

	template<typename StringType>
	StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) {
	StringType ret;
	ret.reserve(str.size());
	for (size_t i = 0; i < str.size(); i++)
	ret.push_back(ToUpperASCII(str[i]));
	return ret;
	}

	} // namespace

	std::string ToLowerASCII(StringPiece str) {
	return ToLowerASCIIImpl<std::string>(str);
	}

	string16 ToLowerASCII(StringPiece16 str) {
	return ToLowerASCIIImpl<string16>(str);
	}

	template <class Char>
	inline bool DoIsStringASCII(const Char* characters, size_t length) {
	MachineWord all_char_bits = 0;
	const Char* end = characters + length;

	// Prologue: align the input.
	while (!IsAlignedToMachineWord(characters) && characters != end) {
	all_char_bits \|= *characters;
	++characters;
	}

	// Compare the values of CPU word size.
	const Char* word_end = AlignToMachineWord(end);
	const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
	while (characters < word_end) {
	all_char_bits \|= (reinterpret_cast<const MachineWord>(characters));
	characters += loop_increment;
	}

	// Process the remaining bytes.
	while (characters != end) {
	all_char_bits \|= *characters;
	++characters;
	}

	MachineWord non_ascii_bit_mask =
	NonASCIIMask<sizeof(MachineWord), Char>::value();
	return !(all_char_bits & non_ascii_bit_mask);
	}

	bool IsStringASCII(const StringPiece& str) {
	return DoIsStringASCII(str.data(), str.length());
	}

	bool IsStringASCII(const StringPiece16& str) {
	return DoIsStringASCII(str.data(), str.length());
	}

	bool IsStringASCII(const string16& str) {
	return DoIsStringASCII(str.data(), str.length());
	}

	#if defined(WCHAR_T_IS_UTF32)
	bool IsStringASCII(const std::wstring& str) {
	return DoIsStringASCII(str.data(), str.length());
	}
	#endif

	// Implementation note: Normally this function will be called with a hardcoded
	// constant for the lowercase_ascii parameter. Constructing a StringPiece from
	// a C constant requires running strlen, so the result will be two passes
	// through the buffers, one to file the length of lowercase_ascii, and one to
	// compare each letter.
	//
	// This function could have taken a const char* to avoid this and only do one
	// pass through the string. But the strlen is faster than the case-insensitive
	// compares and lets us early-exit in the case that the strings are different
	// lengths (will often be the case for non-matches). So whether one approach or
	// the other will be faster depends on the case.
	//
	// The hardcoded strings are typically very short so it doesn't matter, and the
	// string piece gives additional flexibility for the caller (doesn't have to be
	// null terminated) so we choose the StringPiece route.
	template<typename Str>
	static inline bool DoLowerCaseEqualsASCII(BasicStringPiece<Str> str,
	StringPiece lowercase_ascii) {
	if (str.size() != lowercase_ascii.size())
	return false;
	for (size_t i = 0; i < str.size(); i++) {
	if (ToLowerASCII(str[i]) != lowercase_ascii[i])
	return false;
	}
	return true;
	}

	bool LowerCaseEqualsASCII(StringPiece str, StringPiece lowercase_ascii) {
	return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
	}

	bool LowerCaseEqualsASCII(StringPiece16 str, StringPiece lowercase_ascii) {
	return DoLowerCaseEqualsASCII<string16>(str, lowercase_ascii);
	}

	} // namespace base
	} // namespace url