blob: 095fbfa3b501ccab4661c53ffeaec03dd6153efa [file] [log] [blame]
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/strings/string_util.h"
#include <ctype.h>
#include <errno.h>
#include <math.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <wchar.h>
#include <wctype.h>
#include <algorithm>
#include <limits>
#include <vector>
namespace url {
namespace base {
namespace {
// Assuming that a pointer is the size of a "machine word", then
// uintptr_t is an integer type that is also a machine word.
typedef uintptr_t MachineWord;
const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
inline bool IsAlignedToMachineWord(const void* pointer) {
return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
}
template<typename T> inline T* AlignToMachineWord(T* pointer) {
return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
~kMachineWordAlignmentMask);
}
template<size_t size, typename CharacterType> struct NonASCIIMask;
template<> struct NonASCIIMask<4, char16> {
static inline uint32_t value() { return 0xFF80FF80U; }
};
template<> struct NonASCIIMask<4, char> {
static inline uint32_t value() { return 0x80808080U; }
};
template<> struct NonASCIIMask<8, char16> {
static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
};
template<> struct NonASCIIMask<8, char> {
static inline uint64_t value() { return 0x8080808080808080ULL; }
};
#if defined(WCHAR_T_IS_UTF32)
template<> struct NonASCIIMask<4, wchar_t> {
static inline uint32_t value() { return 0xFFFFFF80U; }
};
template<> struct NonASCIIMask<8, wchar_t> {
static inline uint64_t value() { return 0xFFFFFF80FFFFFF80ULL; }
};
#endif // WCHAR_T_IS_UTF32
template<typename StringType>
StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) {
StringType ret;
ret.reserve(str.size());
for (size_t i = 0; i < str.size(); i++)
ret.push_back(ToLowerASCII(str[i]));
return ret;
}
template<typename StringType>
StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) {
StringType ret;
ret.reserve(str.size());
for (size_t i = 0; i < str.size(); i++)
ret.push_back(ToUpperASCII(str[i]));
return ret;
}
} // namespace
std::string ToLowerASCII(StringPiece str) {
return ToLowerASCIIImpl<std::string>(str);
}
string16 ToLowerASCII(StringPiece16 str) {
return ToLowerASCIIImpl<string16>(str);
}
template <class Char>
inline bool DoIsStringASCII(const Char* characters, size_t length) {
MachineWord all_char_bits = 0;
const Char* end = characters + length;
// Prologue: align the input.
while (!IsAlignedToMachineWord(characters) && characters != end) {
all_char_bits |= *characters;
++characters;
}
// Compare the values of CPU word size.
const Char* word_end = AlignToMachineWord(end);
const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
while (characters < word_end) {
all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
characters += loop_increment;
}
// Process the remaining bytes.
while (characters != end) {
all_char_bits |= *characters;
++characters;
}
MachineWord non_ascii_bit_mask =
NonASCIIMask<sizeof(MachineWord), Char>::value();
return !(all_char_bits & non_ascii_bit_mask);
}
bool IsStringASCII(const StringPiece& str) {
return DoIsStringASCII(str.data(), str.length());
}
bool IsStringASCII(const StringPiece16& str) {
return DoIsStringASCII(str.data(), str.length());
}
bool IsStringASCII(const string16& str) {
return DoIsStringASCII(str.data(), str.length());
}
#if defined(WCHAR_T_IS_UTF32)
bool IsStringASCII(const std::wstring& str) {
return DoIsStringASCII(str.data(), str.length());
}
#endif
// Implementation note: Normally this function will be called with a hardcoded
// constant for the lowercase_ascii parameter. Constructing a StringPiece from
// a C constant requires running strlen, so the result will be two passes
// through the buffers, one to file the length of lowercase_ascii, and one to
// compare each letter.
//
// This function could have taken a const char* to avoid this and only do one
// pass through the string. But the strlen is faster than the case-insensitive
// compares and lets us early-exit in the case that the strings are different
// lengths (will often be the case for non-matches). So whether one approach or
// the other will be faster depends on the case.
//
// The hardcoded strings are typically very short so it doesn't matter, and the
// string piece gives additional flexibility for the caller (doesn't have to be
// null terminated) so we choose the StringPiece route.
template<typename Str>
static inline bool DoLowerCaseEqualsASCII(BasicStringPiece<Str> str,
StringPiece lowercase_ascii) {
if (str.size() != lowercase_ascii.size())
return false;
for (size_t i = 0; i < str.size(); i++) {
if (ToLowerASCII(str[i]) != lowercase_ascii[i])
return false;
}
return true;
}
bool LowerCaseEqualsASCII(StringPiece str, StringPiece lowercase_ascii) {
return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
}
bool LowerCaseEqualsASCII(StringPiece16 str, StringPiece lowercase_ascii) {
return DoLowerCaseEqualsASCII<string16>(str, lowercase_ascii);
}
} // namespace base
} // namespace url