blob: 04116b1e4a98fcf463cba9aed4133ab5b1ed95e8 [file] [log] [blame]
///////////////////////////////////////////////////////////////////////////////
//
// The contents of this file are subject to the Mozilla Public License
// Version 1.1 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://www.mozilla.org/MPL/
//
// Software distributed under the License is distributed on an "AS IS"
// basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
// License for the specific language governing rights and limitations
// under the License.
//
// The Original Code is MP4v2.
//
// The Initial Developer of the Original Code is David Byron.
// Portions created by David Byron are Copyright (C) 2010.
// All Rights Reserved.
//
// Contributors:
// David Byron, dbyron@dbyron.com
//
///////////////////////////////////////////////////////////////////////////////
#include "src/impl.h"
#include "libplatform/impl.h" /* for platform_win32_impl.h which declares Utf8ToFilename */
#include <algorithm> /* for replace */
#include <windows.h>
namespace mp4v2 {
using namespace impl;
}
/**
* Set this to 1 to compile in extra debugging
*/
#define EXTRA_DEBUG 0
/**
* @def LOG_PRINTF
*
* call log.printf if EXTRA_DEBUG is defined to 1. Do
* nothing otherwise
*/
#if EXTRA_DEBUG
#define LOG_PRINTF(X) log.printf X
#else
#define LOG_PRINTF(X)
#endif
/**
* Section 2.13 "Special Characters and Noncharacters" of
* _The Unicode Standard, Version 5.0_
* (http://www.unicode.org/versions/Unicode5.0.0/bookmarks.html)
* defines "The Replacement Character" U+FFFD as the
* "general substitute character" that "can be substituted
* for any 'unknown' character in another encoding that can
* not be mapped in terms of known Unicode characters"
*
* See also section D.7 of 10646.
*/
#define REPLACEMENT_CHAR 0xFFFD
namespace mp4v2 { namespace platform { namespace win32 {
/**
* A structure to store the number of characters required to
* encode a particular UCS-4 character in UTF-8
*/
struct utf8_len_info
{
/**
* This structure applies to a number >= @p range_min.
*/
UINT32 range_min;
/**
* This structure applies to a number <= @p range_max.
*/
UINT32 range_max;
/**
* The number of characters required to encode a number
* in [@p range_min,@p range_max] as UTF-8.
*/
size_t num_chars;
};
/**
* A structure to store the number of characters required to
* encode a particular UCS-4 character in UTF-8. For now
* we're using wide characters (which according to
* http://msdn.microsoft.com/en-us/library/ms776414.aspx
* means UTF-16 since Windows 2000) so we're only using up
* to 4-byte UTF-8 sequences. Parts of the range aren't
* valid (e.g. [U+D800,U+DFFF] but that's handled elsewhere.
*/
static struct utf8_len_info s_len_info[] =
{
{ 0x00000000, 0x0000007F, 1 },
{ 0x00000080, 0x000007FF, 2 },
{ 0x00000800, 0x0000FFFF, 3 },
{ 0x00010000, 0x001FFFFF, 4 },
{ 0x00200000, 0x03FFFFFF, 5 },
{ 0x04000000, 0x7FFFFFFF, 6 }
};
/**
* Utf8ToFilename constructor
*
* @param utf8string a UTF-8 encoded string that does not
* begin with \\\?\\ nor \\\?\\UNC\\
*
* @see IsValidUTF16 to see whether the constructor
* succeeded
*/
Utf8ToFilename::Utf8ToFilename( const string &utf8string )
: _wideCharString( NULL )
, utf8( _utf8 )
{
// See
// http://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx
// for notes about path lengths, prefixes, etc. The
// goal is to support the longest path possible.
// Relative paths are limited to 260 characters but
// absolute paths can be up to about 32767
// characters if properly prefixed.
// If utf8string is a relative path, convert it to
// UTF-16 and be done.
if (!IsAbsolute(utf8string))
{
_wideCharString = ConvertToUTF16(utf8string);
return;
}
// Since the prefix has backslashes, convert any forward
// slashes in utf8string to backslashes to keep Windows
// happy
const string *utf8ToUse = &utf8string;
string forwardSlash;
if (utf8string.find('/') != std::string::npos)
{
forwardSlash = utf8string;
std::replace(forwardSlash.begin(),forwardSlash.end(),'/','\\');
utf8ToUse = &forwardSlash;
}
ASSERT(utf8ToUse);
ASSERT((*utf8ToUse).length() > 0);
// utf8string is an absolute path. It could be a
// UNC path (\\host\path). The prefix is different
// for UNC paths than it is for non-UNC paths.
string prefixedPath;
if (IsUncPath(*utf8ToUse))
{
// utf8string begins with two backslashes, but
// with a prefix we only need one so we can't
// just prepend a prefix.
prefixedPath = "\\\\?\\UNC" + (*utf8ToUse).substr(1);
}
else
{
prefixedPath = "\\\\?\\" + *utf8ToUse;
}
// Transform prefixedPath to UTF-16 so it's
// appropriate for CreateFileW
_wideCharString = ConvertToUTF16(prefixedPath);
}
Utf8ToFilename::~Utf8ToFilename( )
{
if( _wideCharString != NULL )
{
free(_wideCharString);
_wideCharString = NULL;
}
}
/**
* Convert a UTF-8 encoded string to a UTF-16 string
*
* @param utf8 the NUL-terminated UTF-8 string to decode
*
* @retval NULL error allocating memory for UTF-16 string
*
* @retval non-NULL NUL-terminated UTF-16 version of @p
* utf8. Invalid portions of UTF-8 are represented by a
* replacement character U+FFFD. The caller is
* responsible for freeing this memory.
*/
wchar_t *
Utf8ToFilename::ConvertToUTF16 ( const string &utf8string )
{
int num_bytes;
size_t num_chars;
wchar_t *retval;
ASSERT(sizeof(wchar_t) == 2);
// Store the utf8 string in our member variable so it's
// available
_utf8 = utf8string;
// We need to find out how many characters we're dealing
// with so we know how much memory to allocate. At the
// same time, it's possible that the string we've been
// given isn't valid UTF-8. So, just use the length of
// the string we've been given as the number of
// characters to allocate. The decoded string can't be
// longer than this, even taking into account surrogate
// pairs since they require 4 UTF-8 characters but only
// two UTF-16 character elements.
num_chars = utf8string.length();
LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: entry point (%d character string)",
__FUNCTION__,num_chars));
/*
** Allocate space for the decoded string. Add one
** for the NUL terminator.
*/
num_bytes = (num_chars + 1) * sizeof(wchar_t);
retval = (wchar_t *)malloc(num_bytes);
if (!retval)
{
log.errorf("%s: error allocating memory for %d byte(s)",__FUNCTION__,num_bytes);
return NULL;
}
/*
** ConvertToUTF16Buf zeroes out the memory so don't
** do it here
*/
// ConvertToUTF16Buf shouldn't fail if we allocated
// enough memory for the entire string. Check
// anyway just to be safe.
if (!ConvertToUTF16Buf(utf8string.c_str(),retval,num_bytes))
{
// But ASSERT so we can find the problem and fix
// it.
ASSERT(0);
free(retval);
retval = NULL;
return NULL;
}
return retval;
}
/**
* Convert a UTF-8 encoded string to a UTF-16 string in
* a previously allocated buffer.
*
* @param utf8 the NUL-terminated UTF-8 string to decode
*
* @param utf16_buf the buffer in which to place the
* UTF-16 version of @p utf8. If there's enough space
* to hold a NUL terminator, @p utf16_buf contains one.
* If not, @p utf16_buf is not NUL terminated.
*
* @param num_bytes the number of bytes that @p
* utf16_str points to
*
* @retval 0 error converting @p name to UTF-16,
* including when @p utf8 requires more space to encode
* in UTF-16 than indicated by @p num_bytes. In that
* case, @p utf16_buf contains the UTF-16 encoding of as
* much of @p utf8 as possible.
*
* @retval 1 successfully converted @p name to @p UTF-16
* in @p utf16_buf. wide character (UTF-16) version of
* @p Invalid portions of UTF-8 are represented by a
* replacement character U+FFFD.
*/
int
Utf8ToFilename::ConvertToUTF16Buf ( const char *utf8,
wchar_t *utf16_buf,
size_t num_bytes )
{
size_t i;
const UINT8 *next_char;
size_t num_chars;
size_t num_utf16_chars;
size_t num_input_bytes;
const UINT8 *p;
wchar_t this_utf16[2];
ASSERT(utf8);
ASSERT(utf16_buf || (num_bytes == 0));
ASSERT(sizeof(wchar_t) == 2);
ASSERT(num_bytes % sizeof(wchar_t) == 0);
LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: converting \"%s\"",__FUNCTION__,utf8));
num_chars = strlen(utf8);
// If the input is NUL-terminated (which it better
// be), the NUL-terminator is a valid input byte as
// well
num_input_bytes = num_chars + 1;
// Make sure the buffer we've been given is long
// enough. We might need one UTF-16 character for
// every UTF-8 character. And one more for the NUL
// terminator.
//
// Here, check that there's room for a NUL
// terminator in the output string. This makes it
// safe to dereference p in the while loop below.
// It's probably enough to check num_bytes == 0 here
// but if we did that we'd have to change the error
// message after the while loop to be less specific.
// This way we give the caller more info about the
// input string.
if (num_bytes < sizeof(wchar_t))
{
log.errorf("%s: %u byte(s) is not enough to transform a %u byte UTF-8 string "
"to NUL-terminated UTF-16",__FUNCTION__,num_bytes,num_input_bytes);
return 0;
}
ASSERT(num_bytes > 0);
ASSERT(utf16_buf);
memset(utf16_buf,0,num_bytes);
// The number of UTF-16 characters we've got space for
// in utf16_buf
num_utf16_chars = num_bytes / sizeof(wchar_t);
p = (const UINT8 *)utf8;
i = 0;
while (*p && (i < num_utf16_chars))
{
LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: decoding first UTF-8 byte 0x%02X (UTF-16 "
"character %d of at most %d)",__FUNCTION__,*p,(i + 1),
num_utf16_chars));
memset(this_utf16,0,sizeof(this_utf16));
// This function decodes illegal bytes/sequences
// with a replacement character and returns the
// pointer to the next character to decode. Pass
// NULL since we don't care about detecting invalid
// characters here.
next_char = Utf8DecodeChar(p,num_input_bytes,this_utf16,NULL);
// We've always got one character to assign
utf16_buf[i++] = this_utf16[0];
// If we're dealing with a surrogate pair,
// assign the low half too
if (this_utf16[1])
{
// We may not have any more room in the
// UTF-16 buffer. Check to make sure we
// don't step on someone else's memory. We
// need to return failure here instead of
// depending on our other logic to do it for
// us. We'll get out of the while loop with
// no extra code, but if we're dealing with
// the UTF-16 encoding of the last character
// in the input string, there won't appear
// to be anything wrong.
if (i >= num_utf16_chars)
{
log.errorf("%s: out of space in %u byte output string to store surrogate "
"pair low half (0x%04X)",__FUNCTION__,num_bytes,this_utf16[1]);
return 0;
}
utf16_buf[i++] = this_utf16[1];
}
// Put this here to make it brutally clear that
// the cast is safe
ASSERT(next_char >= p);
num_input_bytes -= (size_t)(next_char - p);
p = next_char;
}
if (*p)
{
// Since num_input_bytes includes 1 for the
// NUL-terminator, it's got to be bigger than
// one here.
ASSERT(num_input_bytes > 1);
log.errorf("%s: %u byte(s) of input string remain(s) undecoded (%s): out of space in "
"%u byte output string",__FUNCTION__,(num_input_bytes - 1),p,num_bytes);
return 0;
}
return 1;
}
/**
* Accessor for the length of a prefix (i.e. \\\?\\ or
* \\\?\\UNC\\) that begins a filename
*
* @param utf8string the UTF-8 encoded filename to
* examine
*
* @return the length of the prefix of @p utf8string in
* characters
*/
int
Utf8ToFilename::GetPrefixLen ( const string &utf8string )
{
if (utf8string.find("\\\\?\\") == 0)
{
return strlen("\\\\?\\");
}
if (utf8string.find("\\\\?\\UNC\\") == 0)
{
return strlen("\\\\?\\UNC\\");
}
return 0;
}
/**
* Determine if a path is absolute or not
*
* @param utf8string the UTF-8 encoded path to examine
* that does not begin with \\\?\\ nor \\\?\\UNC\\
*
* @retval 0 @p utf8string is not an absolute path
* @retval 1 @p utf8string is an absolute path
*/
int
Utf8ToFilename::IsAbsolute ( const string &utf8string )
{
// Assume utf8string doesn't already start with a
// long filename prefix (i.e. \\?\ or \\?\UNC\)
// since the logic here depends on that.
ASSERT(GetPrefixLen(utf8string) == 0);
// Is an empty string absolute or relative? It's
// not absolute since we can't tell what
// drive/volume it's for so say it's relative.
if (utf8string.length() == 0)
{
return 0;
}
// Here we're looking for:
// x: drive relative
// x:\ absolute path
if (utf8string[1] == ':')
{
// It starts with x:, but is it x:/ ?
if ((utf8string.length() >= 2) && IsPathSeparator(utf8string[2]))
{
// Yup -- it's absolute
return 1;
}
// Nope, not x:/, just x:something
return 0;
}
// UNC paths are absolute paths too
return IsUncPath(utf8string);
}
/**
* Determine if a character is a valid path separator
*
* @param c the character to check
*
* @retval 0 @p c is not a valid path separator
* @retval 1 @p c is a valid path separator
*/
int
Utf8ToFilename::IsPathSeparator ( char c )
{
return ((c == '\\') || (c == '/'));
}
/**
* Determine if a path is a UNC path
*
* @param utf8string the UTF-8 encoded path to examine
* that does not begin with \\\?\\ nor \\\?\\UNC\\
*
* @retval 0 @p utf8string is not a UNC path
* @retval 1 @p utf8string is a UNC path
*/
int
Utf8ToFilename::IsUncPath ( const string &utf8string )
{
const char *host;
int num_slashes;
const char *p;
// Assume utf8string doesn't already start with a
// long filename prefix (i.e. \\?\ or \\?\UNC\)
// since the logic here depends on that.
ASSERT(GetPrefixLen(utf8string) == 0);
// Is an empty string a UNC path? No.
if (utf8string.length() == 0)
{
return 0;
}
// Recognize:
// //volume/path
// \\volume\path
if (!IsPathSeparator(utf8string[0]))
{
// If it doesn't start with a path separator, it's
// not a UNC path.
return 0;
}
// The path starts with a slash, so it could be a UNC
// path. See if it starts with two slashes...Be careful
// though, it might have more than 2 slashes.
p = utf8string.c_str();
num_slashes = 0;
while (*p && IsPathSeparator(*p))
{
num_slashes++;
p++;
}
// We found a slash at the beginning so we better have
// at least one here
ASSERT(num_slashes >= 1);
if ((num_slashes > 2) || !(*p))
{
// If we've got more than two slashes or we've
// run off the end of the string (///foo or
// //)...who knows how the OS will handle it,
// but it's not a UNC path.
log.errorf("%s: don't understand path(%s)",__FUNCTION__,utf8string.c_str());
return 0;
}
// If we've only got one slash, it looks like a
// drive relative path. If it's something like
// /foo//bar it's not clear how the OS handles it,
// but that's someone else's problem. It's not a
// UNC path.
if (num_slashes == 1)
{
return 0;
}
// If we're here, we've got two slashes followed by
// a non-slash. Something like //foo. To be a
// proper UNC path, we need to see a hostname
// (e.g. foo), and then another slash. If not, it's
// not a UNC path.
ASSERT(num_slashes == 2);
// Tempting to use STRTOK_R here, but that modifies
// the original string. Instead of making a copy,
// search manually.
host = p;
while (*p && !IsPathSeparator(*p))
{
p++;
}
// We checked for separators above, so we better
// have moved on at least a bit
ASSERT(host != p);
if (!(*p))
{
// We ran off the end of the string without finding
// another separator. So, we've got something like
//
// //foobar
//
// which isn't a UNC path.
log.warningf("%s: incomplete UNC path: host only(%s)",__FUNCTION__,
utf8string.c_str());
return 0;
}
// p points to a separator, so...we've got one of:
// //host//
// //host//blah
// //host/bar
//
// Of these, only the last is a proper UNC path. See
// what we've got after p.
num_slashes = 0;
while (*p && IsPathSeparator(*p))
{
num_slashes++;
p++;
}
// We better have at least one slash or our logic is
// broken
ASSERT(num_slashes >= 1);
if (!(*p))
{
// //host// (or maybe //host///), but no path
// part after the host
log.warningf("%s: incomplete UNC path: no path after host(%s)",
__FUNCTION__,utf8string.c_str());
return 0;
}
if (num_slashes > 1)
{
// Another busted case //host//blah or
// //host///blah, etc.
log.warningf("%s: invalid UNC path: too many slashes after host(%s)",
__FUNCTION__,utf8string.c_str());
return 0;
}
// If we're here it means num_slashes is exactly 1
// so we've got //host/something so we're calling
// that a UNC path.
return 1;
}
/**
* Accessor for whether the UTF-16 encoded string is valid
*
* @retval false the UTF-16 encoded string is not valid
* @retval true the UTF-16 encoded string is valid
*/
bool
Utf8ToFilename::IsUTF16Valid( ) const
{
return (_wideCharString ? true : false);
}
/**
* Decode one UTF-8 encoded character into a UTF-16
* character. The trouble here is that UTF-16 is really a
* variable length encoding to handle surrogate pairs
* (0xD800 --> 0xDFFF). This way UTF-16 can handle more
* than 2^16 characters. So we need to be careful. UCS-2
* is a fixed width (16-bit) encoding that we could use, but
* then we can only handle 2^16 characters (the BMP). To
* handle all 2^21 characters, we need UTF-16.
*
* What does Windows really use? UTF-16. See
* http://unicode.org/iuc/iuc17/b2/slides.ppt for a
* discussion.
* http://discuss.fogcreek.com/joelonsoftware5/default.asp?cmd=show&ixPost=168543
* also has some info.
*
* @param utf8_char the UTF-8 character to decode, possibly
* occupying multiple bytes, not necessarily NUL terminated
*
* @param num_bytes the number of bytes that @p utf8_char
* points to (must be > 0)
*
* @param utf16 populated with the UTF-16 equivalent of @p
* utf8_char. Note that this must point to at least 2
* wchar_t's of memory so there's room to hold a surrogate
* pair.
*
* @param invalid populated with 1 if @p utf8_char doesn't
* point to a valid UTF-8 encoded character, 0 if @p
* utf8_char is valid.
*
* @return the next byte to examine for subsequent decoding
* (some number of bytes after @p utf8_char). This may not
* be valid to dereference depending on the value of @p
* num_bytes.
*/
const UINT8 *
Utf8ToFilename::Utf8DecodeChar ( const UINT8 *utf8_char,
size_t num_bytes,
wchar_t *utf16,
int *invalid )
{
wchar_t high_half;
int i;
UINT8 len;
wchar_t low_half;
UINT8 mask;
const UINT8 *p;
UINT32 ucs4;
int valid_len;
ASSERT(utf8_char);
ASSERT(num_bytes > 0);
ASSERT(utf16);
LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: decoding UTF-8 string at address 0x%p",
__FUNCTION__,utf8_char));
/*
** Assume utf8_char is invalid until we learn otherwise
*/
if (invalid)
{
*invalid = 1;
}
/*
** Traverse the UTF-8 encoding and figure out what we've
** got.
*/
p = (const UINT8 *)(utf8_char);
/*
** This is the number of bytes we expect based on the
** first octet. If subsequent bytes are NUL or invalid,
** then it may not the same as the actual len.
*/
len = Utf8NumOctets(*p);
if (len == 0)
{
log.errorf("%s: 0x%02X is not a valid first byte of a UTF-8 encoded character",__FUNCTION__,*p);
/*
** Use the replacement character and advance past
** the invalid byte
*/
*utf16 = REPLACEMENT_CHAR;
return p + 1;
}
/*
** Handle one byte encodings in a special case. See
** below for an explanation of how we mask successive
** bytes of an encoding to see why. We're depending on
** the validation in Utf8NumOctets here to make this OK.
*/
if (len == 1)
{
/*
** There's no intermediate UCS-4 step here. We go
** straight to UTF-16 since they're the same.
*/
LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: one byte UTF-16 encoding: 0x%02X",
__FUNCTION__,*p));
*utf16 = *p;
if (invalid)
{
*invalid = 0;
}
return p + 1;
}
/*
** Make sure we've got enough bytes in our input string
** to form a valid UTF-8 character
*/
if (len > num_bytes)
{
log.errorf("%s: first byte 0x%02X indicates a %d byte "
"UTF-8 character, but we only have %u valid byte(s)",
__FUNCTION__,*p,len,num_bytes);
*utf16 = REPLACEMENT_CHAR;
return p + 1;
}
/*
** Traverse the bytes that should be part of this UTF-8
** encoded character and make sure we don't have an
** overlength encoding, and make sure that each
** character is valid.
*/
/*
** As we traverse each character, we mask off the
** appropriate number of bits and include them in the
** overall result.
**
** 1 byte encoding [U+00000000,U+0000007F]: 7 bits (7 bits total) (handled above)
** 2 byte encoding [U+00000080,U+000007FF]: 5 bits, 6 bits (11 bits total)
** 3 byte encoding [U+00000800,U+0000FFFF]: 4 bits, 6 bits, 6 bits (16 bits total)
** 4 byte encoding [U+00010000,U+001FFFFF]: 3 bits, 6 bits, 6 bits, 6 bits (21 bits total)
** 5 byte encoding [U+00200000,U+03FFFFFF]: 2 bits, 6 bits, 6 bits, 6 bits, 6 bits (26 bits total)
** 6 byte encoding [U+04000000,U+7FFFFFFF]: 1 bit, 6 bits, 6 bits, 6 bits, 6 bits, 6 bits (31 bits total)
**
** So, mask the initial byte appropriately, then take
** the bottom 6 bits from the remaining bytes. To be
** brutally explicit, the first byte mask is:
**
** 1 byte encoding: 0x7F (or 0x80 - 1) (or (1 << 7) - 1)
** 2 byte encoding: 0x1F (or 0x20 - 1) (or (1 << 5) - 1)
** 3 byte encoding: 0x0F (or 0x10 - 1) (or (1 << 4) - 1)
** 4 byte encoding: 0x07 (or 0x08 - 1) (or (1 << 3) - 1)
** 5 byte encoding: 0x03 (or 0x04 - 1) (or (1 << 2) - 1)
** 6 byte encoding: 0x01 (or 0x02 - 1) (or (1 << 1) - 1)
**
** So, the one byte encoding is a special case (again,
** handled above), but for the other lengths, the mask
** is (1 << (7 - len)) - 1.
*/
/*
** Handle the first byte of multi-byte encodings since
** it's special
*/
ASSERT(len > 1);
ASSERT(len <= 6);
mask = (1 << (7 - len)) - 1;
ucs4 = *p & mask;
p++;
/*
** Now handle the remaining bytes
*/
for (i = 1;(i < len);i++)
{
if ((*p < 0x80) || (*p > 0xBF))
{
log.errorf("%s: 0x%02X is not a valid continuation character in a UTF-8 encoding",
__FUNCTION__,*p);
/*
** Use the replacement character and return the
** next byte after the invalid sequence as the
** place for subsequent decoding operations. In
** this case the invalid continuation character
** could be the beginning of the next valid
** sequence, so return that.
*/
*utf16 = REPLACEMENT_CHAR;
return p;
}
/*
** For the remainder of the bytes, shift over what
** we've already got by 6 bits, and then OR in the
** bottom 6 bits of the current byte.
*/
ucs4 = (ucs4 << 6) | (*p & 0x3F);
p++;
}
/*
** p is now pointing to the beginning of the next UTF-8
** sequence to decode...
*/
/*
** Finally, detect overlong encodings. For example, a
** line feed (U+000A) should be encoded as 0x0A
** (0b00001010) but could in theory be encoded in UTF-8
** as 0xC0 0x8A (0b10001010).
**
** Another example is the forward slash (/) (U+002F).
** It should be encoded as 0x2F, but could in theory be
** encoded in UTF-8 as 0xC0 0xAF (which we'll catch
** because 0xC0 is an invalid first byte of a UTF-8
** encoding), but could also be 0xE0 0x80 0xAF.
**
** I can't see any reasonable way to do this other than
** to check the decoded character against its expected
** length
*/
valid_len = Utf8LenFromUcs4(ucs4);
if (valid_len == 0)
{
/*
** This should never happen
*/
log.errorf("%s: decoded a character that we can't encode again (0x%08X)",__FUNCTION__,ucs4);
ASSERT(0);
/*
** If it does, use the replacement character
*/
*utf16 = REPLACEMENT_CHAR;
return p;
}
if (len != valid_len)
{
ASSERT(len > valid_len);
log.errorf("%s: overlong encoding(%s)...should be %d byte(s), not %d",__FUNCTION__,
utf8_char,valid_len,len);
*utf16 = REPLACEMENT_CHAR;
return p;
}
/*
** UTF-16 can only hold 21 bits. As of now (21-dec-10),
** there's no Unicode code point bigger than 2^21. To
** be safe, check...
*/
if (ucs4 > 0x0010FFFF)
{
log.errorf("%s: code point 0x%08X is too big",__FUNCTION__,ucs4);
*utf16 = REPLACEMENT_CHAR;
return p;
}
/*
** Check to make sure we're not working with a "code
** point" that is in the range used to indicate
** surrogate pairs.
*/
if ((ucs4 >= 0x0000D800) && (ucs4 <= 0x0000DFFF))
{
log.errorf("%s: code point 0x%08X is in the range used to indicate surrogate pairs",
__FUNCTION__,ucs4);
*utf16 = REPLACEMENT_CHAR;
return p;
}
/*
** To (try to) be complete, check for a couple more
** invalid code points
*/
if ((ucs4 == 0x0000FFFF) || (ucs4 == 0x0000FFFE))
{
log.errorf("%s: invalid code point (0x%08X)",__FUNCTION__,ucs4);
*utf16 = REPLACEMENT_CHAR;
return p;
}
/*
** Finally, convert from UCS-4 to UTF-16. This may be a
** straightforward assignment, but we have to deal with
** surrogate pairs
*/
if (ucs4 <= 0x0000FFFF)
{
*utf16 = ucs4 & 0xFFFF;
LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: UTF-16 encoding of 0x%08X is 0x%04X",
__FUNCTION__,ucs4,*utf16));
if (invalid)
{
*invalid = 0;
}
return p;
}
/*
** Transform UCS-4 into a UTF-16 surrogate pair
*/
/*
** Grab bits [10,20] (where bit 0 is the LSB) and shift
** them down
*/
high_half = 0xD800 + ((ucs4 - 0x00010000) >> 10);
/*
** And the bottom 10 bits [0,9]
*/
low_half = 0xDC00 + (ucs4 & 0x03FF);
utf16[0] = high_half;
utf16[1] = low_half;
LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: UTF-16 encoding of 0x%08X is 0x%04X:0x%04X",
__FUNCTION__,ucs4,utf16[0],utf16[1]));
if (invalid)
{
*invalid = 0;
}
return p;
}
/**
* Determine the number of bytes required to hold the UTF-8
* encoding of a UCS-4 code point
*
* @param ucs4 the code point
*
* @param use_syslog 1 to use syslog, 0 otherwise
*
* @retval 0 @p ucs4 is not a valid code point
*
* @retval [1,6] the number of bytes required to hold the
* UTF-8 encoding of @p ucs4
*/
size_t
Utf8ToFilename::Utf8LenFromUcs4 ( UINT32 ucs4 )
{
size_t table_idx;
LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: processing UCS-4 code point 0x%08X",
__FUNCTION__,ucs4));
for (table_idx = 0;(table_idx < (sizeof(s_len_info) /
sizeof(struct utf8_len_info)));
table_idx++)
{
if ((s_len_info[table_idx].range_min <= ucs4) &&
(ucs4 <= s_len_info[table_idx].range_max))
{
return s_len_info[table_idx].num_chars;
}
}
log.errorf("%s: 0x%08X is an invalid code point",__FUNCTION__,ucs4);
return 0;
}
/**
* Determine the number of octets that a UTF-8 encoded
* character should occupy based on its first byte
*
* @param utf8_first_byte the byte to examine
*
* @retval 0 @p utf8_first_byte is not a valid first byte of
* a UTF-8 encoded character
*
* @retval [1,6] the number of octets that @p
* utf8_first_byte should occupy
*/
UINT8
Utf8ToFilename::Utf8NumOctets ( UINT8 utf8_first_byte )
{
/**
* Here's a mapping from the first byte of a UTF-8
* character to the number of bytes it should contain
* based on information from
* http://www.unicode.org/versions/corrigendum1.html as
* well as
* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
*
* [0x00,0x7F]: 1 (0-127) (128 possible values)
* [0x80,0xBF]: invalid (128-191) (64 possible values)
* [0xC0,0xDF]: 2 (192-223) (32 possible values) (see below)
* [0xE0,0xEF]: 3 (224-239) (16 possible values)
* [0xF0,0xF7]: 4 (240 - 247) (8 possible values)
* [0xF8,0xFB]: 5 (248 - 251) (4 possible values)
* [0xFC,0xFD]: 6 (252 - 253) (2 possible values)
* [0xFE,0xFF]: invalid (254 - 255) (2 possible values)
*
* There's some gray area about 0xC0 and 0xC1. It's
* clear they are invalid first bytes but the question
* is how to handle it. If I reject them here, they'll
* get replaced with the REPLACEMENT character. But, if
* I allow them here, it's likely that both this byte
* and the subsequent one will get replaced with only
* one replacement character. This is what
* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
* assumes in sections 4.1.1, 4.2.1 and 4.3.1.
*/
if (utf8_first_byte <= 0x7F)
{
return 1;
}
if ((utf8_first_byte >= 0x80) && (utf8_first_byte <= 0xBF))
{
return 0;
}
if ((utf8_first_byte >= 0xC0) && (utf8_first_byte <= 0xDF))
{
return 2;
}
if ((utf8_first_byte >= 0xE0) && (utf8_first_byte <= 0xEF))
{
return 3;
}
if ((utf8_first_byte >= 0xF0) && (utf8_first_byte <= 0xF7))
{
return 4;
}
if ((utf8_first_byte >= 0xF8) && (utf8_first_byte <= 0xFB))
{
return 5;
}
if ((utf8_first_byte >= 0xFC) && (utf8_first_byte <= 0xFD))
{
return 6;
}
ASSERT((utf8_first_byte == 0xFE) || (utf8_first_byte == 0xFF));
return 0;
}
///////////////////////////////////////////////////////////////////////////////
}}} // namespace mp4v2::platform::win32