libplatform/platform_win32.cpp - mp4v2 - Git at Google

 ///////////////////////////////////////////////////////////////////////////////
 //
 //  The contents of this file are subject to the Mozilla Public License
 //  Version 1.1 (the "License"); you may not use this file except in
 //  compliance with the License. You may obtain a copy of the License at
 //  http://www.mozilla.org/MPL/
 //
 //  Software distributed under the License is distributed on an "AS IS"
 //  basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 //  License for the specific language governing rights and limitations
 //  under the License.
 //
 //  The Original Code is MP4v2.
 //
 //  The Initial Developer of the Original Code is David Byron.
 //  Portions created by David Byron are Copyright (C) 2010.
 //  All Rights Reserved.
 //
 //  Contributors:
 //      David Byron, dbyron@dbyron.com
 //
 ///////////////////////////////////////////////////////////////////////////////

 #include "src/impl.h"
 #include "libplatform/impl.h" /* for platform_win32_impl.h which declares Utf8ToFilename */
 #include <algorithm> /* for replace */
 #include <windows.h>

 namespace mp4v2 {
     using namespace impl;
 }

 /**
  * Set this to 1 to compile in extra debugging
  */
 #define EXTRA_DEBUG 0

 /**
  * @def LOG_PRINTF
  *
  * call log.printf if EXTRA_DEBUG is defined to 1.  Do
  * nothing otherwise
  */
 #if EXTRA_DEBUG
 #define LOG_PRINTF(X) log.printf X
 #else
 #define LOG_PRINTF(X)
 #endif

 /**
  * Section 2.13 "Special Characters and Noncharacters" of
  * _The Unicode Standard, Version 5.0_
  * (http://www.unicode.org/versions/Unicode5.0.0/bookmarks.html)
  * defines "The Replacement Character" U+FFFD as the
  * "general substitute character" that "can be substituted
  * for any 'unknown' character in another encoding that can
  * not be mapped in terms of known Unicode characters"
  *
  * See also section D.7 of 10646.
  */
 #define REPLACEMENT_CHAR    0xFFFD

 namespace mp4v2 { namespace platform { namespace win32 {

 /**
  * A structure to store the number of characters required to
  * encode a particular UCS-4 character in UTF-8
  */
 struct utf8_len_info
 {
     /**
      * This structure applies to a number >= @p range_min.
      */
     UINT32      range_min;

     /**
      * This structure applies to a number <= @p range_max.
      */
     UINT32      range_max;

     /**
      * The number of characters required to encode a number
      * in [@p range_min,@p range_max] as UTF-8.
      */
     size_t      num_chars;
 };

 /**
  * A structure to store the number of characters required to
  * encode a particular UCS-4 character in UTF-8.  For now
  * we're using wide characters (which according to
  * http://msdn.microsoft.com/en-us/library/ms776414.aspx
  * means UTF-16 since Windows 2000) so we're only using up
  * to 4-byte UTF-8 sequences.  Parts of the range aren't
  * valid (e.g. [U+D800,U+DFFF] but that's handled elsewhere.
  */
 static struct utf8_len_info s_len_info[] =
 {
     { 0x00000000, 0x0000007F, 1 },
     { 0x00000080, 0x000007FF, 2 },
     { 0x00000800, 0x0000FFFF, 3 },
     { 0x00010000, 0x001FFFFF, 4 },
     { 0x00200000, 0x03FFFFFF, 5 },
     { 0x04000000, 0x7FFFFFFF, 6 }
 };

 /**
  * Utf8ToFilename constructor
  *
  * @param utf8string a UTF-8 encoded string that does not
  * begin with \\\?\\ nor \\\?\\UNC\\
  *
  * @see IsValidUTF16 to see whether the constructor
  * succeeded
  */
 Utf8ToFilename::Utf8ToFilename( const string &utf8string )
     : _wideCharString( NULL )
       , utf8( _utf8 )
 {
     // See
     // http://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx
     // for notes about path lengths, prefixes, etc.  The
     // goal is to support the longest path possible.
     // Relative paths are limited to 260 characters but
     // absolute paths can be up to about 32767
     // characters if properly prefixed.

     // If utf8string is a relative path, convert it to
     // UTF-16 and be done.
     if (!IsAbsolute(utf8string))
     {
         _wideCharString = ConvertToUTF16(utf8string);
         return;
     }

     // Since the prefix has backslashes, convert any forward
     // slashes in utf8string to backslashes to keep Windows
     // happy
     const string *utf8ToUse = &utf8string;
     string forwardSlash;

     if (utf8string.find('/') != std::string::npos)
     {
         forwardSlash = utf8string;
         std::replace(forwardSlash.begin(),forwardSlash.end(),'/','\\');
         utf8ToUse = &forwardSlash;
     }
     ASSERT(utf8ToUse);
     ASSERT((*utf8ToUse).length() > 0);

     // utf8string is an absolute path.  It could be a
     // UNC path (\\host\path).  The prefix is different
     // for UNC paths than it is for non-UNC paths.
     string prefixedPath;

     if (IsUncPath(*utf8ToUse))
     {
         // utf8string begins with two backslashes, but
         // with a prefix we only need one so we can't
         // just prepend a prefix.
         prefixedPath = "\\\\?\\UNC" + (*utf8ToUse).substr(1);
     }
     else
     {
         prefixedPath = "\\\\?\\" + *utf8ToUse;
     }

     // Transform prefixedPath to UTF-16 so it's
     // appropriate for CreateFileW
     _wideCharString = ConvertToUTF16(prefixedPath);
 }

 Utf8ToFilename::~Utf8ToFilename( )
 {
     if( _wideCharString != NULL )
     {
         free(_wideCharString);
         _wideCharString = NULL;
     }
 }

 /**
  * Convert a UTF-8 encoded string to a UTF-16 string
  *
  * @param utf8 the NUL-terminated UTF-8 string to decode
  *
  * @retval NULL error allocating memory for UTF-16 string
  *
  * @retval non-NULL NUL-terminated UTF-16 version of @p
  * utf8.  Invalid portions of UTF-8 are represented by a
  * replacement character U+FFFD.  The caller is
  * responsible for freeing this memory.
  */
 wchar_t *
 Utf8ToFilename::ConvertToUTF16 ( const string &utf8string )
 {
     int         num_bytes;
     size_t      num_chars;
     wchar_t     *retval;

     ASSERT(sizeof(wchar_t) == 2);

     // Store the utf8 string in our member variable so it's
     // available
     _utf8 = utf8string;

     // We need to find out how many characters we're dealing
     // with so we know how much memory to allocate.  At the
     // same time, it's possible that the string we've been
     // given isn't valid UTF-8.  So, just use the length of
     // the string we've been given as the number of
     // characters to allocate.  The decoded string can't be
     // longer than this, even taking into account surrogate
     // pairs since they require 4 UTF-8 characters but only
     // two UTF-16 character elements.
     num_chars = utf8string.length();

     LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: entry point (%d character string)",
                 __FUNCTION__,num_chars));

     /*
     ** Allocate space for the decoded string.  Add one
     ** for the NUL terminator.
     */
     num_bytes = (num_chars + 1) * sizeof(wchar_t);
     retval = (wchar_t *)malloc(num_bytes);
     if (!retval)
     {
         log.errorf("%s: error allocating memory for %d byte(s)",__FUNCTION__,num_bytes);
         return NULL;
     }

     /*
     ** ConvertToUTF16Buf zeroes out the memory so don't
     ** do it here
     */

     // ConvertToUTF16Buf shouldn't fail if we allocated
     // enough memory for the entire string.  Check
     // anyway just to be safe.
     if (!ConvertToUTF16Buf(utf8string.c_str(),retval,num_bytes))
     {
         // But ASSERT so we can find the problem and fix
         // it.
         ASSERT(0);
         free(retval);
         retval = NULL;
         return NULL;
     }

     return retval;
 }

 /**
  * Convert a UTF-8 encoded string to a UTF-16 string in
  * a previously allocated buffer.
  *
  * @param utf8 the NUL-terminated UTF-8 string to decode
  *
  * @param utf16_buf the buffer in which to place the
  * UTF-16 version of @p utf8.  If there's enough space
  * to hold a NUL terminator, @p utf16_buf contains one.
  * If not, @p utf16_buf is not NUL terminated.
  *
  * @param num_bytes the number of bytes that @p
  * utf16_str points to
  *
  * @retval 0 error converting @p name to UTF-16,
  * including when @p utf8 requires more space to encode
  * in UTF-16 than indicated by @p num_bytes.  In that
  * case, @p utf16_buf contains the UTF-16 encoding of as
  * much of @p utf8 as possible.
  *
  * @retval 1 successfully converted @p name to @p UTF-16
  * in @p utf16_buf. wide character (UTF-16) version of
  * @p Invalid portions of UTF-8 are represented by a
  * replacement character U+FFFD.
  */
 int
 Utf8ToFilename::ConvertToUTF16Buf ( const char      *utf8,
                                     wchar_t         *utf16_buf,
                                     size_t          num_bytes )
 {
     size_t      i;
     const UINT8 *next_char;
     size_t      num_chars;
     size_t      num_utf16_chars;
     size_t      num_input_bytes;
     const UINT8 *p;
     wchar_t     this_utf16[2];

     ASSERT(utf8);
     ASSERT(utf16_buf || (num_bytes == 0));
     ASSERT(sizeof(wchar_t) == 2);

     ASSERT(num_bytes % sizeof(wchar_t) == 0);

     LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: converting \"%s\"",__FUNCTION__,utf8));

     num_chars = strlen(utf8);

     // If the input is NUL-terminated (which it better
     // be), the NUL-terminator is a valid input byte as
     // well
     num_input_bytes = num_chars + 1;

     // Make sure the buffer we've been given is long
     // enough.  We might need one UTF-16 character for
     // every UTF-8 character.  And one more for the NUL
     // terminator.
     //
     // Here, check that there's room for a NUL
     // terminator in the output string.  This makes it
     // safe to dereference p in the while loop below.
     // It's probably enough to check num_bytes == 0 here
     // but if we did that we'd have to change the error
     // message after the while loop to be less specific.
     // This way we give the caller more info about the
     // input string.
     if (num_bytes < sizeof(wchar_t))
     {
         log.errorf("%s: %u byte(s) is not enough to transform a %u byte UTF-8 string "
                    "to NUL-terminated UTF-16",__FUNCTION__,num_bytes,num_input_bytes);
         return 0;
     }

     ASSERT(num_bytes > 0);
     ASSERT(utf16_buf);
     memset(utf16_buf,0,num_bytes);

     // The number of UTF-16 characters we've got space for
     // in utf16_buf
     num_utf16_chars = num_bytes / sizeof(wchar_t);

     p = (const UINT8 *)utf8;
     i = 0;
     while (*p && (i < num_utf16_chars))
     {
         LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: decoding first UTF-8 byte 0x%02X (UTF-16 "
                     "character %d of at most %d)",__FUNCTION__,*p,(i + 1),
                     num_utf16_chars));

         memset(this_utf16,0,sizeof(this_utf16));

         // This function decodes illegal bytes/sequences
         // with a replacement character and returns the
         // pointer to the next character to decode.  Pass
         // NULL since we don't care about detecting invalid
         // characters here.
         next_char = Utf8DecodeChar(p,num_input_bytes,this_utf16,NULL);

         // We've always got one character to assign
         utf16_buf[i++] = this_utf16[0];

         // If we're dealing with a surrogate pair,
         // assign the low half too
         if (this_utf16[1])
         {
             // We may not have any more room in the
             // UTF-16 buffer.  Check to make sure we
             // don't step on someone else's memory.  We
             // need to return failure here instead of
             // depending on our other logic to do it for
             // us.  We'll get out of the while loop with
             // no extra code, but if we're dealing with
             // the UTF-16 encoding of the last character
             // in the input string, there won't appear
             // to be anything wrong.
             if (i >= num_utf16_chars)
             {
                 log.errorf("%s: out of space in %u  byte output string to store surrogate "
                            "pair low half (0x%04X)",__FUNCTION__,num_bytes,this_utf16[1]);
                 return 0;
             }

             utf16_buf[i++] = this_utf16[1];
         }

         // Put this here to make it brutally clear that
         // the cast is safe
         ASSERT(next_char >= p);
         num_input_bytes -= (size_t)(next_char - p);
         p = next_char;
     }

     if (*p)
     {
         // Since num_input_bytes includes 1 for the
         // NUL-terminator, it's got to be bigger than
         // one here.
         ASSERT(num_input_bytes > 1);
         log.errorf("%s: %u byte(s) of input string remain(s) undecoded (%s): out of space in "
                    "%u byte output string",__FUNCTION__,(num_input_bytes - 1),p,num_bytes);
         return 0;
     }

     return 1;
 }

 /**
  * Accessor for the length of a prefix (i.e. \\\?\\ or
  * \\\?\\UNC\\) that begins a filename
  *
  * @param utf8string the UTF-8 encoded filename to
  * examine
  *
  * @return the length of the prefix of @p utf8string in
  * characters
  */
 int
 Utf8ToFilename::GetPrefixLen ( const string &utf8string )
 {
     if (utf8string.find("\\\\?\\") == 0)
     {
         return strlen("\\\\?\\");
     }

     if (utf8string.find("\\\\?\\UNC\\") == 0)
     {
         return strlen("\\\\?\\UNC\\");
     }

     return 0;
 }

 /**
  * Determine if a path is absolute or not
  *
  * @param utf8string the UTF-8 encoded path to examine
  * that does not begin with \\\?\\ nor \\\?\\UNC\\
  *
  * @retval 0 @p utf8string is not an absolute path
  * @retval 1 @p utf8string is an absolute path
  */
 int
 Utf8ToFilename::IsAbsolute ( const string &utf8string )
 {
     // Assume utf8string doesn't already start with a
     // long filename prefix (i.e. \\?\ or \\?\UNC\)
     // since the logic here depends on that.
     ASSERT(GetPrefixLen(utf8string) == 0);

     // Is an empty string absolute or relative?  It's
     // not absolute since we can't tell what
     // drive/volume it's for so say it's relative.
     if (utf8string.length() == 0)
     {
         return 0;
     }

     // Here we're looking for:
     //  x:   drive relative
     //  x:\  absolute path
     if (utf8string[1] == ':')
     {
         // It starts with x:, but is it x:/ ?
         if ((utf8string.length() >= 2) && IsPathSeparator(utf8string[2]))
         {
             // Yup -- it's absolute
             return 1;
         }

         // Nope, not x:/, just x:something
         return 0;
     }

     // UNC paths are absolute paths too
     return IsUncPath(utf8string);
 }

 /**
  * Determine if a character is a valid path separator
  *
  * @param c the character to check
  *
  * @retval 0 @p c is not a valid path separator
  * @retval 1 @p c is a valid path separator
  */
 int
 Utf8ToFilename::IsPathSeparator ( char c )
 {
     return ((c == '\\') || (c == '/'));
 }

 /**
  * Determine if a path is a UNC path
  *
  * @param utf8string the UTF-8 encoded path to examine
  * that does not begin with \\\?\\ nor \\\?\\UNC\\
  *
  * @retval 0 @p utf8string is not a UNC path
  * @retval 1 @p utf8string is a UNC path
  */
 int
 Utf8ToFilename::IsUncPath ( const string &utf8string )
 {
     const char  *host;
     int         num_slashes;
     const char  *p;

     // Assume utf8string doesn't already start with a
     // long filename prefix (i.e. \\?\ or \\?\UNC\)
     // since the logic here depends on that.
     ASSERT(GetPrefixLen(utf8string) == 0);

     // Is an empty string a UNC path?  No.
     if (utf8string.length() == 0)
     {
         return 0;
     }

     //  Recognize:
     //    //volume/path
     //    \\volume\path
     if (!IsPathSeparator(utf8string[0]))
     {
         // If it doesn't start with a path separator, it's
         // not a UNC path.
         return 0;
     }

     // The path starts with a slash, so it could be a UNC
     // path.  See if it starts with two slashes...Be careful
     // though, it might have more than 2 slashes.
     p = utf8string.c_str();
     num_slashes = 0;
     while (*p && IsPathSeparator(*p))
     {
         num_slashes++;
         p++;
     }

     // We found a slash at the beginning so we better have
     // at least one here
     ASSERT(num_slashes >= 1);
     if ((num_slashes > 2) || !(*p))
     {
         // If we've got more than two slashes or we've
         // run off the end of the string (///foo or
         // //)...who knows how the OS will handle it,
         // but it's not a UNC path.
         log.errorf("%s: don't understand path(%s)",__FUNCTION__,utf8string.c_str());
         return 0;
     }

     // If we've only got one slash, it looks like a
     // drive relative path.  If it's something like
     // /foo//bar it's not clear how the OS handles it,
     // but that's someone else's problem.  It's not a
     // UNC path.
     if (num_slashes == 1)
     {
         return 0;
     }

     // If we're here, we've got two slashes followed by
     // a non-slash.  Something like //foo.  To be a
     // proper UNC path, we need to see a hostname
     // (e.g. foo), and then another slash.  If not, it's
     // not a UNC path.
     ASSERT(num_slashes == 2);

     // Tempting to use STRTOK_R here, but that modifies
     // the original string.  Instead of making a copy,
     // search manually.
     host = p;
     while (*p && !IsPathSeparator(*p))
     {
         p++;
     }

     // We checked for separators above, so we better
     // have moved on at least a bit
     ASSERT(host != p);
     if (!(*p))
     {
         // We ran off the end of the string without finding
         // another separator.  So, we've got something like
         //
         //  //foobar
         //
         // which isn't a UNC path.
         log.warningf("%s: incomplete UNC path: host only(%s)",__FUNCTION__,
                      utf8string.c_str());
         return 0;
     }

     // p points to a separator, so...we've got one of:
     //  //host//
     //  //host//blah
     //  //host/bar
     //
     // Of these, only the last is a proper UNC path.  See
     // what we've got after p.
     num_slashes = 0;
     while (*p && IsPathSeparator(*p))
     {
         num_slashes++;
         p++;
     }

     // We better have at least one slash or our logic is
     // broken
     ASSERT(num_slashes >= 1);
     if (!(*p))
     {
         // //host// (or maybe //host///), but no path
         // part after the host
         log.warningf("%s: incomplete UNC path: no path after host(%s)",
                      __FUNCTION__,utf8string.c_str());
         return 0;
     }

     if (num_slashes > 1)
     {
         // Another busted case //host//blah or
         // //host///blah, etc.
         log.warningf("%s: invalid UNC path: too many slashes after host(%s)",
                      __FUNCTION__,utf8string.c_str());
         return 0;
     }

     // If we're here it means num_slashes is exactly 1
     // so we've got //host/something so we're calling
     // that a UNC path.
     return 1;
 }

 /**
  * Accessor for whether the UTF-16 encoded string is valid
  *
  * @retval false the UTF-16 encoded string is not valid
  * @retval true the UTF-16 encoded string is valid
  */
 bool
 Utf8ToFilename::IsUTF16Valid( ) const
 {
     return (_wideCharString ? true : false);
 }

 /**
  * Decode one UTF-8 encoded character into a UTF-16
  * character.  The trouble here is that UTF-16 is really a
  * variable length encoding to handle surrogate pairs
  * (0xD800 --> 0xDFFF).  This way UTF-16 can handle more
  * than 2^16 characters.  So we need to be careful.  UCS-2
  * is a fixed width (16-bit) encoding that we could use, but
  * then we can only handle 2^16 characters (the BMP).  To
  * handle all 2^21 characters, we need UTF-16.
  *
  * What does Windows really use?  UTF-16.  See
  * http://unicode.org/iuc/iuc17/b2/slides.ppt for a
  * discussion.
  * http://discuss.fogcreek.com/joelonsoftware5/default.asp?cmd=show&ixPost=168543
  * also has some info.
  *
  * @param utf8_char the UTF-8 character to decode, possibly
  * occupying multiple bytes, not necessarily NUL terminated
  *
  * @param num_bytes the number of bytes that @p utf8_char
  * points to (must be > 0)
  *
  * @param utf16 populated with the UTF-16 equivalent of @p
  * utf8_char.  Note that this must point to at least 2
  * wchar_t's of memory so there's room to hold a surrogate
  * pair.
  *
  * @param invalid populated with 1 if @p utf8_char doesn't
  * point to a valid UTF-8 encoded character, 0 if @p
  * utf8_char is valid.
  *
  * @return the next byte to examine for subsequent decoding
  * (some number of bytes after @p utf8_char).  This may not
  * be valid to dereference depending on the value of @p
  * num_bytes.
  */
 const UINT8 *
 Utf8ToFilename::Utf8DecodeChar ( const UINT8    *utf8_char,
                                  size_t         num_bytes,
                                  wchar_t        *utf16,
                                  int            *invalid )

 {
     wchar_t     high_half;
     int         i;
     UINT8       len;
     wchar_t     low_half;
     UINT8       mask;
     const UINT8 *p;
     UINT32      ucs4;
     int         valid_len;

     ASSERT(utf8_char);
     ASSERT(num_bytes > 0);
     ASSERT(utf16);

     LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: decoding UTF-8 string at address 0x%p",
                 __FUNCTION__,utf8_char));

     /*
     ** Assume utf8_char is invalid until we learn otherwise
     */
     if (invalid)
     {
         *invalid = 1;
     }

     /*
     ** Traverse the UTF-8 encoding and figure out what we've
     ** got.
     */
     p = (const UINT8 *)(utf8_char);

     /*
     ** This is the number of bytes we expect based on the
     ** first octet.  If subsequent bytes are NUL or invalid,
     ** then it may not the same as the actual len.
     */
     len = Utf8NumOctets(*p);
     if (len == 0)
     {
         log.errorf("%s: 0x%02X is not a valid first byte of a UTF-8 encoded character",__FUNCTION__,*p);

         /*
         ** Use the replacement character and advance past
         ** the invalid byte
         */
         *utf16 = REPLACEMENT_CHAR;
         return p + 1;
     }

     /*
     ** Handle one byte encodings in a special case.  See
     ** below for an explanation of how we mask successive
     ** bytes of an encoding to see why.  We're depending on
     ** the validation in Utf8NumOctets here to make this OK.
     */
     if (len == 1)
     {
         /*
         ** There's no intermediate UCS-4 step here.  We go
         ** straight to UTF-16 since they're the same.
         */
         LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: one byte UTF-16 encoding: 0x%02X",
                     __FUNCTION__,*p));
         *utf16 = *p;
         if (invalid)
         {
             *invalid = 0;
         }
         return p + 1;
     }

     /*
     ** Make sure we've got enough bytes in our input string
     ** to form a valid UTF-8 character
     */
     if (len > num_bytes)
     {
         log.errorf("%s: first byte 0x%02X indicates a %d byte "
                    "UTF-8 character, but we only have %u valid byte(s)",
                    __FUNCTION__,*p,len,num_bytes);
         *utf16 = REPLACEMENT_CHAR;
         return p + 1;
     }

     /*
     ** Traverse the bytes that should be part of this UTF-8
     ** encoded character and make sure we don't have an
     ** overlength encoding, and make sure that each
     ** character is valid.
     */

     /*
     ** As we traverse each character, we mask off the
     ** appropriate number of bits and include them in the
     ** overall result.
     **
     ** 1 byte encoding [U+00000000,U+0000007F]: 7 bits (7 bits total) (handled above)
     ** 2 byte encoding [U+00000080,U+000007FF]: 5 bits, 6 bits (11 bits total)
     ** 3 byte encoding [U+00000800,U+0000FFFF]: 4 bits, 6 bits, 6 bits (16 bits total)
     ** 4 byte encoding [U+00010000,U+001FFFFF]: 3 bits, 6 bits, 6 bits, 6 bits (21 bits total)
     ** 5 byte encoding [U+00200000,U+03FFFFFF]: 2 bits, 6 bits, 6 bits, 6 bits, 6 bits (26 bits total)
     ** 6 byte encoding [U+04000000,U+7FFFFFFF]: 1 bit, 6 bits, 6 bits, 6 bits, 6 bits, 6 bits (31 bits total)
     **
     ** So, mask the initial byte appropriately, then take
     ** the bottom 6 bits from the remaining bytes.  To be
     ** brutally explicit, the first byte mask is:
     **
     ** 1 byte encoding: 0x7F (or 0x80 - 1) (or (1 << 7) - 1)
     ** 2 byte encoding: 0x1F (or 0x20 - 1) (or (1 << 5) - 1)
     ** 3 byte encoding: 0x0F (or 0x10 - 1) (or (1 << 4) - 1)
     ** 4 byte encoding: 0x07 (or 0x08 - 1) (or (1 << 3) - 1)
     ** 5 byte encoding: 0x03 (or 0x04 - 1) (or (1 << 2) - 1)
     ** 6 byte encoding: 0x01 (or 0x02 - 1) (or (1 << 1) - 1)
     **
     ** So, the one byte encoding is a special case (again,
     ** handled above), but for the other lengths, the mask
     ** is (1 << (7 - len)) - 1.
     */

     /*
     ** Handle the first byte of multi-byte encodings since
     ** it's special
     */
     ASSERT(len > 1);
     ASSERT(len <= 6);
     mask = (1 << (7 - len)) - 1;
     ucs4 = *p & mask;
     p++;

     /*
     ** Now handle the remaining bytes
     */
     for (i = 1;(i < len);i++)
     {
         if ((*p < 0x80) || (*p > 0xBF))
         {
             log.errorf("%s: 0x%02X is not a valid continuation character in a UTF-8 encoding",
                        __FUNCTION__,*p);

             /*
             ** Use the replacement character and return the
             ** next byte after the invalid sequence as the
             ** place for subsequent decoding operations.  In
             ** this case the invalid continuation character
             ** could be the beginning of the next valid
             ** sequence, so return that.
             */
             *utf16 = REPLACEMENT_CHAR;
             return p;
         }

         /*
         ** For the remainder of the bytes, shift over what
         ** we've already got by 6 bits, and then OR in the
         ** bottom 6 bits of the current byte.
         */
         ucs4 = (ucs4 << 6) | (*p & 0x3F);
         p++;
     }

     /*
     ** p is now pointing to the beginning of the next UTF-8
     ** sequence to decode...
     */

     /*
     ** Finally, detect overlong encodings.  For example, a
     ** line feed (U+000A) should be encoded as 0x0A
     ** (0b00001010) but could in theory be encoded in UTF-8
     ** as 0xC0 0x8A (0b10001010).
     **
     ** Another example is the forward slash (/) (U+002F).
     ** It should be encoded as 0x2F, but could in theory be
     ** encoded in UTF-8 as 0xC0 0xAF (which we'll catch
     ** because 0xC0 is an invalid first byte of a UTF-8
     ** encoding), but could also be 0xE0 0x80 0xAF.
     **
     ** I can't see any reasonable way to do this other than
     ** to check the decoded character against its expected
     ** length
     */
     valid_len = Utf8LenFromUcs4(ucs4);
     if (valid_len == 0)
     {
         /*
         ** This should never happen
         */
         log.errorf("%s: decoded a character that we can't encode again (0x%08X)",__FUNCTION__,ucs4);
         ASSERT(0);

         /*
         ** If it does, use the replacement character
         */
         *utf16 = REPLACEMENT_CHAR;
         return p;
     }

     if (len != valid_len)
     {
         ASSERT(len > valid_len);
         log.errorf("%s: overlong encoding(%s)...should be %d byte(s), not %d",__FUNCTION__,
                    utf8_char,valid_len,len);
         *utf16 = REPLACEMENT_CHAR;
         return p;
     }

     /*
     ** UTF-16 can only hold 21 bits.  As of now (21-dec-10),
     ** there's no Unicode code point bigger than 2^21.  To
     ** be safe, check...
     */
     if (ucs4 > 0x0010FFFF)
     {
         log.errorf("%s: code point 0x%08X is too big",__FUNCTION__,ucs4);
         *utf16 = REPLACEMENT_CHAR;
         return p;
     }

     /*
     ** Check to make sure we're not working with a "code
     ** point" that is in the range used to indicate
     ** surrogate pairs.
     */
     if ((ucs4 >= 0x0000D800) && (ucs4 <= 0x0000DFFF))
     {
         log.errorf("%s: code point 0x%08X is in the range used to indicate surrogate pairs",
                    __FUNCTION__,ucs4);
         *utf16 = REPLACEMENT_CHAR;
         return p;
     }

     /*
     ** To (try to) be complete, check for a couple more
     ** invalid code points
     */
     if ((ucs4 == 0x0000FFFF) || (ucs4 == 0x0000FFFE))
     {
         log.errorf("%s: invalid code point (0x%08X)",__FUNCTION__,ucs4);
         *utf16 = REPLACEMENT_CHAR;
         return p;
     }

     /*
     ** Finally, convert from UCS-4 to UTF-16.  This may be a
     ** straightforward assignment, but we have to deal with
     ** surrogate pairs
     */
     if (ucs4 <= 0x0000FFFF)
     {
         *utf16 = ucs4 & 0xFFFF;
         LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: UTF-16 encoding of 0x%08X is 0x%04X",
                     __FUNCTION__,ucs4,*utf16));
         if (invalid)
         {
             *invalid = 0;
         }
         return p;
     }

     /*
     ** Transform UCS-4 into a UTF-16 surrogate pair
     */

     /*
     ** Grab bits [10,20] (where bit 0 is the LSB) and shift
     ** them down
     */
     high_half = 0xD800 + ((ucs4 - 0x00010000) >> 10);

     /*
     ** And the bottom 10 bits [0,9]
     */
     low_half = 0xDC00 + (ucs4 & 0x03FF);

     utf16[0] = high_half;
     utf16[1] = low_half;

     LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: UTF-16 encoding of 0x%08X is 0x%04X:0x%04X",
                 __FUNCTION__,ucs4,utf16[0],utf16[1]));

     if (invalid)
     {
         *invalid = 0;
     }

     return p;
 }

 /**
  * Determine the number of bytes required to hold the UTF-8
  * encoding of a UCS-4 code point
  *
  * @param ucs4 the code point
  *
  * @param use_syslog 1 to use syslog, 0 otherwise
  *
  * @retval 0 @p ucs4 is not a valid code point
  *
  * @retval [1,6] the number of bytes required to hold the
  * UTF-8 encoding of @p ucs4
  */
 size_t
 Utf8ToFilename::Utf8LenFromUcs4 ( UINT32 ucs4 )
 {
     size_t      table_idx;

     LOG_PRINTF((MP4_LOG_VERBOSE4,"%s: processing UCS-4 code point 0x%08X",
                 __FUNCTION__,ucs4));

     for (table_idx = 0;(table_idx < (sizeof(s_len_info) /
                                      sizeof(struct utf8_len_info)));
          table_idx++)
     {
         if ((s_len_info[table_idx].range_min <= ucs4) &&
             (ucs4 <= s_len_info[table_idx].range_max))
         {
             return s_len_info[table_idx].num_chars;
         }
     }

     log.errorf("%s: 0x%08X is an invalid code point",__FUNCTION__,ucs4);

     return 0;
 }

 /**
  * Determine the number of octets that a UTF-8 encoded
  * character should occupy based on its first byte
  *
  * @param utf8_first_byte the byte to examine
  *
  * @retval 0 @p utf8_first_byte is not a valid first byte of
  * a UTF-8 encoded character
  *
  * @retval [1,6] the number of octets that @p
  * utf8_first_byte should occupy
  */
 UINT8
 Utf8ToFilename::Utf8NumOctets ( UINT8 utf8_first_byte )
 {
     /**
      * Here's a mapping from the first byte of a UTF-8
      * character to the number of bytes it should contain
      * based on information from
      * http://www.unicode.org/versions/corrigendum1.html as
      * well as
      * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
      *
      * [0x00,0x7F]: 1       (0-127)     (128 possible values)
      * [0x80,0xBF]: invalid (128-191)   (64 possible values)
      * [0xC0,0xDF]: 2       (192-223)   (32 possible values) (see below)
      * [0xE0,0xEF]: 3       (224-239)   (16 possible values)
      * [0xF0,0xF7]: 4       (240 - 247) (8 possible values)
      * [0xF8,0xFB]: 5       (248 - 251) (4 possible values)
      * [0xFC,0xFD]: 6       (252 - 253) (2 possible values)
      * [0xFE,0xFF]: invalid (254 - 255) (2 possible values)
      *
      * There's some gray area about 0xC0 and 0xC1.  It's
      * clear they are invalid first bytes but the question
      * is how to handle it.  If I reject them here, they'll
      * get replaced with the REPLACEMENT character.  But, if
      * I allow them here, it's likely that both this byte
      * and the subsequent one will get replaced with only
      * one replacement character.  This is what
      * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
      * assumes in sections 4.1.1, 4.2.1 and 4.3.1.
      */
     if (utf8_first_byte <= 0x7F)
     {
         return 1;
     }

     if ((utf8_first_byte >= 0x80) && (utf8_first_byte <= 0xBF))
     {
         return 0;
     }

     if ((utf8_first_byte >= 0xC0) && (utf8_first_byte <= 0xDF))
     {
         return 2;
     }

     if ((utf8_first_byte >= 0xE0) && (utf8_first_byte <= 0xEF))
     {
         return 3;
     }

     if ((utf8_first_byte >= 0xF0) && (utf8_first_byte <= 0xF7))
     {
         return 4;
     }

     if ((utf8_first_byte >= 0xF8) && (utf8_first_byte <= 0xFB))
     {
         return 5;
     }

     if ((utf8_first_byte >= 0xFC) && (utf8_first_byte <= 0xFD))
     {
         return 6;
     }

     ASSERT((utf8_first_byte == 0xFE) || (utf8_first_byte == 0xFF));
     return 0;
 }

 ///////////////////////////////////////////////////////////////////////////////

 }}} // namespace mp4v2::platform::win32