blob: fde703000d4a3e5ab201c37e42086783abb44452 [file] [log] [blame] [edit]
/*
* BRLTTY - A background process providing access to the console screen (when in
* text mode) for a blind person using a refreshable braille display.
*
* Copyright (C) 1995-2023 by The BRLTTY Developers.
*
* BRLTTY comes with ABSOLUTELY NO WARRANTY.
*
* This is free software, placed under the terms of the
* GNU Lesser General Public License, as published by the Free Software
* Foundation; either version 2.1 of the License, or (at your option) any
* later version. Please see the file LICENSE-LGPL for details.
*
* Web Page: http://brltty.app/
*
* This software is maintained by Dave Mielke <dave@mielke.cc>.
*/
#include "prologue.h"
#include <stdio.h>
#include <string.h>
#include "log.h"
#include "utf8.h"
#include "unicode.h"
wchar_t *
allocateCharacters (size_t count) {
{
wchar_t *characters = malloc(count * sizeof(*characters));
if (characters) return characters;
}
logMallocError();
return NULL;
}
size_t
convertCodepointToUtf8 (uint32_t codepoint, Utf8Buffer utf8) {
size_t length = 0;
if (!(codepoint & ~0X7F)) {
utf8[length++] = codepoint;
} else {
char *end = &utf8[2];
{
uint32_t value = codepoint;
uint32_t mask = ~((1 << 11) - 1);
while ((value &= mask)) {
mask <<= 5;
end += 1;
}
length = end - utf8;
}
{
uint32_t value = codepoint;
do {
*--end = (value & 0X3F) | 0X80;
value >>= 6;
} while (end > utf8);
}
*end |= ~((1 << (8 - length)) - 1);
}
utf8[length] = 0;
return length;
}
int
convertUtf8ToCodepoint (uint32_t *codepoint, const char **utf8, size_t *utfs) {
int ok = 0;
uint32_t cp = 0;
int first = 1;
int state = 0;
while (*utfs) {
unsigned char byte = *(*utf8)++;
*utfs -= 1;
if (!(byte & 0X80)) {
if (!first) goto unexpected;
cp = byte;
ok = 1;
break;
}
if (!(byte & 0X40)) {
if (first) break;
cp = (cp << 6) | (byte & 0X3F);
if (!--state) {
ok = 1;
break;
}
} else {
if (!first) goto unexpected;
state = 1;
uint8_t bit = 0X20;
while (byte & bit) {
if (!(bit >>= 1)) break;
state += 1;
}
cp = byte & ((1 << (6 - state)) - 1);
}
first = 0;
}
while (*utfs) {
if ((**utf8 & 0XC0) != 0X80) break;
ok = 0;
*utf8 += 1;
*utfs -= 1;
}
if (!ok) goto error;
*codepoint = cp;
return 1;
unexpected:
*utf8 -= 1;
*utfs += 1;
error:
return 0;
}
size_t
convertWcharToUtf8 (wchar_t character, Utf8Buffer utf8) {
return convertCodepointToUtf8(character, utf8);
}
wint_t
convertUtf8ToWchar (const char **utf8, size_t *utfs) {
uint32_t codepoint;
int ok = convertUtf8ToCodepoint(&codepoint, utf8, utfs);
if (!ok) return WEOF;
if (codepoint > WCHAR_MAX) codepoint = UNICODE_REPLACEMENT_CHARACTER;
return codepoint;
}
void
convertUtf8ToWchars (const char **utf8, wchar_t **characters, size_t count) {
while (**utf8 && (count > 1)) {
size_t utfs = UTF8_LEN_MAX;
wint_t character = convertUtf8ToWchar(utf8, &utfs);
if (character == WEOF) break;
*(*characters)++ = character;
count -= 1;
}
if (count) **characters = 0;
}
size_t
makeUtf8FromWchars (const wchar_t *characters, unsigned int count, char *buffer, size_t size) {
char *byte = buffer;
const char *end = byte + size;
for (unsigned int i=0; i<count; i+=1) {
Utf8Buffer utf8;
size_t utfs = convertWcharToUtf8(characters[i], utf8);
char *next = byte + utfs;
if (next >= end) break;
memcpy(byte, utf8, utfs);
byte = next;
}
*byte = 0;
return byte - buffer;
}
char *
getUtf8FromWchars (const wchar_t *characters, unsigned int count, size_t *length) {
size_t size = (count * UTF8_LEN_MAX) + 1;
char buffer[size];
size_t len = makeUtf8FromWchars(characters, count, buffer, size);
char *text = strdup(buffer);
if (!text) {
logMallocError();
} else if (length) {
*length = len;
}
return text;
}
size_t
makeWcharsFromUtf8 (const char *text, wchar_t *characters, size_t size) {
size_t length = strlen(text);
size_t count = 0;
while (length > 0) {
const char *utf8 = text;
size_t utfs = length;
wint_t character = convertUtf8ToWchar(&utf8, &utfs);
if (character == WEOF) break;
if (!character) break;
if (characters) {
if (count == size) break;
characters[count] = character;
}
count += 1;
text = utf8;
length = utfs;
}
if (characters && (count < size)) characters[count] = 0;
return count;
}
size_t
countUtf8Characters (const char *text) {
return makeWcharsFromUtf8(text, NULL, 0);
}
int
writeUtf8Character (FILE *stream, wchar_t character) {
Utf8Buffer utf8;
size_t utfs = convertWcharToUtf8(character, utf8);
if (utfs) {
if (fwrite(utf8, 1, utfs, stream) == utfs) {
return 1;
} else {
logSystemError("fwrite");
}
} else {
logBytes(LOG_ERR, "invalid Unicode character", &character, sizeof(character));
}
return 0;
}
int
writeUtf8Characters (FILE *stream, const wchar_t *characters, size_t count) {
const wchar_t *character = characters;
const wchar_t *end = character + count;
while (character < end) {
if (!writeUtf8Character(stream, *character++)) {
return 0;
}
}
return 1;
}
int
writeUtf8ByteOrderMark (FILE *stream) {
#if UNICODE_BYTE_ORDER_MARK <= WCHAR_MAX
if (!writeUtf8Character(stream, UNICODE_BYTE_ORDER_MARK)) {
return 0;
}
#endif /* UNICODE_BYTE_ORDER_MARK <= WCHAR_MAX */
return 1;
}
int
isCharsetUTF8 (const char *name) {
{
const char *substring = "utf";
size_t length = strlen(substring);
if (strncasecmp(name, substring, length) != 0) return 0;
name += length;
if (*name == '-') name += 1;
}
return strcmp(name, "8") == 0;
}