blob: 1892267afbbbee48b9a83533e63db74e5e1bf502 [file] [log] [blame]
/* Handling strings that are given partially in the source encoding and
partially in Unicode.
Copyright (C) 2001-2018 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
/* Specification. */
#include "xg-mixed-string.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "error.h"
#include "error-progname.h"
#include "flexmember.h"
#include "msgl-ascii.h"
#include "po-charset.h"
#include "unistr.h"
#include "xalloc.h"
#include "xg-pos.h"
#include "gettext.h"
#define _(str) gettext (str)
/* Allocates a single segment. */
static inline struct mixed_string_segment *
segment_alloc (enum segment_type type, const char *string, size_t length)
{
struct mixed_string_segment *segment =
(struct mixed_string_segment *)
xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents, length));
segment->type = type;
segment->length = length;
memcpy (segment->contents, string, length);
return segment;
}
/* Clones a single segment. */
static inline struct mixed_string_segment *
segment_clone (const struct mixed_string_segment *segment)
{
return segment_alloc (segment->type, segment->contents, segment->length);
}
mixed_string_ty *
mixed_string_alloc_simple (const char *string,
lexical_context_ty lcontext,
const char *logical_file_name,
int line_number)
{
struct mixed_string *ms = XMALLOC (struct mixed_string);
if (*string == '\0')
{
/* An empty string. */
ms->segments = NULL;
ms->nsegments = 0;
}
else
{
ms->segments = XNMALLOC (1, struct mixed_string_segment *);
if ((xgettext_current_source_encoding == po_charset_ascii
|| xgettext_current_source_encoding == po_charset_utf8)
&& is_ascii_string (string))
/* An optimization. */
ms->segments[0] =
segment_alloc (utf8_encoded, string, strlen (string));
else
/* The general case. */
ms->segments[0] =
segment_alloc (source_encoded, string, strlen (string));
ms->nsegments = 1;
}
ms->lcontext = lcontext;
ms->logical_file_name = logical_file_name;
ms->line_number = line_number;
return ms;
}
mixed_string_ty *
mixed_string_alloc_utf8 (const char *string,
lexical_context_ty lcontext,
const char *logical_file_name,
int line_number)
{
struct mixed_string *ms = XMALLOC (struct mixed_string);
if (*string == '\0')
{
/* An empty string. */
ms->segments = NULL;
ms->nsegments = 0;
}
else
{
ms->segments = XNMALLOC (1, struct mixed_string_segment *);
ms->segments[0] = segment_alloc (utf8_encoded, string, strlen (string));
ms->nsegments = 1;
}
ms->lcontext = lcontext;
ms->logical_file_name = logical_file_name;
ms->line_number = line_number;
return ms;
}
mixed_string_ty *
mixed_string_clone (const mixed_string_ty *ms1)
{
struct mixed_string *ms = XMALLOC (struct mixed_string);
size_t nsegments = ms1->nsegments;
if (nsegments == 0)
{
ms->segments = NULL;
ms->nsegments = 0;
}
else
{
size_t i;
ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
for (i = 0; i < nsegments; i++)
ms->segments[i] = segment_clone (ms1->segments[i]);
ms->nsegments = nsegments;
}
ms->lcontext = ms1->lcontext;
ms->logical_file_name = ms1->logical_file_name;
ms->line_number = ms1->line_number;
return ms;
}
char *
mixed_string_contents (const mixed_string_ty *ms)
{
size_t nsegments = ms->nsegments;
/* Trivial cases. */
if (nsegments == 0)
return xstrdup ("");
if (nsegments == 1 && ms->segments[0]->type == utf8_encoded)
{
/* Return the segment, with a NUL at the end. */
size_t len = ms->segments[0]->length;
char *string = XNMALLOC (len + 1, char);
memcpy (string, ms->segments[0]->contents, len);
string[len] = '\0';
return string;
}
/* General case. */
{
size_t i;
for (i = 0; i < nsegments - 1; i++)
if (memchr (ms->segments[i]->contents, '\0', ms->segments[i]->length)
!= NULL)
{
/* Segment i contains a NUL character. Ignore the remaining
segments. */
nsegments = i + 1;
break;
}
}
{
char **converted_segments = XNMALLOC (nsegments, char *);
size_t length;
length = 0;
{
size_t i;
for (i = 0; i < nsegments; i++)
if (ms->segments[i]->type == source_encoded)
{
char *source_encoded_string;
char *utf8_encoded_string;
/* Copy the segment's contents, with a NUL at the end. */
{
size_t len = ms->segments[i]->length;
source_encoded_string = XNMALLOC (len + 1, char);
memcpy (source_encoded_string, ms->segments[i]->contents, len);
source_encoded_string[len] = '\0';
}
/* Convert it to UTF-8 encoding. */
utf8_encoded_string =
from_current_source_encoding (source_encoded_string,
ms->lcontext,
ms->logical_file_name,
ms->line_number);
if (utf8_encoded_string != source_encoded_string)
free (source_encoded_string);
converted_segments[i] = utf8_encoded_string;
length += strlen (utf8_encoded_string);
}
else
length += ms->segments[i]->length;
}
{
char *string = XNMALLOC (length + 1, char);
{
char *p;
size_t i;
p = string;
for (i = 0; i < nsegments; i++)
if (ms->segments[i]->type == source_encoded)
{
p = stpcpy (p, converted_segments[i]);
free (converted_segments[i]);
}
else
{
memcpy (p, ms->segments[i]->contents, ms->segments[i]->length);
p += ms->segments[i]->length;
}
assert (p == string + length);
*p = '\0';
}
free (converted_segments);
return string;
}
}
}
void
mixed_string_free (mixed_string_ty *ms)
{
struct mixed_string_segment **segments = ms->segments;
size_t nsegments = ms->nsegments;
if (nsegments > 0)
{
size_t i;
for (i = 0; i < nsegments; i++)
free (segments[i]);
}
free (segments);
free (ms);
}
char *
mixed_string_contents_free1 (mixed_string_ty *ms)
{
char *contents = mixed_string_contents (ms);
mixed_string_free (ms);
return contents;
}
mixed_string_ty *
mixed_string_concat (const mixed_string_ty *ms1,
const mixed_string_ty *ms2)
{
/* Trivial cases. */
if (ms2->nsegments == 0)
return mixed_string_clone (ms1);
if (ms1->nsegments == 0)
return mixed_string_clone (ms2);
/* General case. */
{
struct mixed_string *ms = XMALLOC (struct mixed_string);
size_t nsegments = ms1->nsegments + ms2->nsegments;
size_t j;
if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type)
{
/* Combine the last segment of ms1 with the first segment of ms2. */
size_t i;
nsegments -= 1;
ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
j = 0;
for (i = 0; i < ms1->nsegments - 1; i++)
ms->segments[j++] = segment_clone (ms1->segments[i]);
{
size_t len1 = ms1->segments[i]->length;
size_t len2 = ms2->segments[0]->length;
struct mixed_string_segment *newseg =
(struct mixed_string_segment *)
xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents,
len1 + len2));
newseg->type = ms2->segments[0]->type;
newseg->length = len1 + len2;
memcpy (newseg->contents, ms1->segments[i]->contents, len1);
memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2);
ms->segments[j++] = newseg;
}
for (i = 1; i < ms2->nsegments; i++)
ms->segments[j++] = segment_clone (ms2->segments[i]);
}
else
{
size_t i;
ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
j = 0;
for (i = 0; i < ms1->nsegments; i++)
ms->segments[j++] = segment_clone (ms1->segments[i]);
for (i = 0; i < ms2->nsegments; i++)
ms->segments[j++] = segment_clone (ms2->segments[i]);
}
assert (j == nsegments);
ms->nsegments = nsegments;
ms->lcontext = ms1->lcontext;
ms->logical_file_name = ms1->logical_file_name;
ms->line_number = ms1->line_number;
return ms;
}
}
mixed_string_ty *
mixed_string_concat_free1 (mixed_string_ty *ms1, const mixed_string_ty *ms2)
{
/* Trivial cases. */
if (ms2->nsegments == 0)
return ms1;
if (ms1->nsegments == 0)
{
mixed_string_free (ms1);
return mixed_string_clone (ms2);
}
/* General case. */
{
struct mixed_string *ms = XMALLOC (struct mixed_string);
size_t nsegments = ms1->nsegments + ms2->nsegments;
size_t j;
if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type)
{
/* Combine the last segment of ms1 with the first segment of ms2. */
size_t i;
nsegments -= 1;
ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
j = 0;
for (i = 0; i < ms1->nsegments - 1; i++)
ms->segments[j++] = ms1->segments[i];
{
size_t len1 = ms1->segments[i]->length;
size_t len2 = ms2->segments[0]->length;
struct mixed_string_segment *newseg =
(struct mixed_string_segment *)
xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents,
len1 + len2));
newseg->type = ms2->segments[0]->type;
newseg->length = len1 + len2;
memcpy (newseg->contents, ms1->segments[i]->contents, len1);
memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2);
ms->segments[j++] = newseg;
}
free (ms1->segments[i]);
for (i = 1; i < ms2->nsegments; i++)
ms->segments[j++] = segment_clone (ms2->segments[i]);
}
else
{
size_t i;
ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
j = 0;
for (i = 0; i < ms1->nsegments; i++)
ms->segments[j++] = ms1->segments[i];
for (i = 0; i < ms2->nsegments; i++)
ms->segments[j++] = segment_clone (ms2->segments[i]);
}
assert (j == nsegments);
free (ms1->segments);
ms->nsegments = nsegments;
ms->lcontext = ms1->lcontext;
ms->logical_file_name = ms1->logical_file_name;
ms->line_number = ms1->line_number;
free (ms1);
return ms;
}
}
void
mixed_string_buffer_init (struct mixed_string_buffer *bp,
lexical_context_ty lcontext,
const char *logical_file_name,
int line_number)
{
bp->segments = NULL;
bp->nsegments = 0;
bp->nsegments_allocated = 0;
bp->curr_type = -1;
bp->curr_buffer = NULL;
bp->curr_buflen = 0;
bp->curr_allocated = 0;
bp->utf16_surr = 0;
bp->lcontext = lcontext;
bp->logical_file_name = logical_file_name;
bp->line_number = line_number;
}
bool
mixed_string_buffer_is_empty (const struct mixed_string_buffer *bp)
{
return (bp->nsegments == 0 && bp->curr_buflen == 0);
}
/* Auxiliary function: Ensure count more bytes are available in
bp->curr_buffer. */
static inline void
mixed_string_buffer_grow_curr_buffer (struct mixed_string_buffer *bp,
size_t count)
{
if (bp->curr_buflen + count > bp->curr_allocated)
{
size_t new_allocated = 2 * bp->curr_allocated + 10;
if (new_allocated < bp->curr_buflen + count)
new_allocated = bp->curr_buflen + count;
bp->curr_allocated = new_allocated;
bp->curr_buffer = xrealloc (bp->curr_buffer, new_allocated);
}
}
/* Auxiliary function: Append a byte to bp->curr. */
static inline void
mixed_string_buffer_append_to_curr_buffer (struct mixed_string_buffer *bp,
unsigned char c)
{
if (bp->curr_buflen == bp->curr_allocated)
{
bp->curr_allocated = 2 * bp->curr_allocated + 10;
bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
}
bp->curr_buffer[bp->curr_buflen++] = c;
}
/* Auxiliary function: Assuming bp->curr_type == utf8_encoded, append a
Unicode character to bp->curr_buffer. uc must be < 0x110000. */
static inline void
mixed_string_buffer_append_to_utf8_buffer (struct mixed_string_buffer *bp,
ucs4_t uc)
{
unsigned char utf8buf[6];
int count = u8_uctomb (utf8buf, uc, 6);
if (count < 0)
/* The caller should have ensured that uc is not out-of-range. */
abort ();
mixed_string_buffer_grow_curr_buffer (bp, count);
memcpy (bp->curr_buffer + bp->curr_buflen, utf8buf, count);
bp->curr_buflen += count;
}
/* Auxiliary function: Assuming bp->curr_type == utf8_encoded, handle the
attempt to append a lone surrogate to bp->curr_buffer. */
static void
mixed_string_buffer_append_lone_surrogate (struct mixed_string_buffer *bp,
ucs4_t uc)
{
/* A half surrogate is invalid, therefore use U+FFFD instead.
It may be valid in a particular programming language.
But a half surrogate is invalid in UTF-8:
- RFC 3629 says
"The definition of UTF-8 prohibits encoding character
numbers between U+D800 and U+DFFF".
- Unicode 4.0 chapter 3
<http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
section 3.9, p.77, says
"Because surrogate code points are not Unicode scalar
values, any UTF-8 byte sequence that would otherwise
map to code points D800..DFFF is ill-formed."
and in table 3-6, p. 78, does not mention D800..DFFF.
- The unicode.org FAQ question "How do I convert an unpaired
UTF-16 surrogate to UTF-8?" has the answer
"By representing such an unpaired surrogate on its own
as a 3-byte sequence, the resulting UTF-8 data stream
would become ill-formed."
So use U+FFFD instead. */
error_with_progname = false;
error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
logical_file_name, line_number, uc);
error_with_progname = true;
mixed_string_buffer_append_to_utf8_buffer (bp, 0xfffd);
}
/* Auxiliary function: Assuming bp->curr_type == utf8_encoded, flush
bp->utf16_surr into bp->curr_buffer. */
static inline void
mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
{
if (bp->utf16_surr != 0)
{
mixed_string_buffer_append_lone_surrogate (bp, bp->utf16_surr);
bp->utf16_surr = 0;
}
}
/* Auxiliary function: Append a segment to bp->segments. */
static inline void
mixed_string_buffer_add_segment (struct mixed_string_buffer *bp,
struct mixed_string_segment *newseg)
{
if (bp->nsegments == bp->nsegments_allocated)
{
size_t new_allocated =
bp->nsegments_allocated = 2 * bp->nsegments_allocated + 1;
bp->segments =
(struct mixed_string_segment **)
xrealloc (bp->segments,
new_allocated * sizeof (struct mixed_string_segment *));
}
bp->segments[bp->nsegments++] = newseg;
}
/* Auxiliary function: Flush bp->curr_buffer and bp->utf16_surr into
bp->segments. */
static void
mixed_string_buffer_flush_curr (struct mixed_string_buffer *bp)
{
if (bp->curr_type == utf8_encoded)
mixed_string_buffer_flush_utf16_surr (bp);
if (bp->curr_type != -1)
{
if (bp->curr_buflen > 0)
{
struct mixed_string_segment *segment =
segment_alloc (bp->curr_type, bp->curr_buffer, bp->curr_buflen);
mixed_string_buffer_add_segment (bp, segment);
}
bp->curr_buflen = 0;
}
}
void
mixed_string_buffer_append_char (struct mixed_string_buffer *bp, int c)
{
/* Switch to multibyte character mode. */
if (bp->curr_type != source_encoded)
{
mixed_string_buffer_flush_curr (bp);
bp->curr_type = source_encoded;
}
mixed_string_buffer_append_to_curr_buffer (bp, (unsigned char) c);
}
void
mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, int c)
{
/* Switch to Unicode character mode. */
if (bp->curr_type != utf8_encoded)
{
mixed_string_buffer_flush_curr (bp);
bp->curr_type = utf8_encoded;
assert (bp->utf16_surr == 0);
}
/* Test whether this character and the previous one form a Unicode
surrogate character pair. */
if (bp->utf16_surr != 0 && (c >= 0xdc00 && c < 0xe000))
{
unsigned short utf16buf[2];
ucs4_t uc;
utf16buf[0] = bp->utf16_surr;
utf16buf[1] = c;
if (u16_mbtouc (&uc, utf16buf, 2) != 2)
abort ();
mixed_string_buffer_append_to_utf8_buffer (bp, uc);
bp->utf16_surr = 0;
}
else
{
mixed_string_buffer_flush_utf16_surr (bp);
if (c >= 0xd800 && c < 0xdc00)
bp->utf16_surr = c;
else if (c >= 0xdc00 && c < 0xe000)
mixed_string_buffer_append_lone_surrogate (bp, c);
else
mixed_string_buffer_append_to_utf8_buffer (bp, c);
}
}
void
mixed_string_buffer_destroy (struct mixed_string_buffer *bp)
{
struct mixed_string_segment **segments = bp->segments;
size_t nsegments = bp->nsegments;
if (nsegments > 0)
{
size_t i;
for (i = 0; i < nsegments; i++)
free (segments[i]);
}
free (segments);
free (bp->curr_buffer);
}
mixed_string_ty *
mixed_string_buffer_result (struct mixed_string_buffer *bp)
{
mixed_string_buffer_flush_curr (bp);
{
struct mixed_string *ms = XMALLOC (struct mixed_string);
size_t nsegments = bp->nsegments;
if (nsegments > 0)
ms->segments =
(struct mixed_string_segment **)
xrealloc (bp->segments,
nsegments * sizeof (struct mixed_string_segment *));
else
{
assert (bp->segments == NULL);
ms->segments = NULL;
}
ms->nsegments = nsegments;
ms->lcontext = bp->lcontext;
ms->logical_file_name = bp->logical_file_name;
ms->line_number = bp->line_number;
free (bp->curr_buffer);
return ms;
}
}