| /* Handling strings that are given partially in the source encoding and |
| partially in Unicode. |
| Copyright (C) 2001-2018 Free Software Foundation, Inc. |
| |
| This program is free software: you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
| |
| #ifdef HAVE_CONFIG_H |
| # include <config.h> |
| #endif |
| |
| /* Specification. */ |
| #include "xg-mixed-string.h" |
| |
| #include <assert.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "error.h" |
| #include "error-progname.h" |
| #include "flexmember.h" |
| #include "msgl-ascii.h" |
| #include "po-charset.h" |
| #include "unistr.h" |
| #include "xalloc.h" |
| |
| #include "xg-pos.h" |
| |
| #include "gettext.h" |
| #define _(str) gettext (str) |
| |
| |
| /* Allocates a single segment. */ |
| static inline struct mixed_string_segment * |
| segment_alloc (enum segment_type type, const char *string, size_t length) |
| { |
| struct mixed_string_segment *segment = |
| (struct mixed_string_segment *) |
| xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents, length)); |
| segment->type = type; |
| segment->length = length; |
| memcpy (segment->contents, string, length); |
| return segment; |
| } |
| |
| /* Clones a single segment. */ |
| static inline struct mixed_string_segment * |
| segment_clone (const struct mixed_string_segment *segment) |
| { |
| return segment_alloc (segment->type, segment->contents, segment->length); |
| } |
| |
| mixed_string_ty * |
| mixed_string_alloc_simple (const char *string, |
| lexical_context_ty lcontext, |
| const char *logical_file_name, |
| int line_number) |
| { |
| struct mixed_string *ms = XMALLOC (struct mixed_string); |
| |
| if (*string == '\0') |
| { |
| /* An empty string. */ |
| ms->segments = NULL; |
| ms->nsegments = 0; |
| } |
| else |
| { |
| ms->segments = XNMALLOC (1, struct mixed_string_segment *); |
| if ((xgettext_current_source_encoding == po_charset_ascii |
| || xgettext_current_source_encoding == po_charset_utf8) |
| && is_ascii_string (string)) |
| /* An optimization. */ |
| ms->segments[0] = |
| segment_alloc (utf8_encoded, string, strlen (string)); |
| else |
| /* The general case. */ |
| ms->segments[0] = |
| segment_alloc (source_encoded, string, strlen (string)); |
| ms->nsegments = 1; |
| } |
| ms->lcontext = lcontext; |
| ms->logical_file_name = logical_file_name; |
| ms->line_number = line_number; |
| |
| return ms; |
| } |
| |
| mixed_string_ty * |
| mixed_string_alloc_utf8 (const char *string, |
| lexical_context_ty lcontext, |
| const char *logical_file_name, |
| int line_number) |
| { |
| struct mixed_string *ms = XMALLOC (struct mixed_string); |
| |
| if (*string == '\0') |
| { |
| /* An empty string. */ |
| ms->segments = NULL; |
| ms->nsegments = 0; |
| } |
| else |
| { |
| ms->segments = XNMALLOC (1, struct mixed_string_segment *); |
| ms->segments[0] = segment_alloc (utf8_encoded, string, strlen (string)); |
| ms->nsegments = 1; |
| } |
| ms->lcontext = lcontext; |
| ms->logical_file_name = logical_file_name; |
| ms->line_number = line_number; |
| |
| return ms; |
| } |
| |
| mixed_string_ty * |
| mixed_string_clone (const mixed_string_ty *ms1) |
| { |
| struct mixed_string *ms = XMALLOC (struct mixed_string); |
| size_t nsegments = ms1->nsegments; |
| |
| if (nsegments == 0) |
| { |
| ms->segments = NULL; |
| ms->nsegments = 0; |
| } |
| else |
| { |
| size_t i; |
| |
| ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *); |
| for (i = 0; i < nsegments; i++) |
| ms->segments[i] = segment_clone (ms1->segments[i]); |
| ms->nsegments = nsegments; |
| } |
| ms->lcontext = ms1->lcontext; |
| ms->logical_file_name = ms1->logical_file_name; |
| ms->line_number = ms1->line_number; |
| |
| return ms; |
| } |
| |
| char * |
| mixed_string_contents (const mixed_string_ty *ms) |
| { |
| size_t nsegments = ms->nsegments; |
| /* Trivial cases. */ |
| if (nsegments == 0) |
| return xstrdup (""); |
| if (nsegments == 1 && ms->segments[0]->type == utf8_encoded) |
| { |
| /* Return the segment, with a NUL at the end. */ |
| size_t len = ms->segments[0]->length; |
| char *string = XNMALLOC (len + 1, char); |
| memcpy (string, ms->segments[0]->contents, len); |
| string[len] = '\0'; |
| return string; |
| } |
| /* General case. */ |
| { |
| size_t i; |
| |
| for (i = 0; i < nsegments - 1; i++) |
| if (memchr (ms->segments[i]->contents, '\0', ms->segments[i]->length) |
| != NULL) |
| { |
| /* Segment i contains a NUL character. Ignore the remaining |
| segments. */ |
| nsegments = i + 1; |
| break; |
| } |
| } |
| { |
| char **converted_segments = XNMALLOC (nsegments, char *); |
| size_t length; |
| |
| length = 0; |
| { |
| size_t i; |
| |
| for (i = 0; i < nsegments; i++) |
| if (ms->segments[i]->type == source_encoded) |
| { |
| char *source_encoded_string; |
| char *utf8_encoded_string; |
| |
| /* Copy the segment's contents, with a NUL at the end. */ |
| { |
| size_t len = ms->segments[i]->length; |
| source_encoded_string = XNMALLOC (len + 1, char); |
| memcpy (source_encoded_string, ms->segments[i]->contents, len); |
| source_encoded_string[len] = '\0'; |
| } |
| /* Convert it to UTF-8 encoding. */ |
| utf8_encoded_string = |
| from_current_source_encoding (source_encoded_string, |
| ms->lcontext, |
| ms->logical_file_name, |
| ms->line_number); |
| if (utf8_encoded_string != source_encoded_string) |
| free (source_encoded_string); |
| converted_segments[i] = utf8_encoded_string; |
| length += strlen (utf8_encoded_string); |
| } |
| else |
| length += ms->segments[i]->length; |
| } |
| |
| { |
| char *string = XNMALLOC (length + 1, char); |
| { |
| char *p; |
| size_t i; |
| |
| p = string; |
| for (i = 0; i < nsegments; i++) |
| if (ms->segments[i]->type == source_encoded) |
| { |
| p = stpcpy (p, converted_segments[i]); |
| free (converted_segments[i]); |
| } |
| else |
| { |
| memcpy (p, ms->segments[i]->contents, ms->segments[i]->length); |
| p += ms->segments[i]->length; |
| } |
| assert (p == string + length); |
| *p = '\0'; |
| } |
| |
| free (converted_segments); |
| return string; |
| } |
| } |
| } |
| |
| void |
| mixed_string_free (mixed_string_ty *ms) |
| { |
| struct mixed_string_segment **segments = ms->segments; |
| size_t nsegments = ms->nsegments; |
| if (nsegments > 0) |
| { |
| size_t i; |
| for (i = 0; i < nsegments; i++) |
| free (segments[i]); |
| } |
| free (segments); |
| free (ms); |
| } |
| |
| char * |
| mixed_string_contents_free1 (mixed_string_ty *ms) |
| { |
| char *contents = mixed_string_contents (ms); |
| mixed_string_free (ms); |
| return contents; |
| } |
| |
| mixed_string_ty * |
| mixed_string_concat (const mixed_string_ty *ms1, |
| const mixed_string_ty *ms2) |
| { |
| /* Trivial cases. */ |
| if (ms2->nsegments == 0) |
| return mixed_string_clone (ms1); |
| if (ms1->nsegments == 0) |
| return mixed_string_clone (ms2); |
| /* General case. */ |
| { |
| struct mixed_string *ms = XMALLOC (struct mixed_string); |
| size_t nsegments = ms1->nsegments + ms2->nsegments; |
| size_t j; |
| if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type) |
| { |
| /* Combine the last segment of ms1 with the first segment of ms2. */ |
| size_t i; |
| |
| nsegments -= 1; |
| ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *); |
| j = 0; |
| for (i = 0; i < ms1->nsegments - 1; i++) |
| ms->segments[j++] = segment_clone (ms1->segments[i]); |
| { |
| size_t len1 = ms1->segments[i]->length; |
| size_t len2 = ms2->segments[0]->length; |
| struct mixed_string_segment *newseg = |
| (struct mixed_string_segment *) |
| xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents, |
| len1 + len2)); |
| newseg->type = ms2->segments[0]->type; |
| newseg->length = len1 + len2; |
| memcpy (newseg->contents, ms1->segments[i]->contents, len1); |
| memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2); |
| ms->segments[j++] = newseg; |
| } |
| for (i = 1; i < ms2->nsegments; i++) |
| ms->segments[j++] = segment_clone (ms2->segments[i]); |
| } |
| else |
| { |
| size_t i; |
| |
| ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *); |
| j = 0; |
| for (i = 0; i < ms1->nsegments; i++) |
| ms->segments[j++] = segment_clone (ms1->segments[i]); |
| for (i = 0; i < ms2->nsegments; i++) |
| ms->segments[j++] = segment_clone (ms2->segments[i]); |
| } |
| assert (j == nsegments); |
| ms->nsegments = nsegments; |
| ms->lcontext = ms1->lcontext; |
| ms->logical_file_name = ms1->logical_file_name; |
| ms->line_number = ms1->line_number; |
| |
| return ms; |
| } |
| } |
| |
| mixed_string_ty * |
| mixed_string_concat_free1 (mixed_string_ty *ms1, const mixed_string_ty *ms2) |
| { |
| /* Trivial cases. */ |
| if (ms2->nsegments == 0) |
| return ms1; |
| if (ms1->nsegments == 0) |
| { |
| mixed_string_free (ms1); |
| return mixed_string_clone (ms2); |
| } |
| /* General case. */ |
| { |
| struct mixed_string *ms = XMALLOC (struct mixed_string); |
| size_t nsegments = ms1->nsegments + ms2->nsegments; |
| size_t j; |
| if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type) |
| { |
| /* Combine the last segment of ms1 with the first segment of ms2. */ |
| size_t i; |
| |
| nsegments -= 1; |
| ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *); |
| j = 0; |
| for (i = 0; i < ms1->nsegments - 1; i++) |
| ms->segments[j++] = ms1->segments[i]; |
| { |
| size_t len1 = ms1->segments[i]->length; |
| size_t len2 = ms2->segments[0]->length; |
| struct mixed_string_segment *newseg = |
| (struct mixed_string_segment *) |
| xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents, |
| len1 + len2)); |
| newseg->type = ms2->segments[0]->type; |
| newseg->length = len1 + len2; |
| memcpy (newseg->contents, ms1->segments[i]->contents, len1); |
| memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2); |
| ms->segments[j++] = newseg; |
| } |
| free (ms1->segments[i]); |
| for (i = 1; i < ms2->nsegments; i++) |
| ms->segments[j++] = segment_clone (ms2->segments[i]); |
| } |
| else |
| { |
| size_t i; |
| |
| ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *); |
| j = 0; |
| for (i = 0; i < ms1->nsegments; i++) |
| ms->segments[j++] = ms1->segments[i]; |
| for (i = 0; i < ms2->nsegments; i++) |
| ms->segments[j++] = segment_clone (ms2->segments[i]); |
| } |
| assert (j == nsegments); |
| free (ms1->segments); |
| ms->nsegments = nsegments; |
| ms->lcontext = ms1->lcontext; |
| ms->logical_file_name = ms1->logical_file_name; |
| ms->line_number = ms1->line_number; |
| free (ms1); |
| |
| return ms; |
| } |
| } |
| |
| |
| void |
| mixed_string_buffer_init (struct mixed_string_buffer *bp, |
| lexical_context_ty lcontext, |
| const char *logical_file_name, |
| int line_number) |
| { |
| bp->segments = NULL; |
| bp->nsegments = 0; |
| bp->nsegments_allocated = 0; |
| bp->curr_type = -1; |
| bp->curr_buffer = NULL; |
| bp->curr_buflen = 0; |
| bp->curr_allocated = 0; |
| bp->utf16_surr = 0; |
| bp->lcontext = lcontext; |
| bp->logical_file_name = logical_file_name; |
| bp->line_number = line_number; |
| } |
| |
| bool |
| mixed_string_buffer_is_empty (const struct mixed_string_buffer *bp) |
| { |
| return (bp->nsegments == 0 && bp->curr_buflen == 0); |
| } |
| |
| /* Auxiliary function: Ensure count more bytes are available in |
| bp->curr_buffer. */ |
| static inline void |
| mixed_string_buffer_grow_curr_buffer (struct mixed_string_buffer *bp, |
| size_t count) |
| { |
| if (bp->curr_buflen + count > bp->curr_allocated) |
| { |
| size_t new_allocated = 2 * bp->curr_allocated + 10; |
| if (new_allocated < bp->curr_buflen + count) |
| new_allocated = bp->curr_buflen + count; |
| bp->curr_allocated = new_allocated; |
| bp->curr_buffer = xrealloc (bp->curr_buffer, new_allocated); |
| } |
| } |
| |
| /* Auxiliary function: Append a byte to bp->curr. */ |
| static inline void |
| mixed_string_buffer_append_to_curr_buffer (struct mixed_string_buffer *bp, |
| unsigned char c) |
| { |
| if (bp->curr_buflen == bp->curr_allocated) |
| { |
| bp->curr_allocated = 2 * bp->curr_allocated + 10; |
| bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); |
| } |
| bp->curr_buffer[bp->curr_buflen++] = c; |
| } |
| |
| /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, append a |
| Unicode character to bp->curr_buffer. uc must be < 0x110000. */ |
| static inline void |
| mixed_string_buffer_append_to_utf8_buffer (struct mixed_string_buffer *bp, |
| ucs4_t uc) |
| { |
| unsigned char utf8buf[6]; |
| int count = u8_uctomb (utf8buf, uc, 6); |
| |
| if (count < 0) |
| /* The caller should have ensured that uc is not out-of-range. */ |
| abort (); |
| |
| mixed_string_buffer_grow_curr_buffer (bp, count); |
| memcpy (bp->curr_buffer + bp->curr_buflen, utf8buf, count); |
| bp->curr_buflen += count; |
| } |
| |
| /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, handle the |
| attempt to append a lone surrogate to bp->curr_buffer. */ |
| static void |
| mixed_string_buffer_append_lone_surrogate (struct mixed_string_buffer *bp, |
| ucs4_t uc) |
| { |
| /* A half surrogate is invalid, therefore use U+FFFD instead. |
| It may be valid in a particular programming language. |
| But a half surrogate is invalid in UTF-8: |
| - RFC 3629 says |
| "The definition of UTF-8 prohibits encoding character |
| numbers between U+D800 and U+DFFF". |
| - Unicode 4.0 chapter 3 |
| <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf> |
| section 3.9, p.77, says |
| "Because surrogate code points are not Unicode scalar |
| values, any UTF-8 byte sequence that would otherwise |
| map to code points D800..DFFF is ill-formed." |
| and in table 3-6, p. 78, does not mention D800..DFFF. |
| - The unicode.org FAQ question "How do I convert an unpaired |
| UTF-16 surrogate to UTF-8?" has the answer |
| "By representing such an unpaired surrogate on its own |
| as a 3-byte sequence, the resulting UTF-8 data stream |
| would become ill-formed." |
| So use U+FFFD instead. */ |
| error_with_progname = false; |
| error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"), |
| logical_file_name, line_number, uc); |
| error_with_progname = true; |
| mixed_string_buffer_append_to_utf8_buffer (bp, 0xfffd); |
| } |
| |
| /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, flush |
| bp->utf16_surr into bp->curr_buffer. */ |
| static inline void |
| mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp) |
| { |
| if (bp->utf16_surr != 0) |
| { |
| mixed_string_buffer_append_lone_surrogate (bp, bp->utf16_surr); |
| bp->utf16_surr = 0; |
| } |
| } |
| |
| /* Auxiliary function: Append a segment to bp->segments. */ |
| static inline void |
| mixed_string_buffer_add_segment (struct mixed_string_buffer *bp, |
| struct mixed_string_segment *newseg) |
| { |
| if (bp->nsegments == bp->nsegments_allocated) |
| { |
| size_t new_allocated = |
| bp->nsegments_allocated = 2 * bp->nsegments_allocated + 1; |
| bp->segments = |
| (struct mixed_string_segment **) |
| xrealloc (bp->segments, |
| new_allocated * sizeof (struct mixed_string_segment *)); |
| } |
| bp->segments[bp->nsegments++] = newseg; |
| } |
| |
| /* Auxiliary function: Flush bp->curr_buffer and bp->utf16_surr into |
| bp->segments. */ |
| static void |
| mixed_string_buffer_flush_curr (struct mixed_string_buffer *bp) |
| { |
| if (bp->curr_type == utf8_encoded) |
| mixed_string_buffer_flush_utf16_surr (bp); |
| if (bp->curr_type != -1) |
| { |
| if (bp->curr_buflen > 0) |
| { |
| struct mixed_string_segment *segment = |
| segment_alloc (bp->curr_type, bp->curr_buffer, bp->curr_buflen); |
| mixed_string_buffer_add_segment (bp, segment); |
| } |
| bp->curr_buflen = 0; |
| } |
| } |
| |
| void |
| mixed_string_buffer_append_char (struct mixed_string_buffer *bp, int c) |
| { |
| /* Switch to multibyte character mode. */ |
| if (bp->curr_type != source_encoded) |
| { |
| mixed_string_buffer_flush_curr (bp); |
| bp->curr_type = source_encoded; |
| } |
| |
| mixed_string_buffer_append_to_curr_buffer (bp, (unsigned char) c); |
| } |
| |
| void |
| mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, int c) |
| { |
| /* Switch to Unicode character mode. */ |
| if (bp->curr_type != utf8_encoded) |
| { |
| mixed_string_buffer_flush_curr (bp); |
| bp->curr_type = utf8_encoded; |
| assert (bp->utf16_surr == 0); |
| } |
| |
| /* Test whether this character and the previous one form a Unicode |
| surrogate character pair. */ |
| if (bp->utf16_surr != 0 && (c >= 0xdc00 && c < 0xe000)) |
| { |
| unsigned short utf16buf[2]; |
| ucs4_t uc; |
| |
| utf16buf[0] = bp->utf16_surr; |
| utf16buf[1] = c; |
| if (u16_mbtouc (&uc, utf16buf, 2) != 2) |
| abort (); |
| |
| mixed_string_buffer_append_to_utf8_buffer (bp, uc); |
| bp->utf16_surr = 0; |
| } |
| else |
| { |
| mixed_string_buffer_flush_utf16_surr (bp); |
| |
| if (c >= 0xd800 && c < 0xdc00) |
| bp->utf16_surr = c; |
| else if (c >= 0xdc00 && c < 0xe000) |
| mixed_string_buffer_append_lone_surrogate (bp, c); |
| else |
| mixed_string_buffer_append_to_utf8_buffer (bp, c); |
| } |
| } |
| |
| void |
| mixed_string_buffer_destroy (struct mixed_string_buffer *bp) |
| { |
| struct mixed_string_segment **segments = bp->segments; |
| size_t nsegments = bp->nsegments; |
| if (nsegments > 0) |
| { |
| size_t i; |
| for (i = 0; i < nsegments; i++) |
| free (segments[i]); |
| } |
| free (segments); |
| free (bp->curr_buffer); |
| } |
| |
| mixed_string_ty * |
| mixed_string_buffer_result (struct mixed_string_buffer *bp) |
| { |
| mixed_string_buffer_flush_curr (bp); |
| |
| { |
| struct mixed_string *ms = XMALLOC (struct mixed_string); |
| size_t nsegments = bp->nsegments; |
| |
| if (nsegments > 0) |
| ms->segments = |
| (struct mixed_string_segment **) |
| xrealloc (bp->segments, |
| nsegments * sizeof (struct mixed_string_segment *)); |
| else |
| { |
| assert (bp->segments == NULL); |
| ms->segments = NULL; |
| } |
| ms->nsegments = nsegments; |
| ms->lcontext = bp->lcontext; |
| ms->logical_file_name = bp->logical_file_name; |
| ms->line_number = bp->line_number; |
| |
| free (bp->curr_buffer); |
| |
| return ms; |
| } |
| } |