blob: 5ae010b542b6d2e29e9cccea8dd9ce8888557a9c [file] [log] [blame]
/* Reading Java .properties files.
Copyright (C) 2003, 2005-2007, 2009, 2018, 2020 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2003.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
/* Specification. */
#include "read-properties.h"
#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "error.h"
#include "error-progname.h"
#include "message.h"
#include "read-catalog-abstract.h"
#include "xalloc.h"
#include "xvasprintf.h"
#include "po-xerror.h"
#include "msgl-ascii.h"
#include "read-file.h"
#include "unistr.h"
#include "gettext.h"
#define _(str) gettext (str)
/* For compiling this file in C++ mode. */
#ifdef __cplusplus
# define this thiss
#endif
/* The format of the Java .properties files is documented in the JDK
documentation for class java.util.Properties. In the case of .properties
files for PropertyResourceBundle, each non-comment line contains a
key/value pair in the form "key = value" or "key : value" or "key value",
where the key is the msgid and the value is the msgstr. Messages with
plurals are not supported in this format.
The encoding of Java .properties files is:
- ASCII with Java \uxxxx escape sequences,
- ISO-8859-1 if non-ASCII bytes are encounterd,
- UTF-8 if non-ASCII bytes are encountered and the entire file is
valid UTF-8 (in Java 9 or newer), see
https://docs.oracle.com/javase/9/intl/internationalization-enhancements-jdk-9.htm */
/* Handling of comments: We copy all comments from the .properties file to
the PO file. This is not really needed; it's a service for translators
who don't like PO files and prefer to maintain the .properties file. */
/* Real filename, used in error messages about the input file. */
static const char *real_file_name;
/* File name and line number. */
extern lex_pos_ty gram_pos;
/* The contents of the input file. */
static char *contents;
static size_t contents_length;
/* True if the input file is assumed to be in UTF-8 encoding.
False if it is assumed to be in ISO-8859-1 encoding. */
static bool assume_utf8;
/* Current position in contents. */
static size_t position;
/* Phase 1: Read an input byte.
Max. 1 pushback byte. */
static int
phase1_getc ()
{
if (position == contents_length)
return EOF;
return (unsigned char) contents[position++];
}
static inline void
phase1_ungetc (int c)
{
if (c != EOF)
position--;
}
/* Phase 2: Read an input byte, treating CR/LF like a single LF.
Max. 2 pushback bytes. */
static unsigned char phase2_pushback[2];
static int phase2_pushback_length;
static int
phase2_getc ()
{
int c;
if (phase2_pushback_length)
c = phase2_pushback[--phase2_pushback_length];
else
{
c = phase1_getc ();
if (c == '\r')
{
int c2 = phase1_getc ();
if (c2 == '\n')
c = c2;
else
phase1_ungetc (c2);
}
}
if (c == '\n')
gram_pos.line_number++;
return c;
}
static void
phase2_ungetc (int c)
{
if (c == '\n')
--gram_pos.line_number;
if (c != EOF)
phase2_pushback[phase2_pushback_length++] = c;
}
/* Phase 3: Read an input byte, treating CR/LF like a single LF,
with handling of continuation lines.
Max. 1 pushback character. */
static int
phase3_getc ()
{
int c = phase2_getc ();
for (;;)
{
if (c != '\\')
return c;
c = phase2_getc ();
if (c != '\n')
{
phase2_ungetc (c);
return '\\';
}
/* Skip the backslash-newline and all whitespace that follows it. */
do
c = phase2_getc ();
while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
}
}
static inline void
phase3_ungetc (int c)
{
phase2_ungetc (c);
}
/* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
static char *
conv_from_iso_8859_1 (char *string)
{
if (is_ascii_string (string))
return string;
else
{
size_t length = strlen (string);
/* Each ISO-8859-1 character needs 2 bytes at worst. */
unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
unsigned char *q = utf8_string;
const char *str = string;
const char *str_limit = str + length;
while (str < str_limit)
{
unsigned int uc = (unsigned char) *str++;
int n = u8_uctomb (q, uc, 6);
assert (n > 0);
q += n;
}
*q = '\0';
assert (q - utf8_string <= 2 * length);
return (char *) utf8_string;
}
}
/* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
encoding. May destructively modify the argument string. */
static char *
conv_from_java (char *string)
{
/* This conversion can only shrink the string, never increase its size.
So there is no need to xmalloc the result freshly. */
const char *p = string;
unsigned char *q = (unsigned char *) string;
while (*p != '\0')
{
if (p[0] == '\\' && p[1] == 'u')
{
unsigned int n = 0;
int i;
for (i = 0; i < 4; i++)
{
int c1 = (unsigned char) p[2 + i];
if (c1 >= '0' && c1 <= '9')
n = (n << 4) + (c1 - '0');
else if (c1 >= 'A' && c1 <= 'F')
n = (n << 4) + (c1 - 'A' + 10);
else if (c1 >= 'a' && c1 <= 'f')
n = (n << 4) + (c1 - 'a' + 10);
else
goto just_one_byte;
}
if (i == 4)
{
unsigned int uc;
if (n >= 0xd800 && n < 0xdc00)
{
if (p[6] == '\\' && p[7] == 'u')
{
unsigned int m = 0;
for (i = 0; i < 4; i++)
{
int c1 = (unsigned char) p[8 + i];
if (c1 >= '0' && c1 <= '9')
m = (m << 4) + (c1 - '0');
else if (c1 >= 'A' && c1 <= 'F')
m = (m << 4) + (c1 - 'A' + 10);
else if (c1 >= 'a' && c1 <= 'f')
m = (m << 4) + (c1 - 'a' + 10);
else
goto just_one_byte;
}
if (i == 4 && (m >= 0xdc00 && m < 0xe000))
{
/* Combine two UTF-16 words to a character. */
uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
p += 12;
}
else
goto just_one_byte;
}
else
goto just_one_byte;
}
else
{
uc = n;
p += 6;
}
q += u8_uctomb (q, uc, 6);
continue;
}
}
just_one_byte:
*q++ = (unsigned char) *p++;
}
*q = '\0';
return string;
}
/* Phase 4: Read the next single byte or UTF-16 code point,
treating CR/LF like a single LF, with handling of continuation lines
and of \uxxxx sequences. */
/* Return value of phase 4 when EOF is reached. */
#define P4_EOF 0xffff
/* Convert an UTF-16 code point to a return value that can be distinguished
from a single-byte return value. */
#define UNICODE(code) (0x10000 + (code))
/* Test a return value of phase 4 whether it designates an UTF-16 code
point. */
#define IS_UNICODE(p4_result) ((p4_result) >= 0x10000)
/* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */
#define UTF16_VALUE(p4_result) ((p4_result) - 0x10000)
static int
phase4_getuc ()
{
int c = phase3_getc ();
if (c == EOF)
return P4_EOF;
if (c == '\\')
{
int c2 = phase3_getc ();
if (c2 == 't')
return '\t';
if (c2 == 'n')
return '\n';
if (c2 == 'r')
return '\r';
if (c2 == 'f')
return '\f';
if (c2 == 'u')
{
unsigned int n = 0;
int i;
for (i = 0; i < 4; i++)
{
int c1 = phase3_getc ();
if (c1 >= '0' && c1 <= '9')
n = (n << 4) + (c1 - '0');
else if (c1 >= 'A' && c1 <= 'F')
n = (n << 4) + (c1 - 'A' + 10);
else if (c1 >= 'a' && c1 <= 'f')
n = (n << 4) + (c1 - 'a' + 10);
else
{
phase3_ungetc (c1);
po_xerror (PO_SEVERITY_ERROR, NULL,
real_file_name, gram_pos.line_number, (size_t)(-1),
false, _("warning: invalid \\uxxxx syntax for Unicode character"));
return 'u';
}
}
return UNICODE (n);
}
return c2;
}
else
return c;
}
/* Reads a key or value string.
Returns the string in UTF-8 encoding, or NULL if the end of the logical
line is reached.
Parsing ends:
- when returning NULL, after the end of the logical line,
- otherwise, if in_key is true, after the whitespace and possibly the
separator that follows after the string,
- otherwise, if in_key is false, after the end of the logical line. */
static char *
read_escaped_string (bool in_key)
{
/* The part of the string that has already been converted to UTF-8. */
static unsigned char *utf8_buffer;
static size_t utf8_buflen;
static size_t utf8_allocated;
/* The first half of an UTF-16 surrogate character. */
unsigned short utf16_surr;
/* Line in which this surrogate character occurred. */
size_t utf16_surr_line;
/* Ensures utf8_buffer has room for N bytes. N must be <= 10. */
#define utf8_buffer_ensure_available(n) \
do \
{ \
if (utf8_buflen + (n) > utf8_allocated) \
{ \
utf8_allocated = 2 * utf8_allocated + 10; \
utf8_buffer = \
(unsigned char *) xrealloc (utf8_buffer, utf8_allocated); \
} \
} \
while (0)
/* Appends a lone surrogate to utf8_buffer. */
/* Note: A half surrogate is invalid in UTF-8:
- RFC 3629 says
"The definition of UTF-8 prohibits encoding character
numbers between U+D800 and U+DFFF".
- Unicode 4.0 chapter 3
<https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
section 3.9, p.77, says
"Because surrogate code points are not Unicode scalar
values, any UTF-8 byte sequence that would otherwise
map to code points D800..DFFF is ill-formed."
and in table 3-6, p. 78, does not mention D800..DFFF.
- The unicode.org FAQ question "How do I convert an unpaired
UTF-16 surrogate to UTF-8?" has the answer
"By representing such an unpaired surrogate on its own
as a 3-byte sequence, the resulting UTF-8 data stream
would become ill-formed."
So use U+FFFD instead. */
#define utf8_buffer_append_lone_surrogate(uc, line) \
do \
{ \
error_with_progname = false; \
po_xerror (PO_SEVERITY_ERROR, NULL, \
real_file_name, (line), (size_t)(-1), false, \
xasprintf (_("warning: lone surrogate U+%04X"), (uc))); \
error_with_progname = true; \
utf8_buffer_ensure_available (3); \
utf8_buffer[utf8_buflen++] = 0xef; \
utf8_buffer[utf8_buflen++] = 0xbf; \
utf8_buffer[utf8_buflen++] = 0xbd; \
} \
while (0)
int c;
/* Skip whitespace before the string. */
do
c = phase3_getc ();
while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
if (c == EOF || c == '\n')
/* Empty string. */
return NULL;
/* Start accumulating the string. */
utf8_buflen = 0;
utf16_surr = 0;
utf16_surr_line = 0;
for (;;)
{
if (in_key && (c == '=' || c == ':'
|| c == ' ' || c == '\t' || c == '\r' || c == '\f'))
{
/* Skip whitespace after the string. */
while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
c = phase3_getc ();
/* Skip '=' or ':' separator. */
if (!(c == '=' || c == ':'))
phase3_ungetc (c);
break;
}
phase3_ungetc (c);
/* Read the next byte or UTF-16 code point. */
c = phase4_getuc ();
if (c == P4_EOF)
break;
/* Append it to the buffer. */
if (IS_UNICODE (c))
{
/* Append an UTF-16 code point. */
/* Test whether this character and the previous one form a Unicode
surrogate pair. */
if (utf16_surr != 0
&& (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
{
unsigned short utf16buf[2];
ucs4_t uc;
int len;
utf16buf[0] = utf16_surr;
utf16buf[1] = UTF16_VALUE (c);
if (u16_mbtouc (&uc, utf16buf, 2) != 2)
abort ();
utf8_buffer_ensure_available (6);
len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 6);
if (len < 0)
{
error_with_progname = false;
po_xerror (PO_SEVERITY_ERROR, NULL,
real_file_name, gram_pos.line_number, (size_t)(-1),
false, _("warning: invalid Unicode character"));
error_with_progname = true;
}
else
utf8_buflen += len;
utf16_surr = 0;
}
else
{
if (utf16_surr != 0)
{
utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
utf16_surr = 0;
}
if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
{
utf16_surr = UTF16_VALUE (c);
utf16_surr_line = gram_pos.line_number;
}
else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
utf8_buffer_append_lone_surrogate (UTF16_VALUE (c), gram_pos.line_number);
else
{
ucs4_t uc = UTF16_VALUE (c);
int len;
utf8_buffer_ensure_available (3);
len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 3);
if (len < 0)
{
error_with_progname = false;
po_xerror (PO_SEVERITY_ERROR, NULL,
real_file_name, gram_pos.line_number, (size_t)(-1),
false, _("warning: invalid Unicode character"));
error_with_progname = true;
}
else
utf8_buflen += len;
}
}
}
else
{
/* Append a single byte. */
if (utf16_surr != 0)
{
utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
utf16_surr = 0;
}
if (assume_utf8)
{
/* No conversion needed. */
utf8_buffer_ensure_available (1);
utf8_buffer[utf8_buflen++] = c;
}
else
{
/* Convert the byte from ISO-8859-1 to UTF-8 on the fly. */
ucs4_t uc = c;
int len;
utf8_buffer_ensure_available (2);
len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 2);
if (len < 0)
abort ();
utf8_buflen += len;
}
}
c = phase3_getc ();
if (c == EOF || c == '\n')
{
if (in_key)
phase3_ungetc (c);
break;
}
}
if (utf16_surr != 0)
utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
/* Return the result. */
{
unsigned char *utf8_string = XNMALLOC (utf8_buflen + 1, unsigned char);
if (utf8_buflen > 0)
memcpy (utf8_string, utf8_buffer, utf8_buflen);
utf8_string[utf8_buflen] = '\0';
return (char *) utf8_string;
}
#undef utf8_buffer_append_lone_surrogate
#undef utf8_buffer_ensure_available
}
/* Read a .properties file from a stream, and dispatch to the various
abstract_catalog_reader_class_ty methods. */
static void
properties_parse (abstract_catalog_reader_ty *this, FILE *file,
const char *real_filename, const char *logical_filename)
{
/* Read the file into memory. */
contents = fread_file (file, 0, &contents_length);
if (contents == NULL)
{
const char *errno_description = strerror (errno);
po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
xasprintf ("%s: %s",
xasprintf (_("error while reading \"%s\""),
real_filename),
errno_description));
return;
}
/* Test whether it's valid UTF-8. */
assume_utf8 = (u8_check ((uint8_t *) contents, contents_length) == NULL);
position = 0;
real_file_name = real_filename;
gram_pos.file_name = xstrdup (real_file_name);
gram_pos.line_number = 1;
for (;;)
{
int c;
bool comment;
bool hidden;
c = phase2_getc ();
if (c == EOF)
break;
comment = false;
hidden = false;
if (c == '#')
comment = true;
else if (c == '!')
{
/* For compatibility with write-properties.c, we treat '!' not
followed by space as a fuzzy or untranslated message. */
int c2 = phase2_getc ();
if (c2 == ' ' || c2 == '\n' || c2 == EOF)
comment = true;
else
hidden = true;
phase2_ungetc (c2);
}
else
phase2_ungetc (c);
if (comment)
{
/* A comment line. */
static char *buffer;
static size_t bufmax;
static size_t buflen;
buflen = 0;
for (;;)
{
c = phase2_getc ();
if (buflen >= bufmax)
{
bufmax += 100;
buffer = xrealloc (buffer, bufmax);
}
if (c == EOF || c == '\n')
break;
buffer[buflen++] = c;
}
buffer[buflen] = '\0';
po_callback_comment_dispatcher (
conv_from_java (
assume_utf8 ? buffer : conv_from_iso_8859_1 (buffer)));
}
else
{
/* A key/value pair. */
char *msgid;
lex_pos_ty msgid_pos;
msgid_pos = gram_pos;
msgid = read_escaped_string (true);
if (msgid == NULL)
/* Skip blank line. */
;
else
{
char *msgstr;
lex_pos_ty msgstr_pos;
bool force_fuzzy;
msgstr_pos = gram_pos;
msgstr = read_escaped_string (false);
if (msgstr == NULL)
msgstr = xstrdup ("");
/* Be sure to make the message fuzzy if it was commented out
and if it is not already header/fuzzy/untranslated. */
force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
po_callback_message (NULL, msgid, &msgid_pos, NULL,
msgstr, strlen (msgstr) + 1, &msgstr_pos,
NULL, NULL, NULL,
force_fuzzy, false);
}
}
}
free (contents);
contents = NULL;
real_file_name = NULL;
gram_pos.line_number = 0;
}
const struct catalog_input_format input_format_properties =
{
properties_parse, /* parse */
true /* produces_utf8 */
};