| /* Convert using charmaps and possibly iconv(). |
| Copyright (C) 2001-2014 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| Contributed by Ulrich Drepper <drepper@redhat.com>, 2001. |
| |
| This program is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published |
| by the Free Software Foundation; version 2 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; if not, see <http://www.gnu.org/licenses/>. */ |
| |
| #include <assert.h> |
| #include <errno.h> |
| #include <error.h> |
| #include <fcntl.h> |
| #include <iconv.h> |
| #include <libintl.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <stdint.h> |
| #include <sys/mman.h> |
| #include <sys/stat.h> |
| |
| #include "iconv_prog.h" |
| |
| |
| /* Prototypes for a few program-wide used functions. */ |
| #include <programs/xmalloc.h> |
| |
| |
| struct convtable |
| { |
| int term[256 / 8]; |
| union |
| { |
| struct convtable *sub; |
| struct charseq *out; |
| } val[256]; |
| }; |
| |
| |
| static inline struct convtable * |
| allocate_table (void) |
| { |
| return (struct convtable *) xcalloc (1, sizeof (struct convtable)); |
| } |
| |
| |
| static inline int |
| is_term (struct convtable *tbl, unsigned int idx) |
| { |
| return tbl->term[idx / 8] & (1 << (idx % 8)); |
| } |
| |
| |
| static inline void |
| clear_term (struct convtable *tbl, unsigned int idx) |
| { |
| tbl->term[idx / 8] &= ~(1 << (idx % 8)); |
| } |
| |
| |
| static inline void |
| set_term (struct convtable *tbl, unsigned int idx) |
| { |
| tbl->term[idx / 8] |= 1 << (idx % 8); |
| } |
| |
| |
| /* Generate the conversion table. */ |
| static struct convtable *use_from_charmap (struct charmap_t *from_charmap, |
| const char *to_code); |
| static struct convtable *use_to_charmap (const char *from_code, |
| struct charmap_t *to_charmap); |
| static struct convtable *use_both_charmaps (struct charmap_t *from_charmap, |
| struct charmap_t *to_charmap); |
| |
| /* Prototypes for the functions doing the actual work. */ |
| static int process_block (struct convtable *tbl, char *addr, size_t len, |
| FILE *output); |
| static int process_fd (struct convtable *tbl, int fd, FILE *output); |
| static int process_file (struct convtable *tbl, FILE *input, FILE *output); |
| |
| |
| int |
| charmap_conversion (const char *from_code, struct charmap_t *from_charmap, |
| const char *to_code, struct charmap_t *to_charmap, |
| int argc, int remaining, char *argv[], |
| const char *output_file) |
| { |
| struct convtable *cvtbl; |
| int status = EXIT_SUCCESS; |
| |
| /* We have three different cases to handle: |
| |
| - both, from_charmap and to_charmap, are available. This means we |
| can assume that the symbolic names match and use them to create |
| the mapping. |
| |
| - only from_charmap is available. In this case we can only hope that |
| the symbolic names used are of the <Uxxxx> form in which case we |
| can use a UCS4->"to_code" iconv() conversion for the second step. |
| |
| - only to_charmap is available. This is similar, only that we would |
| use iconv() for the "to_code"->UCS4 conversion. |
| |
| We first create a table which maps input bytes into output bytes. |
| Once this is done we can handle all three of the cases above |
| equally. */ |
| if (from_charmap != NULL) |
| { |
| if (to_charmap == NULL) |
| cvtbl = use_from_charmap (from_charmap, to_code); |
| else |
| cvtbl = use_both_charmaps (from_charmap, to_charmap); |
| } |
| else |
| { |
| assert (to_charmap != NULL); |
| cvtbl = use_to_charmap (from_code, to_charmap); |
| } |
| |
| /* If we couldn't generate a table stop now. */ |
| if (cvtbl == NULL) |
| return EXIT_FAILURE; |
| |
| /* Determine output file. */ |
| FILE *output; |
| if (output_file != NULL && strcmp (output_file, "-") != 0) |
| { |
| output = fopen (output_file, "w"); |
| if (output == NULL) |
| error (EXIT_FAILURE, errno, _("cannot open output file")); |
| } |
| else |
| output = stdout; |
| |
| /* We can now start the conversion. */ |
| if (remaining == argc) |
| { |
| if (process_file (cvtbl, stdin, output) != 0) |
| status = EXIT_FAILURE; |
| } |
| else |
| do |
| { |
| int fd; |
| |
| if (verbose) |
| printf ("%s:\n", argv[remaining]); |
| if (strcmp (argv[remaining], "-") == 0) |
| fd = 0; |
| else |
| { |
| fd = open (argv[remaining], O_RDONLY); |
| |
| if (fd == -1) |
| { |
| error (0, errno, _("cannot open input file `%s'"), |
| argv[remaining]); |
| status = EXIT_FAILURE; |
| continue; |
| } |
| } |
| |
| #ifdef _POSIX_MAPPED_FILES |
| struct stat64 st; |
| char *addr; |
| /* We have possibilities for reading the input file. First try |
| to mmap() it since this will provide the fastest solution. */ |
| if (fstat64 (fd, &st) == 0 |
| && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, |
| fd, 0)) != MAP_FAILED)) |
| { |
| /* Yes, we can use mmap(). The descriptor is not needed |
| anymore. */ |
| if (close (fd) != 0) |
| error (EXIT_FAILURE, errno, |
| _("error while closing input `%s'"), argv[remaining]); |
| |
| if (process_block (cvtbl, addr, st.st_size, output) < 0) |
| { |
| /* Something went wrong. */ |
| status = EXIT_FAILURE; |
| |
| /* We don't need the input data anymore. */ |
| munmap ((void *) addr, st.st_size); |
| |
| /* We cannot go on with producing output since it might |
| lead to problem because the last output might leave |
| the output stream in an undefined state. */ |
| break; |
| } |
| |
| /* We don't need the input data anymore. */ |
| munmap ((void *) addr, st.st_size); |
| } |
| else |
| #endif /* _POSIX_MAPPED_FILES */ |
| { |
| /* Read the file in pieces. */ |
| if (process_fd (cvtbl, fd, output) != 0) |
| { |
| /* Something went wrong. */ |
| status = EXIT_FAILURE; |
| |
| /* We don't need the input file anymore. */ |
| close (fd); |
| |
| /* We cannot go on with producing output since it might |
| lead to problem because the last output might leave |
| the output stream in an undefined state. */ |
| break; |
| } |
| |
| /* Now close the file. */ |
| close (fd); |
| } |
| } |
| while (++remaining < argc); |
| |
| /* All done. */ |
| return status; |
| } |
| |
| |
| static void |
| add_bytes (struct convtable *tbl, struct charseq *in, struct charseq *out) |
| { |
| int n = 0; |
| unsigned int byte; |
| |
| assert (in->nbytes > 0); |
| |
| byte = ((unsigned char *) in->bytes)[n]; |
| while (n + 1 < in->nbytes) |
| { |
| if (is_term (tbl, byte) || tbl->val[byte].sub == NULL) |
| { |
| /* Note that we simply ignore a definition for a byte sequence |
| which is also the prefix for a longer one. */ |
| clear_term (tbl, byte); |
| tbl->val[byte].sub = |
| (struct convtable *) xcalloc (1, sizeof (struct convtable)); |
| } |
| |
| tbl = tbl->val[byte].sub; |
| |
| byte = ((unsigned char *) in->bytes)[++n]; |
| } |
| |
| /* Only add the new sequence if there is none yet and the byte sequence |
| is not part of an even longer one. */ |
| if (! is_term (tbl, byte) && tbl->val[byte].sub == NULL) |
| { |
| set_term (tbl, byte); |
| tbl->val[byte].out = out; |
| } |
| } |
| |
| |
| static struct convtable * |
| use_from_charmap (struct charmap_t *from_charmap, const char *to_code) |
| { |
| /* We iterate over all entries in the from_charmap and for those which |
| have a known UCS4 representation we use an iconv() call to determine |
| the mapping to the to_code charset. */ |
| struct convtable *rettbl; |
| iconv_t cd; |
| void *ptr = NULL; |
| const void *key; |
| size_t keylen; |
| void *data; |
| |
| cd = iconv_open (to_code, "WCHAR_T"); |
| if (cd == (iconv_t) -1) |
| /* We cannot do anything. */ |
| return NULL; |
| |
| rettbl = allocate_table (); |
| |
| while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data) |
| >= 0) |
| { |
| struct charseq *in = (struct charseq *) data; |
| |
| if (in->ucs4 != UNINITIALIZED_CHAR_VALUE) |
| { |
| /* There is a chance. Try the iconv module. */ |
| wchar_t inbuf[1] = { in->ucs4 }; |
| unsigned char outbuf[64]; |
| char *inptr = (char *) inbuf; |
| size_t inlen = sizeof (inbuf); |
| char *outptr = (char *) outbuf; |
| size_t outlen = sizeof (outbuf); |
| |
| (void) iconv (cd, &inptr, &inlen, &outptr, &outlen); |
| |
| if (outptr != (char *) outbuf) |
| { |
| /* We got some output. Good, use it. */ |
| struct charseq *newp; |
| |
| outlen = sizeof (outbuf) - outlen; |
| assert ((char *) outbuf + outlen == outptr); |
| |
| newp = (struct charseq *) xmalloc (sizeof (struct charseq) |
| + outlen); |
| newp->name = in->name; |
| newp->ucs4 = in->ucs4; |
| newp->nbytes = outlen; |
| memcpy (newp->bytes, outbuf, outlen); |
| |
| add_bytes (rettbl, in, newp); |
| } |
| |
| /* Clear any possible state left behind. */ |
| (void) iconv (cd, NULL, NULL, NULL, NULL); |
| } |
| } |
| |
| iconv_close (cd); |
| |
| return rettbl; |
| } |
| |
| |
| static struct convtable * |
| use_to_charmap (const char *from_code, struct charmap_t *to_charmap) |
| { |
| /* We iterate over all entries in the to_charmap and for those which |
| have a known UCS4 representation we use an iconv() call to determine |
| the mapping to the from_code charset. */ |
| struct convtable *rettbl; |
| iconv_t cd; |
| void *ptr = NULL; |
| const void *key; |
| size_t keylen; |
| void *data; |
| |
| /* Note that the conversion we use here is the reverse direction. Without |
| exhaustive search we cannot figure out which input yields the UCS4 |
| character we are looking for. Therefore we determine it the other |
| way round. */ |
| cd = iconv_open (from_code, "WCHAR_T"); |
| if (cd == (iconv_t) -1) |
| /* We cannot do anything. */ |
| return NULL; |
| |
| rettbl = allocate_table (); |
| |
| while (iterate_table (&to_charmap->char_table, &ptr, &key, &keylen, &data) |
| >= 0) |
| { |
| struct charseq *out = (struct charseq *) data; |
| |
| if (out->ucs4 != UNINITIALIZED_CHAR_VALUE) |
| { |
| /* There is a chance. Try the iconv module. */ |
| wchar_t inbuf[1] = { out->ucs4 }; |
| unsigned char outbuf[64]; |
| char *inptr = (char *) inbuf; |
| size_t inlen = sizeof (inbuf); |
| char *outptr = (char *) outbuf; |
| size_t outlen = sizeof (outbuf); |
| |
| (void) iconv (cd, &inptr, &inlen, &outptr, &outlen); |
| |
| if (outptr != (char *) outbuf) |
| { |
| /* We got some output. Good, use it. */ |
| union |
| { |
| struct charseq seq; |
| struct |
| { |
| const char *name; |
| uint32_t ucs4; |
| int nbytes; |
| unsigned char bytes[outlen]; |
| } mem; |
| } new; |
| |
| outlen = sizeof (outbuf) - outlen; |
| assert ((char *) outbuf + outlen == outptr); |
| |
| new.mem.name = out->name; |
| new.mem.ucs4 = out->ucs4; |
| new.mem.nbytes = outlen; |
| memcpy (new.mem.bytes, outbuf, outlen); |
| |
| add_bytes (rettbl, &new.seq, out); |
| } |
| |
| /* Clear any possible state left behind. */ |
| (void) iconv (cd, NULL, NULL, NULL, NULL); |
| } |
| } |
| |
| iconv_close (cd); |
| |
| return rettbl; |
| } |
| |
| |
| static struct convtable * |
| use_both_charmaps (struct charmap_t *from_charmap, |
| struct charmap_t *to_charmap) |
| { |
| /* In this case we iterate over all the entries in the from_charmap, |
| determine the internal name, and find an appropriate entry in the |
| to_charmap (if it exists). */ |
| struct convtable *rettbl = allocate_table (); |
| void *ptr = NULL; |
| const void *key; |
| size_t keylen; |
| void *data; |
| |
| while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data) |
| >= 0) |
| { |
| struct charseq *in = (struct charseq *) data; |
| struct charseq *out = charmap_find_value (to_charmap, key, keylen); |
| |
| if (out != NULL) |
| add_bytes (rettbl, in, out); |
| } |
| |
| return rettbl; |
| } |
| |
| |
| static int |
| process_block (struct convtable *tbl, char *addr, size_t len, FILE *output) |
| { |
| size_t n = 0; |
| |
| while (n < len) |
| { |
| struct convtable *cur = tbl; |
| unsigned char *curp = (unsigned char *) addr; |
| unsigned int byte = *curp; |
| int cnt; |
| struct charseq *out; |
| |
| while (! is_term (cur, byte)) |
| if (cur->val[byte].sub == NULL) |
| { |
| /* This is an invalid sequence. Skip the first byte if we are |
| ignoring errors. Otherwise punt. */ |
| if (! omit_invalid) |
| { |
| error (0, 0, _("illegal input sequence at position %Zd"), n); |
| return -1; |
| } |
| |
| n -= curp - (unsigned char *) addr; |
| |
| byte = *(curp = (unsigned char *) ++addr); |
| if (++n >= len) |
| /* All converted. */ |
| return 0; |
| |
| cur = tbl; |
| } |
| else |
| { |
| cur = cur->val[byte].sub; |
| |
| if (++n >= len) |
| { |
| error (0, 0, _("\ |
| incomplete character or shift sequence at end of buffer")); |
| return -1; |
| } |
| |
| byte = *++curp; |
| } |
| |
| /* We found a final byte. Write the output bytes. */ |
| out = cur->val[byte].out; |
| for (cnt = 0; cnt < out->nbytes; ++cnt) |
| fputc_unlocked (out->bytes[cnt], output); |
| |
| addr = (char *) curp + 1; |
| ++n; |
| } |
| |
| return 0; |
| } |
| |
| |
| static int |
| process_fd (struct convtable *tbl, int fd, FILE *output) |
| { |
| /* We have a problem with reading from a descriptor since we must not |
| provide the iconv() function an incomplete character or shift |
| sequence at the end of the buffer. Since we have to deal with |
| arbitrary encodings we must read the whole text in a buffer and |
| process it in one step. */ |
| static char *inbuf = NULL; |
| static size_t maxlen = 0; |
| char *inptr = inbuf; |
| size_t actlen = 0; |
| |
| while (actlen < maxlen) |
| { |
| ssize_t n = read (fd, inptr, maxlen - actlen); |
| |
| if (n == 0) |
| /* No more text to read. */ |
| break; |
| |
| if (n == -1) |
| { |
| /* Error while reading. */ |
| error (0, errno, _("error while reading the input")); |
| return -1; |
| } |
| |
| inptr += n; |
| actlen += n; |
| } |
| |
| if (actlen == maxlen) |
| while (1) |
| { |
| ssize_t n; |
| char *new_inbuf; |
| |
| /* Increase the buffer. */ |
| new_inbuf = (char *) realloc (inbuf, maxlen + 32768); |
| if (new_inbuf == NULL) |
| { |
| error (0, errno, _("unable to allocate buffer for input")); |
| return -1; |
| } |
| inbuf = new_inbuf; |
| maxlen += 32768; |
| inptr = inbuf + actlen; |
| |
| do |
| { |
| n = read (fd, inptr, maxlen - actlen); |
| |
| if (n == 0) |
| /* No more text to read. */ |
| break; |
| |
| if (n == -1) |
| { |
| /* Error while reading. */ |
| error (0, errno, _("error while reading the input")); |
| return -1; |
| } |
| |
| inptr += n; |
| actlen += n; |
| } |
| while (actlen < maxlen); |
| |
| if (n == 0) |
| /* Break again so we leave both loops. */ |
| break; |
| } |
| |
| /* Now we have all the input in the buffer. Process it in one run. */ |
| return process_block (tbl, inbuf, actlen, output); |
| } |
| |
| |
| static int |
| process_file (struct convtable *tbl, FILE *input, FILE *output) |
| { |
| /* This should be safe since we use this function only for `stdin' and |
| we haven't read anything so far. */ |
| return process_fd (tbl, fileno (input), output); |
| } |