Line data Source code
1 : /**
2 : * @file
3 : * Conversion between different character encodings
4 : *
5 : * @authors
6 : * Copyright (C) 1999-2002,2007 Thomas Roessler <roessler@does-not-exist.org>
7 : *
8 : * @copyright
9 : * This program is free software: you can redistribute it and/or modify it under
10 : * the terms of the GNU General Public License as published by the Free Software
11 : * Foundation, either version 2 of the License, or (at your option) any later
12 : * version.
13 : *
14 : * This program is distributed in the hope that it will be useful, but WITHOUT
15 : * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 : * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
17 : * details.
18 : *
19 : * You should have received a copy of the GNU General Public License along with
20 : * this program. If not, see <http://www.gnu.org/licenses/>.
21 : */
22 :
23 : /**
24 : * @page charset Conversion between different character encodings
25 : *
26 : * Conversion between different character encodings
27 : */
28 :
29 : #include "config.h"
30 : #include <ctype.h>
31 : #include <errno.h>
32 : #include <iconv.h>
33 : #include <langinfo.h>
34 : #include <limits.h>
35 : #include <stdbool.h>
36 : #include <stdio.h>
37 : #include <string.h>
38 : #include "charset.h"
39 : #include "buffer.h"
40 : #include "memory.h"
41 : #include "queue.h"
42 : #include "regex3.h"
43 : #include "string2.h"
44 : #ifdef ENABLE_NLS
45 : #include <libintl.h>
46 : #endif
47 :
48 : #ifndef EILSEQ
49 : #define EILSEQ EINVAL
50 : #endif
51 :
52 : char *C_AssumedCharset; ///< Config: If a message is missing a character set, assume this character set
53 : char *C_Charset; ///< Config: Default character set for displaying text on screen
54 :
55 : /**
56 : * ReplacementChar - When a Unicode character can't be displayed, use this instead
57 : */
58 : wchar_t ReplacementChar = '?';
59 :
60 : /**
61 : * CharsetIsUtf8 - Is the user's current character set utf-8?
62 : */
63 : bool CharsetIsUtf8 = false;
64 :
65 : /**
66 : * struct Lookup - Regex to String lookup table
67 : *
68 : * This is used by 'charset-hook' and 'iconv-hook'.
69 : */
70 : struct Lookup
71 : {
72 : enum LookupType type; ///< Lookup type
73 : struct Regex regex; ///< Regular expression
74 : char *replacement; ///< Alternative charset to use
75 : TAILQ_ENTRY(Lookup) entries; ///< Linked list
76 : };
77 : TAILQ_HEAD(LookupList, Lookup);
78 :
79 : static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
80 :
81 : /**
82 : * struct MimeNames - MIME name lookup entry
83 : */
84 : struct MimeNames
85 : {
86 : const char *key;
87 : const char *pref;
88 : };
89 :
90 : // clang-format off
91 : /**
92 : * PreferredMimeNames - Lookup table of preferred charsets
93 : *
94 : * The following list has been created manually from the data under:
95 : * http://www.isi.edu/in-notes/iana/assignments/character-sets
96 : * Last update: 2000-09-07
97 : *
98 : * @note It includes only the subset of character sets for which a preferred
99 : * MIME name is given.
100 : */
101 : const struct MimeNames PreferredMimeNames[] =
102 : {
103 : { "ansi_x3.4-1968", "us-ascii" },
104 : { "iso-ir-6", "us-ascii" },
105 : { "iso_646.irv:1991", "us-ascii" },
106 : { "ascii", "us-ascii" },
107 : { "iso646-us", "us-ascii" },
108 : { "us", "us-ascii" },
109 : { "ibm367", "us-ascii" },
110 : { "cp367", "us-ascii" },
111 : { "csASCII", "us-ascii" },
112 :
113 : { "csISO2022KR", "iso-2022-kr" },
114 : { "csEUCKR", "euc-kr" },
115 : { "csISO2022JP", "iso-2022-jp" },
116 : { "csISO2022JP2", "iso-2022-jp-2" },
117 :
118 : { "ISO_8859-1:1987", "iso-8859-1" },
119 : { "iso-ir-100", "iso-8859-1" },
120 : { "iso_8859-1", "iso-8859-1" },
121 : { "latin1", "iso-8859-1" },
122 : { "l1", "iso-8859-1" },
123 : { "IBM819", "iso-8859-1" },
124 : { "CP819", "iso-8859-1" },
125 : { "csISOLatin1", "iso-8859-1" },
126 :
127 : { "ISO_8859-2:1987", "iso-8859-2" },
128 : { "iso-ir-101", "iso-8859-2" },
129 : { "iso_8859-2", "iso-8859-2" },
130 : { "latin2", "iso-8859-2" },
131 : { "l2", "iso-8859-2" },
132 : { "csISOLatin2", "iso-8859-2" },
133 :
134 : { "ISO_8859-3:1988", "iso-8859-3" },
135 : { "iso-ir-109", "iso-8859-3" },
136 : { "ISO_8859-3", "iso-8859-3" },
137 : { "latin3", "iso-8859-3" },
138 : { "l3", "iso-8859-3" },
139 : { "csISOLatin3", "iso-8859-3" },
140 :
141 : { "ISO_8859-4:1988", "iso-8859-4" },
142 : { "iso-ir-110", "iso-8859-4" },
143 : { "ISO_8859-4", "iso-8859-4" },
144 : { "latin4", "iso-8859-4" },
145 : { "l4", "iso-8859-4" },
146 : { "csISOLatin4", "iso-8859-4" },
147 :
148 : { "ISO_8859-6:1987", "iso-8859-6" },
149 : { "iso-ir-127", "iso-8859-6" },
150 : { "iso_8859-6", "iso-8859-6" },
151 : { "ECMA-114", "iso-8859-6" },
152 : { "ASMO-708", "iso-8859-6" },
153 : { "arabic", "iso-8859-6" },
154 : { "csISOLatinArabic", "iso-8859-6" },
155 :
156 : { "ISO_8859-7:1987", "iso-8859-7" },
157 : { "iso-ir-126", "iso-8859-7" },
158 : { "ISO_8859-7", "iso-8859-7" },
159 : { "ELOT_928", "iso-8859-7" },
160 : { "ECMA-118", "iso-8859-7" },
161 : { "greek", "iso-8859-7" },
162 : { "greek8", "iso-8859-7" },
163 : { "csISOLatinGreek", "iso-8859-7" },
164 :
165 : { "ISO_8859-8:1988", "iso-8859-8" },
166 : { "iso-ir-138", "iso-8859-8" },
167 : { "ISO_8859-8", "iso-8859-8" },
168 : { "hebrew", "iso-8859-8" },
169 : { "csISOLatinHebrew", "iso-8859-8" },
170 :
171 : { "ISO_8859-5:1988", "iso-8859-5" },
172 : { "iso-ir-144", "iso-8859-5" },
173 : { "ISO_8859-5", "iso-8859-5" },
174 : { "cyrillic", "iso-8859-5" },
175 : { "csISOLatinCyrillic", "iso-8859-5" },
176 :
177 : { "ISO_8859-9:1989", "iso-8859-9" },
178 : { "iso-ir-148", "iso-8859-9" },
179 : { "ISO_8859-9", "iso-8859-9" },
180 : { "latin5", "iso-8859-9" }, /* this is not a bug */
181 : { "l5", "iso-8859-9" },
182 : { "csISOLatin5", "iso-8859-9" },
183 :
184 : { "ISO_8859-10:1992", "iso-8859-10" },
185 : { "iso-ir-157", "iso-8859-10" },
186 : { "latin6", "iso-8859-10" }, /* this is not a bug */
187 : { "l6", "iso-8859-10" },
188 : { "csISOLatin6", "iso-8859-10" },
189 :
190 : { "csKOI8r", "koi8-r" },
191 :
192 : { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
193 : { "csShiftJis", "Shift_JIS" },
194 :
195 : { "Extended_UNIX_Code_Packed_Format_for_Japanese",
196 : "euc-jp" },
197 : { "csEUCPkdFmtJapanese", "euc-jp" },
198 :
199 : { "csGB2312", "gb2312" },
200 : { "csbig5", "big5" },
201 :
202 : /* End of official brain damage.
203 : * What follows has been taken from glibc's localedata files. */
204 :
205 : { "iso_8859-13", "iso-8859-13" },
206 : { "iso-ir-179", "iso-8859-13" },
207 : { "latin7", "iso-8859-13" }, /* this is not a bug */
208 : { "l7", "iso-8859-13" },
209 :
210 : { "iso_8859-14", "iso-8859-14" },
211 : { "latin8", "iso-8859-14" }, /* this is not a bug */
212 : { "l8", "iso-8859-14" },
213 :
214 : { "iso_8859-15", "iso-8859-15" },
215 : { "latin9", "iso-8859-15" }, /* this is not a bug */
216 :
217 : /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
218 : { "latin0", "iso-8859-15" }, /* this is not a bug */
219 :
220 : { "iso_8859-16", "iso-8859-16" },
221 : { "latin10", "iso-8859-16" }, /* this is not a bug */
222 :
223 : { "646", "us-ascii" },
224 :
225 : /* http://www.sun.com/software/white-papers/wp-unicode/ */
226 :
227 : { "eucJP", "euc-jp" },
228 : { "PCK", "Shift_JIS" },
229 : { "ko_KR-euc", "euc-kr" },
230 : { "zh_TW-big5", "big5" },
231 :
232 : /* seems to be common on some systems */
233 :
234 : { "sjis", "Shift_JIS" },
235 : { "euc-jp-ms", "eucJP-ms" },
236 :
237 : /* If you happen to encounter system-specific brain-damage with respect to
238 : * character set naming, please add it above this comment, and submit a patch
239 : * to <neomutt-devel@neomutt.org> */
240 :
241 : { NULL, NULL },
242 : };
243 : // clang-format on
244 :
245 : /**
246 : * lookup_new - Create a new Lookup
247 : * @retval ptr New Lookup
248 : */
249 2 : static struct Lookup *lookup_new(void)
250 : {
251 2 : return mutt_mem_calloc(1, sizeof(struct Lookup));
252 : }
253 :
254 : /**
255 : * lookup_free - Free a Lookup
256 : * @param ptr Lookup to free
257 : */
258 2 : static void lookup_free(struct Lookup **ptr)
259 : {
260 2 : if (!ptr || !*ptr)
261 0 : return;
262 :
263 2 : struct Lookup *l = *ptr;
264 2 : FREE(&l->replacement);
265 2 : FREE(&l->regex.pattern);
266 2 : if (l->regex.regex)
267 2 : regfree(l->regex.regex);
268 2 : FREE(&l->regex.regex);
269 2 : FREE(&l->regex);
270 :
271 2 : FREE(ptr);
272 : }
273 :
274 : /**
275 : * lookup_charset - Look for a preferred character set name
276 : * @param type Type, e.g. #MUTT_LOOKUP_CHARSET
277 : * @param cs Character set
278 : * @retval ptr Charset string
279 : *
280 : * If the character set matches one of the regexes,
281 : * then return the replacement name.
282 : */
283 2120 : static const char *lookup_charset(enum LookupType type, const char *cs)
284 : {
285 2120 : if (!cs)
286 4 : return NULL;
287 :
288 2116 : struct Lookup *l = NULL;
289 :
290 2116 : TAILQ_FOREACH(l, &Lookups, entries)
291 : {
292 0 : if (l->type != type)
293 0 : continue;
294 0 : if (mutt_regex_match(&l->regex, cs))
295 0 : return l->replacement;
296 : }
297 2116 : return NULL;
298 : }
299 :
300 : /**
301 : * mutt_ch_convert_nonmime_string - Try to convert a string using a list of character sets
302 : * @param[in,out] ps String to be converted
303 : * @retval 0 Success
304 : * @retval -1 Error
305 : *
306 : * Work through `$assumed_charset` looking for a character set conversion that
307 : * works. Failing that, try mutt_ch_get_default_charset().
308 : */
309 2 : int mutt_ch_convert_nonmime_string(char **ps)
310 : {
311 2 : if (!ps)
312 2 : return -1;
313 :
314 0 : char *u = *ps;
315 0 : const size_t ulen = mutt_str_len(u);
316 0 : if (ulen == 0)
317 0 : return 0;
318 :
319 0 : const char *c1 = NULL;
320 :
321 0 : for (const char *c = C_AssumedCharset; c; c = c1 ? c1 + 1 : 0)
322 : {
323 0 : c1 = strchr(c, ':');
324 0 : size_t n = c1 ? c1 - c : mutt_str_len(c);
325 0 : if (n == 0)
326 0 : return 0;
327 0 : char *fromcode = mutt_mem_malloc(n + 1);
328 0 : mutt_str_copy(fromcode, c, n + 1);
329 0 : char *s = mutt_strn_dup(u, ulen);
330 0 : int m = mutt_ch_convert_string(&s, fromcode, C_Charset, 0);
331 0 : FREE(&fromcode);
332 0 : FREE(&s);
333 0 : if (m == 0)
334 : {
335 0 : return 0;
336 : }
337 : }
338 0 : mutt_ch_convert_string(ps, (const char *) mutt_ch_get_default_charset(),
339 : C_Charset, MUTT_ICONV_HOOK_FROM);
340 0 : return -1;
341 : }
342 :
343 : /**
344 : * mutt_ch_canonical_charset - Canonicalise the charset of a string
345 : * @param buf Buffer for canonical character set name
346 : * @param buflen Length of buffer
347 : * @param name Name to be canonicalised
348 : *
349 : * This first ties off any charset extension such as "//TRANSLIT",
350 : * canonicalizes the charset and re-adds the extension
351 : */
352 3099 : void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
353 : {
354 3099 : if (!buf || !name)
355 10 : return;
356 :
357 3089 : char in[1024], scratch[1024];
358 :
359 3089 : mutt_str_copy(in, name, sizeof(in));
360 3089 : char *ext = strchr(in, '/');
361 3089 : if (ext)
362 0 : *ext++ = '\0';
363 :
364 3089 : if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
365 : {
366 3061 : mutt_str_copy(buf, "utf-8", buflen);
367 3061 : goto out;
368 : }
369 :
370 : /* catch some common iso-8859-something misspellings */
371 : size_t plen;
372 28 : if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
373 0 : snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
374 28 : else if ((plen = mutt_istr_startswith(in, "8859-")))
375 0 : snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
376 28 : else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
377 0 : snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
378 28 : else if ((plen = mutt_istr_startswith(in, "iso8859-")))
379 0 : snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
380 : else
381 28 : mutt_str_copy(scratch, in, sizeof(scratch));
382 :
383 2856 : for (size_t i = 0; PreferredMimeNames[i].key; i++)
384 : {
385 2828 : if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
386 : {
387 0 : mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
388 0 : goto out;
389 : }
390 : }
391 :
392 28 : mutt_str_copy(buf, scratch, buflen);
393 :
394 : /* for cosmetics' sake, transform to lowercase. */
395 216 : for (char *p = buf; *p; p++)
396 188 : *p = tolower(*p);
397 :
398 3089 : out:
399 3089 : if (ext && *ext)
400 : {
401 0 : mutt_str_cat(buf, buflen, "/");
402 0 : mutt_str_cat(buf, buflen, ext);
403 : }
404 : }
405 :
406 : /**
407 : * mutt_ch_chscmp - Are the names of two character sets equivalent?
408 : * @param cs1 First character set
409 : * @param cs2 Second character set
410 : * @retval true Names are equivalent
411 : * @retval false Names differ
412 : *
413 : * Charsets may have extensions that mutt_ch_canonical_charset() leaves intact;
414 : * we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2'
415 : * does _not_ have any extension) we simply check if the shorter string is a
416 : * prefix for the longer.
417 : */
418 985 : bool mutt_ch_chscmp(const char *cs1, const char *cs2)
419 : {
420 985 : if (!cs1 || !cs2)
421 4 : return false;
422 :
423 981 : char buf[256];
424 :
425 981 : mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
426 :
427 981 : int len1 = mutt_str_len(buf);
428 981 : int len2 = mutt_str_len(cs2);
429 :
430 981 : return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
431 981 : ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
432 : }
433 :
434 : /**
435 : * mutt_ch_get_default_charset - Get the default character set
436 : * @retval ptr Name of the default character set
437 : *
438 : * @warning This returns a pointer to a static buffer. Do not free it.
439 : */
440 2 : char *mutt_ch_get_default_charset(void)
441 : {
442 : static char fcharset[128];
443 2 : const char *c = C_AssumedCharset;
444 2 : const char *c1 = NULL;
445 :
446 2 : if (c)
447 : {
448 0 : c1 = strchr(c, ':');
449 0 : mutt_str_copy(fcharset, c, c1 ? (c1 - c + 1) : sizeof(fcharset));
450 0 : return fcharset;
451 : }
452 2 : return strcpy(fcharset, "us-ascii");
453 : }
454 :
455 : /**
456 : * mutt_ch_get_langinfo_charset - Get the user's choice of character set
457 : * @retval ptr Charset string
458 : *
459 : * Get the canonical character set used by the user's locale.
460 : * The caller must free the returned string.
461 : */
462 3 : char *mutt_ch_get_langinfo_charset(void)
463 : {
464 3 : char buf[1024] = { 0 };
465 :
466 3 : mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
467 :
468 3 : if (buf[0] != '\0')
469 3 : return mutt_str_dup(buf);
470 :
471 0 : return mutt_str_dup("iso-8859-1");
472 : }
473 :
474 : /**
475 : * mutt_ch_lookup_add - Add a new character set lookup
476 : * @param type Type of character set, e.g. #MUTT_LOOKUP_CHARSET
477 : * @param pat Pattern to match
478 : * @param replace Replacement string
479 : * @param err Buffer for error message
480 : * @retval true Lookup added to list
481 : * @retval false Regex string was invalid
482 : *
483 : * Add a regex for a character set and a replacement name.
484 : */
485 6 : bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
486 : const char *replace, struct Buffer *err)
487 : {
488 6 : if (!pat || !replace)
489 4 : return false;
490 :
491 2 : regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
492 2 : int rc = REG_COMP(rx, pat, REG_ICASE);
493 2 : if (rc != 0)
494 : {
495 0 : regerror(rc, rx, err->data, err->dsize);
496 0 : FREE(&rx);
497 0 : return false;
498 : }
499 :
500 2 : struct Lookup *l = lookup_new();
501 2 : l->type = type;
502 2 : l->replacement = mutt_str_dup(replace);
503 2 : l->regex.pattern = mutt_str_dup(pat);
504 2 : l->regex.regex = rx;
505 2 : l->regex.pat_not = false;
506 :
507 2 : TAILQ_INSERT_TAIL(&Lookups, l, entries);
508 :
509 2 : return true;
510 : }
511 :
512 : /**
513 : * mutt_ch_lookup_remove - Remove all the character set lookups
514 : *
515 : * Empty the list of replacement character set names.
516 : */
517 2 : void mutt_ch_lookup_remove(void)
518 : {
519 2 : struct Lookup *l = NULL;
520 2 : struct Lookup *tmp = NULL;
521 :
522 4 : TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
523 : {
524 2 : TAILQ_REMOVE(&Lookups, l, entries);
525 2 : lookup_free(&l);
526 : }
527 2 : }
528 :
529 : /**
530 : * mutt_ch_charset_lookup - Look for a replacement character set
531 : * @param chs Character set to lookup
532 : * @retval ptr Replacement character set (if a 'charset-hook' matches)
533 : * @retval NULL No matching hook
534 : *
535 : * Look through all the 'charset-hook's.
536 : * If one matches return the replacement character set.
537 : */
538 26 : const char *mutt_ch_charset_lookup(const char *chs)
539 : {
540 26 : return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
541 : }
542 :
543 : /**
544 : * mutt_ch_iconv_open - Set up iconv for conversions
545 : * @param tocode Current character set
546 : * @param fromcode Target character set
547 : * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
548 : * @retval ptr iconv handle for the conversion
549 : *
550 : * Like iconv_open, but canonicalises the charsets, applies charset-hooks,
551 : * recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips
552 : * charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers
553 : * should use flags=0 when fromcode can safely be considered true, either some
554 : * constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be
555 : * used only when fromcode is unsure, taken from a possibly wrong incoming MIME
556 : * label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions
557 : * in some setups.
558 : *
559 : * @note By design charset-hooks should never be, and are never, applied
560 : * to tocode.
561 : *
562 : * @note The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks,
563 : * not at all on iconv-hooks.
564 : */
565 1046 : iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
566 : {
567 1046 : char tocode1[128];
568 1046 : char fromcode1[128];
569 1046 : const char *tocode2 = NULL, *fromcode2 = NULL;
570 1046 : const char *tmp = NULL;
571 :
572 : iconv_t cd;
573 :
574 : /* transform to MIME preferred charset names */
575 1046 : mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
576 1046 : mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
577 :
578 : /* maybe apply charset-hooks and recanonicalise fromcode,
579 : * but only when caller asked us to sanitize a potentially wrong
580 : * charset name incoming from the wild exterior. */
581 1046 : if (flags & MUTT_ICONV_HOOK_FROM)
582 : {
583 24 : tmp = mutt_ch_charset_lookup(fromcode1);
584 24 : if (tmp)
585 0 : mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
586 : }
587 :
588 : /* always apply iconv-hooks to suit system's iconv tastes */
589 1046 : tocode2 = mutt_ch_iconv_lookup(tocode1);
590 1046 : tocode2 = tocode2 ? tocode2 : tocode1;
591 1046 : fromcode2 = mutt_ch_iconv_lookup(fromcode1);
592 1046 : fromcode2 = fromcode2 ? fromcode2 : fromcode1;
593 :
594 : /* call system iconv with names it appreciates */
595 1046 : cd = iconv_open(tocode2, fromcode2);
596 1046 : if (cd != (iconv_t) -1)
597 1038 : return cd;
598 :
599 8 : return (iconv_t) -1;
600 : }
601 :
602 : /**
603 : * mutt_ch_iconv - Change the encoding of a string
604 : * @param[in] cd Iconv conversion descriptor
605 : * @param[in,out] inbuf Buffer to convert
606 : * @param[in,out] inbytesleft Length of buffer to convert
607 : * @param[in,out] outbuf Buffer for the result
608 : * @param[in,out] outbytesleft Length of result buffer
609 : * @param[in] inrepls Input replacement characters
610 : * @param[in] outrepl Output replacement characters
611 : * @param[out] iconverrno Errno if iconv() fails, 0 if it succeeds
612 : * @retval num Characters converted
613 : *
614 : * Like iconv, but keeps going even when the input is invalid
615 : * If you're supplying inrepls, the source charset should be stateless;
616 : * if you're supplying an outrepl, the target charset should be.
617 : */
618 972 : size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
619 : char **outbuf, size_t *outbytesleft, const char **inrepls,
620 : const char *outrepl, int *iconverrno)
621 : {
622 972 : size_t rc = 0;
623 972 : const char *ib = *inbuf;
624 972 : size_t ibl = *inbytesleft;
625 972 : char *ob = *outbuf;
626 972 : size_t obl = *outbytesleft;
627 :
628 : while (true)
629 0 : {
630 972 : errno = 0;
631 972 : const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
632 972 : if (ret1 != (size_t) -1)
633 972 : rc += ret1;
634 972 : if (iconverrno)
635 972 : *iconverrno = errno;
636 :
637 972 : if (ibl && obl && (errno == EILSEQ))
638 : {
639 0 : if (inrepls)
640 : {
641 : /* Try replacing the input */
642 0 : const char **t = NULL;
643 0 : for (t = inrepls; *t; t++)
644 : {
645 0 : const char *ib1 = *t;
646 0 : size_t ibl1 = strlen(*t);
647 0 : char *ob1 = ob;
648 0 : size_t obl1 = obl;
649 0 : iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
650 0 : if (ibl1 == 0)
651 : {
652 0 : ib++;
653 0 : ibl--;
654 0 : ob = ob1;
655 0 : obl = obl1;
656 0 : rc++;
657 0 : break;
658 : }
659 : }
660 0 : if (*t)
661 0 : continue;
662 : }
663 : /* Replace the output */
664 0 : if (!outrepl)
665 0 : outrepl = "?";
666 0 : iconv(cd, NULL, NULL, &ob, &obl);
667 0 : if (obl)
668 : {
669 0 : int n = strlen(outrepl);
670 0 : if (n > obl)
671 : {
672 0 : outrepl = "?";
673 0 : n = 1;
674 : }
675 0 : memcpy(ob, outrepl, n);
676 0 : ib++;
677 0 : ibl--;
678 0 : ob += n;
679 0 : obl -= n;
680 0 : rc++;
681 0 : iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
682 0 : continue;
683 : }
684 : }
685 972 : *inbuf = ib;
686 972 : *inbytesleft = ibl;
687 972 : *outbuf = ob;
688 972 : *outbytesleft = obl;
689 972 : return rc;
690 : }
691 : }
692 :
693 : /**
694 : * mutt_ch_iconv_lookup - Look for a replacement character set
695 : * @param chs Character set to lookup
696 : * @retval ptr Replacement character set (if a 'iconv-hook' matches)
697 : * @retval NULL No matching hook
698 : *
699 : * Look through all the 'iconv-hook's.
700 : * If one matches return the replacement character set.
701 : */
702 2094 : const char *mutt_ch_iconv_lookup(const char *chs)
703 : {
704 2094 : return lookup_charset(MUTT_LOOKUP_ICONV, chs);
705 : }
706 :
707 : /**
708 : * mutt_ch_check - Check whether a string can be converted between encodings
709 : * @param[in] s String to check
710 : * @param[in] slen Length of the string to check
711 : * @param[in] from Current character set
712 : * @param[in] to Target character set
713 : * @retval 0 Success
714 : * @retval -1 Error in iconv_open()
715 : * @retval >0 Errno as set by iconv()
716 : */
717 20 : int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
718 : {
719 20 : if (!s || !from || !to)
720 6 : return -1;
721 :
722 14 : int rc = 0;
723 14 : iconv_t cd = mutt_ch_iconv_open(to, from, 0);
724 14 : if (cd == (iconv_t) -1)
725 2 : return -1;
726 :
727 12 : size_t outlen = MB_LEN_MAX * slen;
728 12 : char *out = mutt_mem_malloc(outlen + 1);
729 12 : char *saved_out = out;
730 :
731 : const size_t convlen =
732 12 : iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
733 12 : if (convlen == -1)
734 0 : rc = errno;
735 :
736 12 : FREE(&saved_out);
737 12 : iconv_close(cd);
738 12 : return rc;
739 : }
740 :
741 : /**
742 : * mutt_ch_convert_string - Convert a string between encodings
743 : * @param[in,out] ps String to convert
744 : * @param[in] from Current character set
745 : * @param[in] to Target character set
746 : * @param[in] flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
747 : * @retval 0 Success
748 : * @retval -1 Invalid arguments or failure to open an iconv channel
749 : * @retval errno Failure in iconv conversion
750 : *
751 : * Parameter flags is given as-is to mutt_ch_iconv_open().
752 : * See there for its meaning and usage policy.
753 : */
754 998 : int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
755 : {
756 998 : if (!ps)
757 2 : return -1;
758 :
759 996 : char *s = *ps;
760 :
761 996 : if (!s || (*s == '\0'))
762 10 : return 0;
763 :
764 986 : if (!to || !from)
765 14 : return -1;
766 :
767 972 : const char *repls[] = { "\357\277\275", "?", 0 };
768 972 : int rc = 0;
769 :
770 972 : iconv_t cd = mutt_ch_iconv_open(to, from, flags);
771 972 : if (cd == (iconv_t) -1)
772 0 : return -1;
773 :
774 : size_t len;
775 972 : const char *ib = NULL;
776 972 : char *buf = NULL, *ob = NULL;
777 972 : size_t ibl, obl;
778 972 : const char **inrepls = NULL;
779 972 : const char *outrepl = NULL;
780 :
781 972 : if (mutt_ch_is_utf8(to))
782 970 : outrepl = "\357\277\275";
783 2 : else if (mutt_ch_is_utf8(from))
784 2 : inrepls = repls;
785 : else
786 0 : outrepl = "?";
787 :
788 972 : len = strlen(s);
789 972 : ib = s;
790 972 : ibl = len + 1;
791 972 : obl = MB_LEN_MAX * ibl;
792 972 : buf = mutt_mem_malloc(obl + 1);
793 972 : ob = buf;
794 :
795 972 : mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
796 972 : iconv_close(cd);
797 :
798 972 : *ob = '\0';
799 :
800 972 : FREE(ps);
801 972 : *ps = buf;
802 :
803 972 : mutt_str_adjust(ps);
804 972 : return rc;
805 : }
806 :
807 : /**
808 : * mutt_ch_check_charset - Does iconv understand a character set?
809 : * @param cs Character set to check
810 : * @param strict Check strictly by using iconv
811 : * @retval true Character set is valid
812 : *
813 : * If `strict` is false, then finding a matching character set in
814 : * #PreferredMimeNames will be enough.
815 : * If `strict` is true, or the charset is not in #PreferredMimeNames, then
816 : * iconv() with be run.
817 : */
818 6 : bool mutt_ch_check_charset(const char *cs, bool strict)
819 : {
820 6 : if (!cs)
821 2 : return false;
822 :
823 4 : if (mutt_ch_is_utf8(cs))
824 2 : return true;
825 :
826 2 : if (!strict)
827 : {
828 0 : for (int i = 0; PreferredMimeNames[i].key; i++)
829 : {
830 0 : if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
831 0 : mutt_istr_equal(PreferredMimeNames[i].pref, cs))
832 : {
833 0 : return true;
834 : }
835 : }
836 : }
837 :
838 2 : iconv_t cd = mutt_ch_iconv_open(cs, cs, 0);
839 2 : if (cd != (iconv_t)(-1))
840 : {
841 2 : iconv_close(cd);
842 2 : return true;
843 : }
844 :
845 0 : return false;
846 : }
847 :
848 : /**
849 : * mutt_ch_fgetconv_open - Prepare a file for charset conversion
850 : * @param fp FILE ptr to prepare
851 : * @param from Current character set
852 : * @param to Destination character set
853 : * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
854 : * @retval ptr fgetconv handle
855 : *
856 : * Parameter flags is given as-is to mutt_ch_iconv_open().
857 : */
858 6 : struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, int flags)
859 : {
860 6 : struct FgetConv *fc = NULL;
861 6 : iconv_t cd = (iconv_t) -1;
862 :
863 6 : if (from && to)
864 2 : cd = mutt_ch_iconv_open(to, from, flags);
865 :
866 6 : if (cd != (iconv_t) -1)
867 : {
868 : static const char *repls[] = { "\357\277\275", "?", 0 };
869 :
870 0 : fc = mutt_mem_malloc(sizeof(struct FgetConv));
871 0 : fc->p = fc->bufo;
872 0 : fc->ob = fc->bufo;
873 0 : fc->ib = fc->bufi;
874 0 : fc->ibl = 0;
875 0 : fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
876 : }
877 : else
878 6 : fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
879 6 : fc->fp = fp;
880 6 : fc->cd = cd;
881 6 : return fc;
882 : }
883 :
884 : /**
885 : * mutt_ch_fgetconv_close - Close an fgetconv handle
886 : * @param[out] fc fgetconv handle
887 : */
888 10 : void mutt_ch_fgetconv_close(struct FgetConv **fc)
889 : {
890 10 : if (!fc || !*fc)
891 4 : return;
892 :
893 6 : if ((*fc)->cd != (iconv_t) -1)
894 0 : iconv_close((*fc)->cd);
895 6 : FREE(fc);
896 : }
897 :
898 : /**
899 : * mutt_ch_fgetconv - Convert a file's character set
900 : * @param fc FgetConv handle
901 : * @retval num Next character in the converted file
902 : * @retval EOF Error
903 : *
904 : * A file is read into a buffer and its character set is converted.
905 : * Each call to this function will return one converted character.
906 : * The buffer is refilled automatically when empty.
907 : */
908 4 : int mutt_ch_fgetconv(struct FgetConv *fc)
909 : {
910 4 : if (!fc)
911 4 : return EOF;
912 0 : if (fc->cd == (iconv_t) -1)
913 0 : return fgetc(fc->fp);
914 0 : if (!fc->p)
915 0 : return EOF;
916 0 : if (fc->p < fc->ob)
917 0 : return (unsigned char) *(fc->p)++;
918 :
919 : /* Try to convert some more */
920 0 : fc->p = fc->bufo;
921 0 : fc->ob = fc->bufo;
922 0 : if (fc->ibl)
923 : {
924 0 : size_t obl = sizeof(fc->bufo);
925 0 : iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
926 0 : if (fc->p < fc->ob)
927 0 : return (unsigned char) *(fc->p)++;
928 : }
929 :
930 : /* If we trusted iconv a bit more, we would at this point
931 : * ask why it had stopped converting ... */
932 :
933 : /* Try to read some more */
934 0 : if ((fc->ibl == sizeof(fc->bufi)) ||
935 0 : (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
936 : {
937 0 : fc->p = 0;
938 0 : return EOF;
939 : }
940 0 : if (fc->ibl)
941 0 : memcpy(fc->bufi, fc->ib, fc->ibl);
942 0 : fc->ib = fc->bufi;
943 0 : fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
944 :
945 : /* Try harder this time to convert some */
946 0 : if (fc->ibl)
947 : {
948 0 : size_t obl = sizeof(fc->bufo);
949 0 : mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
950 : fc->inrepls, 0, NULL);
951 0 : if (fc->p < fc->ob)
952 0 : return (unsigned char) *(fc->p)++;
953 : }
954 :
955 : /* Either the file has finished or one of the buffers is too small */
956 0 : fc->p = 0;
957 0 : return EOF;
958 : }
959 :
960 : /**
961 : * mutt_ch_fgetconvs - Convert a file's charset into a string buffer
962 : * @param buf Buffer for result
963 : * @param buflen Length of buffer
964 : * @param fc FgetConv handle
965 : * @retval ptr Success, result buffer
966 : * @retval NULL Error
967 : *
968 : * Read a file into a buffer, converting the character set as it goes.
969 : */
970 4 : char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
971 : {
972 4 : if (!buf)
973 2 : return NULL;
974 :
975 : size_t r;
976 2 : for (r = 0; (r + 1) < buflen;)
977 : {
978 2 : const int c = mutt_ch_fgetconv(fc);
979 2 : if (c == EOF)
980 2 : break;
981 0 : buf[r++] = (char) c;
982 0 : if (c == '\n')
983 0 : break;
984 : }
985 2 : buf[r] = '\0';
986 :
987 2 : if (r > 0)
988 0 : return buf;
989 :
990 2 : return NULL;
991 : }
992 :
993 : /**
994 : * mutt_ch_set_charset - Update the records for a new character set
995 : * @param charset New character set
996 : *
997 : * Check if this character set is utf-8 and pick a suitable replacement
998 : * character for unprintable characters.
999 : *
1000 : * @note This calls `bind_textdomain_codeset()` which will affect future
1001 : * message translations.
1002 : */
1003 3 : void mutt_ch_set_charset(const char *charset)
1004 : {
1005 3 : char buf[256];
1006 :
1007 3 : mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1008 :
1009 3 : if (mutt_ch_is_utf8(buf))
1010 : {
1011 1 : CharsetIsUtf8 = true;
1012 1 : ReplacementChar = 0xfffd; /* replacement character */
1013 : }
1014 : else
1015 : {
1016 2 : CharsetIsUtf8 = false;
1017 2 : ReplacementChar = '?';
1018 : }
1019 :
1020 : #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1021 3 : bind_textdomain_codeset(PACKAGE, buf);
1022 : #endif
1023 3 : }
1024 :
1025 : /**
1026 : * mutt_ch_choose - Figure the best charset to encode a string
1027 : * @param[in] fromcode Original charset of the string
1028 : * @param[in] charsets Colon-separated list of potential charsets to use
1029 : * @param[in] u String to encode
1030 : * @param[in] ulen Length of the string to encode
1031 : * @param[out] d If not NULL, point it to the converted string
1032 : * @param[out] dlen If not NULL, point it to the length of the d string
1033 : * @retval ptr Best performing charset
1034 : * @retval NULL None could be found
1035 : */
1036 22 : char *mutt_ch_choose(const char *fromcode, const char *charsets, const char *u,
1037 : size_t ulen, char **d, size_t *dlen)
1038 : {
1039 22 : if (!fromcode)
1040 2 : return NULL;
1041 :
1042 20 : char *e = NULL, *tocode = NULL;
1043 20 : size_t elen = 0, bestn = 0;
1044 20 : const char *q = NULL;
1045 :
1046 38 : for (const char *p = charsets; p; p = q ? q + 1 : 0)
1047 : {
1048 18 : q = strchr(p, ':');
1049 :
1050 18 : size_t n = q ? q - p : strlen(p);
1051 18 : if (n == 0)
1052 0 : continue;
1053 :
1054 18 : char *t = mutt_mem_malloc(n + 1);
1055 18 : memcpy(t, p, n);
1056 18 : t[n] = '\0';
1057 :
1058 18 : char *s = mutt_strn_dup(u, ulen);
1059 18 : const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, 0) :
1060 14 : mutt_ch_check(s, ulen, fromcode, t);
1061 18 : if (rc)
1062 : {
1063 2 : FREE(&t);
1064 2 : FREE(&s);
1065 2 : continue;
1066 : }
1067 16 : size_t slen = mutt_str_len(s);
1068 :
1069 16 : if (!tocode || (n < bestn))
1070 : {
1071 16 : bestn = n;
1072 16 : FREE(&tocode);
1073 16 : tocode = t;
1074 16 : if (d)
1075 : {
1076 4 : FREE(&e);
1077 4 : e = s;
1078 : }
1079 : else
1080 12 : FREE(&s);
1081 16 : elen = slen;
1082 : }
1083 : else
1084 : {
1085 0 : FREE(&t);
1086 0 : FREE(&s);
1087 : }
1088 : }
1089 20 : if (tocode)
1090 : {
1091 16 : if (d)
1092 4 : *d = e;
1093 16 : if (dlen)
1094 2 : *dlen = elen;
1095 :
1096 16 : char canonical_buf[1024];
1097 16 : mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1098 16 : mutt_str_replace(&tocode, canonical_buf);
1099 : }
1100 20 : return tocode;
1101 : }
|