LCOV - code coverage report
Current view: top level - mutt - charset.c (source / functions) Hit Total Coverage
Test: lcov.info Lines: 259 385 67.3 %
Date: 2020-09-01 11:23:34 Functions: 23 23 100.0 %

          Line data    Source code
       1             : /**
       2             :  * @file
       3             :  * Conversion between different character encodings
       4             :  *
       5             :  * @authors
       6             :  * Copyright (C) 1999-2002,2007 Thomas Roessler <roessler@does-not-exist.org>
       7             :  *
       8             :  * @copyright
       9             :  * This program is free software: you can redistribute it and/or modify it under
      10             :  * the terms of the GNU General Public License as published by the Free Software
      11             :  * Foundation, either version 2 of the License, or (at your option) any later
      12             :  * version.
      13             :  *
      14             :  * This program is distributed in the hope that it will be useful, but WITHOUT
      15             :  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
      16             :  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
      17             :  * details.
      18             :  *
      19             :  * You should have received a copy of the GNU General Public License along with
      20             :  * this program.  If not, see <http://www.gnu.org/licenses/>.
      21             :  */
      22             : 
      23             : /**
      24             :  * @page charset Conversion between different character encodings
      25             :  *
      26             :  * Conversion between different character encodings
      27             :  */
      28             : 
      29             : #include "config.h"
      30             : #include <ctype.h>
      31             : #include <errno.h>
      32             : #include <iconv.h>
      33             : #include <langinfo.h>
      34             : #include <limits.h>
      35             : #include <stdbool.h>
      36             : #include <stdio.h>
      37             : #include <string.h>
      38             : #include "charset.h"
      39             : #include "buffer.h"
      40             : #include "memory.h"
      41             : #include "queue.h"
      42             : #include "regex3.h"
      43             : #include "string2.h"
      44             : #ifdef ENABLE_NLS
      45             : #include <libintl.h>
      46             : #endif
      47             : 
      48             : #ifndef EILSEQ
      49             : #define EILSEQ EINVAL
      50             : #endif
      51             : 
      52             : char *C_AssumedCharset; ///< Config: If a message is missing a character set, assume this character set
      53             : char *C_Charset; ///< Config: Default character set for displaying text on screen
      54             : 
      55             : /**
      56             :  * ReplacementChar - When a Unicode character can't be displayed, use this instead
      57             :  */
      58             : wchar_t ReplacementChar = '?';
      59             : 
      60             : /**
      61             :  * CharsetIsUtf8 - Is the user's current character set utf-8?
      62             :  */
      63             : bool CharsetIsUtf8 = false;
      64             : 
      65             : /**
      66             :  * struct Lookup - Regex to String lookup table
      67             :  *
      68             :  * This is used by 'charset-hook' and 'iconv-hook'.
      69             :  */
      70             : struct Lookup
      71             : {
      72             :   enum LookupType type;        ///< Lookup type
      73             :   struct Regex regex;          ///< Regular expression
      74             :   char *replacement;           ///< Alternative charset to use
      75             :   TAILQ_ENTRY(Lookup) entries; ///< Linked list
      76             : };
      77             : TAILQ_HEAD(LookupList, Lookup);
      78             : 
      79             : static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
      80             : 
      81             : /**
      82             :  * struct MimeNames - MIME name lookup entry
      83             :  */
      84             : struct MimeNames
      85             : {
      86             :   const char *key;
      87             :   const char *pref;
      88             : };
      89             : 
      90             : // clang-format off
      91             : /**
      92             :  * PreferredMimeNames - Lookup table of preferred charsets
      93             :  *
      94             :  * The following list has been created manually from the data under:
      95             :  * http://www.isi.edu/in-notes/iana/assignments/character-sets
      96             :  * Last update: 2000-09-07
      97             :  *
      98             :  * @note It includes only the subset of character sets for which a preferred
      99             :  * MIME name is given.
     100             :  */
     101             : const struct MimeNames PreferredMimeNames[] =
     102             : {
     103             :   { "ansi_x3.4-1968",        "us-ascii"      },
     104             :   { "iso-ir-6",              "us-ascii"      },
     105             :   { "iso_646.irv:1991",      "us-ascii"      },
     106             :   { "ascii",                 "us-ascii"      },
     107             :   { "iso646-us",             "us-ascii"      },
     108             :   { "us",                    "us-ascii"      },
     109             :   { "ibm367",                "us-ascii"      },
     110             :   { "cp367",                 "us-ascii"      },
     111             :   { "csASCII",               "us-ascii"      },
     112             : 
     113             :   { "csISO2022KR",           "iso-2022-kr"   },
     114             :   { "csEUCKR",               "euc-kr"        },
     115             :   { "csISO2022JP",           "iso-2022-jp"   },
     116             :   { "csISO2022JP2",          "iso-2022-jp-2" },
     117             : 
     118             :   { "ISO_8859-1:1987",       "iso-8859-1"    },
     119             :   { "iso-ir-100",            "iso-8859-1"    },
     120             :   { "iso_8859-1",            "iso-8859-1"    },
     121             :   { "latin1",                "iso-8859-1"    },
     122             :   { "l1",                    "iso-8859-1"    },
     123             :   { "IBM819",                "iso-8859-1"    },
     124             :   { "CP819",                 "iso-8859-1"    },
     125             :   { "csISOLatin1",           "iso-8859-1"    },
     126             : 
     127             :   { "ISO_8859-2:1987",       "iso-8859-2"    },
     128             :   { "iso-ir-101",            "iso-8859-2"    },
     129             :   { "iso_8859-2",            "iso-8859-2"    },
     130             :   { "latin2",                "iso-8859-2"    },
     131             :   { "l2",                    "iso-8859-2"    },
     132             :   { "csISOLatin2",           "iso-8859-2"    },
     133             : 
     134             :   { "ISO_8859-3:1988",       "iso-8859-3"    },
     135             :   { "iso-ir-109",            "iso-8859-3"    },
     136             :   { "ISO_8859-3",            "iso-8859-3"    },
     137             :   { "latin3",                "iso-8859-3"    },
     138             :   { "l3",                    "iso-8859-3"    },
     139             :   { "csISOLatin3",           "iso-8859-3"    },
     140             : 
     141             :   { "ISO_8859-4:1988",       "iso-8859-4"    },
     142             :   { "iso-ir-110",            "iso-8859-4"    },
     143             :   { "ISO_8859-4",            "iso-8859-4"    },
     144             :   { "latin4",                "iso-8859-4"    },
     145             :   { "l4",                    "iso-8859-4"    },
     146             :   { "csISOLatin4",           "iso-8859-4"    },
     147             : 
     148             :   { "ISO_8859-6:1987",       "iso-8859-6"    },
     149             :   { "iso-ir-127",            "iso-8859-6"    },
     150             :   { "iso_8859-6",            "iso-8859-6"    },
     151             :   { "ECMA-114",              "iso-8859-6"    },
     152             :   { "ASMO-708",              "iso-8859-6"    },
     153             :   { "arabic",                "iso-8859-6"    },
     154             :   { "csISOLatinArabic",      "iso-8859-6"    },
     155             : 
     156             :   { "ISO_8859-7:1987",       "iso-8859-7"    },
     157             :   { "iso-ir-126",            "iso-8859-7"    },
     158             :   { "ISO_8859-7",            "iso-8859-7"    },
     159             :   { "ELOT_928",              "iso-8859-7"    },
     160             :   { "ECMA-118",              "iso-8859-7"    },
     161             :   { "greek",                 "iso-8859-7"    },
     162             :   { "greek8",                "iso-8859-7"    },
     163             :   { "csISOLatinGreek",       "iso-8859-7"    },
     164             : 
     165             :   { "ISO_8859-8:1988",       "iso-8859-8"    },
     166             :   { "iso-ir-138",            "iso-8859-8"    },
     167             :   { "ISO_8859-8",            "iso-8859-8"    },
     168             :   { "hebrew",                "iso-8859-8"    },
     169             :   { "csISOLatinHebrew",      "iso-8859-8"    },
     170             : 
     171             :   { "ISO_8859-5:1988",       "iso-8859-5"    },
     172             :   { "iso-ir-144",            "iso-8859-5"    },
     173             :   { "ISO_8859-5",            "iso-8859-5"    },
     174             :   { "cyrillic",              "iso-8859-5"    },
     175             :   { "csISOLatinCyrillic",    "iso-8859-5"    },
     176             : 
     177             :   { "ISO_8859-9:1989",       "iso-8859-9"    },
     178             :   { "iso-ir-148",            "iso-8859-9"    },
     179             :   { "ISO_8859-9",            "iso-8859-9"    },
     180             :   { "latin5",                "iso-8859-9"    },  /* this is not a bug */
     181             :   { "l5",                    "iso-8859-9"    },
     182             :   { "csISOLatin5",           "iso-8859-9"    },
     183             : 
     184             :   { "ISO_8859-10:1992",      "iso-8859-10"   },
     185             :   { "iso-ir-157",            "iso-8859-10"   },
     186             :   { "latin6",                "iso-8859-10"   },  /* this is not a bug */
     187             :   { "l6",                    "iso-8859-10"   },
     188             :   { "csISOLatin6",           "iso-8859-10"   },
     189             : 
     190             :   { "csKOI8r",               "koi8-r"        },
     191             : 
     192             :   { "MS_Kanji",              "Shift_JIS"     },  /* Note the underscore! */
     193             :   { "csShiftJis",            "Shift_JIS"     },
     194             : 
     195             :   { "Extended_UNIX_Code_Packed_Format_for_Japanese",
     196             :                              "euc-jp"        },
     197             :   { "csEUCPkdFmtJapanese",   "euc-jp"        },
     198             : 
     199             :   { "csGB2312",              "gb2312"        },
     200             :   { "csbig5",                "big5"          },
     201             : 
     202             :   /* End of official brain damage.
     203             :    * What follows has been taken from glibc's localedata files.  */
     204             : 
     205             :   { "iso_8859-13",           "iso-8859-13"   },
     206             :   { "iso-ir-179",            "iso-8859-13"   },
     207             :   { "latin7",                "iso-8859-13"   },  /* this is not a bug */
     208             :   { "l7",                    "iso-8859-13"   },
     209             : 
     210             :   { "iso_8859-14",           "iso-8859-14"   },
     211             :   { "latin8",                "iso-8859-14"   },  /* this is not a bug */
     212             :   { "l8",                    "iso-8859-14"   },
     213             : 
     214             :   { "iso_8859-15",           "iso-8859-15"   },
     215             :   { "latin9",                "iso-8859-15"   },  /* this is not a bug */
     216             : 
     217             :   /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
     218             :   { "latin0",                "iso-8859-15"   },  /* this is not a bug */
     219             : 
     220             :   { "iso_8859-16",           "iso-8859-16"   },
     221             :   { "latin10",               "iso-8859-16"   },  /* this is not a bug */
     222             : 
     223             :   { "646",                   "us-ascii"      },
     224             : 
     225             :   /* http://www.sun.com/software/white-papers/wp-unicode/ */
     226             : 
     227             :   { "eucJP",                 "euc-jp"        },
     228             :   { "PCK",                   "Shift_JIS"     },
     229             :   { "ko_KR-euc",             "euc-kr"        },
     230             :   { "zh_TW-big5",            "big5"          },
     231             : 
     232             :   /* seems to be common on some systems */
     233             : 
     234             :   { "sjis",                  "Shift_JIS"     },
     235             :   { "euc-jp-ms",             "eucJP-ms"      },
     236             : 
     237             :   /* If you happen to encounter system-specific brain-damage with respect to
     238             :    * character set naming, please add it above this comment, and submit a patch
     239             :    * to <neomutt-devel@neomutt.org> */
     240             : 
     241             :   { NULL,                     NULL           },
     242             : };
     243             : // clang-format on
     244             : 
     245             : /**
     246             :  * lookup_new - Create a new Lookup
     247             :  * @retval ptr New Lookup
     248             :  */
     249           2 : static struct Lookup *lookup_new(void)
     250             : {
     251           2 :   return mutt_mem_calloc(1, sizeof(struct Lookup));
     252             : }
     253             : 
     254             : /**
     255             :  * lookup_free - Free a Lookup
     256             :  * @param ptr Lookup to free
     257             :  */
     258           2 : static void lookup_free(struct Lookup **ptr)
     259             : {
     260           2 :   if (!ptr || !*ptr)
     261           0 :     return;
     262             : 
     263           2 :   struct Lookup *l = *ptr;
     264           2 :   FREE(&l->replacement);
     265           2 :   FREE(&l->regex.pattern);
     266           2 :   if (l->regex.regex)
     267           2 :     regfree(l->regex.regex);
     268           2 :   FREE(&l->regex.regex);
     269           2 :   FREE(&l->regex);
     270             : 
     271           2 :   FREE(ptr);
     272             : }
     273             : 
     274             : /**
     275             :  * lookup_charset - Look for a preferred character set name
     276             :  * @param type Type, e.g. #MUTT_LOOKUP_CHARSET
     277             :  * @param cs   Character set
     278             :  * @retval ptr Charset string
     279             :  *
     280             :  * If the character set matches one of the regexes,
     281             :  * then return the replacement name.
     282             :  */
     283        2120 : static const char *lookup_charset(enum LookupType type, const char *cs)
     284             : {
     285        2120 :   if (!cs)
     286           4 :     return NULL;
     287             : 
     288        2116 :   struct Lookup *l = NULL;
     289             : 
     290        2116 :   TAILQ_FOREACH(l, &Lookups, entries)
     291             :   {
     292           0 :     if (l->type != type)
     293           0 :       continue;
     294           0 :     if (mutt_regex_match(&l->regex, cs))
     295           0 :       return l->replacement;
     296             :   }
     297        2116 :   return NULL;
     298             : }
     299             : 
     300             : /**
     301             :  * mutt_ch_convert_nonmime_string - Try to convert a string using a list of character sets
     302             :  * @param[in,out] ps String to be converted
     303             :  * @retval 0  Success
     304             :  * @retval -1 Error
     305             :  *
     306             :  * Work through `$assumed_charset` looking for a character set conversion that
     307             :  * works.  Failing that, try mutt_ch_get_default_charset().
     308             :  */
     309           2 : int mutt_ch_convert_nonmime_string(char **ps)
     310             : {
     311           2 :   if (!ps)
     312           2 :     return -1;
     313             : 
     314           0 :   char *u = *ps;
     315           0 :   const size_t ulen = mutt_str_len(u);
     316           0 :   if (ulen == 0)
     317           0 :     return 0;
     318             : 
     319           0 :   const char *c1 = NULL;
     320             : 
     321           0 :   for (const char *c = C_AssumedCharset; c; c = c1 ? c1 + 1 : 0)
     322             :   {
     323           0 :     c1 = strchr(c, ':');
     324           0 :     size_t n = c1 ? c1 - c : mutt_str_len(c);
     325           0 :     if (n == 0)
     326           0 :       return 0;
     327           0 :     char *fromcode = mutt_mem_malloc(n + 1);
     328           0 :     mutt_str_copy(fromcode, c, n + 1);
     329           0 :     char *s = mutt_strn_dup(u, ulen);
     330           0 :     int m = mutt_ch_convert_string(&s, fromcode, C_Charset, 0);
     331           0 :     FREE(&fromcode);
     332           0 :     FREE(&s);
     333           0 :     if (m == 0)
     334             :     {
     335           0 :       return 0;
     336             :     }
     337             :   }
     338           0 :   mutt_ch_convert_string(ps, (const char *) mutt_ch_get_default_charset(),
     339             :                          C_Charset, MUTT_ICONV_HOOK_FROM);
     340           0 :   return -1;
     341             : }
     342             : 
     343             : /**
     344             :  * mutt_ch_canonical_charset - Canonicalise the charset of a string
     345             :  * @param buf Buffer for canonical character set name
     346             :  * @param buflen Length of buffer
     347             :  * @param name Name to be canonicalised
     348             :  *
     349             :  * This first ties off any charset extension such as "//TRANSLIT",
     350             :  * canonicalizes the charset and re-adds the extension
     351             :  */
     352        3099 : void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
     353             : {
     354        3099 :   if (!buf || !name)
     355          10 :     return;
     356             : 
     357        3089 :   char in[1024], scratch[1024];
     358             : 
     359        3089 :   mutt_str_copy(in, name, sizeof(in));
     360        3089 :   char *ext = strchr(in, '/');
     361        3089 :   if (ext)
     362           0 :     *ext++ = '\0';
     363             : 
     364        3089 :   if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
     365             :   {
     366        3061 :     mutt_str_copy(buf, "utf-8", buflen);
     367        3061 :     goto out;
     368             :   }
     369             : 
     370             :   /* catch some common iso-8859-something misspellings */
     371             :   size_t plen;
     372          28 :   if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
     373           0 :     snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
     374          28 :   else if ((plen = mutt_istr_startswith(in, "8859-")))
     375           0 :     snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
     376          28 :   else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
     377           0 :     snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
     378          28 :   else if ((plen = mutt_istr_startswith(in, "iso8859-")))
     379           0 :     snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
     380             :   else
     381          28 :     mutt_str_copy(scratch, in, sizeof(scratch));
     382             : 
     383        2856 :   for (size_t i = 0; PreferredMimeNames[i].key; i++)
     384             :   {
     385        2828 :     if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
     386             :     {
     387           0 :       mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
     388           0 :       goto out;
     389             :     }
     390             :   }
     391             : 
     392          28 :   mutt_str_copy(buf, scratch, buflen);
     393             : 
     394             :   /* for cosmetics' sake, transform to lowercase. */
     395         216 :   for (char *p = buf; *p; p++)
     396         188 :     *p = tolower(*p);
     397             : 
     398        3089 : out:
     399        3089 :   if (ext && *ext)
     400             :   {
     401           0 :     mutt_str_cat(buf, buflen, "/");
     402           0 :     mutt_str_cat(buf, buflen, ext);
     403             :   }
     404             : }
     405             : 
     406             : /**
     407             :  * mutt_ch_chscmp - Are the names of two character sets equivalent?
     408             :  * @param cs1 First character set
     409             :  * @param cs2 Second character set
     410             :  * @retval true  Names are equivalent
     411             :  * @retval false Names differ
     412             :  *
     413             :  * Charsets may have extensions that mutt_ch_canonical_charset() leaves intact;
     414             :  * we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2'
     415             :  * does _not_ have any extension) we simply check if the shorter string is a
     416             :  * prefix for the longer.
     417             :  */
     418         985 : bool mutt_ch_chscmp(const char *cs1, const char *cs2)
     419             : {
     420         985 :   if (!cs1 || !cs2)
     421           4 :     return false;
     422             : 
     423         981 :   char buf[256];
     424             : 
     425         981 :   mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
     426             : 
     427         981 :   int len1 = mutt_str_len(buf);
     428         981 :   int len2 = mutt_str_len(cs2);
     429             : 
     430         981 :   return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
     431         981 :                           ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
     432             : }
     433             : 
     434             : /**
     435             :  * mutt_ch_get_default_charset - Get the default character set
     436             :  * @retval ptr Name of the default character set
     437             :  *
     438             :  * @warning This returns a pointer to a static buffer.  Do not free it.
     439             :  */
     440           2 : char *mutt_ch_get_default_charset(void)
     441             : {
     442             :   static char fcharset[128];
     443           2 :   const char *c = C_AssumedCharset;
     444           2 :   const char *c1 = NULL;
     445             : 
     446           2 :   if (c)
     447             :   {
     448           0 :     c1 = strchr(c, ':');
     449           0 :     mutt_str_copy(fcharset, c, c1 ? (c1 - c + 1) : sizeof(fcharset));
     450           0 :     return fcharset;
     451             :   }
     452           2 :   return strcpy(fcharset, "us-ascii");
     453             : }
     454             : 
     455             : /**
     456             :  * mutt_ch_get_langinfo_charset - Get the user's choice of character set
     457             :  * @retval ptr Charset string
     458             :  *
     459             :  * Get the canonical character set used by the user's locale.
     460             :  * The caller must free the returned string.
     461             :  */
     462           3 : char *mutt_ch_get_langinfo_charset(void)
     463             : {
     464           3 :   char buf[1024] = { 0 };
     465             : 
     466           3 :   mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
     467             : 
     468           3 :   if (buf[0] != '\0')
     469           3 :     return mutt_str_dup(buf);
     470             : 
     471           0 :   return mutt_str_dup("iso-8859-1");
     472             : }
     473             : 
     474             : /**
     475             :  * mutt_ch_lookup_add - Add a new character set lookup
     476             :  * @param type    Type of character set, e.g. #MUTT_LOOKUP_CHARSET
     477             :  * @param pat     Pattern to match
     478             :  * @param replace Replacement string
     479             :  * @param err     Buffer for error message
     480             :  * @retval true  Lookup added to list
     481             :  * @retval false Regex string was invalid
     482             :  *
     483             :  * Add a regex for a character set and a replacement name.
     484             :  */
     485           6 : bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
     486             :                         const char *replace, struct Buffer *err)
     487             : {
     488           6 :   if (!pat || !replace)
     489           4 :     return false;
     490             : 
     491           2 :   regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
     492           2 :   int rc = REG_COMP(rx, pat, REG_ICASE);
     493           2 :   if (rc != 0)
     494             :   {
     495           0 :     regerror(rc, rx, err->data, err->dsize);
     496           0 :     FREE(&rx);
     497           0 :     return false;
     498             :   }
     499             : 
     500           2 :   struct Lookup *l = lookup_new();
     501           2 :   l->type = type;
     502           2 :   l->replacement = mutt_str_dup(replace);
     503           2 :   l->regex.pattern = mutt_str_dup(pat);
     504           2 :   l->regex.regex = rx;
     505           2 :   l->regex.pat_not = false;
     506             : 
     507           2 :   TAILQ_INSERT_TAIL(&Lookups, l, entries);
     508             : 
     509           2 :   return true;
     510             : }
     511             : 
     512             : /**
     513             :  * mutt_ch_lookup_remove - Remove all the character set lookups
     514             :  *
     515             :  * Empty the list of replacement character set names.
     516             :  */
     517           2 : void mutt_ch_lookup_remove(void)
     518             : {
     519           2 :   struct Lookup *l = NULL;
     520           2 :   struct Lookup *tmp = NULL;
     521             : 
     522           4 :   TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
     523             :   {
     524           2 :     TAILQ_REMOVE(&Lookups, l, entries);
     525           2 :     lookup_free(&l);
     526             :   }
     527           2 : }
     528             : 
     529             : /**
     530             :  * mutt_ch_charset_lookup - Look for a replacement character set
     531             :  * @param chs Character set to lookup
     532             :  * @retval ptr  Replacement character set (if a 'charset-hook' matches)
     533             :  * @retval NULL No matching hook
     534             :  *
     535             :  * Look through all the 'charset-hook's.
     536             :  * If one matches return the replacement character set.
     537             :  */
     538          26 : const char *mutt_ch_charset_lookup(const char *chs)
     539             : {
     540          26 :   return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
     541             : }
     542             : 
     543             : /**
     544             :  * mutt_ch_iconv_open - Set up iconv for conversions
     545             :  * @param tocode   Current character set
     546             :  * @param fromcode Target character set
     547             :  * @param flags    Flags, e.g. #MUTT_ICONV_HOOK_FROM
     548             :  * @retval ptr iconv handle for the conversion
     549             :  *
     550             :  * Like iconv_open, but canonicalises the charsets, applies charset-hooks,
     551             :  * recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips
     552             :  * charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers
     553             :  * should use flags=0 when fromcode can safely be considered true, either some
     554             :  * constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be
     555             :  * used only when fromcode is unsure, taken from a possibly wrong incoming MIME
     556             :  * label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions
     557             :  * in some setups.
     558             :  *
     559             :  * @note By design charset-hooks should never be, and are never, applied
     560             :  * to tocode.
     561             :  *
     562             :  * @note The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks,
     563             :  * not at all on iconv-hooks.
     564             :  */
     565        1046 : iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
     566             : {
     567        1046 :   char tocode1[128];
     568        1046 :   char fromcode1[128];
     569        1046 :   const char *tocode2 = NULL, *fromcode2 = NULL;
     570        1046 :   const char *tmp = NULL;
     571             : 
     572             :   iconv_t cd;
     573             : 
     574             :   /* transform to MIME preferred charset names */
     575        1046 :   mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
     576        1046 :   mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
     577             : 
     578             :   /* maybe apply charset-hooks and recanonicalise fromcode,
     579             :    * but only when caller asked us to sanitize a potentially wrong
     580             :    * charset name incoming from the wild exterior. */
     581        1046 :   if (flags & MUTT_ICONV_HOOK_FROM)
     582             :   {
     583          24 :     tmp = mutt_ch_charset_lookup(fromcode1);
     584          24 :     if (tmp)
     585           0 :       mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
     586             :   }
     587             : 
     588             :   /* always apply iconv-hooks to suit system's iconv tastes */
     589        1046 :   tocode2 = mutt_ch_iconv_lookup(tocode1);
     590        1046 :   tocode2 = tocode2 ? tocode2 : tocode1;
     591        1046 :   fromcode2 = mutt_ch_iconv_lookup(fromcode1);
     592        1046 :   fromcode2 = fromcode2 ? fromcode2 : fromcode1;
     593             : 
     594             :   /* call system iconv with names it appreciates */
     595        1046 :   cd = iconv_open(tocode2, fromcode2);
     596        1046 :   if (cd != (iconv_t) -1)
     597        1038 :     return cd;
     598             : 
     599           8 :   return (iconv_t) -1;
     600             : }
     601             : 
     602             : /**
     603             :  * mutt_ch_iconv - Change the encoding of a string
     604             :  * @param[in]     cd           Iconv conversion descriptor
     605             :  * @param[in,out] inbuf        Buffer to convert
     606             :  * @param[in,out] inbytesleft  Length of buffer to convert
     607             :  * @param[in,out] outbuf       Buffer for the result
     608             :  * @param[in,out] outbytesleft Length of result buffer
     609             :  * @param[in]     inrepls      Input replacement characters
     610             :  * @param[in]     outrepl      Output replacement characters
     611             :  * @param[out]    iconverrno   Errno if iconv() fails, 0 if it succeeds
     612             :  * @retval num Characters converted
     613             :  *
     614             :  * Like iconv, but keeps going even when the input is invalid
     615             :  * If you're supplying inrepls, the source charset should be stateless;
     616             :  * if you're supplying an outrepl, the target charset should be.
     617             :  */
     618         972 : size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
     619             :                      char **outbuf, size_t *outbytesleft, const char **inrepls,
     620             :                      const char *outrepl, int *iconverrno)
     621             : {
     622         972 :   size_t rc = 0;
     623         972 :   const char *ib = *inbuf;
     624         972 :   size_t ibl = *inbytesleft;
     625         972 :   char *ob = *outbuf;
     626         972 :   size_t obl = *outbytesleft;
     627             : 
     628             :   while (true)
     629           0 :   {
     630         972 :     errno = 0;
     631         972 :     const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
     632         972 :     if (ret1 != (size_t) -1)
     633         972 :       rc += ret1;
     634         972 :     if (iconverrno)
     635         972 :       *iconverrno = errno;
     636             : 
     637         972 :     if (ibl && obl && (errno == EILSEQ))
     638             :     {
     639           0 :       if (inrepls)
     640             :       {
     641             :         /* Try replacing the input */
     642           0 :         const char **t = NULL;
     643           0 :         for (t = inrepls; *t; t++)
     644             :         {
     645           0 :           const char *ib1 = *t;
     646           0 :           size_t ibl1 = strlen(*t);
     647           0 :           char *ob1 = ob;
     648           0 :           size_t obl1 = obl;
     649           0 :           iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
     650           0 :           if (ibl1 == 0)
     651             :           {
     652           0 :             ib++;
     653           0 :             ibl--;
     654           0 :             ob = ob1;
     655           0 :             obl = obl1;
     656           0 :             rc++;
     657           0 :             break;
     658             :           }
     659             :         }
     660           0 :         if (*t)
     661           0 :           continue;
     662             :       }
     663             :       /* Replace the output */
     664           0 :       if (!outrepl)
     665           0 :         outrepl = "?";
     666           0 :       iconv(cd, NULL, NULL, &ob, &obl);
     667           0 :       if (obl)
     668             :       {
     669           0 :         int n = strlen(outrepl);
     670           0 :         if (n > obl)
     671             :         {
     672           0 :           outrepl = "?";
     673           0 :           n = 1;
     674             :         }
     675           0 :         memcpy(ob, outrepl, n);
     676           0 :         ib++;
     677           0 :         ibl--;
     678           0 :         ob += n;
     679           0 :         obl -= n;
     680           0 :         rc++;
     681           0 :         iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
     682           0 :         continue;
     683             :       }
     684             :     }
     685         972 :     *inbuf = ib;
     686         972 :     *inbytesleft = ibl;
     687         972 :     *outbuf = ob;
     688         972 :     *outbytesleft = obl;
     689         972 :     return rc;
     690             :   }
     691             : }
     692             : 
     693             : /**
     694             :  * mutt_ch_iconv_lookup - Look for a replacement character set
     695             :  * @param chs Character set to lookup
     696             :  * @retval ptr  Replacement character set (if a 'iconv-hook' matches)
     697             :  * @retval NULL No matching hook
     698             :  *
     699             :  * Look through all the 'iconv-hook's.
     700             :  * If one matches return the replacement character set.
     701             :  */
     702        2094 : const char *mutt_ch_iconv_lookup(const char *chs)
     703             : {
     704        2094 :   return lookup_charset(MUTT_LOOKUP_ICONV, chs);
     705             : }
     706             : 
     707             : /**
     708             :  * mutt_ch_check - Check whether a string can be converted between encodings
     709             :  * @param[in] s     String to check
     710             :  * @param[in] slen  Length of the string to check
     711             :  * @param[in] from  Current character set
     712             :  * @param[in] to    Target character set
     713             :  * @retval 0  Success
     714             :  * @retval -1 Error in iconv_open()
     715             :  * @retval >0 Errno as set by iconv()
     716             :  */
     717          20 : int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
     718             : {
     719          20 :   if (!s || !from || !to)
     720           6 :     return -1;
     721             : 
     722          14 :   int rc = 0;
     723          14 :   iconv_t cd = mutt_ch_iconv_open(to, from, 0);
     724          14 :   if (cd == (iconv_t) -1)
     725           2 :     return -1;
     726             : 
     727          12 :   size_t outlen = MB_LEN_MAX * slen;
     728          12 :   char *out = mutt_mem_malloc(outlen + 1);
     729          12 :   char *saved_out = out;
     730             : 
     731             :   const size_t convlen =
     732          12 :       iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
     733          12 :   if (convlen == -1)
     734           0 :     rc = errno;
     735             : 
     736          12 :   FREE(&saved_out);
     737          12 :   iconv_close(cd);
     738          12 :   return rc;
     739             : }
     740             : 
     741             : /**
     742             :  * mutt_ch_convert_string - Convert a string between encodings
     743             :  * @param[in,out] ps    String to convert
     744             :  * @param[in]     from  Current character set
     745             :  * @param[in]     to    Target character set
     746             :  * @param[in]     flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
     747             :  * @retval 0      Success
     748             :  * @retval -1     Invalid arguments or failure to open an iconv channel
     749             :  * @retval errno  Failure in iconv conversion
     750             :  *
     751             :  * Parameter flags is given as-is to mutt_ch_iconv_open().
     752             :  * See there for its meaning and usage policy.
     753             :  */
     754         998 : int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
     755             : {
     756         998 :   if (!ps)
     757           2 :     return -1;
     758             : 
     759         996 :   char *s = *ps;
     760             : 
     761         996 :   if (!s || (*s == '\0'))
     762          10 :     return 0;
     763             : 
     764         986 :   if (!to || !from)
     765          14 :     return -1;
     766             : 
     767         972 :   const char *repls[] = { "\357\277\275", "?", 0 };
     768         972 :   int rc = 0;
     769             : 
     770         972 :   iconv_t cd = mutt_ch_iconv_open(to, from, flags);
     771         972 :   if (cd == (iconv_t) -1)
     772           0 :     return -1;
     773             : 
     774             :   size_t len;
     775         972 :   const char *ib = NULL;
     776         972 :   char *buf = NULL, *ob = NULL;
     777         972 :   size_t ibl, obl;
     778         972 :   const char **inrepls = NULL;
     779         972 :   const char *outrepl = NULL;
     780             : 
     781         972 :   if (mutt_ch_is_utf8(to))
     782         970 :     outrepl = "\357\277\275";
     783           2 :   else if (mutt_ch_is_utf8(from))
     784           2 :     inrepls = repls;
     785             :   else
     786           0 :     outrepl = "?";
     787             : 
     788         972 :   len = strlen(s);
     789         972 :   ib = s;
     790         972 :   ibl = len + 1;
     791         972 :   obl = MB_LEN_MAX * ibl;
     792         972 :   buf = mutt_mem_malloc(obl + 1);
     793         972 :   ob = buf;
     794             : 
     795         972 :   mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
     796         972 :   iconv_close(cd);
     797             : 
     798         972 :   *ob = '\0';
     799             : 
     800         972 :   FREE(ps);
     801         972 :   *ps = buf;
     802             : 
     803         972 :   mutt_str_adjust(ps);
     804         972 :   return rc;
     805             : }
     806             : 
     807             : /**
     808             :  * mutt_ch_check_charset - Does iconv understand a character set?
     809             :  * @param cs     Character set to check
     810             :  * @param strict Check strictly by using iconv
     811             :  * @retval true Character set is valid
     812             :  *
     813             :  * If `strict` is false, then finding a matching character set in
     814             :  * #PreferredMimeNames will be enough.
     815             :  * If `strict` is true, or the charset is not in #PreferredMimeNames, then
     816             :  * iconv() with be run.
     817             :  */
     818           6 : bool mutt_ch_check_charset(const char *cs, bool strict)
     819             : {
     820           6 :   if (!cs)
     821           2 :     return false;
     822             : 
     823           4 :   if (mutt_ch_is_utf8(cs))
     824           2 :     return true;
     825             : 
     826           2 :   if (!strict)
     827             :   {
     828           0 :     for (int i = 0; PreferredMimeNames[i].key; i++)
     829             :     {
     830           0 :       if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
     831           0 :           mutt_istr_equal(PreferredMimeNames[i].pref, cs))
     832             :       {
     833           0 :         return true;
     834             :       }
     835             :     }
     836             :   }
     837             : 
     838           2 :   iconv_t cd = mutt_ch_iconv_open(cs, cs, 0);
     839           2 :   if (cd != (iconv_t)(-1))
     840             :   {
     841           2 :     iconv_close(cd);
     842           2 :     return true;
     843             :   }
     844             : 
     845           0 :   return false;
     846             : }
     847             : 
     848             : /**
     849             :  * mutt_ch_fgetconv_open - Prepare a file for charset conversion
     850             :  * @param fp    FILE ptr to prepare
     851             :  * @param from  Current character set
     852             :  * @param to    Destination character set
     853             :  * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
     854             :  * @retval ptr fgetconv handle
     855             :  *
     856             :  * Parameter flags is given as-is to mutt_ch_iconv_open().
     857             :  */
     858           6 : struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, int flags)
     859             : {
     860           6 :   struct FgetConv *fc = NULL;
     861           6 :   iconv_t cd = (iconv_t) -1;
     862             : 
     863           6 :   if (from && to)
     864           2 :     cd = mutt_ch_iconv_open(to, from, flags);
     865             : 
     866           6 :   if (cd != (iconv_t) -1)
     867             :   {
     868             :     static const char *repls[] = { "\357\277\275", "?", 0 };
     869             : 
     870           0 :     fc = mutt_mem_malloc(sizeof(struct FgetConv));
     871           0 :     fc->p = fc->bufo;
     872           0 :     fc->ob = fc->bufo;
     873           0 :     fc->ib = fc->bufi;
     874           0 :     fc->ibl = 0;
     875           0 :     fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
     876             :   }
     877             :   else
     878           6 :     fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
     879           6 :   fc->fp = fp;
     880           6 :   fc->cd = cd;
     881           6 :   return fc;
     882             : }
     883             : 
     884             : /**
     885             :  * mutt_ch_fgetconv_close - Close an fgetconv handle
     886             :  * @param[out] fc fgetconv handle
     887             :  */
     888          10 : void mutt_ch_fgetconv_close(struct FgetConv **fc)
     889             : {
     890          10 :   if (!fc || !*fc)
     891           4 :     return;
     892             : 
     893           6 :   if ((*fc)->cd != (iconv_t) -1)
     894           0 :     iconv_close((*fc)->cd);
     895           6 :   FREE(fc);
     896             : }
     897             : 
     898             : /**
     899             :  * mutt_ch_fgetconv - Convert a file's character set
     900             :  * @param fc FgetConv handle
     901             :  * @retval num Next character in the converted file
     902             :  * @retval EOF Error
     903             :  *
     904             :  * A file is read into a buffer and its character set is converted.
     905             :  * Each call to this function will return one converted character.
     906             :  * The buffer is refilled automatically when empty.
     907             :  */
     908           4 : int mutt_ch_fgetconv(struct FgetConv *fc)
     909             : {
     910           4 :   if (!fc)
     911           4 :     return EOF;
     912           0 :   if (fc->cd == (iconv_t) -1)
     913           0 :     return fgetc(fc->fp);
     914           0 :   if (!fc->p)
     915           0 :     return EOF;
     916           0 :   if (fc->p < fc->ob)
     917           0 :     return (unsigned char) *(fc->p)++;
     918             : 
     919             :   /* Try to convert some more */
     920           0 :   fc->p = fc->bufo;
     921           0 :   fc->ob = fc->bufo;
     922           0 :   if (fc->ibl)
     923             :   {
     924           0 :     size_t obl = sizeof(fc->bufo);
     925           0 :     iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
     926           0 :     if (fc->p < fc->ob)
     927           0 :       return (unsigned char) *(fc->p)++;
     928             :   }
     929             : 
     930             :   /* If we trusted iconv a bit more, we would at this point
     931             :    * ask why it had stopped converting ... */
     932             : 
     933             :   /* Try to read some more */
     934           0 :   if ((fc->ibl == sizeof(fc->bufi)) ||
     935           0 :       (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
     936             :   {
     937           0 :     fc->p = 0;
     938           0 :     return EOF;
     939             :   }
     940           0 :   if (fc->ibl)
     941           0 :     memcpy(fc->bufi, fc->ib, fc->ibl);
     942           0 :   fc->ib = fc->bufi;
     943           0 :   fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
     944             : 
     945             :   /* Try harder this time to convert some */
     946           0 :   if (fc->ibl)
     947             :   {
     948           0 :     size_t obl = sizeof(fc->bufo);
     949           0 :     mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
     950             :                   fc->inrepls, 0, NULL);
     951           0 :     if (fc->p < fc->ob)
     952           0 :       return (unsigned char) *(fc->p)++;
     953             :   }
     954             : 
     955             :   /* Either the file has finished or one of the buffers is too small */
     956           0 :   fc->p = 0;
     957           0 :   return EOF;
     958             : }
     959             : 
     960             : /**
     961             :  * mutt_ch_fgetconvs - Convert a file's charset into a string buffer
     962             :  * @param buf    Buffer for result
     963             :  * @param buflen Length of buffer
     964             :  * @param fc     FgetConv handle
     965             :  * @retval ptr  Success, result buffer
     966             :  * @retval NULL Error
     967             :  *
     968             :  * Read a file into a buffer, converting the character set as it goes.
     969             :  */
     970           4 : char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
     971             : {
     972           4 :   if (!buf)
     973           2 :     return NULL;
     974             : 
     975             :   size_t r;
     976           2 :   for (r = 0; (r + 1) < buflen;)
     977             :   {
     978           2 :     const int c = mutt_ch_fgetconv(fc);
     979           2 :     if (c == EOF)
     980           2 :       break;
     981           0 :     buf[r++] = (char) c;
     982           0 :     if (c == '\n')
     983           0 :       break;
     984             :   }
     985           2 :   buf[r] = '\0';
     986             : 
     987           2 :   if (r > 0)
     988           0 :     return buf;
     989             : 
     990           2 :   return NULL;
     991             : }
     992             : 
     993             : /**
     994             :  * mutt_ch_set_charset - Update the records for a new character set
     995             :  * @param charset New character set
     996             :  *
     997             :  * Check if this character set is utf-8 and pick a suitable replacement
     998             :  * character for unprintable characters.
     999             :  *
    1000             :  * @note This calls `bind_textdomain_codeset()` which will affect future
    1001             :  * message translations.
    1002             :  */
    1003           3 : void mutt_ch_set_charset(const char *charset)
    1004             : {
    1005           3 :   char buf[256];
    1006             : 
    1007           3 :   mutt_ch_canonical_charset(buf, sizeof(buf), charset);
    1008             : 
    1009           3 :   if (mutt_ch_is_utf8(buf))
    1010             :   {
    1011           1 :     CharsetIsUtf8 = true;
    1012           1 :     ReplacementChar = 0xfffd; /* replacement character */
    1013             :   }
    1014             :   else
    1015             :   {
    1016           2 :     CharsetIsUtf8 = false;
    1017           2 :     ReplacementChar = '?';
    1018             :   }
    1019             : 
    1020             : #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
    1021           3 :   bind_textdomain_codeset(PACKAGE, buf);
    1022             : #endif
    1023           3 : }
    1024             : 
    1025             : /**
    1026             :  * mutt_ch_choose - Figure the best charset to encode a string
    1027             :  * @param[in] fromcode Original charset of the string
    1028             :  * @param[in] charsets Colon-separated list of potential charsets to use
    1029             :  * @param[in] u        String to encode
    1030             :  * @param[in] ulen     Length of the string to encode
    1031             :  * @param[out] d       If not NULL, point it to the converted string
    1032             :  * @param[out] dlen    If not NULL, point it to the length of the d string
    1033             :  * @retval ptr  Best performing charset
    1034             :  * @retval NULL None could be found
    1035             :  */
    1036          22 : char *mutt_ch_choose(const char *fromcode, const char *charsets, const char *u,
    1037             :                      size_t ulen, char **d, size_t *dlen)
    1038             : {
    1039          22 :   if (!fromcode)
    1040           2 :     return NULL;
    1041             : 
    1042          20 :   char *e = NULL, *tocode = NULL;
    1043          20 :   size_t elen = 0, bestn = 0;
    1044          20 :   const char *q = NULL;
    1045             : 
    1046          38 :   for (const char *p = charsets; p; p = q ? q + 1 : 0)
    1047             :   {
    1048          18 :     q = strchr(p, ':');
    1049             : 
    1050          18 :     size_t n = q ? q - p : strlen(p);
    1051          18 :     if (n == 0)
    1052           0 :       continue;
    1053             : 
    1054          18 :     char *t = mutt_mem_malloc(n + 1);
    1055          18 :     memcpy(t, p, n);
    1056          18 :     t[n] = '\0';
    1057             : 
    1058          18 :     char *s = mutt_strn_dup(u, ulen);
    1059          18 :     const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, 0) :
    1060          14 :                        mutt_ch_check(s, ulen, fromcode, t);
    1061          18 :     if (rc)
    1062             :     {
    1063           2 :       FREE(&t);
    1064           2 :       FREE(&s);
    1065           2 :       continue;
    1066             :     }
    1067          16 :     size_t slen = mutt_str_len(s);
    1068             : 
    1069          16 :     if (!tocode || (n < bestn))
    1070             :     {
    1071          16 :       bestn = n;
    1072          16 :       FREE(&tocode);
    1073          16 :       tocode = t;
    1074          16 :       if (d)
    1075             :       {
    1076           4 :         FREE(&e);
    1077           4 :         e = s;
    1078             :       }
    1079             :       else
    1080          12 :         FREE(&s);
    1081          16 :       elen = slen;
    1082             :     }
    1083             :     else
    1084             :     {
    1085           0 :       FREE(&t);
    1086           0 :       FREE(&s);
    1087             :     }
    1088             :   }
    1089          20 :   if (tocode)
    1090             :   {
    1091          16 :     if (d)
    1092           4 :       *d = e;
    1093          16 :     if (dlen)
    1094           2 :       *dlen = elen;
    1095             : 
    1096          16 :     char canonical_buf[1024];
    1097          16 :     mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
    1098          16 :     mutt_str_replace(&tocode, canonical_buf);
    1099             :   }
    1100          20 :   return tocode;
    1101             : }

Generated by: LCOV version 1.15