idna.c

Go to the documentation of this file.
00001 /* idna.c --- Convert to or from IDN strings.
00002  * Copyright (C) 2002, 2003, 2004  Simon Josefsson
00003  *
00004  * This file is part of GNU Libidn.
00005  *
00006  * GNU Libidn is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * GNU Libidn is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with GNU Libidn; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
00019  *
00020  */
00021 
00022 #ifdef HAVE_CONFIG_H
00023 # include "config.h"
00024 #endif
00025 
00026 #include <stdlib.h>
00027 #include <string.h>
00028 #include <stringprep.h>
00029 #include <punycode.h>
00030 
00031 #include "idna.h"
00032 
00033 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 ||      \
00034                  (c) == 0xFF0E || (c) == 0xFF61)
00035 
00036 /* Core functions */
00037 
00069 int
00070 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
00071 {
00072   size_t len, outlen;
00073   uint32_t *src;                /* XXX don't need to copy data? */
00074   int rc;
00075 
00076   /*
00077    * ToASCII consists of the following steps:
00078    *
00079    * 1. If all code points in the sequence are in the ASCII range (0..7F)
00080    * then skip to step 3.
00081    */
00082 
00083   {
00084     size_t i;
00085     int inasciirange;
00086 
00087     inasciirange = 1;
00088     for (i = 0; i < inlen; i++)
00089       if (in[i] > 0x7F)
00090         inasciirange = 0;
00091     if (inasciirange)
00092       {
00093         src = malloc (sizeof (in[0]) * (inlen + 1));
00094         if (src == NULL)
00095           return IDNA_MALLOC_ERROR;
00096 
00097         memcpy (src, in, sizeof (in[0]) * inlen);
00098         src[inlen] = 0;
00099 
00100         goto step3;
00101       }
00102   }
00103 
00104   /*
00105    * 2. Perform the steps specified in [NAMEPREP] and fail if there is
00106    * an error. The AllowUnassigned flag is used in [NAMEPREP].
00107    */
00108 
00109   {
00110     char *p;
00111 
00112     p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
00113     if (p == NULL)
00114       return IDNA_MALLOC_ERROR;
00115 
00116     len = strlen (p);
00117     do
00118       {
00119         char *newp;
00120 
00121         len = 2 * len + 10;     /* XXX better guess? */
00122         newp = realloc (p, len);
00123         if (newp == NULL)
00124           {
00125             free (p);
00126             return IDNA_MALLOC_ERROR;
00127           }
00128         p = newp;
00129 
00130         if (flags & IDNA_ALLOW_UNASSIGNED)
00131           rc = stringprep_nameprep (p, len);
00132         else
00133           rc = stringprep_nameprep_no_unassigned (p, len);
00134       }
00135     while (rc == STRINGPREP_TOO_SMALL_BUFFER);
00136 
00137     if (rc != STRINGPREP_OK)
00138       {
00139         free (p);
00140         return IDNA_STRINGPREP_ERROR;
00141       }
00142 
00143     src = stringprep_utf8_to_ucs4 (p, -1, NULL);
00144 
00145     free (p);
00146   }
00147 
00148 step3:
00149   /*
00150    * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
00151    *
00152    * (a) Verify the absence of non-LDH ASCII code points; that is,
00153    * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
00154    *
00155    * (b) Verify the absence of leading and trailing hyphen-minus;
00156    * that is, the absence of U+002D at the beginning and end of
00157    * the sequence.
00158    */
00159 
00160   if (flags & IDNA_USE_STD3_ASCII_RULES)
00161     {
00162       size_t i;
00163 
00164       for (i = 0; src[i]; i++)
00165         if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
00166             (src[i] >= 0x3A && src[i] <= 0x40) ||
00167             (src[i] >= 0x5B && src[i] <= 0x60) ||
00168             (src[i] >= 0x7B && src[i] <= 0x7F))
00169           {
00170             free (src);
00171             return IDNA_CONTAINS_NON_LDH;
00172           }
00173 
00174       if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
00175         {
00176           free (src);
00177           return IDNA_CONTAINS_MINUS;
00178         }
00179     }
00180 
00181   /*
00182    * 4. If all code points in the sequence are in the ASCII range
00183    * (0..7F), then skip to step 8.
00184    */
00185 
00186   {
00187     size_t i;
00188     int inasciirange;
00189 
00190     inasciirange = 1;
00191     for (i = 0; src[i]; i++)
00192       {
00193         if (src[i] > 0x7F)
00194           inasciirange = 0;
00195         /* copy string to output buffer if we are about to skip to step8 */
00196         if (i < 64)
00197           out[i] = src[i];
00198       }
00199     if (i < 64)
00200       out[i] = '\0';
00201     if (inasciirange)
00202       goto step8;
00203   }
00204 
00205   /*
00206    * 5. Verify that the sequence does NOT begin with the ACE prefix.
00207    *
00208    */
00209 
00210   {
00211     size_t i;
00212     int match;
00213 
00214     match = 1;
00215     for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
00216       if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
00217         match = 0;
00218     if (match)
00219       {
00220         free (src);
00221         return IDNA_CONTAINS_ACE_PREFIX;
00222       }
00223   }
00224 
00225   /*
00226    * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
00227    * and fail if there is an error.
00228    */
00229   for (len = 0; src[len]; len++)
00230     ;
00231   src[len] = '\0';
00232   outlen = 63 - strlen (IDNA_ACE_PREFIX);
00233   rc = punycode_encode (len, src, NULL,
00234                         &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
00235   if (rc != PUNYCODE_SUCCESS)
00236     {
00237       free (src);
00238       return IDNA_PUNYCODE_ERROR;
00239     }
00240   out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
00241 
00242   /*
00243    * 7. Prepend the ACE prefix.
00244    */
00245 
00246   memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
00247 
00248   /*
00249    * 8. Verify that the number of code points is in the range 1 to 63
00250    * inclusive (0 is excluded).
00251    */
00252 
00253 step8:
00254   free (src);
00255   if (strlen (out) < 1 || strlen (out) > 63)
00256     return IDNA_INVALID_LENGTH;
00257 
00258   return IDNA_SUCCESS;
00259 }
00260 
00261 /* ToUnicode().  May realloc() utf8in.  Will free utf8in unconditionally. */
00262 static int
00263 idna_to_unicode_internal (char *utf8in,
00264                           uint32_t * out, size_t * outlen, int flags)
00265 {
00266   int rc;
00267   char tmpout[64];
00268   size_t utf8len = strlen (utf8in) + 1;
00269   size_t addlen = 0;
00270 
00271   /*
00272    * ToUnicode consists of the following steps:
00273    *
00274    * 1. If the sequence contains any code points outside the ASCII range
00275    * (0..7F) then proceed to step 2, otherwise skip to step 3.
00276    */
00277 
00278   {
00279     size_t i;
00280     int inasciirange;
00281 
00282     inasciirange = 1;
00283     for (i = 0; utf8in[i]; i++)
00284       if (utf8in[i] & ~0x7F)
00285         inasciirange = 0;
00286     if (inasciirange)
00287       goto step3;
00288   }
00289 
00290   /*
00291    * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
00292    * error. (If step 3 of ToASCII is also performed here, it will not
00293    * affect the overall behavior of ToUnicode, but it is not
00294    * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
00295    */
00296   do
00297     {
00298       char *newp = realloc (utf8in, utf8len + addlen);
00299       if (newp == NULL)
00300         {
00301           free (utf8in);
00302           return IDNA_MALLOC_ERROR;
00303         }
00304       utf8in = newp;
00305       if (flags & IDNA_ALLOW_UNASSIGNED)
00306         rc = stringprep_nameprep (utf8in, utf8len + addlen);
00307       else
00308         rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
00309       addlen += 1;
00310     }
00311   while (rc == STRINGPREP_TOO_SMALL_BUFFER);
00312 
00313   if (rc != STRINGPREP_OK)
00314     {
00315       free (utf8in);
00316       return IDNA_STRINGPREP_ERROR;
00317     }
00318 
00319   /* 3. Verify that the sequence begins with the ACE prefix, and save a
00320    * copy of the sequence.
00321    */
00322 
00323 step3:
00324   if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
00325     {
00326       free (utf8in);
00327       return IDNA_NO_ACE_PREFIX;
00328     }
00329 
00330   /* 4. Remove the ACE prefix.
00331    */
00332 
00333   memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
00334            strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
00335 
00336   /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
00337    * and fail if there is an error. Save a copy of the result of
00338    * this step.
00339    */
00340 
00341   (*outlen)--;                  /* reserve one for the zero */
00342 
00343   rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
00344   if (rc != PUNYCODE_SUCCESS)
00345     {
00346       free (utf8in);
00347       return IDNA_PUNYCODE_ERROR;
00348     }
00349 
00350   out[*outlen] = 0;             /* add zero */
00351 
00352   /* 6. Apply ToASCII.
00353    */
00354 
00355   rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
00356   if (rc != IDNA_SUCCESS)
00357     {
00358       free (utf8in);
00359       return rc;
00360     }
00361 
00362   /* 7. Verify that the result of step 6 matches the saved copy from
00363    * step 3, using a case-insensitive ASCII comparison.
00364    */
00365 
00366   if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
00367     {
00368       free (utf8in);
00369       return IDNA_ROUNDTRIP_VERIFY_ERROR;
00370     }
00371 
00372   /* 8. Return the saved copy from step 5.
00373    */
00374 
00375   free (utf8in);
00376   return IDNA_SUCCESS;
00377 }
00378 
00414 int
00415 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
00416                      uint32_t * out, size_t * outlen, int flags)
00417 {
00418   int rc;
00419   size_t outlensave = *outlen;
00420   char *p;
00421 
00422   p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
00423   if (p == NULL)
00424     return IDNA_MALLOC_ERROR;
00425 
00426   rc = idna_to_unicode_internal (p, out, outlen, flags);
00427   if (rc != IDNA_SUCCESS)
00428     {
00429       memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
00430                                          inlen : outlensave));
00431       *outlen = inlen;
00432     }
00433 
00434   /* p is freed in idna_to_unicode_internal.  */
00435 
00436   return rc;
00437 }
00438 
00439 /* Wrappers that handle several labels */
00440 
00454 int
00455 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
00456 {
00457   const uint32_t *start = input;
00458   const uint32_t *end = input;
00459   char buf[64];
00460   char *out = NULL;
00461   int rc;
00462 
00463   /* 1) Whenever dots are used as label separators, the following
00464      characters MUST be recognized as dots: U+002E (full stop),
00465      U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
00466      U+FF61 (halfwidth ideographic full stop). */
00467 
00468   if (input[0] == 0)
00469     {
00470       /* Handle implicit zero-length root label. */
00471       *output = malloc (1);
00472       if (!*output)
00473         return IDNA_MALLOC_ERROR;
00474       strcpy (*output, "");
00475       return IDNA_SUCCESS;
00476     }
00477 
00478   if (DOTP (input[0]) && input[1] == 0)
00479     {
00480       /* Handle explicit zero-length root label. */
00481       *output = malloc (2);
00482       if (!*output)
00483         return IDNA_MALLOC_ERROR;
00484       strcpy (*output, ".");
00485       return IDNA_SUCCESS;
00486     }
00487 
00488   *output = NULL;
00489   do
00490     {
00491       end = start;
00492 
00493       for (; *end && !DOTP (*end); end++)
00494         ;
00495 
00496       if (*end == '\0' && start == end)
00497         {
00498           /* Handle explicit zero-length root label. */
00499           buf[0] = '\0';
00500         }
00501       else
00502         {
00503           rc = idna_to_ascii_4i (start, end - start, buf, flags);
00504           if (rc != IDNA_SUCCESS)
00505             return rc;
00506         }
00507 
00508       if (out)
00509         {
00510           char *newp = realloc (out, strlen (out) + 1 + strlen (buf) + 1);
00511           if (!newp)
00512             {
00513               free (out);
00514               return IDNA_MALLOC_ERROR;
00515             }
00516           out = newp;
00517           strcat (out, ".");
00518           strcat (out, buf);
00519         }
00520       else
00521         {
00522           out = (char *) malloc (strlen (buf) + 1);
00523           if (!out)
00524             return IDNA_MALLOC_ERROR;
00525           strcpy (out, buf);
00526         }
00527 
00528       start = end + 1;
00529     }
00530   while (*end);
00531 
00532   *output = out;
00533 
00534   return IDNA_SUCCESS;
00535 }
00536 
00550 int
00551 idna_to_ascii_8z (const char *input, char **output, int flags)
00552 {
00553   uint32_t *ucs4;
00554   size_t ucs4len;
00555   int rc;
00556 
00557   ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
00558   if (!ucs4)
00559     return IDNA_ICONV_ERROR;
00560 
00561   rc = idna_to_ascii_4z (ucs4, output, flags);
00562 
00563   free (ucs4);
00564 
00565   return rc;
00566 
00567 }
00568 
00583 int
00584 idna_to_ascii_lz (const char *input, char **output, int flags)
00585 {
00586   char *utf8;
00587   int rc;
00588 
00589   utf8 = stringprep_locale_to_utf8 (input);
00590   if (!utf8)
00591     return IDNA_ICONV_ERROR;
00592 
00593   rc = idna_to_ascii_8z (utf8, output, flags);
00594 
00595   free (utf8);
00596 
00597   return rc;
00598 }
00599 
00614 int
00615 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
00616 {
00617   const uint32_t *start = input;
00618   const uint32_t *end = input;
00619   uint32_t *buf;
00620   size_t buflen;
00621   uint32_t *out = NULL;
00622   size_t outlen = 0;
00623   int rc;
00624 
00625   *output = NULL;
00626 
00627   do
00628     {
00629       end = start;
00630 
00631       for (; *end && !DOTP (*end); end++)
00632         ;
00633 
00634       buflen = end - start;
00635       buf = malloc (sizeof (buf[0]) * (buflen + 1));
00636       if (!buf)
00637         return IDNA_MALLOC_ERROR;
00638 
00639       rc = idna_to_unicode_44i (start, end - start, buf, &buflen, flags);
00640       /* don't check rc as per specification! */
00641 
00642       if (out)
00643         {
00644           uint32_t *newp = realloc (out,
00645                                     sizeof (out[0])
00646                                     * (outlen + 1 + buflen + 1));
00647           if (!newp)
00648             {
00649               free (buf);
00650               free (out);
00651               return IDNA_MALLOC_ERROR;
00652             }
00653           out = newp;
00654           out[outlen++] = 0x002E;       /* '.' (full stop) */
00655           memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
00656           outlen += buflen;
00657           out[outlen] = 0x0;
00658           free (buf);
00659         }
00660       else
00661         {
00662           out = buf;
00663           outlen = buflen;
00664           out[outlen] = 0x0;
00665         }
00666 
00667       start = end + 1;
00668     }
00669   while (*end);
00670 
00671   *output = out;
00672 
00673   return IDNA_SUCCESS;
00674 }
00675 
00690 int
00691 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
00692 {
00693   uint32_t *ucs4;
00694   size_t ucs4len;
00695   int rc;
00696 
00697   ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
00698   if (!ucs4)
00699     return IDNA_ICONV_ERROR;
00700 
00701   rc = idna_to_unicode_4z4z (ucs4, output, flags);
00702   free (ucs4);
00703 
00704   return rc;
00705 }
00706 
00721 int
00722 idna_to_unicode_8z8z (const char *input, char **output, int flags)
00723 {
00724   uint32_t *ucs4;
00725   int rc;
00726 
00727   rc = idna_to_unicode_8z4z (input, &ucs4, flags);
00728   *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
00729   free (ucs4);
00730 
00731   if (!*output)
00732     return IDNA_ICONV_ERROR;
00733 
00734   return rc;
00735 }
00736 
00752 int
00753 idna_to_unicode_8zlz (const char *input, char **output, int flags)
00754 {
00755   char *utf8;
00756   int rc;
00757 
00758   rc = idna_to_unicode_8z8z (input, &utf8, flags);
00759   *output = stringprep_utf8_to_locale (utf8);
00760   free (utf8);
00761 
00762   if (!*output)
00763     return IDNA_ICONV_ERROR;
00764 
00765   return rc;
00766 }
00767 
00784 int
00785 idna_to_unicode_lzlz (const char *input, char **output, int flags)
00786 {
00787   char *utf8;
00788   int rc;
00789 
00790   utf8 = stringprep_locale_to_utf8 (input);
00791   if (!utf8)
00792     return IDNA_ICONV_ERROR;
00793 
00794   rc = idna_to_unicode_8zlz (utf8, output, flags);
00795   free (utf8);
00796 
00797   return rc;
00798 }
00799 

Generated on Wed Sep 13 10:20:31 2006 for libidn by  doxygen 1.4.7