libidn: punycode.c Source File

00001 /* punycode.c --- Implementation of punycode used to ASCII encode IDN's.
00002  * Copyright (C) 2002, 2003, 2004  Simon Josefsson
00003  *
00004  * This file is part of GNU Libidn.
00005  *
00006  * GNU Libidn is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * GNU Libidn is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with GNU Libidn; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
00019  *
00020  */
00021 
00022 /*
00023  * This file is derived from RFC 3492bis written by Adam M. Costello.
00024  *
00025  * Disclaimer and license: Regarding this entire document or any
00026  * portion of it (including the pseudocode and C code), the author
00027  * makes no guarantees and is not responsible for any damage resulting
00028  * from its use.  The author grants irrevocable permission to anyone
00029  * to use, modify, and distribute it in any way that does not diminish
00030  * the rights of anyone else to use, modify, and distribute it,
00031  * provided that redistributed derivative works do not contain
00032  * misleading author or version information.  Derivative works need
00033  * not be licensed under similar terms.
00034  *
00035  * Copyright (C) The Internet Society (2003).  All Rights Reserved.
00036  *
00037  * This document and translations of it may be copied and furnished to
00038  * others, and derivative works that comment on or otherwise explain it
00039  * or assist in its implementation may be prepared, copied, published
00040  * and distributed, in whole or in part, without restriction of any
00041  * kind, provided that the above copyright notice and this paragraph are
00042  * included on all such copies and derivative works.  However, this
00043  * document itself may not be modified in any way, such as by removing
00044  * the copyright notice or references to the Internet Society or other
00045  * Internet organizations, except as needed for the purpose of
00046  * developing Internet standards in which case the procedures for
00047  * copyrights defined in the Internet Standards process must be
00048  * followed, or as required to translate it into languages other than
00049  * English.
00050  *
00051  * The limited permissions granted above are perpetual and will not be
00052  * revoked by the Internet Society or its successors or assigns.
00053  *
00054  * This document and the information contained herein is provided on an
00055  * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
00056  * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
00057  * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
00058  * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
00059  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
00060  */
00061 
00062 #include <string.h>
00063 
00064 #include "punycode.h"
00065 
00066 /*** Bootstring parameters for Punycode ***/
00067 
00068 enum
00069 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
00070   initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
00071 };
00072 
00073 /* basic(cp) tests whether cp is a basic code point: */
00074 #define basic(cp) ((punycode_uint)(cp) < 0x80)
00075 
00076 /* delim(cp) tests whether cp is a delimiter: */
00077 #define delim(cp) ((cp) == delimiter)
00078 
00079 /* decode_digit(cp) returns the numeric value of a basic code */
00080 /* point (for use in representing integers) in the range 0 to */
00081 /* base-1, or base if cp does not represent a value.          */
00082 
00083 static punycode_uint
00084 decode_digit (punycode_uint cp)
00085 {
00086   return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
00087     cp - 97 < 26 ? cp - 97 : base;
00088 }
00089 
00090 /* encode_digit(d,flag) returns the basic code point whose value      */
00091 /* (when used for representing integers) is d, which needs to be in   */
00092 /* the range 0 to base-1.  The lowercase form is used unless flag is  */
00093 /* nonzero, in which case the uppercase form is used.  The behavior   */
00094 /* is undefined if flag is nonzero and digit d has no uppercase form. */
00095 
00096 static char
00097 encode_digit (punycode_uint d, int flag)
00098 {
00099   return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
00100   /*  0..25 map to ASCII a..z or A..Z */
00101   /* 26..35 map to ASCII 0..9         */
00102 }
00103 
00104 /* flagged(bcp) tests whether a basic code point is flagged */
00105 /* (uppercase).  The behavior is undefined if bcp is not a  */
00106 /* basic code point.                                        */
00107 
00108 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
00109 
00110 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
00111 /* if flag is zero, uppercase if flag is nonzero, and returns    */
00112 /* the resulting code point.  The code point is unchanged if it  */
00113 /* is caseless.  The behavior is undefined if bcp is not a basic */
00114 /* code point.                                                   */
00115 
00116 static char
00117 encode_basic (punycode_uint bcp, int flag)
00118 {
00119   bcp -= (bcp - 97 < 26) << 5;
00120   return bcp + ((!flag && (bcp - 65 < 26)) << 5);
00121 }
00122 
00123 /*** Platform-specific constants ***/
00124 
00125 /* maxint is the maximum value of a punycode_uint variable: */
00126 static const punycode_uint maxint = -1;
00127 /* Because maxint is unsigned, -1 becomes the maximum value. */
00128 
00129 /*** Bias adaptation function ***/
00130 
00131 static punycode_uint
00132 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
00133 {
00134   punycode_uint k;
00135 
00136   delta = firsttime ? delta / damp : delta >> 1;
00137   /* delta >> 1 is a faster way of doing delta / 2 */
00138   delta += delta / numpoints;
00139 
00140   for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
00141     {
00142       delta /= base - tmin;
00143     }
00144 
00145   return k + (base - tmin + 1) * delta / (delta + skew);
00146 }
00147 
00148 /*** Main encode function ***/
00149 
00187 int
00188 punycode_encode (size_t input_length,
00189                  const punycode_uint input[],
00190                  const unsigned char case_flags[],
00191                  size_t * output_length, char output[])
00192 {
00193   punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
00194   size_t out, max_out;
00195 
00196   /* The Punycode spec assumes that the input length is the same type */
00197   /* of integer as a code point, so we need to convert the size_t to  */
00198   /* a punycode_uint, which could overflow.                           */
00199 
00200   if (input_length > maxint)
00201     return punycode_overflow;
00202   input_len = (punycode_uint) input_length;
00203 
00204   /* Initialize the state: */
00205 
00206   n = initial_n;
00207   delta = 0;
00208   out = 0;
00209   max_out = *output_length;
00210   bias = initial_bias;
00211 
00212   /* Handle the basic code points: */
00213 
00214   for (j = 0; j < input_len; ++j)
00215     {
00216       if (basic (input[j]))
00217         {
00218           if (max_out - out < 2)
00219             return punycode_big_output;
00220           output[out++] = case_flags ?
00221             encode_basic (input[j], case_flags[j]) : (char) input[j];
00222         }
00223       /* else if (input[j] < n) return punycode_bad_input; */
00224       /* (not needed for Punycode with unsigned code points) */
00225     }
00226 
00227   h = b = (punycode_uint) out;
00228   /* cannot overflow because out <= input_len <= maxint */
00229 
00230   /* h is the number of code points that have been handled, b is the  */
00231   /* number of basic code points, and out is the number of ASCII code */
00232   /* points that have been output.                                    */
00233 
00234   if (b > 0)
00235     output[out++] = delimiter;
00236 
00237   /* Main encoding loop: */
00238 
00239   while (h < input_len)
00240     {
00241       /* All non-basic code points < n have been     */
00242       /* handled already.  Find the next larger one: */
00243 
00244       for (m = maxint, j = 0; j < input_len; ++j)
00245         {
00246           /* if (basic(input[j])) continue; */
00247           /* (not needed for Punycode) */
00248           if (input[j] >= n && input[j] < m)
00249             m = input[j];
00250         }
00251 
00252       /* Increase delta enough to advance the decoder's    */
00253       /* <n,i> state to <m,0>, but guard against overflow: */
00254 
00255       if (m - n > (maxint - delta) / (h + 1))
00256         return punycode_overflow;
00257       delta += (m - n) * (h + 1);
00258       n = m;
00259 
00260       for (j = 0; j < input_len; ++j)
00261         {
00262           /* Punycode does not need to check whether input[j] is basic: */
00263           if (input[j] < n /* || basic(input[j]) */ )
00264             {
00265               if (++delta == 0)
00266                 return punycode_overflow;
00267             }
00268 
00269           if (input[j] == n)
00270             {
00271               /* Represent delta as a generalized variable-length integer: */
00272 
00273               for (q = delta, k = base;; k += base)
00274                 {
00275                   if (out >= max_out)
00276                     return punycode_big_output;
00277                   t = k <= bias /* + tmin */ ? tmin :   /* +tmin not needed */
00278                     k >= bias + tmax ? tmax : k - bias;
00279                   if (q < t)
00280                     break;
00281                   output[out++] = encode_digit (t + (q - t) % (base - t), 0);
00282                   q = (q - t) / (base - t);
00283                 }
00284 
00285               output[out++] = encode_digit (q, case_flags && case_flags[j]);
00286               bias = adapt (delta, h + 1, h == b);
00287               delta = 0;
00288               ++h;
00289             }
00290         }
00291 
00292       ++delta, ++n;
00293     }
00294 
00295   *output_length = out;
00296   return punycode_success;
00297 }
00298 
00299 /*** Main decode function ***/
00300 
00336 int
00337 punycode_decode (size_t input_length,
00338                  const char input[],
00339                  size_t * output_length,
00340                  punycode_uint output[], unsigned char case_flags[])
00341 {
00342   punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
00343   size_t b, j, in;
00344 
00345   /* Initialize the state: */
00346 
00347   n = initial_n;
00348   out = i = 0;
00349   max_out = *output_length > maxint ? maxint
00350     : (punycode_uint) * output_length;
00351   bias = initial_bias;
00352 
00353   /* Handle the basic code points:  Let b be the number of input code */
00354   /* points before the last delimiter, or 0 if there is none, then    */
00355   /* copy the first b code points to the output.                      */
00356 
00357   for (b = j = 0; j < input_length; ++j)
00358     if (delim (input[j]))
00359       b = j;
00360   if (b > max_out)
00361     return punycode_big_output;
00362 
00363   for (j = 0; j < b; ++j)
00364     {
00365       if (case_flags)
00366         case_flags[out] = flagged (input[j]);
00367       if (!basic (input[j]))
00368         return punycode_bad_input;
00369       output[out++] = input[j];
00370     }
00371 
00372   /* Main decoding loop:  Start just after the last delimiter if any  */
00373   /* basic code points were copied; start at the beginning otherwise. */
00374 
00375   for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
00376     {
00377 
00378       /* in is the index of the next ASCII code point to be consumed, */
00379       /* and out is the number of code points in the output array.    */
00380 
00381       /* Decode a generalized variable-length integer into delta,  */
00382       /* which gets added to i.  The overflow checking is easier   */
00383       /* if we increase i as we go, then subtract off its starting */
00384       /* value at the end to obtain delta.                         */
00385 
00386       for (oldi = i, w = 1, k = base;; k += base)
00387         {
00388           if (in >= input_length)
00389             return punycode_bad_input;
00390           digit = decode_digit (input[in++]);
00391           if (digit >= base)
00392             return punycode_bad_input;
00393           if (digit > (maxint - i) / w)
00394             return punycode_overflow;
00395           i += digit * w;
00396           t = k <= bias /* + tmin */ ? tmin :   /* +tmin not needed */
00397             k >= bias + tmax ? tmax : k - bias;
00398           if (digit < t)
00399             break;
00400           if (w > maxint / (base - t))
00401             return punycode_overflow;
00402           w *= (base - t);
00403         }
00404 
00405       bias = adapt (i - oldi, out + 1, oldi == 0);
00406 
00407       /* i was supposed to wrap around from out+1 to 0,   */
00408       /* incrementing n each time, so we'll fix that now: */
00409 
00410       if (i / (out + 1) > maxint - n)
00411         return punycode_overflow;
00412       n += i / (out + 1);
00413       i %= (out + 1);
00414 
00415       /* Insert n at position i of the output: */
00416 
00417       /* not needed for Punycode: */
00418       /* if (basic(n)) return punycode_invalid_input; */
00419       if (out >= max_out)
00420         return punycode_big_output;
00421 
00422       if (case_flags)
00423         {
00424           memmove (case_flags + i + 1, case_flags + i, out - i);
00425           /* Case of last ASCII code point determines case flag: */
00426           case_flags[i] = flagged (input[in - 1]);
00427         }
00428 
00429       memmove (output + i + 1, output + i, (out - i) * sizeof *output);
00430       output[i++] = n;
00431     }
00432 
00433   *output_length = (size_t) out;
00434   /* cannot overflow because out <= old value of *output_length */
00435   return punycode_success;
00436 }
00437