rfcdecoder.cc
00001 /********************************************************************** 00002 * 00003 * rfcdecoder.cc - handler for various rfc/mime encodings 00004 * Copyright (C) 2000 s.carstens@gmx.de 00005 * 00006 * This program is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 2 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * This program is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 * GNU General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU General Public License 00017 * along with this program; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00019 * 00020 * Send comments and bug fixes to s.carstens@gmx.de 00021 * 00022 *********************************************************************/ 00023 #include "rfcdecoder.h" 00024 00025 #include <ctype.h> 00026 #include <sys/types.h> 00027 00028 #include <stdio.h> 00029 #include <stdlib.h> 00030 00031 #include <tqtextcodec.h> 00032 #include <tqbuffer.h> 00033 #include <tqregexp.h> 00034 #include <kmdcodec.h> 00035 00036 // This part taken from rfc 2192 IMAP URL Scheme. C. Newman. September 1997. 00037 // adapted to QT-Toolkit by Sven Carstens <s.carstens@gmx.de> 2000 00038 00039 static unsigned char base64chars[] = 00040 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; 00041 #define UNDEFINED 64 00042 #define MAXLINE 76 00043 00044 /* UTF16 definitions */ 00045 #define UTF16MASK 0x03FFUL 00046 #define UTF16SHIFT 10 00047 #define UTF16BASE 0x10000UL 00048 #define UTF16HIGHSTART 0xD800UL 00049 #define UTF16HIGHEND 0xDBFFUL 00050 #define UTF16LOSTART 0xDC00UL 00051 #define UTF16LOEND 0xDFFFUL 00052 00053 /* Convert an IMAP mailbox to a Unicode path 00054 */ 00055 TQString rfcDecoder::fromIMAP (const TQString & inSrc) 00056 { 00057 unsigned char c, i, bitcount; 00058 unsigned long ucs4, utf16, bitbuf; 00059 unsigned char base64[256], utf8[6]; 00060 unsigned long srcPtr = 0; 00061 TQCString dst; 00062 TQCString src = inSrc.ascii (); 00063 uint srcLen = inSrc.length(); 00064 00065 /* initialize modified base64 decoding table */ 00066 memset (base64, UNDEFINED, sizeof (base64)); 00067 for (i = 0; i < sizeof (base64chars); ++i) 00068 { 00069 base64[(int)base64chars[i]] = i; 00070 } 00071 00072 /* loop until end of string */ 00073 while (srcPtr < srcLen) 00074 { 00075 c = src[srcPtr++]; 00076 /* deal with literal characters and &- */ 00077 if (c != '&' || src[srcPtr] == '-') 00078 { 00079 /* encode literally */ 00080 dst += c; 00081 /* skip over the '-' if this is an &- sequence */ 00082 if (c == '&') 00083 srcPtr++; 00084 } 00085 else 00086 { 00087 /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */ 00088 bitbuf = 0; 00089 bitcount = 0; 00090 ucs4 = 0; 00091 while ((c = base64[(unsigned char) src[srcPtr]]) != UNDEFINED) 00092 { 00093 ++srcPtr; 00094 bitbuf = (bitbuf << 6) | c; 00095 bitcount += 6; 00096 /* enough bits for a UTF-16 character? */ 00097 if (bitcount >= 16) 00098 { 00099 bitcount -= 16; 00100 utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff; 00101 /* convert UTF16 to UCS4 */ 00102 if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) 00103 { 00104 ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT; 00105 continue; 00106 } 00107 else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) 00108 { 00109 ucs4 += utf16 - UTF16LOSTART + UTF16BASE; 00110 } 00111 else 00112 { 00113 ucs4 = utf16; 00114 } 00115 /* convert UTF-16 range of UCS4 to UTF-8 */ 00116 if (ucs4 <= 0x7fUL) 00117 { 00118 utf8[0] = ucs4; 00119 i = 1; 00120 } 00121 else if (ucs4 <= 0x7ffUL) 00122 { 00123 utf8[0] = 0xc0 | (ucs4 >> 6); 00124 utf8[1] = 0x80 | (ucs4 & 0x3f); 00125 i = 2; 00126 } 00127 else if (ucs4 <= 0xffffUL) 00128 { 00129 utf8[0] = 0xe0 | (ucs4 >> 12); 00130 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f); 00131 utf8[2] = 0x80 | (ucs4 & 0x3f); 00132 i = 3; 00133 } 00134 else 00135 { 00136 utf8[0] = 0xf0 | (ucs4 >> 18); 00137 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f); 00138 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f); 00139 utf8[3] = 0x80 | (ucs4 & 0x3f); 00140 i = 4; 00141 } 00142 /* copy it */ 00143 for (c = 0; c < i; ++c) 00144 { 00145 dst += utf8[c]; 00146 } 00147 } 00148 } 00149 /* skip over trailing '-' in modified UTF-7 encoding */ 00150 if (src[srcPtr] == '-') 00151 ++srcPtr; 00152 } 00153 } 00154 return TQString::fromUtf8 (dst.data ()); 00155 } 00156 00157 /* replace " with \" and \ with \\ " and \ characters */ 00158 TQString rfcDecoder::quoteIMAP(const TQString &src) 00159 { 00160 uint len = src.length(); 00161 TQString result; 00162 result.reserve(2 * len); 00163 for (unsigned int i = 0; i < len; i++) 00164 { 00165 if (src[i] == '"' || src[i] == '\\') 00166 result += '\\'; 00167 result += src[i]; 00168 } 00169 //result.squeeze(); - unnecessary and slow 00170 return result; 00171 } 00172 00173 /* Convert Unicode path to modified UTF-7 IMAP mailbox 00174 */ 00175 TQString rfcDecoder::toIMAP (const TQString & inSrc) 00176 { 00177 unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag; 00178 unsigned long ucs4, bitbuf; 00179 TQCString src = inSrc.utf8 (); 00180 TQString dst; 00181 00182 ulong srcPtr = 0; 00183 utf7mode = 0; 00184 utf8total = 0; 00185 bitstogo = 0; 00186 utf8pos = 0; 00187 bitbuf = 0; 00188 ucs4 = 0; 00189 while (srcPtr < src.length ()) 00190 { 00191 c = (unsigned char) src[srcPtr++]; 00192 /* normal character? */ 00193 if (c >= ' ' && c <= '~') 00194 { 00195 /* switch out of UTF-7 mode */ 00196 if (utf7mode) 00197 { 00198 if (bitstogo) 00199 { 00200 dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F]; 00201 bitstogo = 0; 00202 } 00203 dst += '-'; 00204 utf7mode = 0; 00205 } 00206 dst += c; 00207 /* encode '&' as '&-' */ 00208 if (c == '&') 00209 { 00210 dst += '-'; 00211 } 00212 continue; 00213 } 00214 /* switch to UTF-7 mode */ 00215 if (!utf7mode) 00216 { 00217 dst += '&'; 00218 utf7mode = 1; 00219 } 00220 /* Encode US-ASCII characters as themselves */ 00221 if (c < 0x80) 00222 { 00223 ucs4 = c; 00224 utf8total = 1; 00225 } 00226 else if (utf8total) 00227 { 00228 /* save UTF8 bits into UCS4 */ 00229 ucs4 = (ucs4 << 6) | (c & 0x3FUL); 00230 if (++utf8pos < utf8total) 00231 { 00232 continue; 00233 } 00234 } 00235 else 00236 { 00237 utf8pos = 1; 00238 if (c < 0xE0) 00239 { 00240 utf8total = 2; 00241 ucs4 = c & 0x1F; 00242 } 00243 else if (c < 0xF0) 00244 { 00245 utf8total = 3; 00246 ucs4 = c & 0x0F; 00247 } 00248 else 00249 { 00250 /* NOTE: can't convert UTF8 sequences longer than 4 */ 00251 utf8total = 4; 00252 ucs4 = c & 0x03; 00253 } 00254 continue; 00255 } 00256 /* loop to split ucs4 into two utf16 chars if necessary */ 00257 utf8total = 0; 00258 do 00259 { 00260 if (ucs4 >= UTF16BASE) 00261 { 00262 ucs4 -= UTF16BASE; 00263 bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART); 00264 ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART; 00265 utf16flag = 1; 00266 } 00267 else 00268 { 00269 bitbuf = (bitbuf << 16) | ucs4; 00270 utf16flag = 0; 00271 } 00272 bitstogo += 16; 00273 /* spew out base64 */ 00274 while (bitstogo >= 6) 00275 { 00276 bitstogo -= 6; 00277 dst += base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F]; 00278 } 00279 } 00280 while (utf16flag); 00281 } 00282 /* if in UTF-7 mode, finish in ASCII */ 00283 if (utf7mode) 00284 { 00285 if (bitstogo) 00286 { 00287 dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F]; 00288 } 00289 dst += '-'; 00290 } 00291 return quoteIMAP(dst); 00292 } 00293 00294 //----------------------------------------------------------------------------- 00295 TQString rfcDecoder::decodeQuoting(const TQString &aStr) 00296 { 00297 TQString result; 00298 unsigned int strLength(aStr.length()); 00299 for (unsigned int i = 0; i < strLength ; i++) 00300 { 00301 if (aStr[i] == "\\") i++; 00302 result += aStr[i]; 00303 } 00304 return result; 00305 } 00306 00307 //----------------------------------------------------------------------------- 00308 TQTextCodec * 00309 rfcDecoder::codecForName (const TQString & _str) 00310 { 00311 if (_str.isEmpty ()) 00312 return NULL; 00313 return TQTextCodec::codecForName (_str.lower (). 00314 replace ("windows", "cp").latin1 ()); 00315 } 00316 00317 //----------------------------------------------------------------------------- 00318 const TQString 00319 rfcDecoder::decodeRFC2047String (const TQString & _str) 00320 { 00321 TQString throw_away; 00322 00323 return decodeRFC2047String (_str, throw_away); 00324 } 00325 00326 //----------------------------------------------------------------------------- 00327 const TQString 00328 rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset) 00329 { 00330 TQString throw_away; 00331 00332 return decodeRFC2047String (_str, charset, throw_away); 00333 } 00334 00335 //----------------------------------------------------------------------------- 00336 const TQString 00337 rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset, 00338 TQString & language) 00339 { 00340 //do we have a rfc string 00341 if (_str.find("=?") < 0) 00342 return _str; 00343 00344 TQCString aStr = _str.ascii (); // TQString.length() means Unicode chars 00345 TQCString result; 00346 char *pos, *beg, *end, *mid = NULL; 00347 TQCString str; 00348 char encoding = 0, ch; 00349 bool valid; 00350 const int maxLen = 200; 00351 int i; 00352 00353 // result.truncate(aStr.length()); 00354 for (pos = aStr.data (); *pos; pos++) 00355 { 00356 if (pos[0] != '=' || pos[1] != '?') 00357 { 00358 result += *pos; 00359 continue; 00360 } 00361 beg = pos + 2; 00362 end = beg; 00363 valid = TRUE; 00364 // parse charset name 00365 for (i = 2, pos += 2; 00366 i < maxLen && (*pos != '?' && (ispunct (*pos) || isalnum (*pos))); 00367 i++) 00368 pos++; 00369 if (*pos != '?' || i < 4 || i >= maxLen) 00370 valid = FALSE; 00371 else 00372 { 00373 charset = TQCString (beg, i - 1); // -2 + 1 for the zero 00374 int pt = charset.findRev('*'); 00375 if (pt != -1) 00376 { 00377 // save language for later usage 00378 language = charset.right (charset.length () - pt - 1); 00379 00380 // tie off language as defined in rfc2047 00381 charset.truncate(pt); 00382 } 00383 // get encoding and check delimiting question marks 00384 encoding = toupper (pos[1]); 00385 if (pos[2] != '?' 00386 || (encoding != 'Q' && encoding != 'B' && encoding != 'q' 00387 && encoding != 'b')) 00388 valid = FALSE; 00389 pos += 3; 00390 i += 3; 00391 // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - charset " << charset << " - language " << language << " - '" << pos << "'" << endl; 00392 } 00393 if (valid) 00394 { 00395 mid = pos; 00396 // search for end of encoded part 00397 while (i < maxLen && *pos && !(*pos == '?' && *(pos + 1) == '=')) 00398 { 00399 i++; 00400 pos++; 00401 } 00402 end = pos + 2; //end now points to the first char after the encoded string 00403 if (i >= maxLen || !*pos) 00404 valid = FALSE; 00405 } 00406 if (valid) 00407 { 00408 ch = *pos; 00409 *pos = '\0'; 00410 str = TQCString (mid).left ((int) (mid - pos - 1)); 00411 if (encoding == 'Q') 00412 { 00413 // decode quoted printable text 00414 for (i = str.length () - 1; i >= 0; i--) 00415 if (str[i] == '_') 00416 str[i] = ' '; 00417 // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - before QP '" << str << "'" << endl; 00418 00419 str = KCodecs::quotedPrintableDecode(str); 00420 // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - after QP '" << str << "'" << endl; 00421 } 00422 else 00423 { 00424 // decode base64 text 00425 str = KCodecs::base64Decode(str); 00426 } 00427 *pos = ch; 00428 int len = str.length(); 00429 for (i = 0; i < len; i++) 00430 result += (char) (TQChar) str[i]; 00431 00432 pos = end - 1; 00433 } 00434 else 00435 { 00436 // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - invalid" << endl; 00437 //result += "=?"; 00438 //pos = beg -1; // because pos gets increased shortly afterwards 00439 pos = beg - 2; 00440 result += *pos++; 00441 result += *pos; 00442 } 00443 } 00444 if (!charset.isEmpty ()) 00445 { 00446 TQTextCodec *aCodec = codecForName (charset.ascii ()); 00447 if (aCodec) 00448 { 00449 // kdDebug(7116) << "Codec is " << aCodec->name() << endl; 00450 return aCodec->toUnicode (result); 00451 } 00452 } 00453 return result; 00454 } 00455 00456 00457 //----------------------------------------------------------------------------- 00458 const char especials[17] = "()<>@,;:\"/[]?.= "; 00459 00460 const TQString 00461 rfcDecoder::encodeRFC2047String (const TQString & _str) 00462 { 00463 if (_str.isEmpty ()) 00464 return _str; 00465 const signed char *latin = reinterpret_cast<const signed char *>(_str.latin1()), *l, *start, *stop; 00466 char hexcode; 00467 int numQuotes, i; 00468 int rptr = 0; 00469 // My stats show this number results in 12 resize() out of 73,000 00470 int resultLen = 3 * _str.length() / 2; 00471 TQCString result(resultLen); 00472 00473 while (*latin) 00474 { 00475 l = latin; 00476 start = latin; 00477 while (*l) 00478 { 00479 if (*l == 32) 00480 start = l + 1; 00481 if (*l < 0) 00482 break; 00483 l++; 00484 } 00485 if (*l) 00486 { 00487 numQuotes = 1; 00488 while (*l) 00489 { 00490 /* The encoded word must be limited to 75 character */ 00491 for (i = 0; i < 16; i++) 00492 if (*l == especials[i]) 00493 numQuotes++; 00494 if (*l < 0) 00495 numQuotes++; 00496 /* Stop after 58 = 75 - 17 characters or at "<user@host..." */ 00497 if (l - start + 2 * numQuotes >= 58 || *l == 60) 00498 break; 00499 l++; 00500 } 00501 if (*l) 00502 { 00503 stop = l - 1; 00504 while (stop >= start && *stop != 32) 00505 stop--; 00506 if (stop <= start) 00507 stop = l; 00508 } 00509 else 00510 stop = l; 00511 if (resultLen - rptr - 1 <= start - latin + 1 + 16 /* =?iso-88... */) { 00512 resultLen += (start - latin + 1) * 2 + 20; // more space 00513 result.resize(resultLen); 00514 } 00515 while (latin < start) 00516 { 00517 result[rptr++] = *latin; 00518 latin++; 00519 } 00520 strcpy(&result[rptr], "=?iso-8859-1?q?"); rptr += 15; 00521 if (resultLen - rptr - 1 <= 3*(stop - latin + 1)) { 00522 resultLen += (stop - latin + 1) * 4 + 20; // more space 00523 result.resize(resultLen); 00524 } 00525 while (latin < stop) // can add up to 3 chars/iteration 00526 { 00527 numQuotes = 0; 00528 for (i = 0; i < 16; i++) 00529 if (*latin == especials[i]) 00530 numQuotes = 1; 00531 if (*latin < 0) 00532 numQuotes = 1; 00533 if (numQuotes) 00534 { 00535 result[rptr++] = '='; 00536 hexcode = ((*latin & 0xF0) >> 4) + 48; 00537 if (hexcode >= 58) 00538 hexcode += 7; 00539 result[rptr++] = hexcode; 00540 hexcode = (*latin & 0x0F) + 48; 00541 if (hexcode >= 58) 00542 hexcode += 7; 00543 result[rptr++] = hexcode; 00544 } 00545 else 00546 { 00547 result[rptr++] = *latin; 00548 } 00549 latin++; 00550 } 00551 result[rptr++] = '?'; 00552 result[rptr++] = '='; 00553 } 00554 else 00555 { 00556 while (*latin) 00557 { 00558 if (rptr == resultLen - 1) { 00559 resultLen += 30; 00560 result.resize(resultLen); 00561 } 00562 result[rptr++] = *latin; 00563 latin++; 00564 } 00565 } 00566 } 00567 result[rptr] = 0; 00568 //free (latinStart); 00569 return result; 00570 } 00571 00572 00573 //----------------------------------------------------------------------------- 00574 const TQString 00575 rfcDecoder::encodeRFC2231String (const TQString & _str) 00576 { 00577 if (_str.isEmpty ()) 00578 return _str; 00579 signed char *latin = (signed char *) calloc (1, _str.length () + 1); 00580 char *latin_us = (char *) latin; 00581 strcpy (latin_us, _str.latin1 ()); 00582 signed char *l = latin; 00583 char hexcode; 00584 int i; 00585 bool quote; 00586 while (*l) 00587 { 00588 if (*l < 0) 00589 break; 00590 l++; 00591 } 00592 if (!*l) { 00593 free(latin); 00594 return _str.ascii (); 00595 } 00596 TQCString result; 00597 l = latin; 00598 while (*l) 00599 { 00600 quote = *l < 0; 00601 for (i = 0; i < 16; i++) 00602 if (*l == especials[i]) 00603 quote = true; 00604 if (quote) 00605 { 00606 result += "%"; 00607 hexcode = ((*l & 0xF0) >> 4) + 48; 00608 if (hexcode >= 58) 00609 hexcode += 7; 00610 result += hexcode; 00611 hexcode = (*l & 0x0F) + 48; 00612 if (hexcode >= 58) 00613 hexcode += 7; 00614 result += hexcode; 00615 } 00616 else 00617 { 00618 result += *l; 00619 } 00620 l++; 00621 } 00622 free (latin); 00623 return result; 00624 } 00625 00626 00627 //----------------------------------------------------------------------------- 00628 const TQString 00629 rfcDecoder::decodeRFC2231String (const TQString & _str) 00630 { 00631 int p = _str.find ('\''); 00632 00633 //see if it is an rfc string 00634 if (p < 0) 00635 return _str; 00636 00637 int l = _str.findRev ('\''); 00638 00639 //second is language 00640 if (p >= l) 00641 return _str; 00642 00643 //first is charset or empty 00644 TQString charset = _str.left (p); 00645 TQString st = _str.mid (l + 1); 00646 TQString language = _str.mid (p + 1, l - p - 1); 00647 00648 //kdDebug(7116) << "Charset: " << charset << " Language: " << language << endl; 00649 00650 char ch, ch2; 00651 p = 0; 00652 while (p < (int) st.length ()) 00653 { 00654 if (st.at (p) == 37) 00655 { 00656 ch = st.at (p + 1).latin1 () - 48; 00657 if (ch > 16) 00658 ch -= 7; 00659 ch2 = st.at (p + 2).latin1 () - 48; 00660 if (ch2 > 16) 00661 ch2 -= 7; 00662 st.at (p) = ch * 16 + ch2; 00663 st.remove (p + 1, 2); 00664 } 00665 p++; 00666 } 00667 return st; 00668 }