akregator/src/librss

tools_p.cpp
00001 /*
00002  * tools_p.cpp
00003  *
00004  * Copyright (c) 2001, 2002, 2003 Frerich Raabe <raabe@kde.org>
00005  *
00006  * This program is distributed in the hope that it will be useful, but WITHOUT
00007  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00008  * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the
00009  * accompanying file 'COPYING'.
00010  */
00011 #include "tools_p.h"
00012 
00013 #include <krfcdate.h>
00014 #include <tqdom.h>
00015 #include <kcharsets.h>
00016 #include <tqregexp.h>
00017 
00018 namespace RSS {
00019 
00020 time_t parseISO8601Date(const TQString &s)
00021 {
00022     // do some sanity check: 26-12-2004T00:00+00:00 is parsed to epoch+1 in the KRFCDate, which is wrong. So let's check if the date begins with YYYY -fo
00023     if (s.stripWhiteSpace().left(4).toInt() < 1000)
00024         return 0; // error
00025 
00026     // FIXME: imho this is done in KRFCDate::parseDateISO8601() automatically, so we could omit it? -fo
00027     if (s.find('T') != -1)
00028         return KRFCDate::parseDateISO8601(s);
00029     else
00030         return KRFCDate::parseDateISO8601(s + "T12:00:00");
00031 }
00032 
00033 TQString childNodesAsXML(const TQDomNode& parent)
00034 {
00035     TQDomNodeList list = parent.childNodes();
00036     TQString str;
00037     TQTextStream ts( &str, IO_WriteOnly );
00038     for (uint i = 0; i < list.count(); ++i)
00039         ts << list.item(i);
00040     return str.stripWhiteSpace();
00041 }
00042 
00043 static TQString plainTextToHtml(const TQString& plainText)
00044 {
00045     TQString str(plainText);
00046     str.replace("&", "&amp;");
00047     str.replace("\"", "&quot;");
00048     str.replace("<", "&lt;");
00049     //str.replace(">", "&gt;");
00050     str.replace("\n", "<br/>");
00051     return str;
00052 }
00053 
00054 enum ContentFormat { Text, HTML, XML, Binary };
00055         
00056 static ContentFormat mapTypeToFormat(const TQString& modep, const TQString& typep,  const TQString& src)
00057 {
00058     TQString mode = modep.isNull() ? "escaped" : modep;
00059     TQString type = typep;
00060     
00061     //"If neither the type attribute nor the src attribute is provided,
00062     //Atom Processors MUST behave as though the type attribute were
00063     //present with a value of "text""
00064     if (type.isNull() && src.isEmpty())
00065         type = TQString::fromUtf8("text");
00066 
00067     if (type == TQString::fromUtf8("html")
00068         || type == TQString::fromUtf8("text/html"))
00069         return HTML;
00070     
00071     if (type == TQString::fromUtf8("text")
00072         || (type.startsWith(TQString::fromUtf8("text/"), false)
00073         && !type.startsWith(TQString::fromUtf8("text/xml"), false))
00074        )
00075         return Text;
00076     
00077     TQStringList xmltypes;
00078     xmltypes.append(TQString::fromUtf8("xhtml"));
00079     // XML media types as defined in RFC3023:
00080     xmltypes.append(TQString::fromUtf8("text/xml"));
00081     xmltypes.append(TQString::fromUtf8("application/xml"));
00082     xmltypes.append(TQString::fromUtf8("text/xml-external-parsed-entity"));
00083     xmltypes.append(TQString::fromUtf8("application/xml-external-parsed-entity"));
00084     xmltypes.append(TQString::fromUtf8("application/xml-dtd"));
00085     
00086     
00087     if (xmltypes.contains(type)
00088         || type.endsWith(TQString::fromUtf8("+xml"), false)
00089         || type.endsWith(TQString::fromUtf8("/xml"), false))
00090         return XML;
00091     
00092     return Binary;
00093 }
00094 
00095 static TQString extractAtomContent(const TQDomElement& e)
00096 {
00097     ContentFormat format = mapTypeToFormat(e.attribute("mode"),
00098                                            e.attribute("type"),
00099                                            e.attribute("src"));
00100     
00101     switch (format)
00102     {
00103         case HTML:
00104         {
00105             const bool hasPre = e.text().contains( "<pre>", false ) || e.text().contains( "<pre ", false );
00106             return KCharsets::resolveEntities( hasPre ? e.text() : e.text().simplifyWhiteSpace() );
00107         }
00108         case Text:
00109             return plainTextToHtml(e.text().stripWhiteSpace());
00110         case XML:
00111             return childNodesAsXML(e).simplifyWhiteSpace();
00112         case Binary:
00113         default:
00114             return TQString();
00115     }
00116     
00117     return TQString();
00118 }
00119 
00120 TQString extractNode(const TQDomNode &parent, const TQString &elemName, bool isInlined)
00121 {
00122     TQDomNode node = parent.namedItem(elemName);
00123     if (node.isNull())
00124         return TQString();
00125 
00126     TQDomElement e = node.toElement();
00127         TQString result = e.text().stripWhiteSpace(); // let's assume plain text
00128  
00129         if (elemName == "content") // we have Atom here
00130         {
00131             result = extractAtomContent(e);
00132         }        
00133         else // check for HTML; not necessary for Atom:content
00134         {
00135             bool hasPre = result.contains("<pre>", false) || result.contains("<pre ", false);
00136             bool hasHtml = hasPre || result.contains("<");  // FIXME: test if we have html, should be more clever -> regexp
00137             if(!isInlined && !hasHtml)                      // perform nl2br if not a inline elt and it has no html elts
00138                     result = result = result.replace(TQChar('\n'), "<br />");
00139             if(!hasPre)                                     // strip white spaces if no <pre>
00140                     result = result.simplifyWhiteSpace();
00141         }
00142         
00143         return result.isEmpty() ? TQString() : result;
00144 }
00145 
00146 TQString extractTitle(const TQDomNode & parent)
00147 {
00148     TQDomNode node = parent.namedItem(TQString::fromLatin1("title"));
00149     if (node.isNull())
00150         return TQString();
00151 
00152     TQString result = node.toElement().text();
00153 
00154     result = KCharsets::resolveEntities(KCharsets::resolveEntities(result).replace(TQRegExp("<[^>]*>"), "").remove("\\"));
00155     result = result.simplifyWhiteSpace();
00156 
00157     if (result.isEmpty())
00158         return TQString();
00159 
00160     return result;
00161 }
00162 
00163 static void authorFromString(const TQString& strp, TQString& name, TQString& email)
00164 {
00165     TQString str = strp.stripWhiteSpace();
00166     if (str.isEmpty())
00167         return;
00168     
00169     // look for something looking like a mail address ( "foo@bar.com", 
00170     // "<foo@bar.com>") and extract it
00171     
00172     TQRegExp remail("<?([^@\\s<]+@[^>\\s]+)>?"); // FIXME: user "proper" regexp,
00173        // search kmail source for it
00174     
00175     int pos = remail.search(str);
00176     if (pos != -1)
00177     {
00178         TQString all = remail.cap(0);
00179         email = remail.cap(1);
00180         str.replace(all, ""); // remove mail address
00181     }
00182     
00183     // simplify the rest and use it as name
00184     
00185     name = str.simplifyWhiteSpace();
00186     
00187     // after removing the email, str might have 
00188     // the format "(Foo M. Bar)". We cut off 
00189     // parentheses if there are any. However, if
00190     // str is of the format "Foo M. Bar (President)",
00191     // we should not cut anything.
00192 
00193     TQRegExp rename("^\\(([^\\)]*)\\)");
00194     
00195     pos = rename.search(name);
00196     
00197     if (pos != -1)
00198     {
00199         name = rename.cap(1);
00200     }
00201     
00202     name = name.isEmpty() ? TQString() : name;
00203     email = email.isEmpty() ? TQString() : email;
00204 }
00205 
00206 TQString parseItemAuthor(const TQDomElement& element, Format format, Version version)
00207 {
00208     TQString name;
00209     TQString email;
00210 
00211     TQDomElement dcCreator = element.namedItem("dc:creator").toElement();
00212     
00213     if (!dcCreator.isNull())
00214          authorFromString(dcCreator.text(), name, email);
00215     else if (format == AtomFeed)
00216     {
00217         TQDomElement atomAuthor = element.namedItem("author").toElement();
00218         if (atomAuthor.isNull())
00219             atomAuthor = element.namedItem("atom:author").toElement();
00220         if (!atomAuthor.isNull())
00221         {
00222             TQDomElement atomName = atomAuthor.namedItem("name").toElement();
00223             if (atomName.isNull())
00224                 atomName = atomAuthor.namedItem("atom:name").toElement();
00225             name = atomName.text().stripWhiteSpace();
00226             
00227             TQDomElement atomEmail = atomAuthor.namedItem("email").toElement();
00228             if (atomEmail.isNull())
00229                 atomEmail = atomAuthor.namedItem("atom:email").toElement();
00230             email = atomEmail.text().stripWhiteSpace();
00231         }
00232     }
00233     else if (format == RSSFeed)
00234     {
00235         authorFromString(element.namedItem("author").toElement().text(), name, email);
00236     }
00237     
00238     if (name.isNull())
00239         name = email;
00240     
00241     if (!email.isNull())
00242         return TQString("<a href=\"mailto:%1\">%2</a>").arg(email).arg(name);
00243     else
00244         return name;
00245 }
00246 
00247 } // namespace RSS
00248 
00249 // vim:noet:ts=4