feeddetector.cpp
00001 /* 00002 This file is part of Akregator. 00003 00004 Copyright (C) 2004 Teemu Rytilahti <tpr@d5k.net> 00005 00006 This program is free software; you can redistribute it and/or modify 00007 it under the terms of the GNU General Public License as published by 00008 the Free Software Foundation; either version 2 of the License, or 00009 (at your option) any later version. 00010 00011 This program is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 GNU General Public License for more details. 00015 00016 You should have received a copy of the GNU General Public License 00017 along with this program; if not, write to the Free Software 00018 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00019 00020 As a special exception, permission is given to link this program 00021 with any edition of TQt, and distribute the resulting executable, 00022 without including the source code for TQt in the source distribution. 00023 */ 00024 00025 #include <tqregexp.h> 00026 #include <tqstring.h> 00027 #include <tqstringlist.h> 00028 #include <tqvaluelist.h> 00029 #include <kcharsets.h> 00030 #include <kurl.h> 00031 00032 #include "feeddetector.h" 00033 00034 00035 using namespace RSS; 00036 00037 FeedDetectorEntryList FeedDetector::extractFromLinkTags(const TQString& s) 00038 { 00039 //reduce all sequences of spaces, newlines etc. to one space: 00040 TQString str = s.simplifyWhiteSpace(); 00041 00042 // extracts <link> tags 00043 TQRegExp reLinkTag("<[\\s]?LINK[^>]*REL[\\s]?=[\\s]?\\\"[\\s]?(ALTERNATE|SERVICE\\.FEED)[\\s]?\\\"[^>]*>", false); 00044 00045 // extracts the URL (href="url") 00046 TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false); 00047 // extracts type attribute 00048 TQRegExp reType("TYPE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false); 00049 // extracts the title (title="title") 00050 TQRegExp reTitle("TITLE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false); 00051 00052 int pos = 0; 00053 int matchpos = 0; 00054 00055 // get all <link> tags 00056 TQStringList linkTags; 00057 //int strlength = str.length(); 00058 while ( matchpos != -1 ) 00059 { 00060 matchpos = reLinkTag.search(str, pos); 00061 if (matchpos != -1) 00062 { 00063 linkTags.append( str.mid(matchpos, reLinkTag.matchedLength()) ); 00064 pos = matchpos + reLinkTag.matchedLength(); 00065 } 00066 } 00067 00068 FeedDetectorEntryList list; 00069 00070 for ( TQStringList::Iterator it = linkTags.begin(); it != linkTags.end(); ++it ) 00071 { 00072 TQString type; 00073 int pos = reType.search(*it, 0); 00074 if (pos != -1) 00075 type = TQString(reType.cap(1)).lower(); 00076 00077 // we accept only type attributes indicating a feed 00078 if ( type != "application/rss+xml" && type != "application/rdf+xml" 00079 && type != "application/atom+xml" && type != "text/xml" ) 00080 continue; 00081 00082 TQString title; 00083 pos = reTitle.search(*it, 0); 00084 if (pos != -1) 00085 title = reTitle.cap(1); 00086 00087 title = KCharsets::resolveEntities(title); 00088 00089 TQString url; 00090 pos = reHref.search(*it, 0); 00091 if (pos != -1) 00092 url = reHref.cap(1); 00093 00094 url = KCharsets::resolveEntities(url); 00095 00096 // if feed has no title, use the url as preliminary title (until feed is parsed) 00097 if ( title.isEmpty() ) 00098 title = url; 00099 00100 if ( !url.isEmpty() ) 00101 list.append(FeedDetectorEntry(url, title) ); 00102 } 00103 00104 00105 return list; 00106 } 00107 00108 TQStringList FeedDetector::extractBruteForce(const TQString& s) 00109 { 00110 TQString str = s.simplifyWhiteSpace(); 00111 00112 TQRegExp reAhrefTag("<[\\s]?A[^>]?HREF=[\\s]?\\\"[^\\\"]*\\\"[^>]*>", false); 00113 00114 // extracts the URL (href="url") 00115 TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false); 00116 00117 TQRegExp rssrdfxml(".*(RSS|RDF|XML)", false); 00118 00119 int pos = 0; 00120 int matchpos = 0; 00121 00122 // get all <a href> tags and capture url 00123 TQStringList list; 00124 //int strlength = str.length(); 00125 while ( matchpos != -1 ) 00126 { 00127 matchpos = reAhrefTag.search(str, pos); 00128 if ( matchpos != -1 ) 00129 { 00130 TQString ahref = str.mid(matchpos, reAhrefTag.matchedLength()); 00131 int hrefpos = reHref.search(ahref, 0); 00132 if ( hrefpos != -1 ) 00133 { 00134 TQString url = reHref.cap(1); 00135 00136 url = KCharsets::resolveEntities(url); 00137 00138 if ( rssrdfxml.exactMatch(url) ) 00139 list.append(url); 00140 } 00141 00142 pos = matchpos + reAhrefTag.matchedLength(); 00143 } 00144 } 00145 00146 return list; 00147 } 00148 00149 TQString FeedDetector::fixRelativeURL(const TQString &s, const KURL &baseurl) 00150 { 00151 TQString s2=s; 00152 KURL u; 00153 if (KURL::isRelativeURL(s2)) 00154 { 00155 if (s2.startsWith("//")) 00156 { 00157 s2=s2.prepend(baseurl.protocol()+":"); 00158 u=s2; 00159 } 00160 else if (s2.startsWith("/")) 00161 { 00162 KURL b2(baseurl); 00163 b2.setPath(TQString()); // delete path and query, so that only protocol://host remains 00164 b2.setQuery(TQString()); 00165 u = KURL(b2, s2.remove(0,1)); // remove leading "/" 00166 } 00167 else 00168 { 00169 u = KURL(baseurl, s2); 00170 } 00171 } 00172 else 00173 u=s2; 00174 00175 u.cleanPath(); 00176 //kdDebug() << "AKREGATOR_PLUGIN_FIXURL: " << "url=" << s << " baseurl=" << baseurl.url() << " fixed=" << u.url() << 00177 //endl; 00178 return u.url(); 00179 }