source: trunk/www.guidonia.net/wp/wp-content/plugins/parteibuch-aggregator/bdp-rssfeed.php@ 44

Last change on this file since 44 was 44, checked in by luciano, 15 years ago
File size: 10.9 KB
Line 
1<?php
2//this a a complete new parser from brian's 0.63 version just with old class names
3//http://www.ozpolitics.info/plugins/bdp-rssaggregator-0-6-3.zip
4//shall be ready now
5
6if( !class_exists('BDPFeed') )
7{
8 mb_internal_encoding( get_option('blog_charset') );
9 mb_regex_encoding( get_option('blog_charset') );
10
11 class BDPFeed
12 {
13 /* This is a quick and dirty class to sort through a single channel feed.
14 */
15 var $url;
16// var $preserveTagsList;
17
18 function BDPFeed($url)
19 {
20 $this->url = $url;
21 }
22
23 function title_recode($title, $captureTags=false)
24 {
25 /* get rid of difficult characters - we don't want quotes as they affect the SQL and echos
26 * we don't want XHTML code here as it might cause problems when printed
27 * encode unencoded ampersands
28 */
29 // blogspot puts encoded tags into the feed stream - decode them
30 $title = mb_eregi_replace("\&lt;", '<', $title);
31 $title = mb_eregi_replace("\&gt;", '>', $title);
32 $title = mb_eregi_replace("\&amp;([a-z]+);", '&\\1;', $title);
33 $title = mb_eregi_replace("\&amp;#([0-9]+);", '&#\\1;', $title);
34 $title = mb_eregi_replace("\&amp;#x([a-e0-9]+);",'&#x\\1;', $title);
35
36 // remove CDATA tags - leave XHTML tags
37 $title = preg_replace("'<!\[CDATA\[(.*?)\]\]>'si", '\\1', $title);
38
39 // tidy-up quotes - prevents SQL insertion
40 $title = mb_ereg_replace('"', '&quot;', $title);
41 $title = mb_ereg_replace("'", '&#39;', $title);
42 $title = mb_eregi_replace('\&apos;', '&#39;', $title); // old browser fix
43
44 //Parteibuch change: Problems with comments not closed fixed
45 $title = mb_eregi_replace('<!--[^>]*-->', '', $title);
46 $title = mb_eregi_replace('<!--', '', $title);
47
48 // find unencoded ampersands and encode them!
49 $title = mb_ereg_replace('<', '&lt;', $title);
50 $title = mb_ereg_replace('>', '&gt;', $title);
51 $title = mb_eregi_replace('\&([a-z]+);', '<\\1>', $title); // alpha
52 $title = mb_eregi_replace('\&(#[0-9]+);', '<\\1>', $title); // decimal
53 $title = mb_eregi_replace('\&(#x[a-e0-9]+);', '<\\1>', $title); // hex
54 $title = mb_eregi_replace('\&', '&amp;', $title);
55 $title = mb_eregi_replace('<([^>]+)>', '&\\1;', $title);
56
57 //Parteibuch fix: avoid umlaut encoding errors
58 $title = mb_ereg_replace('\&A[uU][mM][lL]', '&#196', $title);
59 $title = mb_ereg_replace('\&O[uU][mM][lL]', '&#214', $title);
60 $title = mb_ereg_replace('\&U[uU][mM][lL]', '&#220', $title);
61 $title = mb_ereg_replace('\&a[uU][mM][lL]', '&#228', $title);
62 $title = mb_ereg_replace('\&o[uU][mM][lL]', '&#246', $title);
63 $title = mb_ereg_replace('\&u[uU][mM][lL]', '&#252', $title);
64 $title = mb_eregi_replace('\&szlig', '&#223', $title);
65
66
67 // tidy-up white spaces
68 $title = mb_eregi_replace('\&nbsp;', ' ', $title);
69
70 //parteibuch: get rid of all lower ascii chars, treat them as blanks
71 $title = mb_eregi_replace('[ \n\r\s(\x00-\x1F)]+', ' ', $title);
72
73 return $title;
74 }
75
76 function rebaseAddresses($itemtext, $siteURL)
77 {
78 // simplify and manipulate links in the itemtext
79 // 1 - restore quotes and angle brackets -- just for a moment
80 $itemtext = mb_eregi_replace('&quot;', '"', $itemtext);
81 $itemtext = mb_eregi_replace('&#39;', "'", $itemtext);
82 $itemtext = mb_eregi_replace('&lt;', '<', $itemtext);
83 $itemtext = mb_eregi_replace('&gt;', '>', $itemtext);
84
85 // 2 - simplify and standardise the HTML
86 $itemtext = mb_eregi_replace('<img ([^>]*)src="([^">]*)"([^>]*) />',
87 "<img src='\\2' \\1 \\3 />", $itemtext);
88 $itemtext = mb_eregi_replace("<img ([^>]*)src='([^'>]*)'([^>]*) />",
89 "<img src='\\2' \\1 \\3 />", $itemtext);
90 $itemtext = mb_eregi_replace("<img (src='[^'>]*')([^>]*)width=['\"]([^'\">]*)['\"]([^>]*) />",
91 "<img \\1 width='\\3' \\2 \\4 />", $itemtext);
92 $itemtext = mb_eregi_replace(
93 "<img (src='[^'>]*' width='[^'>]*')[^>]*height=['\"]([^'\">]*)['\"][^>]* />",
94 "<img \\1 height='\\2' />", $itemtext);
95 $itemtext = mb_eregi_replace("<a [^>]*href='([^\'>]*)'[^>]*>",
96 "<a href='\\1' target='_blank' rel='nofollow'>", $itemtext);
97 $itemtext = mb_eregi_replace('<a [^>]*href="([^"\'>]*)"[^>]*>',
98 "<a href='\\1' target='_blank' rel='nofollow'>", $itemtext);
99
100 // 3 - substitute in full address to relative addresses
101 $itemtext = mb_eregi_replace("<img src='/([^'\>]+'[^\>]+) />",
102 "<img src='$siteURL/\\1 />", $itemtext);
103 $itemtext = mb_eregi_replace( "<a href='/([^'>]+'[^\>]+)>",
104 "<a href='$siteURL/\\1>", $itemtext);
105
106 // 4 -- other tidy-ups
107 $itemtext = mb_eregi_replace('<p [^>]*>', '<p>', $itemtext);
108 $itemtext = mb_eregi_replace('<li [^>]*>', '<li>', $itemtext);
109 $itemtext = mb_eregi_replace('<br[^>]* />', '<br />', $itemtext);
110
111 //echo "<!-- DEBUG: $itemtext -->\n";
112
113 // 5 - kill the quotes to be SQL secure
114 $itemtext = mb_eregi_replace('"', '&quot;', $itemtext);
115 $itemtext = mb_eregi_replace("'", '&#39;', $itemtext);
116 $itemtext = mb_eregi_replace('<', '&lt;', $itemtext);
117 $itemtext = mb_eregi_replace('>', '&gt;', $itemtext);
118
119 return $itemtext;
120 }
121
122 function reg_capture($pattern, $subject)
123 {
124 // start regular expression
125 mb_eregi($pattern, $subject, $out);
126
127 // if there is some result... process it and return it
128 if(isset($out[1]))
129 return $out[1];
130 else
131 // if there is NO result, return nothing
132 return FALSE;
133 }
134
135 function reg_capture_all($splitter, $tail, $subject)
136 {
137
138 $a = mb_split($splitter, $subject);
139 $out = array();
140 if (count($a) > 1 )
141 {
142 for($i=1; $i<count($a); $i++)
143 {
144 $out[$i-1] = mb_eregi_replace($tail, '', $a[$i]);
145 }
146 return $out;
147 }
148 return FALSE;
149
150 }
151
152 function parse()
153 {
154 global $bdprss_db;
155
156 $result = array();
157 $result['items'] = array();
158
159 $snoopy = new Snoopy();
160 $snoopy->agent = PBA_PRODUCT . ' ' . PBA_VERSION;
161 $snoopy->read_timeout = 8; // THINK ABOUT THIS!
162 $snoopy->curl_path = FALSE; // THINK ABOUT THIS!
163 $snoopy->maxredirs = 2;
164
165 if(! @$snoopy->fetch($this->url))
166 {
167 $bdprss_db->recordError($this->url, "Could not open ".$this->url);
168 return FALSE;
169 }
170 $content = $snoopy->results;
171
172 if($snoopy->error) $bdprss_db->recordError($this->url, $snoopy->error);
173
174 if(!$content)
175 {
176 $bdprss_db->recordError($this->url, "Snoopy did not recover any content?");
177 return FALSE;
178 }
179
180 //Parteibuch: detect charset
181 $old_charset = $this->reg_capture("'<?xml[^>]*encoding=\"(.*?)\"[^>]*?>'", $content);
182 $new_charset = get_option( 'blog_charset' );
183
184 // sort out character encoding
185 if($old_charset != $new_charset){
186 mb_detect_order('WINDOWS-1252, UTF-8, ISO-8859-1');
187 if(!$old_charset) $old_charset = mb_detect_encoding( $content );
188 //print 'DEBUG: ' . $old_charset;
189 $content = @mb_convert_encoding($content, /*to*/$new_charset, /*from*/$old_charset);
190 }
191
192 // quick and dirty -- work out the feedtype
193 //mb_regex_encoding('UTF-8'); // this file is written in UTF-8
194 $feedtype = FALSE;
195 if ( mb_eregi('<rss[^>]*?>.*?</rss>', $content) )
196 {
197 $feedtype = 'RSS';
198 $feed = $this->reg_capture('<channel[^>]*?>(.*?)</channel>', $content);
199 $channeltags = array ('title', 'link', 'description', 'copyright');
200 $itemtags = array('title', 'link', 'description', 'content:encoded', 'pubDate', 'dc:date', 'guid', 'issued', 'modified', 'created', 'published', 'updated', 'dc:creator', 'dc:source', 'dc:rights');
201 $item = 'item';
202 }
203 if ( !$feedtype && mb_eregi('<rdf[^>]*?>.*?</rdf[^>]*?>', $content) )
204 {
205 $feedtype = 'RDF';
206 $feed = $content;
207 $channeltags = array ('title', 'link', 'description', 'dc:creator', 'dc:date');
208 $itemtags = array('title', 'link', 'description', 'dc:date', 'dc:subject', 'dc:creator', );
209 $item = 'item';
210 }
211 if ( !$feedtype && mb_eregi('<feed[^>]*?>.*?</feed>', $content) )
212 {
213 $feedtype = 'ATOM';
214 $feed = $this->reg_capture('<feed[^\>]*?>(.*?)</feed>', $content);
215 $channeltags = array('title', 'tagline', 'link');
216 $itemtags = array('title', 'summary', 'link', 'content', 'issued', 'modified', 'created', 'published', 'updated');
217 $bloggerlink1 = "<link[^\>]*href=[\"']([^\"']*)[\"'][^\>]*?type=[\"']text/html[\"'][^\>]*?>";
218 $bloggerlink2 = "<link[^\>]*type=[\"']text/html[\"'][^\>]*?href=[\"']([^\"']*)[\"'][^\>]*?>";
219 $bloggerlink3 = "<link[^\>]*href=[\"']([^\"']*)[\"'][^\>]*>";
220 $item = 'entry';
221 }
222 if ( !$feedtype )
223 {
224 $bdprss_db->recordError($this->url, "Cannot ascertain feed-type, therefore ignored");
225 return FALSE;
226 }
227 $result['feedtype'] = $feedtype;
228
229 if(BDPRSS2_DEBUG) $bdprss_db->recordError($this->url, "DEBUG feedtype: $feedtype");
230 if(BDPRSS2_DEBUG) $bdprss_db->recordError($this->url, "DEBUG feedsize: ".strlen($feed));
231
232 // get the overarching feed information
233 foreach($channeltags as $tag)
234 {
235 if($feedtype == 'ATOM' && $tag == 'link')
236 {
237 $tmp = $this->reg_capture($bloggerlink1, $feed);
238 if(!$tmp) $tmp = $this->reg_capture($bloggerlink2, $feed);
239 if(!$tmp) $tmp = $this->reg_capture($bloggerlink3, $feed);
240 }
241 else
242 $tmp = $this->reg_capture('<'.$tag.'[^>]*?>(.*?)</'.$tag.'>', $feed);
243
244 if(!$tmp) continue;
245 $result[$tag] = $this->title_recode($tmp);
246 }
247
248 // manipulate site URL for use with indirect references
249 $siteURL = $result['link'];
250 if(!$siteURL)
251 $bdprss_db->recordError($this->url, "Feed does not include a site URL");
252 else
253 $siteURL = mb_eregi_replace("(http://[^/]*).*$", "\\1", $siteURL);
254
255 // get the item information
256 $itemArray = $this->reg_capture_all('<'.$item.'[^>]*?>', '</'.$item.'>.*', $feed);
257 if(!$itemArray)
258 {
259 $bdprss_db->recordError($this->url, "Feed did not contain any items");
260 return $result;
261 }
262
263 $itemcount = count($itemArray);
264 $i = 0;
265 while ( $i < $itemcount )
266 {
267 $itm = $itemArray[$i];
268 foreach( $itemtags as $itag )
269 {
270 if($feedtype == 'ATOM' && $itag == 'link')
271 {
272 $tmp = $this->reg_capture($bloggerlink1, $itm);
273 if(!$tmp) $tmp = $this->reg_capture($bloggerlink2, $itm);
274 if(!$tmp) $tmp = $this->reg_capture($bloggerlink3, $itm);
275 }
276 else
277 $tmp = $this->reg_capture('<'.$itag.'[^>]*?>(.*?)</'.$itag.'>', $itm);
278
279 //parteibuch - we don't want feeds without a siteurl
280 if ($tmp == '' || !$siteURL) continue;
281
282 $tmp = $this->title_recode($tmp);
283
284 if($siteURL)
285 if($itag == 'content:encoded' || $itag == 'description' ||
286 $itag == 'content' || $itag == 'summary')
287 $tmp = $this->rebaseAddresses($tmp, $siteURL);
288
289 //parteibuch blog.de fix of links directed to www.blog.de instead of subdomain
290 if(strstr($tmp,'www.blog.de') && strstr($siteURL,'blog.de') && $itag == 'link'){
291 $tmp = eregi_replace("http://[^/]*/(.*)$", "$siteURL/\\1", $tmp);
292 //if(strstr($siteURL,'blog.de')) $bdprss_db->recordError($this->url, "DEBUG for rebased blog.de item link: $tmp");
293 }
294
295 $result['items'][$i][$itag] = $tmp;
296 }
297 $i++;
298 }
299 return $result;
300 }
301 }
302}
303?>
Note: See TracBrowser for help on using the repository browser.