1 | <?php
|
---|
2 | //this a a complete new parser from brian's 0.63 version just with old class names
|
---|
3 | //http://www.ozpolitics.info/plugins/bdp-rssaggregator-0-6-3.zip
|
---|
4 | //shall be ready now
|
---|
5 |
|
---|
6 | if( !class_exists('BDPFeed') )
|
---|
7 | {
|
---|
8 | mb_internal_encoding( get_option('blog_charset') );
|
---|
9 | mb_regex_encoding( get_option('blog_charset') );
|
---|
10 |
|
---|
11 | class BDPFeed
|
---|
12 | {
|
---|
13 | /* This is a quick and dirty class to sort through a single channel feed.
|
---|
14 | */
|
---|
15 | var $url;
|
---|
16 | // var $preserveTagsList;
|
---|
17 |
|
---|
18 | function BDPFeed($url)
|
---|
19 | {
|
---|
20 | $this->url = $url;
|
---|
21 | }
|
---|
22 |
|
---|
23 | function title_recode($title, $captureTags=false)
|
---|
24 | {
|
---|
25 | /* get rid of difficult characters - we don't want quotes as they affect the SQL and echos
|
---|
26 | * we don't want XHTML code here as it might cause problems when printed
|
---|
27 | * encode unencoded ampersands
|
---|
28 | */
|
---|
29 | // blogspot puts encoded tags into the feed stream - decode them
|
---|
30 | $title = mb_eregi_replace("\<", '<', $title);
|
---|
31 | $title = mb_eregi_replace("\>", '>', $title);
|
---|
32 | $title = mb_eregi_replace("\&([a-z]+);", '&\\1;', $title);
|
---|
33 | $title = mb_eregi_replace("\&#([0-9]+);", '&#\\1;', $title);
|
---|
34 | $title = mb_eregi_replace("\&#x([a-e0-9]+);",'&#x\\1;', $title);
|
---|
35 |
|
---|
36 | // remove CDATA tags - leave XHTML tags
|
---|
37 | $title = preg_replace("'<!\[CDATA\[(.*?)\]\]>'si", '\\1', $title);
|
---|
38 |
|
---|
39 | // tidy-up quotes - prevents SQL insertion
|
---|
40 | $title = mb_ereg_replace('"', '"', $title);
|
---|
41 | $title = mb_ereg_replace("'", ''', $title);
|
---|
42 | $title = mb_eregi_replace('\'', ''', $title); // old browser fix
|
---|
43 |
|
---|
44 | //Parteibuch change: Problems with comments not closed fixed
|
---|
45 | $title = mb_eregi_replace('<!--[^>]*-->', '', $title);
|
---|
46 | $title = mb_eregi_replace('<!--', '', $title);
|
---|
47 |
|
---|
48 | // find unencoded ampersands and encode them!
|
---|
49 | $title = mb_ereg_replace('<', '<', $title);
|
---|
50 | $title = mb_ereg_replace('>', '>', $title);
|
---|
51 | $title = mb_eregi_replace('\&([a-z]+);', '<\\1>', $title); // alpha
|
---|
52 | $title = mb_eregi_replace('\&(#[0-9]+);', '<\\1>', $title); // decimal
|
---|
53 | $title = mb_eregi_replace('\&(#x[a-e0-9]+);', '<\\1>', $title); // hex
|
---|
54 | $title = mb_eregi_replace('\&', '&', $title);
|
---|
55 | $title = mb_eregi_replace('<([^>]+)>', '&\\1;', $title);
|
---|
56 |
|
---|
57 | //Parteibuch fix: avoid umlaut encoding errors
|
---|
58 | $title = mb_ereg_replace('\&A[uU][mM][lL]', 'Ä', $title);
|
---|
59 | $title = mb_ereg_replace('\&O[uU][mM][lL]', 'Ö', $title);
|
---|
60 | $title = mb_ereg_replace('\&U[uU][mM][lL]', 'Ü', $title);
|
---|
61 | $title = mb_ereg_replace('\&a[uU][mM][lL]', 'ä', $title);
|
---|
62 | $title = mb_ereg_replace('\&o[uU][mM][lL]', 'ö', $title);
|
---|
63 | $title = mb_ereg_replace('\&u[uU][mM][lL]', 'ü', $title);
|
---|
64 | $title = mb_eregi_replace('\ß', 'ß', $title);
|
---|
65 |
|
---|
66 |
|
---|
67 | // tidy-up white spaces
|
---|
68 | $title = mb_eregi_replace('\ ', ' ', $title);
|
---|
69 |
|
---|
70 | //parteibuch: get rid of all lower ascii chars, treat them as blanks
|
---|
71 | $title = mb_eregi_replace('[ \n\r\s(\x00-\x1F)]+', ' ', $title);
|
---|
72 |
|
---|
73 | return $title;
|
---|
74 | }
|
---|
75 |
|
---|
76 | function rebaseAddresses($itemtext, $siteURL)
|
---|
77 | {
|
---|
78 | // simplify and manipulate links in the itemtext
|
---|
79 | // 1 - restore quotes and angle brackets -- just for a moment
|
---|
80 | $itemtext = mb_eregi_replace('"', '"', $itemtext);
|
---|
81 | $itemtext = mb_eregi_replace(''', "'", $itemtext);
|
---|
82 | $itemtext = mb_eregi_replace('<', '<', $itemtext);
|
---|
83 | $itemtext = mb_eregi_replace('>', '>', $itemtext);
|
---|
84 |
|
---|
85 | // 2 - simplify and standardise the HTML
|
---|
86 | $itemtext = mb_eregi_replace('<img ([^>]*)src="([^">]*)"([^>]*) />',
|
---|
87 | "<img src='\\2' \\1 \\3 />", $itemtext);
|
---|
88 | $itemtext = mb_eregi_replace("<img ([^>]*)src='([^'>]*)'([^>]*) />",
|
---|
89 | "<img src='\\2' \\1 \\3 />", $itemtext);
|
---|
90 | $itemtext = mb_eregi_replace("<img (src='[^'>]*')([^>]*)width=['\"]([^'\">]*)['\"]([^>]*) />",
|
---|
91 | "<img \\1 width='\\3' \\2 \\4 />", $itemtext);
|
---|
92 | $itemtext = mb_eregi_replace(
|
---|
93 | "<img (src='[^'>]*' width='[^'>]*')[^>]*height=['\"]([^'\">]*)['\"][^>]* />",
|
---|
94 | "<img \\1 height='\\2' />", $itemtext);
|
---|
95 | $itemtext = mb_eregi_replace("<a [^>]*href='([^\'>]*)'[^>]*>",
|
---|
96 | "<a href='\\1' target='_blank' rel='nofollow'>", $itemtext);
|
---|
97 | $itemtext = mb_eregi_replace('<a [^>]*href="([^"\'>]*)"[^>]*>',
|
---|
98 | "<a href='\\1' target='_blank' rel='nofollow'>", $itemtext);
|
---|
99 |
|
---|
100 | // 3 - substitute in full address to relative addresses
|
---|
101 | $itemtext = mb_eregi_replace("<img src='/([^'\>]+'[^\>]+) />",
|
---|
102 | "<img src='$siteURL/\\1 />", $itemtext);
|
---|
103 | $itemtext = mb_eregi_replace( "<a href='/([^'>]+'[^\>]+)>",
|
---|
104 | "<a href='$siteURL/\\1>", $itemtext);
|
---|
105 |
|
---|
106 | // 4 -- other tidy-ups
|
---|
107 | $itemtext = mb_eregi_replace('<p [^>]*>', '<p>', $itemtext);
|
---|
108 | $itemtext = mb_eregi_replace('<li [^>]*>', '<li>', $itemtext);
|
---|
109 | $itemtext = mb_eregi_replace('<br[^>]* />', '<br />', $itemtext);
|
---|
110 |
|
---|
111 | //echo "<!-- DEBUG: $itemtext -->\n";
|
---|
112 |
|
---|
113 | // 5 - kill the quotes to be SQL secure
|
---|
114 | $itemtext = mb_eregi_replace('"', '"', $itemtext);
|
---|
115 | $itemtext = mb_eregi_replace("'", ''', $itemtext);
|
---|
116 | $itemtext = mb_eregi_replace('<', '<', $itemtext);
|
---|
117 | $itemtext = mb_eregi_replace('>', '>', $itemtext);
|
---|
118 |
|
---|
119 | return $itemtext;
|
---|
120 | }
|
---|
121 |
|
---|
122 | function reg_capture($pattern, $subject)
|
---|
123 | {
|
---|
124 | // start regular expression
|
---|
125 | mb_eregi($pattern, $subject, $out);
|
---|
126 |
|
---|
127 | // if there is some result... process it and return it
|
---|
128 | if(isset($out[1]))
|
---|
129 | return $out[1];
|
---|
130 | else
|
---|
131 | // if there is NO result, return nothing
|
---|
132 | return FALSE;
|
---|
133 | }
|
---|
134 |
|
---|
135 | function reg_capture_all($splitter, $tail, $subject)
|
---|
136 | {
|
---|
137 |
|
---|
138 | $a = mb_split($splitter, $subject);
|
---|
139 | $out = array();
|
---|
140 | if (count($a) > 1 )
|
---|
141 | {
|
---|
142 | for($i=1; $i<count($a); $i++)
|
---|
143 | {
|
---|
144 | $out[$i-1] = mb_eregi_replace($tail, '', $a[$i]);
|
---|
145 | }
|
---|
146 | return $out;
|
---|
147 | }
|
---|
148 | return FALSE;
|
---|
149 |
|
---|
150 | }
|
---|
151 |
|
---|
152 | function parse()
|
---|
153 | {
|
---|
154 | global $bdprss_db;
|
---|
155 |
|
---|
156 | $result = array();
|
---|
157 | $result['items'] = array();
|
---|
158 |
|
---|
159 | $snoopy = new Snoopy();
|
---|
160 | $snoopy->agent = PBA_PRODUCT . ' ' . PBA_VERSION;
|
---|
161 | $snoopy->read_timeout = 8; // THINK ABOUT THIS!
|
---|
162 | $snoopy->curl_path = FALSE; // THINK ABOUT THIS!
|
---|
163 | $snoopy->maxredirs = 2;
|
---|
164 |
|
---|
165 | if(! @$snoopy->fetch($this->url))
|
---|
166 | {
|
---|
167 | $bdprss_db->recordError($this->url, "Could not open ".$this->url);
|
---|
168 | return FALSE;
|
---|
169 | }
|
---|
170 | $content = $snoopy->results;
|
---|
171 |
|
---|
172 | if($snoopy->error) $bdprss_db->recordError($this->url, $snoopy->error);
|
---|
173 |
|
---|
174 | if(!$content)
|
---|
175 | {
|
---|
176 | $bdprss_db->recordError($this->url, "Snoopy did not recover any content?");
|
---|
177 | return FALSE;
|
---|
178 | }
|
---|
179 |
|
---|
180 | //Parteibuch: detect charset
|
---|
181 | $old_charset = $this->reg_capture("'<?xml[^>]*encoding=\"(.*?)\"[^>]*?>'", $content);
|
---|
182 | $new_charset = get_option( 'blog_charset' );
|
---|
183 |
|
---|
184 | // sort out character encoding
|
---|
185 | if($old_charset != $new_charset){
|
---|
186 | mb_detect_order('WINDOWS-1252, UTF-8, ISO-8859-1');
|
---|
187 | if(!$old_charset) $old_charset = mb_detect_encoding( $content );
|
---|
188 | //print 'DEBUG: ' . $old_charset;
|
---|
189 | $content = @mb_convert_encoding($content, /*to*/$new_charset, /*from*/$old_charset);
|
---|
190 | }
|
---|
191 |
|
---|
192 | // quick and dirty -- work out the feedtype
|
---|
193 | //mb_regex_encoding('UTF-8'); // this file is written in UTF-8
|
---|
194 | $feedtype = FALSE;
|
---|
195 | if ( mb_eregi('<rss[^>]*?>.*?</rss>', $content) )
|
---|
196 | {
|
---|
197 | $feedtype = 'RSS';
|
---|
198 | $feed = $this->reg_capture('<channel[^>]*?>(.*?)</channel>', $content);
|
---|
199 | $channeltags = array ('title', 'link', 'description', 'copyright');
|
---|
200 | $itemtags = array('title', 'link', 'description', 'content:encoded', 'pubDate', 'dc:date', 'guid', 'issued', 'modified', 'created', 'published', 'updated', 'dc:creator', 'dc:source', 'dc:rights');
|
---|
201 | $item = 'item';
|
---|
202 | }
|
---|
203 | if ( !$feedtype && mb_eregi('<rdf[^>]*?>.*?</rdf[^>]*?>', $content) )
|
---|
204 | {
|
---|
205 | $feedtype = 'RDF';
|
---|
206 | $feed = $content;
|
---|
207 | $channeltags = array ('title', 'link', 'description', 'dc:creator', 'dc:date');
|
---|
208 | $itemtags = array('title', 'link', 'description', 'dc:date', 'dc:subject', 'dc:creator', );
|
---|
209 | $item = 'item';
|
---|
210 | }
|
---|
211 | if ( !$feedtype && mb_eregi('<feed[^>]*?>.*?</feed>', $content) )
|
---|
212 | {
|
---|
213 | $feedtype = 'ATOM';
|
---|
214 | $feed = $this->reg_capture('<feed[^\>]*?>(.*?)</feed>', $content);
|
---|
215 | $channeltags = array('title', 'tagline', 'link');
|
---|
216 | $itemtags = array('title', 'summary', 'link', 'content', 'issued', 'modified', 'created', 'published', 'updated');
|
---|
217 | $bloggerlink1 = "<link[^\>]*href=[\"']([^\"']*)[\"'][^\>]*?type=[\"']text/html[\"'][^\>]*?>";
|
---|
218 | $bloggerlink2 = "<link[^\>]*type=[\"']text/html[\"'][^\>]*?href=[\"']([^\"']*)[\"'][^\>]*?>";
|
---|
219 | $bloggerlink3 = "<link[^\>]*href=[\"']([^\"']*)[\"'][^\>]*>";
|
---|
220 | $item = 'entry';
|
---|
221 | }
|
---|
222 | if ( !$feedtype )
|
---|
223 | {
|
---|
224 | $bdprss_db->recordError($this->url, "Cannot ascertain feed-type, therefore ignored");
|
---|
225 | return FALSE;
|
---|
226 | }
|
---|
227 | $result['feedtype'] = $feedtype;
|
---|
228 |
|
---|
229 | if(BDPRSS2_DEBUG) $bdprss_db->recordError($this->url, "DEBUG feedtype: $feedtype");
|
---|
230 | if(BDPRSS2_DEBUG) $bdprss_db->recordError($this->url, "DEBUG feedsize: ".strlen($feed));
|
---|
231 |
|
---|
232 | // get the overarching feed information
|
---|
233 | foreach($channeltags as $tag)
|
---|
234 | {
|
---|
235 | if($feedtype == 'ATOM' && $tag == 'link')
|
---|
236 | {
|
---|
237 | $tmp = $this->reg_capture($bloggerlink1, $feed);
|
---|
238 | if(!$tmp) $tmp = $this->reg_capture($bloggerlink2, $feed);
|
---|
239 | if(!$tmp) $tmp = $this->reg_capture($bloggerlink3, $feed);
|
---|
240 | }
|
---|
241 | else
|
---|
242 | $tmp = $this->reg_capture('<'.$tag.'[^>]*?>(.*?)</'.$tag.'>', $feed);
|
---|
243 |
|
---|
244 | if(!$tmp) continue;
|
---|
245 | $result[$tag] = $this->title_recode($tmp);
|
---|
246 | }
|
---|
247 |
|
---|
248 | // manipulate site URL for use with indirect references
|
---|
249 | $siteURL = $result['link'];
|
---|
250 | if(!$siteURL)
|
---|
251 | $bdprss_db->recordError($this->url, "Feed does not include a site URL");
|
---|
252 | else
|
---|
253 | $siteURL = mb_eregi_replace("(http://[^/]*).*$", "\\1", $siteURL);
|
---|
254 |
|
---|
255 | // get the item information
|
---|
256 | $itemArray = $this->reg_capture_all('<'.$item.'[^>]*?>', '</'.$item.'>.*', $feed);
|
---|
257 | if(!$itemArray)
|
---|
258 | {
|
---|
259 | $bdprss_db->recordError($this->url, "Feed did not contain any items");
|
---|
260 | return $result;
|
---|
261 | }
|
---|
262 |
|
---|
263 | $itemcount = count($itemArray);
|
---|
264 | $i = 0;
|
---|
265 | while ( $i < $itemcount )
|
---|
266 | {
|
---|
267 | $itm = $itemArray[$i];
|
---|
268 | foreach( $itemtags as $itag )
|
---|
269 | {
|
---|
270 | if($feedtype == 'ATOM' && $itag == 'link')
|
---|
271 | {
|
---|
272 | $tmp = $this->reg_capture($bloggerlink1, $itm);
|
---|
273 | if(!$tmp) $tmp = $this->reg_capture($bloggerlink2, $itm);
|
---|
274 | if(!$tmp) $tmp = $this->reg_capture($bloggerlink3, $itm);
|
---|
275 | }
|
---|
276 | else
|
---|
277 | $tmp = $this->reg_capture('<'.$itag.'[^>]*?>(.*?)</'.$itag.'>', $itm);
|
---|
278 |
|
---|
279 | //parteibuch - we don't want feeds without a siteurl
|
---|
280 | if ($tmp == '' || !$siteURL) continue;
|
---|
281 |
|
---|
282 | $tmp = $this->title_recode($tmp);
|
---|
283 |
|
---|
284 | if($siteURL)
|
---|
285 | if($itag == 'content:encoded' || $itag == 'description' ||
|
---|
286 | $itag == 'content' || $itag == 'summary')
|
---|
287 | $tmp = $this->rebaseAddresses($tmp, $siteURL);
|
---|
288 |
|
---|
289 | //parteibuch blog.de fix of links directed to www.blog.de instead of subdomain
|
---|
290 | if(strstr($tmp,'www.blog.de') && strstr($siteURL,'blog.de') && $itag == 'link'){
|
---|
291 | $tmp = eregi_replace("http://[^/]*/(.*)$", "$siteURL/\\1", $tmp);
|
---|
292 | //if(strstr($siteURL,'blog.de')) $bdprss_db->recordError($this->url, "DEBUG for rebased blog.de item link: $tmp");
|
---|
293 | }
|
---|
294 |
|
---|
295 | $result['items'][$i][$itag] = $tmp;
|
---|
296 | }
|
---|
297 | $i++;
|
---|
298 | }
|
---|
299 | return $result;
|
---|
300 | }
|
---|
301 | }
|
---|
302 | }
|
---|
303 | ?>
|
---|