url = $url; } function title_recode($title, $captureTags=false) { /* get rid of difficult characters - we don't want quotes as they affect the SQL and echos * we don't want XHTML code here as it might cause problems when printed * encode unencoded ampersands */ // blogspot puts encoded tags into the feed stream - decode them $title = mb_eregi_replace("\<", '<', $title); $title = mb_eregi_replace("\>", '>', $title); $title = mb_eregi_replace("\&([a-z]+);", '&\\1;', $title); $title = mb_eregi_replace("\&#([0-9]+);", '&#\\1;', $title); $title = mb_eregi_replace("\&#x([a-e0-9]+);",'&#x\\1;', $title); // remove CDATA tags - leave XHTML tags $title = preg_replace("''si", '\\1', $title); // tidy-up quotes - prevents SQL insertion $title = mb_ereg_replace('"', '"', $title); $title = mb_ereg_replace("'", ''', $title); $title = mb_eregi_replace('\'', ''', $title); // old browser fix //Parteibuch change: Problems with comments not closed fixed $title = mb_eregi_replace('', '', $title); $title = mb_eregi_replace('\n"; // 5 - kill the quotes to be SQL secure $itemtext = mb_eregi_replace('"', '"', $itemtext); $itemtext = mb_eregi_replace("'", ''', $itemtext); $itemtext = mb_eregi_replace('<', '<', $itemtext); $itemtext = mb_eregi_replace('>', '>', $itemtext); return $itemtext; } function reg_capture($pattern, $subject) { // start regular expression mb_eregi($pattern, $subject, $out); // if there is some result... process it and return it if(isset($out[1])) return $out[1]; else // if there is NO result, return nothing return FALSE; } function reg_capture_all($splitter, $tail, $subject) { $a = mb_split($splitter, $subject); $out = array(); if (count($a) > 1 ) { for($i=1; $iagent = PBA_PRODUCT . ' ' . PBA_VERSION; $snoopy->read_timeout = 8; // THINK ABOUT THIS! $snoopy->curl_path = FALSE; // THINK ABOUT THIS! $snoopy->maxredirs = 2; if(! @$snoopy->fetch($this->url)) { $bdprss_db->recordError($this->url, "Could not open ".$this->url); return FALSE; } $content = $snoopy->results; if($snoopy->error) $bdprss_db->recordError($this->url, $snoopy->error); if(!$content) { $bdprss_db->recordError($this->url, "Snoopy did not recover any content?"); return FALSE; } //Parteibuch: detect charset $old_charset = $this->reg_capture("']*encoding=\"(.*?)\"[^>]*?>'", $content); $new_charset = get_option( 'blog_charset' ); // sort out character encoding if($old_charset != $new_charset){ mb_detect_order('WINDOWS-1252, UTF-8, ISO-8859-1'); if(!$old_charset) $old_charset = mb_detect_encoding( $content ); //print 'DEBUG: ' . $old_charset; $content = @mb_convert_encoding($content, /*to*/$new_charset, /*from*/$old_charset); } // quick and dirty -- work out the feedtype //mb_regex_encoding('UTF-8'); // this file is written in UTF-8 $feedtype = FALSE; if ( mb_eregi(']*?>.*?', $content) ) { $feedtype = 'RSS'; $feed = $this->reg_capture(']*?>(.*?)', $content); $channeltags = array ('title', 'link', 'description', 'copyright'); $itemtags = array('title', 'link', 'description', 'content:encoded', 'pubDate', 'dc:date', 'guid', 'issued', 'modified', 'created', 'published', 'updated', 'dc:creator', 'dc:source', 'dc:rights'); $item = 'item'; } if ( !$feedtype && mb_eregi(']*?>.*?]*?>', $content) ) { $feedtype = 'RDF'; $feed = $content; $channeltags = array ('title', 'link', 'description', 'dc:creator', 'dc:date'); $itemtags = array('title', 'link', 'description', 'dc:date', 'dc:subject', 'dc:creator', ); $item = 'item'; } if ( !$feedtype && mb_eregi(']*?>.*?', $content) ) { $feedtype = 'ATOM'; $feed = $this->reg_capture(']*?>(.*?)', $content); $channeltags = array('title', 'tagline', 'link'); $itemtags = array('title', 'summary', 'link', 'content', 'issued', 'modified', 'created', 'published', 'updated'); $bloggerlink1 = "]*href=[\"']([^\"']*)[\"'][^\>]*?type=[\"']text/html[\"'][^\>]*?>"; $bloggerlink2 = "]*type=[\"']text/html[\"'][^\>]*?href=[\"']([^\"']*)[\"'][^\>]*?>"; $bloggerlink3 = "]*href=[\"']([^\"']*)[\"'][^\>]*>"; $item = 'entry'; } if ( !$feedtype ) { $bdprss_db->recordError($this->url, "Cannot ascertain feed-type, therefore ignored"); return FALSE; } $result['feedtype'] = $feedtype; if(BDPRSS2_DEBUG) $bdprss_db->recordError($this->url, "DEBUG feedtype: $feedtype"); if(BDPRSS2_DEBUG) $bdprss_db->recordError($this->url, "DEBUG feedsize: ".strlen($feed)); // get the overarching feed information foreach($channeltags as $tag) { if($feedtype == 'ATOM' && $tag == 'link') { $tmp = $this->reg_capture($bloggerlink1, $feed); if(!$tmp) $tmp = $this->reg_capture($bloggerlink2, $feed); if(!$tmp) $tmp = $this->reg_capture($bloggerlink3, $feed); } else $tmp = $this->reg_capture('<'.$tag.'[^>]*?>(.*?)', $feed); if(!$tmp) continue; $result[$tag] = $this->title_recode($tmp); } // manipulate site URL for use with indirect references $siteURL = $result['link']; if(!$siteURL) $bdprss_db->recordError($this->url, "Feed does not include a site URL"); else $siteURL = mb_eregi_replace("(http://[^/]*).*$", "\\1", $siteURL); // get the item information $itemArray = $this->reg_capture_all('<'.$item.'[^>]*?>', '.*', $feed); if(!$itemArray) { $bdprss_db->recordError($this->url, "Feed did not contain any items"); return $result; } $itemcount = count($itemArray); $i = 0; while ( $i < $itemcount ) { $itm = $itemArray[$i]; foreach( $itemtags as $itag ) { if($feedtype == 'ATOM' && $itag == 'link') { $tmp = $this->reg_capture($bloggerlink1, $itm); if(!$tmp) $tmp = $this->reg_capture($bloggerlink2, $itm); if(!$tmp) $tmp = $this->reg_capture($bloggerlink3, $itm); } else $tmp = $this->reg_capture('<'.$itag.'[^>]*?>(.*?)', $itm); //parteibuch - we don't want feeds without a siteurl if ($tmp == '' || !$siteURL) continue; $tmp = $this->title_recode($tmp); if($siteURL) if($itag == 'content:encoded' || $itag == 'description' || $itag == 'content' || $itag == 'summary') $tmp = $this->rebaseAddresses($tmp, $siteURL); //parteibuch blog.de fix of links directed to www.blog.de instead of subdomain if(strstr($tmp,'www.blog.de') && strstr($siteURL,'blog.de') && $itag == 'link'){ $tmp = eregi_replace("http://[^/]*/(.*)$", "$siteURL/\\1", $tmp); //if(strstr($siteURL,'blog.de')) $bdprss_db->recordError($this->url, "DEBUG for rebased blog.de item link: $tmp"); } $result['items'][$i][$itag] = $tmp; } $i++; } return $result; } } } ?>