* * Licensed under The MIT License * Redistributions of files must retain the above copyright notice. * * @copyright Copyright (c) 2006, Felix Geisendörfer. * @link http://www.fg-webdesign.de/ * @link http://www.thinkingphp.org/ * @license http://www.opensource.org/licenses/mit-license.php The MIT License */ require_once MODELS.'web_model.php'; class Rss extends WebModel { var $name = 'Rss'; var $cacheExpires = '+2 hours'; var $cacheFolder = 'web/rss'; function findAll($feedUrl, $limit = 10, $cacheExpires = null) { if (empty($feedUrl)) return array(); $feed = $this->__parseRSS($this->__getRawRSS($feedUrl, null, $cacheExpires)); if (isset($feed['Error'])) return $feed; if (count($feed['Items']>$limit)) { $feed['Items'] = array_slice($feed['Items'], 0, $limit); } return $feed; } function __getRawRSS($feedUrl, $vars = array(), $cacheExpires = null) { $url = $feedUrl; $cachePath = $this->cacheFolder.$this->__createCacheHash('.rss', $url, $vars); if (empty($cacheExpires)) $cacheExpires = $this->cacheExpires; if (empty($vars)) $vars = array(); $rssData = cache($cachePath, null, $cacheExpires); if (empty($rssData)) { $rssData = cache($cachePath, $this->httpGet($url, $vars)); } return $rssData; } /** * A simple function for parsing RSS data. Only returns Items for now. * */ function __parseRSS($data) { if (empty($data)) return array(); $regex = '/\/iUs'; preg_match($regex, $data, $match); if (empty($match)) return array('Error' => 'No valid feed (no feed version found).'); list($raw, $version) = $match; if (empty($version)) $version = '2.0'; // Check if we have a valid version number if (!preg_match('/^[0-9.]+$/iUs', $version)) { return array('Error' => '"'.$version.'" is no valid RSS version.'); } $rssFunction = '__parseRSS_'.str_replace('.', '_', $version); if (method_exists($this, $rssFunction)) { return call_user_func(array(&$this, $rssFunction), $data); } else { return array('Error' => 'No function for parsing RSS feeds of version "'.$version.'" available.'); } } function __parseRSS_2_0($data) { // First thing we need to do, is to identify all html/otherwise formated contents preg_match_all('/\<\!\[CDATA\[(.+)\]\]\>/iUs', $data, $cdata, PREG_SET_ORDER); // Create the md5 hash of the data to parse $dataHash = md5($data); // Now we have to replace them with something that won't confuse our parser, but still keep the array containing their original content // [[CDATA:$dataHash:$cdataNum]] should be pretty unique, so we don't have to deal with errors in an rss feed that talks about this replacment // method. foreach ($cdata as $cdataNum => $cdataItem) { $data = str_replace($cdataItem[0], '[[CDATA:'.$dataHash.':'.$cdataNum.']]', $data); } // Let's get the information about the channel $regex = '/\(.+)\/iUs'; preg_match($regex, $data, $match); if (!empty($match)) { list($raw, $channel) = $match; $channel = $this->__getNodeFields($channel, $cdata, $dataHash, 'channel'); } else $channel = array(); // This will get us a list with all Items contained in the feed $regex = '/\(.+)\<\/item\>/iUs'; $matchCount = preg_match_all($regex, $data, $matches, PREG_SET_ORDER); if (empty($matchCount)) { // No items? Nothing to parse. $matches = array(); } else { $items = array(); // Loop through all Item Matches foreach ($matches as $itemNr => $item) { // Find all fields in our Item $items[$itemNr] = $this->__getNodeFields($item[1], $cdata, $dataHash); } } // Return everything return array('Channel' => $channel, 'Items' => $items); } function __getNodeFields($rawFields, $cdata = null, $dataHash = null, $type = null) { // Don't ask - it works. No seriously, I spent a lot of time and thought on this regex // if you are interested in how it works feel free to contact me. In case you wonder about // the \x00's, that's an optimization trick to generate a character set that matches new lines // but doesn't require the /s modifier. $fieldRegex = '/\<(.+)( [^\x00]*)?\>([^\x00]*)\<\/\\1\>|\<(.+)( [^\x00]*)?\/\>/U'; preg_match_all($fieldRegex, $rawFields, $fieldMatches, PREG_SET_ORDER); // Loop through those fields foreach ($fieldMatches as $fieldMatch) { // Assign the preg_match_all contents to a couple of variables if (count($fieldMatch)==4) list($raw, $field, $attributes, $value) = $fieldMatch; else { // This is for that don't have enclosed content list($raw, , , ,$field, $attributes) = $fieldMatch; $value = null; } // The child image in channel has child elements in RSS, so let's make sure we parse them too if ($type=='channel' && $field=='image') { $value = $this->__getNodeFields($value, $cdata, $dataHash); } else { // Find CDATA replaced stuff and but it back in. preg_match_all('/\[\[CDATA:'.$dataHash.':([0-9]+)\]\]/iUs', $rawFields, $cdataDummies, PREG_SET_ORDER); foreach ($cdataDummies as $cdataDummy) { // Replace CDATA dummies with the actual contents of the cdata field $value = str_replace($cdataDummy[0], $cdata[($cdataDummy[1])][1], $value); } } // Parse the attributes contained in our Node / ItemField $attributes = $this->__getXMLNodeAttributes($attributes); // Add our news Node to the list of Items. $fields[$field] = array('value' => $value, 'attributes' => $attributes); } if (!isset($fields)) $fields = $rawFields; return $fields; } function __getXMLNodeAttributes($attributesData) { if (empty($attributesData)) return array(); preg_match_all('/ ([^ \r\n]+)=(["\'])(.+)\\2/iUs', $attributesData, $attributeMatches, PREG_SET_ORDER); if (empty($attributeMatches)) return array(); $attributes = array(); foreach ($attributeMatches as $attribute) { list($raw, $attributeKey, $enclosure, $attributeValue) = $attribute; $attributes[] = array($attributeKey => $attributeValue); } return $attributes; } } ?>