Browse | Submit New Snippet | Create Package

 

Rss Model

Type:
Function
Category:
Other
License:
MIT/X Consortium License
Language:
PHP
 
Description:
This Model allows you to do basic Rss 2.0 feed parsing. Requires the prescence and inclusion of WebModel to work. (See other snippets in this package)

Versions Of This Snippet::

Felix Geisendörfer
Snippet ID Download Version Date Posted Author Delete
2371.12006-09-06 17:28Felix Geisendörfer
Changes since last version::
- Bug Fix for attribute support
- Now reagonizes <node .../> style nodes
1341.02006-06-14 22:03Felix Geisendörfer

Download a raw-text version of this code by clicking on "Download Version"

 


Latest Snippet Version: :1.1

<?php
/**
 * Created: Wed Sep 06 18:03:26 CEST 2006
 * 
 * This Model allows you to parse a given RSS 2.0 feed and have it returned in a big 
 * array.
 * 
 * PHP versions 4 and 5
 *
 * Copyright (c) Felix Geisendörfer <info@fg-webdesign.de>
 *
 * Licensed under The MIT License
 * Redistributions of files must retain the above copyright notice.
 *
 * @copyright		Copyright (c) 2006, Felix Geisendörfer. 
 * @link			http://www.fg-webdesign.de/
 * @link			http://www.thinkingphp.org/ 
 * @license			http://www.opensource.org/licenses/mit-license.php The MIT License
 */


require_once MODELS.'web_model.php';

class Rss extends WebModel 
{
    var $name = 'Rss';
    var $cacheExpires = '+2 hours';
    var $cacheFolder = 'web/rss';
    
    function findAll($feedUrl, $limit = 10, $cacheExpires = null)
    {
        if (empty($feedUrl))
            return array();

        $feed = $this->__parseRSS($this->__getRawRSS($feedUrl, null, $cacheExpires));       
        
        if (isset($feed['Error']))
            return $feed;
        
        if (count($feed['Items']>$limit))
        {                                
            $feed['Items'] = array_slice($feed['Items'], 0, $limit);
        }

        return $feed;
    }

    function __getRawRSS($feedUrl, $vars = array(), $cacheExpires = null)
    {
        $url = $feedUrl;
        
        $cachePath = $this->cacheFolder.$this->__createCacheHash('.rss', $url, $vars);

        if (empty($cacheExpires))
            $cacheExpires = $this->cacheExpires;
            
        if (empty($vars))
            $vars = array();
                    
        $rssData = cache($cachePath, null, $cacheExpires);
        
        if (empty($rssData))
        {
            $rssData = cache($cachePath, $this->httpGet($url, $vars));
        }
        
        return $rssData;
    }
    
    /**
     * A simple function for parsing RSS data. Only returns Items for now.
     *
     */
    function __parseRSS($data)
    {
        if (empty($data))
            return array();
    
        $regex = '/\<rss.+version="(.+)".*\>/iUs';
        
        preg_match($regex, $data, $match);
        if (empty($match))
            return array('Error' => 'No valid feed (no feed version found).');
        
        list($raw, $version) = $match;
                
        if (empty($version))
            $version = '2.0';

        // Check if we have a valid version number
        if (!preg_match('/^[0-9.]+$/iUs', $version))
        {
            return array('Error' => '"'.$version.'" is no valid RSS version.');
        }
                    
        $rssFunction = '__parseRSS_'.str_replace('.', '_', $version);
                   
        if (method_exists($this, $rssFunction))
        {
            return call_user_func(array(&$this, $rssFunction), $data);
        }
        else 
        {
            return array('Error' => 'No function for parsing RSS feeds of version "'.$version.'" available.');
        }
    }
    
    
    function __parseRSS_2_0($data)
    {       
        // First thing we need to do, is to identify all html/otherwise formated contents
        preg_match_all('/\<\!\[CDATA\[(.+)\]\]\>/iUs', $data, $cdata, PREG_SET_ORDER);


        // Create the md5 hash of the data to parse
        $dataHash = md5($data);
        
        // Now we have to replace them with something that won't confuse our parser, but still keep the array containing their original content
        // [[CDATA:$dataHash:$cdataNum]] should be pretty unique, so we don't have to deal with errors in an rss feed that talks about this replacment
        // method.
        foreach ($cdata as $cdataNum => $cdataItem)
        {
            $data = str_replace($cdataItem[0], '[[CDATA:'.$dataHash.':'.$cdataNum.']]', $data);
        }            
        
        
        // Let's get the information about the channel
        $regex = '/\<channel\>(.+)\<item\>/iUs';
        preg_match($regex, $data, $match);
        if (!empty($match))
        {
            list($raw, $channel) = $match;
            
            $channel = $this->__getNodeFields($channel, $cdata, $dataHash, 'channel');
        }
        else 
            $channel = array();
        
        // This will get us a list with all Items contained in the feed
        $regex = '/\<item\>(.+)\<\/item\>/iUs';
        
        $matchCount = preg_match_all($regex, $data, $matches, PREG_SET_ORDER);                
        if (empty($matchCount))
        {
            // No items? Nothing to parse.
            $matches = array();        
        }
        else 
        {                            
            $items = array();
            
            // Loop through all Item Matches
            foreach ($matches as $itemNr => $item)
            {   
                // Find all fields in our Item           
                
                $items[$itemNr] = $this->__getNodeFields($item[1], $cdata, $dataHash);
            }
        }
    
        // Return everything
        return array('Channel' => $channel,
                     'Items' => $items);
    }    
    
    function __getNodeFields($rawFields, $cdata = null, $dataHash = null, $type = null)
    {
        // Don't ask - it works. No seriously, I spent a lot of time and thought on this regex
        // if you are interested in how it works feel free to contact me. In case you wonder about
        // the \x00's, that's an optimization trick to generate a character set that matches new lines
        // but doesn't require the /s modifier.
        $fieldRegex = '/\<(.+)( [^\x00]*)?\>([^\x00]*)\<\/\\1\>|\<(.+)( [^\x00]*)?\/\>/U';
            
        preg_match_all($fieldRegex, $rawFields, $fieldMatches, PREG_SET_ORDER);
        
        // Loop through those fields
        foreach ($fieldMatches as $fieldMatch)
        {
            // Assign the preg_match_all contents to a couple of variables
            if (count($fieldMatch)==4)
                list($raw, $field, $attributes, $value) = $fieldMatch;
            else 
            {
                // This is for <nodes ... /> that don't have enclosed content
                list($raw, , , ,$field, $attributes) = $fieldMatch;
                $value = null;
            }
             
            // The child image in channel has child elements in RSS, so let's make sure we parse them too
            if ($type=='channel' && $field=='image')
            {
                $value = $this->__getNodeFields($value, $cdata, $dataHash);
            }
            else 
            {
                // Find CDATA replaced stuff and but it back in.
                preg_match_all('/\[\[CDATA:'.$dataHash.':([0-9]+)\]\]/iUs', $rawFields, $cdataDummies, PREG_SET_ORDER);
                foreach ($cdataDummies as $cdataDummy)
                {
                    // Replace CDATA dummies with the actual contents of the cdata field
                    $value = str_replace($cdataDummy[0], $cdata[($cdataDummy[1])][1], $value);
                }
            }
            
            // Parse the attributes contained in our Node / ItemField
            $attributes = $this->__getXMLNodeAttributes($attributes);
        
            // Add our news Node to the list of Items.
            $fields[$field] = array('value' => $value,
                                    'attributes' => $attributes);
        }    
        
        if (!isset($fields))
            $fields = $rawFields;
        
        return $fields;
    }
    
    function __getXMLNodeAttributes($attributesData)
    {
        if (empty($attributesData))
            return array();
    
        preg_match_all('/ ([^ \r\n]+)=(["\'])(.+)\\2/iUs', $attributesData, $attributeMatches, PREG_SET_ORDER);
        
        if (empty($attributeMatches))
            return array();
            
        $attributes = array();
        
        foreach ($attributeMatches as $attribute)
        {
            list($raw, $attributeKey, $enclosure, $attributeValue) = $attribute;
            $attributes[] = array($attributeKey => $attributeValue);
        }
        
        return $attributes;
    }
}

?>
		

Submit a new version

You can submit a new version of this snippet if you have modified it and you feel it is appropriate to share with others..