Code snippets for symfony 1.x

Navigation

Refine Tags

Snippets tagged "sitemap"

crawler (google sitemap xml)

this crawler returns an array. format is like this.

array( 'url_address' => array( 'page_title', 'status_code', 'content_type' ) )

*requires PEAR::HTTP_Client

with the returned array, you can create sitemap page and/or sitemap xml for google webmaster central.

<?php
require_once('HTTP/Client.php');
 
class HTTP_Crawler extends HTTP_Client {
 
    function _getMetaRedirect(&$request){
        return $not_to_redirected_automatically = null;
    }
 
    function getAbsoluteUrl($netUrl, $ambiguousUrl){
        return $this->_redirectUrl($netUrl, $ambiguousUrl);
    }
}
 
class Crawler {
 
    private 
        $baseNetUrl,
        $baseUrlEscaped,
        $client,
        $urlList = array(),
        $sleepMicroSeconds = 500000;    //0.5seconds
 
 
    /**
     * constructor
     *
     * @param string $baseUrl your domain name
     * @param int $sleepMicroSeconds crawler sleep time on each request
     */
    function __construct($baseUrl, $sleepMicroSeconds = 500000){
        $this->baseNetUrl = new Net_URL($baseUrl, true);
        $this->baseUrlEscaped = str_replace('/', '\/', preg_quote($baseUrl));
 
        $this->sleepMicroSeconds = $sleepMicroSeconds;
 
        $this->client = new HTTP_Crawler();
        $this->client->setDefaultHeader('User-Agent', 'php crawler');
    }
 
    /**
     * returns an array of found pages
     *
     * format:
     * array(
     *      'url_address' => array(
     *          'page_title',
     *          'status_code',
     *          'content_type'
     *      )
     * )
     * 
     * @param string $url crawling starting URL
     */
    public function clawThis($url){
        $url = $this->client->getAbsoluteUrl($this->baseNetUrl, $url);
        $this->setUrls($url);
        $links = $this->retrieveLinks($url);
        $this->setUrls($links);
 
        foreach ($this->urlList as $eachUrl => $statusCode){
            if($statusCode === false){
                $this->clawThis($eachUrl);
            }
        }
    }
 
    private static function isValidFile($each){
 
        $doNotCrawlFiles = array(
            'jpg',
            'gif',
            'png',
            'zip',
            'lzh',
            'xls',
            'ppt',
            'doc',
            'tif',
            'exe',
            'avi',
            'mpg',
            'swf',
            'mp3',
            '#[^#]*'
        );
 
        if(preg_match('/\.(' . join('|', $doNotCrawlFiles) . ')$/i', $each)){
            return false;
        }
        return true;
    }
 
    private function isLocalAddress($url){
        if(preg_match(sprintf('/^%s/i', $this->baseUrlEscaped), $url)){
            return true;
        }
        return false;
    }
 
    private function setUrls($urls = array()){
        if(!is_array($urls)){
            $urls = array($urls);
        }
 
        foreach($urls as $each){
            $each = $this->client->getAbsoluteUrl($this->baseNetUrl, $each);
 
            if(!$this->isLocalAddress($each) ||
                !$this->isValidFile($each) ||
                isset($this->urlList[$each])){
                continue;
            }
 
            $this->urlList[$each] = false;
        }
    }
 
    private function retrieveLinks($url){
        if($this->urlList[$url] !== false){
            return array();
        }
 
        if(!$this->isLocalAddress($url)){
            return array();
        }
 
        $this->client->get($url);
        $response = $this->client->currentResponse();
        $body = mb_convert_encoding($response['body'], mb_internal_encoding(), mb_detect_encoding($response['body']));
 
 
        $this->urlList[$url]['title'] = '';
        $this->urlList[$url]['code'] = $response['code'];
        $this->urlList[$url]['type'] = $response['headers']['content-type'];
 
        $document = new DOMDocument('1.0', mb_internal_encoding());
        $document->preserveWhiteSpace = false;
        $document->LoadHTML($body);
 
        foreach ($document->getElementsByTagName('title') as $title){
            $this->urlList[$url]['title'] = $title->textContent; //innerHTML
            break;
        }
 
 
        $links = array();
        foreach ($document->getElementsByTagName('a') as $eachElement){
            $links[] = $eachElement->getAttribute('href');
        }
        foreach ($document->getElementsByTagName('meta') as $eachElement){
            if(preg_match('/refresh/i', $eachElement->getAttribute('http-equiv'))){
                $content = $eachElement->getAttribute('content');
 
                foreach (split(';', $content) as $each){
                    if(preg_match('/url=(.*)/i', trim($each), $match)){
                        $links[] = $match[1];
                        break;
                    }
                }
            }
        }
 
        usleep($this->sleepMicroSeconds);
        return array_unique($links);
    }
 
    public function getUrlList(){
        $list = array();
        foreach ($this->urlList as $url =>$attributes){
            if(
                preg_match('/text\/html/i', $attributes['type']) &&
                !preg_match('/#(.*)$/i', $url)
            ){
                $list[$url] = $attributes;
            }
        }
 
        ksort($list);
        return $list;
    }
}

you can write a batch script to crawl your site periodically.

<?php
ini_set('max_execution_time', 0);
error_reporting(E_ALL);
require_once('Crawler.class.php');
require_once('File.php');
 
$root = 'http://YOUR_OWN_SITE_URL';
 
$crawler = new Crawler($root);
$crawler->clawThis($root);
 
$pageList = $crawler->getUrlList();
$fileName = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'Sitemap.txt';
File::writeLine($fileName, serialize($pageList), FILE_MODE_WRITE);
File::close($fileName, FILE_MODE_WRITE);

to output sitemap xml, write an action like this

public function executeSitemapXml(){
    error_reporting(E_ALL);
 
    $sitemapCacheFile = sfConfig::get('sf_bin_dir') . DIRECTORY_SEPARATOR . 'sitemap.txt';
 
    require_once('File.php');
    $sitemapCache = File::readAll($sitemapCacheFile);
    $this->pageList = unserialize($sitemapCache);
    $this->today = date('Y-m-d', filemtime($sitemapCacheFile));
}

its template is like this

<?xml version="1.0" encoding="UTF-8"?>
<urlset
  xmlns="http://www.google.com/schemas/sitemap/0.84"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84
                      http://www.google.com/schemas/sitemap/0.84/sitemap.xsd">
 
<?php foreach ($pageList as $url => $attributes):?>
<url>
  <loc><?php echo $url?></loc>
  <lastmod><?php echo $today?></lastmod>
  <priority>0.8</priority>
  <changefreq>weekly</changefreq>
</url>
<?php endforeach;?>
 
</urlset>

google needs the sitemap xml to be in the top directory, so you need to wirte a route rule like this.

sitemap_xml:
  url:   /sitemapxml
  param: { module: default, action: sitemapXml }

this is actually used on our site sticker20.com

by sticker 20 on 2006-10-04, tagged crawler  google  sitemap  xml 
(4 comments)