Code snippets for symfony 1.x

Navigation

Refine Tags

Snippets tagged "google"

Google analytics code by filter

This simple filter automatically adds google analytics code for all pages. It's no longer required to modify all the layouts, just put google code to app.yml and this filter does everything for you.

Create a lib/filters/gaFilter.class.php file:

<?php
class gaFilter extends sfFilter
{
  public function execute($filterChain)
  {
    // Nothing to do before the action
    $filterChain->execute();
    // Find google code and check if current module is not disabled
    if(($gaCode = sfConfig::get("app_ga_code",false)) !== false
    && !in_array($this->getContext()->getModuleName(),sfConfig::get("app_ga_disabled_modules",array()))) {
      //Decorate the response with the tracker code
      $response = $this->getContext()->getResponse();
      $response->setContent(str_ireplace('</body>', $gaCode.'</body>',$response->getContent())); 
    }
   }
}
 

Add the filter to filters.yml :

# insert your own filters here
 
# Google analytics filter
ga_filter:
  class: gaFilter
 

Now you are able to insert GA code into your page. Simply copy the whole code and add it as ga_code option into your app.yml file:

all:
  ...
  ga:
    code: >
      <script type="text/javascript">
      try {
      var pageTracker = _gat._getTracker("...");
      pageTracker._trackPageview();
      } catch(err) {}</script>
 

It works on fact, that you probably will not have a </body> tag in other place except at the end of your page and you will not use it in other content type than text/html.

Optionally you may disable GA filter in specific modules, such as administration is. To do this, add this option to your app.yml:

all:
  ...
  ga:
    disabled_modules: [administration]
    code: >
      ...
 
by Radek Petráň on 2010-03-07, tagged analytics  filter  google 

crawler (google sitemap xml)

this crawler returns an array. format is like this.

array( 'url_address' => array( 'page_title', 'status_code', 'content_type' ) )

*requires PEAR::HTTP_Client

with the returned array, you can create sitemap page and/or sitemap xml for google webmaster central.

<?php
require_once('HTTP/Client.php');
 
class HTTP_Crawler extends HTTP_Client {
 
    function _getMetaRedirect(&$request){
        return $not_to_redirected_automatically = null;
    }
 
    function getAbsoluteUrl($netUrl, $ambiguousUrl){
        return $this->_redirectUrl($netUrl, $ambiguousUrl);
    }
}
 
class Crawler {
 
    private 
        $baseNetUrl,
        $baseUrlEscaped,
        $client,
        $urlList = array(),
        $sleepMicroSeconds = 500000;    //0.5seconds
 
 
    /**
     * constructor
     *
     * @param string $baseUrl your domain name
     * @param int $sleepMicroSeconds crawler sleep time on each request
     */
    function __construct($baseUrl, $sleepMicroSeconds = 500000){
        $this->baseNetUrl = new Net_URL($baseUrl, true);
        $this->baseUrlEscaped = str_replace('/', '\/', preg_quote($baseUrl));
 
        $this->sleepMicroSeconds = $sleepMicroSeconds;
 
        $this->client = new HTTP_Crawler();
        $this->client->setDefaultHeader('User-Agent', 'php crawler');
    }
 
    /**
     * returns an array of found pages
     *
     * format:
     * array(
     *      'url_address' => array(
     *          'page_title',
     *          'status_code',
     *          'content_type'
     *      )
     * )
     * 
     * @param string $url crawling starting URL
     */
    public function clawThis($url){
        $url = $this->client->getAbsoluteUrl($this->baseNetUrl, $url);
        $this->setUrls($url);
        $links = $this->retrieveLinks($url);
        $this->setUrls($links);
 
        foreach ($this->urlList as $eachUrl => $statusCode){
            if($statusCode === false){
                $this->clawThis($eachUrl);
            }
        }
    }
 
    private static function isValidFile($each){
 
        $doNotCrawlFiles = array(
            'jpg',
            'gif',
            'png',
            'zip',
            'lzh',
            'xls',
            'ppt',
            'doc',
            'tif',
            'exe',
            'avi',
            'mpg',
            'swf',
            'mp3',
            '#[^#]*'
        );
 
        if(preg_match('/\.(' . join('|', $doNotCrawlFiles) . ')$/i', $each)){
            return false;
        }
        return true;
    }
 
    private function isLocalAddress($url){
        if(preg_match(sprintf('/^%s/i', $this->baseUrlEscaped), $url)){
            return true;
        }
        return false;
    }
 
    private function setUrls($urls = array()){
        if(!is_array($urls)){
            $urls = array($urls);
        }
 
        foreach($urls as $each){
            $each = $this->client->getAbsoluteUrl($this->baseNetUrl, $each);
 
            if(!$this->isLocalAddress($each) ||
                !$this->isValidFile($each) ||
                isset($this->urlList[$each])){
                continue;
            }
 
            $this->urlList[$each] = false;
        }
    }
 
    private function retrieveLinks($url){
        if($this->urlList[$url] !== false){
            return array();
        }
 
        if(!$this->isLocalAddress($url)){
            return array();
        }
 
        $this->client->get($url);
        $response = $this->client->currentResponse();
        $body = mb_convert_encoding($response['body'], mb_internal_encoding(), mb_detect_encoding($response['body']));
 
 
        $this->urlList[$url]['title'] = '';
        $this->urlList[$url]['code'] = $response['code'];
        $this->urlList[$url]['type'] = $response['headers']['content-type'];
 
        $document = new DOMDocument('1.0', mb_internal_encoding());
        $document->preserveWhiteSpace = false;
        $document->LoadHTML($body);
 
        foreach ($document->getElementsByTagName('title') as $title){
            $this->urlList[$url]['title'] = $title->textContent; //innerHTML
            break;
        }
 
 
        $links = array();
        foreach ($document->getElementsByTagName('a') as $eachElement){
            $links[] = $eachElement->getAttribute('href');
        }
        foreach ($document->getElementsByTagName('meta') as $eachElement){
            if(preg_match('/refresh/i', $eachElement->getAttribute('http-equiv'))){
                $content = $eachElement->getAttribute('content');
 
                foreach (split(';', $content) as $each){
                    if(preg_match('/url=(.*)/i', trim($each), $match)){
                        $links[] = $match[1];
                        break;
                    }
                }
            }
        }
 
        usleep($this->sleepMicroSeconds);
        return array_unique($links);
    }
 
    public function getUrlList(){
        $list = array();
        foreach ($this->urlList as $url =>$attributes){
            if(
                preg_match('/text\/html/i', $attributes['type']) &&
                !preg_match('/#(.*)$/i', $url)
            ){
                $list[$url] = $attributes;
            }
        }
 
        ksort($list);
        return $list;
    }
}

you can write a batch script to crawl your site periodically.

<?php
ini_set('max_execution_time', 0);
error_reporting(E_ALL);
require_once('Crawler.class.php');
require_once('File.php');
 
$root = 'http://YOUR_OWN_SITE_URL';
 
$crawler = new Crawler($root);
$crawler->clawThis($root);
 
$pageList = $crawler->getUrlList();
$fileName = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'Sitemap.txt';
File::writeLine($fileName, serialize($pageList), FILE_MODE_WRITE);
File::close($fileName, FILE_MODE_WRITE);

to output sitemap xml, write an action like this

public function executeSitemapXml(){
    error_reporting(E_ALL);
 
    $sitemapCacheFile = sfConfig::get('sf_bin_dir') . DIRECTORY_SEPARATOR . 'sitemap.txt';
 
    require_once('File.php');
    $sitemapCache = File::readAll($sitemapCacheFile);
    $this->pageList = unserialize($sitemapCache);
    $this->today = date('Y-m-d', filemtime($sitemapCacheFile));
}

its template is like this

<?xml version="1.0" encoding="UTF-8"?>
<urlset
  xmlns="http://www.google.com/schemas/sitemap/0.84"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84
                      http://www.google.com/schemas/sitemap/0.84/sitemap.xsd">
 
<?php foreach ($pageList as $url => $attributes):?>
<url>
  <loc><?php echo $url?></loc>
  <lastmod><?php echo $today?></lastmod>
  <priority>0.8</priority>
  <changefreq>weekly</changefreq>
</url>
<?php endforeach;?>
 
</urlset>

google needs the sitemap xml to be in the top directory, so you need to wirte a route rule like this.

sitemap_xml:
  url:   /sitemapxml
  param: { module: default, action: sitemapXml }

this is actually used on our site sticker20.com

by sticker 20 on 2006-10-04, tagged crawler  google  sitemap  xml 
(4 comments)