Custom solr indexer for an RSS-Feed

In one of my last projects we divided one TYPO3 Website with an integrated blog (t3blog Extension) into the TYPO3 based website and a wordpress based blog.
But on the TYPO3 Website we still wanted to make the blog entries searchable via solr.

The two websites are on different machines and i couldn´t grant remote database access. So the simplest thing that came to my mind was to use a dedicated solr indexer for the wordpress rss feed. We could have used Apache Nutch, but i think it´s a way too difficult for that requirement.

We have used the TYPO3 solr Extension in Version 3.0.0.

So here is my solution:

First of all i created a scheduler task through the extbase api:

File: ext_localconf.php

if (!defined('TYPO3_MODE')) {
    die('Access denied.');
}

if (TYPO3_MODE === 'BE') {
    $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['extbase']['commandControllers'][] = 'Ssch\\SschBlog\\Command\\SolrCommandController';
}

File: SolrCommandController.php

<?php

namespace Ssch\SschBlog\Command;

class SolrCommandController extends \TYPO3\CMS\Extbase\Mvc\Controller\CommandController {

    /**
     *
     * @var \Ssch\SschBlog\Domain\Repository\FeedRepository
     * @inject
     */
    protected $feedRepository;

    /**
     *
     * @var \Tx_Solr_ConnectionManager
     * @inject
     */
    protected $solrConnectionManager;

    /**
     *
     * @var \Ssch\SschBlog\Indexer\FeedIndexer
     * @inject
     */
    protected $feedIndexer;



    /**
     * Add feed to the solr index
     * 
     * Add feed to the solr index
     * 
     * @param string $url The url of the feed
     * @param integer $rootPageUid The uid of the rootpage
     */
    public function addBlogFeedCommand($url, $rootPageUid = NULL) {
        $allFeeds = array();
        $urls = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $url);
        foreach ($urls as $url) {
            $feeds = $this->feedRepository->findByUrl($url);
            $allFeeds = array_merge($allFeeds, $feeds->toArray());
        }
        $solrService = $this->solrConnectionManager->getConnectionByRootPageId($rootPageUid);
        $this->feedIndexer->initialize($solrService);
        $this->feedIndexer->indexFeeds($allFeeds, $rootPageUid);
    }  
}

It is up to you too implement the RSS Feed importer to map to the following Model:

Feed.php

<?php

namespace Ssch\SschBlog\Domain\Model;


class Feed {

    /**
     *
     * @var string
     */
    protected $title;

    /**
     *
     * @var string
     */
    protected $content;

    /**
     *
     * @var string
     */
    protected $description;

    /**
     *
     * @var string
     */
    protected $link;

    /**
     *
     * @var \DateTime
     */
    protected $pubDate;

    /**
     *
     * @var string
     */
    protected $category;

    /**
     *
     * @var string
     */
    protected $authorName;

    /**
     *
     * @var string
     */
    protected $authorEmail;

    /**
     * 
     * @return string
     */
    public function getTitle() {
        return $this->title;
    }

    /**
     * 
     * @return string
     */
    public function getContent() {
        return $this->content;
    }

    /**
     * 
     * @return string
     */
    public function getLink() {
        return $this->link;
    }

    /**
     * 
     * @return \DateTime
     */
    public function getPubDate() {
        return $this->pubDate;
    }

    /**
     * 
     * @return string
     */
    public function getCategory() {
        return $this->category;
    }

    /**
     * 
     * @param string $title
     */
    public function setTitle($title) {
        $this->title = $title;
    }

    /**
     * 
     * @param string $content
     */
    public function setContent($content) {
        $this->content = $content;
    }

    /**
     * 
     * @param string $link
     */
    public function setLink($link) {
        $this->link = $link;
    }

    /**
     * 
     * @param \DateTime $pubDate
     */
    public function setPubDate(\DateTime $pubDate = NULL) {
        $this->pubDate = $pubDate;
    }

    /**
     * 
     * @param string $category
     */
    public function setCategory($category) {
        $this->category = $category;
    }

    /**
     * 
     * @return string
     */
    public function getDescription() {
        return $this->description;
    }

    /**
     * 
     * @param string $description
     */
    public function setDescription($description) {
        $this->description = $description;
    }

    /**
     * 
     * @return string
     */
    public function getAuthorName() {
        return $this->authorName;
    }

    /**
     * 
     * @return string
     */
    public function getAuthorEmail() {
        return $this->authorEmail;
    }

    /**
     * 
     * @param string $authorName
     */
    public function setAuthorName($authorName) {
        $this->authorName = $authorName;
    }

    /**
     * 
     * @param string $authorEmail
     */
    public function setAuthorEmail($authorEmail) {
        $this->authorEmail = $authorEmail;
    }

    /**
     * 
     * @return string
     */
    public function getTitleAndAuthor() {
        $content = array();
        if ($this->getTitle()) {
            $content[] = $this->getTitle();
        }
        if ($this->getAuthorName()) {
            $content[] = $this->getAuthorName();
        }
        return implode(' - ', $content);
    }

    /**
     * 
     * @return string
     */
    public function __toString() {
        return $this->getTitle();
    }

}

So far so good. But now the real relevant part, the solr indexer:

<?php

namespace Ssch\SschBlog\Indexer;


use \TYPO3\CMS\Core\Utility\GeneralUtility;

class FeedIndexer {

    CONST ITEM_TYPE = 'blog';
    
    /**
     * Solr connection
     *
     * @var \tx_solr_SolrService
     */
    protected $solr = NULL;

    /**
     *
     * @var integer
     */
    protected $rootPageId;


    /**
     * 
     * @param \tx_solr_SolrService $solrService
     * @return void
     */
    public function initialize($solrService) {
        $this->solr = $solrService;
    }

    /**
     * 
     * @param integer $rootPageId
     */
    public function indexFeeds($feeds, $rootPageId) {
        $this->solr->deleteByType(self::ITEM_TYPE);
        $this->solr->commit();
        $this->rootPageId = $rootPageId;
        $solrDocuments = array();
        $itemIndex = 0;
        foreach ($feeds as $feed) {
            /* @var $feed \Ssch\SschBlog\Domain\Model\Feed */
            $solrDocuments[] = $this->feedToSolrDocument($feed, $itemIndex);
            $itemIndex++;
        }
        // @TODO: Implement Exception Handling here
        $response = $this->solr->addDocuments($solrDocuments);
    }

    /**
     * 
     * @param \Ssch\SschBlog\Domain\Model\Feed $feed
     * @param type $itemIndex
     * @return \Apache_Solr_Document
     */
    protected function feedToSolrDocument(\Ssch\SschBlog\Domain\Model\Feed $feed, $itemIndex) {
        $solrDocument = GeneralUtility::makeInstance('Apache_Solr_Document');
        /* @var $solrDocument \Apache_Solr_Document */
        $solrDocument->addField('id', \Tx_Solr_Util::getDocumentId(
                        self::ITEM_TYPE, $this->rootPageId, GeneralUtility::shortMD5($feed->getLink()) . '_' . $itemIndex
        ));
        $solrDocument->setField('appKey', 'EXT:ssch_blog');
        $solrDocument->addField('type', self::ITEM_TYPE);
        $solrDocument->addField('siteHash', \Tx_Solr_Site::getSiteByPageId($this->rootPageId)->getSiteHash());
        $solrDocument->addField('site', \Tx_Solr_Site::getSiteByPageId($this->rootPageId)->getDomain());
        $solrDocument->addField('title', $feed->getTitleAndAuthor());
        $solrDocument->addField('content', strip_tags($feed->getContent()));
        $solrDocument->addField('description', $feed->getDescription());
        $solrDocument->addField('url', $feed->getLink());
        $solrDocument->addField('language', 0);
        $solrDocument->addField('group', '0');
        $solrDocument->addField('altType_stringS', self::ITEM_TYPE);
        return $solrDocument;
    }

}

You see, it is quiet easy to create a dedicated indexer due to the very clever api of the TYPO3 solr extension.
I just wanted to give you an idea how to implement such an indexer. If you have any questions don´t hesitate to ask me.

Veröffentlicht von

avatar

Sebastian Schreiber

Ich bin 1980 in Bergisch Gladbach geboren. Nach dem Abitur und Zivildienst habe ich 2000 an der Fachhochschule Lippe & Höxter das Studium der Medienproduktin begonnen und nach längeren Aufenthalten in Valencia und Berlin 2003 das Studium mit dem Abschluss Bachelor of Science in Medienproduktion abgeschlossen. Nach einer Festanstellung kurz nach dem Studium in einer kleinen Webagentur in Köln mit Schwerpunkt TYPO3 bin ich nun seit 2008 freiberuflicher Webentwickler.

Schreibe einen Kommentar

Deine E-Mail-Adresse wird nicht veröffentlicht. Erforderliche Felder sind mit * markiert.