Searching With Perl

YAPC::EU 2006 - Birmingham, British Empire

August 30, 2006

Why I care

continued...

Why I care

continued...

Why I care

continued...

Why I care

continued...

Why I care

continued...

Why I care

In The Beginning

  undef $/; 
  my $search = shift;
  while (<>) {
    next unless /$search/i;
    print "$ARGV contains $search\n";
  }

Back to the Present

Clucene / Lucene

Plucene

Search::Xapian

KinoSearch

Search::Indexer

testbed

Indexing Results

Indexing size

Searching

Plucene::Simple

use Plucene::Simple;

my $dir = "./data/plucene.simple.full";
my $index = Plucene::Simple->open( $dir );
local undef $/;
while (<>) {
  my ($id) = ($ARGV =~ /(\d+).txt/);
  $index->index_document($id => $_);
}
$index->optimize;

Plucene::Simple (Search)

my $query = join(' ',@ARGV); 
if ($query) { 
    my $index = Plucene::Simple->open("./data/plucene.simple.full"); 
    my @ids = $index->search($query); 
    foreach my $id (@ids) { 
      my $text = read_file("text/$id.txt");
      my @keywords = split /\s+/, $query;
      my $match = Text::Context->new($text, @keywords);
      print $match->as_text,"\n";
    } 
}

Plucene

my $indexer = Plucene::Index::Writer->new(
    "./data/plucene.full", 
    Plucene::Analysis::SimpleAnalyzer->new(), # cheat
    1);
local undef $/;
while (<>) { 
  my ($id) = ($ARGV =~ /(\d+).txt/);
  my $doc = Plucene::Document->new();
  $doc->add(Plucene::Document::Field->Keyword(id => $id));
  $doc->add(Plucene::Document::Field->UnStored(text => $_));
  $indexer->add_document($doc);
} 
$indexer->optimize;

Plucene (Searching)

my $searcher = Plucene::Search::IndexSearcher->new( "./data/plucene.full" ); 
my $parser = Plucene::QueryParser->new({ 
    analyzer => Plucene::Plugin::SimpleAnalyzer->new(),
    default  => "text"}); 
my $parsedq = $parser->parse($querystring); 

my @docs; 
my $hc = Plucene::Search::HitCollector->new( 
      collect => sub { 
          my ($self, $doc, $score) = @_; 
          push @docs, $searcher->doc($doc) }; 

$searcher->search_hc($parsedq, $hc); 

Plucene (Searching) continues

@results = map {        
  $_->get("id")->string 
} @docs

foreach my $id (@results) {
  my $text = read_file("text/$id.txt");
  my @keywords = split /\s+/, $query;
  my $match = Text::Context->new($text, @keywords);
  print $match->as_text,"\n";
}

Plucene Redux

Search::Indexer

use Search::Indexer;
my $indexer = new Search::Indexer(
    dir => './data/SearchIndexer', 
    writeMode => 1);

while (<>) { 
  my ($id) = ($ARGV =~ /(\d+).txt/);
  $indexer->add($id, $_);
}

Search::Indexer (Search) ParseRequest

my $indexer = new Search::Indexer(dir => './data/SearchIndexer', writeMode => 1);

my $result = $indexer->search(join(' ',@ARGV));
my @ids = keys %{$result->{scores}};
my $killedWords = join ", ", @{$result->{killedWords}};
print scalar(@ids), " documents found\n", ;
print "words $killedWords were ignored during the search\n" if $killedWords;
foreach my $id (@ids) {
  my $text = read_file("text/$id.txt");
  my $score = $result->{scores}{$id};
  my $excerpts = join "\n", @{$indexer->excerpts($text, $result->{regex})};
  print "$id, score $score:\n$excerpts\n\n";
}

KinoSearch

my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( 
  language => 'en',
);

my $invindexer = KinoSearch::InvIndexer->new(
    analyzer => $analyzer,
    invindex => './data/kino',
    create   => 1);

$invindexer->spec_field( 
    name       => 'text',
    vectorized => 1);
$invindexer->spec_field( 
    name       => 'id',
    analyzed => 0);

KinoSearch (continued)

while (<>) { 
  my ($id) = ($ARGV =~ /(\d+).txt/);
  my $doc = $invindexer->new_doc;
  $doc->set_value( text => $_);
  $doc->set_value( id => $id);
  $invindexer->add_doc($doc);
}

$invindexer->finish;

KinoSearch (search)

my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( 
    language => 'en');
my $searcher = KinoSearch::Searcher->new(
    invindex => './data/kino',
    analyzer => $analyzer);
my $hits = $searcher->search(join(' ',@ARGV));
my $highlighter = KinoSearch::Highlight::Highlighter->new( 
    excerpt_field => 'text' );
$hits->create_excerpts( highlighter => $highlighter );
while ( my $hit = $hits->fetch_hit_hashref ) {
  print "$hit->{id}: $hit->{score}\n$hit->{excerpt}";
}

Search::Xapian

use Search::Xapian ':db';

my $db = Search::Xapian::WritableDatabase->new(
  './data/xapian',DB_CREATE_OR_OPEN);
my $stemmer = Search::Xapian::Stem->new('english');

while (<>) { 
  my ($id) = ($ARGV =~ /(\d+).txt/);
  my $doc = Search::Xapian::Document->new;
  $doc->set_data($id);
  foreach my $word (split(' ',$_)) {
    next if (length $word < 4);
    $doc->add_term($stemmer->stem_word($word));
  }
  $db->add_document($doc);
}

Search::Xapian (Search)

use Search::Xapian qw(:ops);

my $db = Search::Xapian::Database->new( './data/xapian' );
# if stemming, or parsing text, see QueryParser
my $enq = $db->enquire( OP_OR, @ARGV );

foreach my $match ( $enq->matches(0,12) ) {
  printf "%d score %d%%", 
         $match->get_docid(), 
         $match->get_percent();
  my $doc = $match->get_document();
  print read_file(sprintf("text/%i.txt",$doc->get_data()));
}

Nifty Bits (Things you'll want to do)

Nifty Bits (continues)

scoring/weighting

In the end

Questions?