#!/usr/local/perl5.005_56.Mar06/bin/perl -w
######################### -*- Mode: Perl -*- #########################
##
## $Basename: cpanwait $
## $Revision: 1.7 $
##
## Author           : Ulrich Pfeifer
## Created On       : Sat Jan  4 18:09:28 1997
##
## Last Modified By : Ulrich Pfeifer
## Last Modified On : Sun Nov 22 18:44:36 1998
## 
## Copyright (c) 1996-1997, Ulrich Pfeifer
## 
## 
######################################################################

eval 'exec perl -S $0 "$@"'
  if 0;


use strict;

use File::Path;
use DB_File;
use Getopt::Long;
use File::Find;
use File::Basename;
use IO::File;

require WAIT::Config;
require WAIT::Database;
require WAIT::Parse::Pod;
require WAIT::Document::Tar;


my %OPT = (database    => 'DB',
           dir         => $WAIT::Config->{WAIT_home} || '/tmp',
           table       => 'cpan',
           clean       => 0,
           remove      => [],
           force       => 0,
           cpan        => '/usr/src/perl/CPAN/sources',
           trust_mtime => 1,
           match       => 'authors/id/',
           test        => 0,
#            cpan        => 'ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN',
           cpan        => 'ftp://ftp.uni-hamburg.de:/pub/soft/lang/perl/CPAN',
           keep        => '/app/unido-i06/src/share/lang/perl/96a/CPAN/sources',
          );

GetOptions(\%OPT,
           'database=s',
           'dir=s',
           'cpan=s',
           'table=s',
           'keep=s',
           'match=s',
           'clean!',
           'test=i',            # test level 0: normal
                                #            1: don't change db
                                #            2: don't look at archives even

           'remove=s@',
           'force!',            # force indexing even if seen
           'trust_mtime!',      # use mtime instead of version number
          ) || die "Usage: ...\n";


clean_database(
               database => $OPT{database},
               dir      => $OPT{dir},
               table    => $OPT{table},
              ) if $OPT{clean};

my $db = WAIT::Database->open(
                              name        => $OPT{database},
                              'directory' => $OPT{dir},
                             )
  ||     WAIT::Database->create(
                                name        => $OPT{database},
                                'directory' => $OPT{dir},
                               )
  or die "Could not open/create database '$OPT{dir}/$OPT{database}': $@";

my $layout= new WAIT::Parse::Pod;

my $tb = $db->table(name => $OPT{table})
  || create_table(db => $db, table => $OPT{table}, layout => $layout);

# Map e.g. '.../latest' to 'perl'. Used in wanted(). Effects version
# considerations. Value *must* match common prefix. Aliasing should be
# used if CPAN contains serveral distributions with different name but
# same root directory.
# We still have a problem if there are different root directories!

my %ALIAS = (# tar name                  real (root) name
             'latest'                => 'perl',
             'perl5db-kit'           => 'DB',
             'SGI-FM'                => 'FM',
             'net-ext'               => 'Net',
             'VelocisSQL'            => 'Velocis',
             'Net-ext'               => 'Net',
             'Curses-DevKit'         => 'Cdk',
             'PostgresPerl'          => 'Postgres',
             'perlpdf'               => 'PERLPDF',
             'Des-perl'              => 'Des',
             'SGI-GL'                => 'GL',
             'DBD-DB2'               => 'DB2',
         );
my %NEW_ALIAS;                             # found in this pass

# Map module names to pathes. Generated by wanted() doing alisaing.
my %ARCHIVE;

# Map module names to latest version. Generated by wanted()
my %VERSION;


# Mapping for modules with common root not matching modules name that
# are not aliased. This is just for prefix stripping and not strictly
# necessary.  Takes effect after version considerations.
my %TR = (# tar name                root to strip
          'Net_SSLeay.pm'        => 'SSLeay/',
          'EventDrivenServer'    => 'Server/',
          'bio_lib.pl.'          => '',
          'AlarmCall'            => 'Sys/',
          'Cdk-ext'              => 'Cdk/',
          'Sx'                   => '\d.\d/',
          'DumpStack'            => 'Devel/',
          'StatisticsDescriptive'=> 'Statistics/',
          'Term-Gnuplot'         => 'Gnuplot/',
          'iodbc_ext'            => 'iodbc-ext-\d.\d/',
          'UNIVERSAL'            => '',
          'Term-Query'           => 'Query/',
          'SelfStubber'          => 'Devel/',
          'CallerItem'           => 'Devel/',
         );

my $DIR  = $tb->dir;
my $DATA = $tb->dir . "/data";
my $LWP;


if (@{$OPT{remove}}) {
  my $pod;
  for $pod (@{$OPT{remove}}) {
    unless (-e $pod) {
      $pod = "$DIR/$pod";
    }
    index_pod(file => $pod, remove => 1) if -f $pod;
    unlink $pod or warn "Could not unlink '$pod': $!\n";
    #$tb->sync;
  }
  $tb->close;
  $db->close;
  exit;
}

# Now get the beef
if ($OPT{cpan} =~ /^(http|ftp):/) {
  $LWP = 1;
  require LWP::Simple;
  LWP::Simple->import();

  mkpath($DATA,1,0755) or
    die "Could not generate '$DATA/': $!"
      unless -d $DATA;

  if (! -f "$DATA/find-ls.gz" or -M "$DATA/find-ls.gz" > 0.5) {
    my $status = mirror("$OPT{cpan}/indices/find-ls.gz", "$DATA/find-ls.gz");
    if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
      # we could use Net:FTP here ...
      die "Was unable to mirror '$OPT{cpan}/indices/find-ls.gz'\n";
    }
  }
  my $fh = new IO::File "gzip -cd $DATA/find-ls.gz |";
  die "Could not open 'gzip -cd $DATA/find-ls.gz': !$\n" unless $fh;

  my $line;
  while (defined ($line = <$fh>)) {
    chomp($line);
    my ($mon, $mday, $time, $file, $is_link) = (split ' ', $line)[7..11];

    next if defined $is_link;
    my $mtime = mtime($mon, $mday, $time);

    $file             =~ s:^\./::;
    ($_)              =  fileparse($file);
    $File::Find::name = $file;
    wanted($mtime);
  }
} else {
  find(sub {&wanted((stat($_))[9])}, $OPT{cpan});
}

ARCHIVE:
for my $tar (sort keys %ARCHIVE) {
  next if $OPT{match} and $ARCHIVE{$tar} !~ /$OPT{match}/o;
  my $base = (split /\//, $ARCHIVE{$tar})[-1];
  my $parent;

  # logging
  if ($OPT{trust_mtime}) {
    printf "%-20s %10s %s\t", $tar,
        substr(scalar(localtime($VERSION{$tar})),0,10), $base;
  } else {
    printf "%-20s %10.5f %s\t", $tar, $VERSION{$tar}, $base;
  }

  # Remember the archive
  # We should have an extra table for the tar file data ...
  if (!$OPT{force} and $tb->have(docid => $base)) {
    print "skipping\n";
    next ARCHIVE;
  } else {
    $parent = $tb->insert(docid    => $base,
                          headline => $ARCHIVE{$tar}) unless $OPT{test};
    print "indexing\n";
  }

  next ARCHIVE if $OPT{test} > 1;

  my $TAR = myget($tar);

  next ARCHIVE unless $TAR;                # not able to fetch it

  my %tar;
  tie (%tar,
       'WAIT::Document::Tar',
       sub { $_[0] =~ /\.(pm|pod|PL)$/ or $_[0] =~ /readme/i},
       #sub { $_[0] !~ m:/$: },
       $TAR)
    or warn "Could not tie '$TAR'\n";

  my $sloppy;
  my ($key, $val);

 FILE:
  while (($key, $val) = each %tar) {
    my $file = fname($key);

    # don't index directories
    next if $file =~ /\/$/;

    # is it a POD file?
    next FILE unless $file =~ /readme/i or $val =~ /\n=head/;

    # remove directory prefix
    unless ($sloppy                        # no common root
            or $file =~ s:^\Q$tar\E[^/]*/::    # common root, maybe alias
            or ($TR{$tar}                  # common root, not aliased
                and $file =~ s:^\Q$TR{$tar}\E::)
           ) {
      # try to determine an alias
      warn "Bad directory prefix: '$file'\n";
      my ($prefix) = split /\//, $file;

      while ($key = (tied %tar)->NEXTKEY) {
        my $file = fname($key);

        next if $file =~ /\/$/;
        unless ($file =~ m:^$prefix/: or $file eq $prefix) {
          warn "Archive contains different prefixes: $prefix,$file\n";
          $prefix = '';
          last;
        }
      }
      if ($prefix) {
        print "Please alias '$tar' to '$prefix' next time!\n";
        print "See alias table later.\n";
        $NEW_ALIAS{$tar} = $prefix;
        $tb->delete_by_key($parent);
        next ARCHIVE;
      } else {
        print "Assuming that tar file name $tar is a valid prefix\n";
        $sloppy = 1;

        # We may reset too much here! But that this is not exact
        # science anyway. Maybe we should ignore using 'next ARCHIVE'.

        $key = (tied %tar)->FIRSTKEY;
        redo FILE;
      }
    }

    # remove /lib prefix
    $file =~ s:^lib/::;

    # generate new path
    my $path = "$DATA/$tar/$file";

    my ($sbase, $sdir) = fileparse($path);
    my $fh;

    unless ($OPT{test}) {
      if (-f $path) {
        index_pod(file => $path, remove => 1);
        unlink $path or warn "Could not unlink '$path' $!\n";
      } elsif (!-d $sdir) {
        mkpath($sdir,1,0755) or die "Could not mkpath($sdir): $!\n";
      }
      $fh = new IO::File "> $path";
      die "Could not write '$path': $!\n" unless $fh;
    }

    if ($file =~ /readme|install/i) {   # make READMEs verbatim pods
      $val =~ s/\n/\n /g;
      $val = "=head1 NAME\n\n$tar $file\n\n=head1 DESCRIPTION\n\n $val"
         unless $val =~ /^=head/m;
    } else {                    # remove non-pod stuff
      my $nval    = $val; $val = '';
      my $cutting = 1;

      for (split /\n/, $nval) {
        if (/^=cut|!NO!SUBS!/) {
          $cutting = 1;
        } elsif ($cutting and /^=head/) {
          $cutting = 0;
        }
        unless ($cutting) {
          $val .= $_ . "\n";
        }
      }
    }
    unless ($OPT{test}) {
      $fh->print($val);
      index_pod(file => $path, parent => $parent,
                text => $val,  source => $ARCHIVE{$tar});
    }
  }

  if ($LWP and !$OPT{keep}) {
    unlink $TAR or warn
      "Could not unlink '$TAR': $!\n";
  }
}

if (%NEW_ALIAS) {
  print "\%ALIAS = (\n";
  for (keys %NEW_ALIAS) {
    print "\t'$_'\t=> '$NEW_ALIAS{$_}',\n";
  }
  print "\t);\n";
}

# we are done
$db->close();
exit;

sub fname ($) {
  my $key = shift;
  my ($ntar, $file) = split $;, $key;

  # remove leading './' - shudder
  $file =~ s/^\.\///;

  return($file);
}

sub myget {
  my $tar = shift;
  my $TAR;
  
  if ($LWP) {                   # fetch the archive
    if ($OPT{keep}) {
      $TAR = "$OPT{keep}/$ARCHIVE{$tar}";
      print "Keeping in '$TAR'\n" unless -e $TAR;
      my ($base, $path) = fileparse($TAR);
      unless (-d $path) {
        mkpath($path,1,0755) or
          die "Could not mkpath($path)\n";
      }
    } else {
      $TAR = "/tmp/$tar.tar.gz";
    }
    unless (-e $TAR) {          # lwp mirror seems to fetch ftp: in any case?
      print "Fetching $OPT{cpan}/$ARCHIVE{$tar}\n";
      my  $status = mirror("$OPT{cpan}/$ARCHIVE{$tar}", $TAR);
      if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
        warn "Was unable to mirror '$ARCHIVE{$tar}, skipping'\n";
        return;
      }
    }
  }
  $TAR;
}

sub index_pod {
  my %parm = @_;
  my $did = $parm{file};
  my $rel_did = $did;
  my $abs_did = $did;

  if ($rel_did =~ s:$DIR/::) {
    $abs_did = "$DIR/$rel_did";
  }

  undef $did;

  # check for both variants
  if ($tb->have('docid' => $rel_did)) {
    $did = $rel_did;
  } elsif ($tb->have('docid' => $abs_did)) {
    $did = $abs_did;
  }
  if ($did) {                   # have it version
    if (!$parm{remove}) {
      warn "duplicate: $did\n";
      return;
    }
  } else {                      # not seen yet
    $did = $rel_did;
    if ($parm{remove}) {
      print "missing: $did\n";
      return;
    }
  }

  $parm{'text'} ||= WAIT::Document::Find->FETCH($abs_did);

  unless (defined $parm{'text'}) {
    print "unavailable: $did\n";
    return;
  }

  my $record      =  $layout->split($parm{'text'});
  $record->{size} =  length($parm{'text'});
  my $headline    =  $record->{name} || $did;

  $headline =~ s/^$DATA//o;     # $did
  $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;

  printf "%s %s\n", ($parm{remove})?'-':'+', substr($headline,0,70);
  if ($parm{remove}) {
    $tb->delete('docid'  => $did,
                headline => $headline,
                %{$record});
  } else {
    $tb->insert('docid'  => $did,
                headline => $headline,
                source   => $parm{source},
                parent   => $parm{parent},
                %{$record});
  }
}

# This *must* remove the version in *any* case. It should compute a
# resonable version number - but usually mtimes should be used.
sub version {
  local ($_) = @_;

  # remove alpha/beta postfix
  s/([-_\d])(a|b|alpha|beta|src)$/$1/;

  # jperl1.3@4.019.tar.gz
  s/@\d.\d+//;

  # oraperl-v2.4-gk.tar.gz
  s/-v(\d)/$1/;

  # lettered versions - shudder
  s/([-_\d\.])([a-z])([\d\._])/sprintf "$1%02d$3", ord(lc $2) - ord('a') /ei;
  s/([-_\d\.])([a-z])$/sprintf "$1%02d", ord(lc $2) - ord('a') /ei;

  # thanks libwww-5b12 ;-)
  s/(\d+)b/($1-1).'.'/e;
  s/(\d+)a/($1-2).'.'/e;

  # replace '-pre' by '0.'
  s/-pre([\.\d])/-0.$1/;
  s/\.\././g;
  s/(\d)_(\d)/$1$2/g;

  # chop '[-.]' and thelike
  s/\W$//;

  # ram's versions Storable-0.4@p
   s/\@/./;

  if (s/[-_]?(\d+)\.(0\d+)\.(\d+)$//) {
    return($_, $1 + "0.$2" + $3 / 1000000);
  } elsif (s/[-_]?(\d+)\.(\d+)\.(\d+)$//) {
    return($_, $1 + $2/1000 + $3 / 1000000);
  } elsif (s/[-_]?(\d+\.[\d_]+)$//) {
    return($_, $1);
  } elsif (s/[-_]?([\d_]+)$//) {
    return($_, $1);
  } elsif (s/-(\d+.\d+)-/-/) {  # perl-4.019-ref-guide
    return($_, $1);
  } else {
    if ($_ =~ /\d/) {           # smells like an unknown scheme
      warn "Odd version Numbering: '$File::Find::name'\n";
      return($_, undef);
    } else {                    # assume version 0
      warn "No  version Numbering: '$File::Find::name'\n";
      return($_, 0);
    }

  }
}

sub wanted {
  my $mtime = shift;            # called by parse_file_ls();

  return unless /^(.*)\.tar(\.gz|\.Z)$/;
  my ($archive, $version) = version($1);
  
  unless (defined $version) {
    warn "Skipping $1\n";
    return;
  }
  
  # Check for file alias
  $archive = $ALIAS{$archive} if $ALIAS{$archive};
  
  # Check for path alias.
  if ($File::Find::name =~ m(/CPAN/(?:source/)?(.*\Q$archive\E))) {
    if ($ALIAS{$1}) {
      $archive = $ALIAS{$1};
    }
  }

  if ($OPT{trust_mtime}) {
    $version = $mtime;
  } else {
    $version =~ s/(\d)_/$1/;
    $version ||= $mtime;   # mtime
  }

  if (!exists $ARCHIVE{$archive}
      or $VERSION{$archive} < $version) {
    $ARCHIVE{$archive} = $File::Find::name;
    $VERSION{$archive} = $version;
  }
}

sub clean_database {
  my %parm = @_;

  my $db = WAIT::Database->open(
                                name        => $parm{database},
                                'directory' => $parm{dir},
                               )
    or die "Could not open database '$parm{dir}/$parm{database}': $@";
  my $tbl = $db->table(name => $parm{table});
  if ($tbl) {
    $tbl->drop or
      die "Could not open table '$parm{tabel}': $@";
  }

  $db->close;
}

sub create_table {
  my %parm = @_;

  my $access = bless {}, 'WAIT::Document::Find';

  my $stem = [{
               'prefix'    => ['isotr', 'isolc'],
               'intervall' => ['isotr', 'isolc'],
              }, 'isotr', 'isolc', 'split2', 'stop', 'Stem'];
  my $text = [{
               'prefix'    => ['isotr', 'isolc'],
               'intervall' => ['isotr', 'isolc'],
              },
              'isotr', 'isolc', 'split2', 'stop'];
  my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],;

  my $tb =
    $parm{db}->create_table
      (name     => $parm{table},
       attr     => ['docid', 'headline', 'source', 'size', 'parent'],
       keyset   => [['docid']],
       layout   => $parm{layout},
       access   => $access,
       invindex =>
       [
        'name'         => $stem,
        'synopsis'     => $stem,
        'bugs'         => $stem,
        'description'  => $stem,
        'text'         => $stem,
        'environment'  => $text,
        'example'      => $text,  'example' => $stem,
        'author'       => $sound, 'author'  => $stem,
       ]
      );
  die "Could not create table '$parm{table}'" unless $tb;
  $tb;
}

my %MON;
my $YEAR;

BEGIN {
  my $i = 1;
  for (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)) {
    $MON{$_} = $i++;
  }
  $YEAR = (localtime(time))[5];
}

# We could/should use Date::GetDate here
use Time::Local;
sub mtime {
  my ($mon, $mday, $time) = @_;
  my ($hour, $min, $year, $monn) = (0,0);

  if ($time =~ /(\d+):(\d+)/) {
    ($hour, $min) = ($1, $2);
    $year = $YEAR;
  } else {
    $year = $time;
  }
  $monn = $MON{$mon} || $MON{ucfirst lc $mon} || warn "Unknown month: '$mon'";
  my $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year);
  if ($guess > time) {
    $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year-1);
  }
  $guess;
}


__END__
## ###################################################################
## pod
## ###################################################################

=head1 NAME

cpan - generate an WAIT index for CPAN

=head1 SYNOPSIS

B<cpan>
[B<-clean>] [B<-noclean>]
[B<-cpan> I<url or directory>]
[B<-database> I<dbname>]
[B<-dir> I<directory>]
[B<-force>] [B<-noforce>]
[B<-keep> I<directory>]
[B<-match> I<regexp>]
[B<-table> I<table name>]
[B<-test> I<level>]
[B<-trust_mtime>] [B<-notrust_mtime>]

=head1 DESCRIPTION

TBS

=head1 OPTIONS

=over 5

=item B<-clean> / B<-noclean>

Clean the table befor indexing. Default is B<off>.

=item B<-cpan> I<url or directory>

Default directory or URL for indexing. If an URL is given, there
currently must be a file F<indices/find-ls.gz> relative to it which
contains the output of C<find . -ls | gzip>.
Default is F<ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN>.


=item B<-database> I<dbname>

Specify database name. Default is F<DB>.

=item B<-dir> I<directory>

Alternate directory were databases are located. Default is the
directory specified during configuration of WAIT.

=item B<-force>

Force reindexing, even if B<cpan> thinks files are up to date.
Default is B<off>

=item B<-keep> I<directory>

If fetching from a remote server, keep files in I<directory>. Default is
F</app/unido-i06/src/share/lang/perl/96a/CPAN/sources>.

=item B<-match> I<regexp>

Limit to patches matching I<regexp>. Default is F<authors/id/>.

=item B<-table> I<table name>

Specify an alternate table name. Default is C<cpan>.

=item B<-test> I<level>

Set test level, were B<0> means normal operation, B<1> means, don't
really index and B<2> means, don't even get archives and examine them.

=item B<-trust_mtime> / B<-notrust_mtime>

If B<on>, the files mtimes are used to decide, which version of an
archive is the newest. If b<off>, the version extracted is used
(beware, there are far more version numbering schemes than B<cpan> can
parse).

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortumund.de>E<gt>
