#!/bin/sh

# Public domain notice for all NCBI EDirect scripts is located at:
# https://www.ncbi.nlm.nih.gov/books/NBK179288/#chapter6.Public_Domain_Notice

# custom-index

pth=$( dirname "$0" )

case "$pth" in
  /* )
    ;; # already absolute
  *  )
    pth=$(cd "$pth" && pwd)
    ;;
esac

case ":$PATH:" in
  *:"$pth":* )
    ;;
  * )
    PATH="$PATH:$pth"
    export PATH
    ;;
esac

# database-specific parameters

dbase="pubmed"
helper=""

if [ $# -gt 0 ]
then
  helper="$1"
  shift
else
  echo "Must supply name of indexing script"
  exit 1
fi

if [ $# -lt 1 ]
then
  echo "Must supply name of one or more indexed fields"
  exit 1
fi

# get path to local folder

osname=$( uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/' )

GetLocalArchiveFolder() {

  dbs="$1"
  fld="$2"

  # find selected local archive folder from environment variables or configuration file
  target=$( rchive -local "$dbs" "$fld" )

  if [ -z "$target" ] || [ "$target" = "" ]
  then
    echo "ERROR: Must supply path to local data by setting EDIRECT_LOCAL_ARCHIVE environment variable" >&2
    exit 1
  fi

  if [ -n "$osname" ] && [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
  then
    target=$( cygpath -w "$target" )
  fi

  # remove trailing slash
  target=${target%/}

  echo "$target"
}

date

archiveBase=$( GetLocalArchiveFolder "$dbase" "Archive" )
mergedBase=$( GetLocalArchiveFolder "$dbase" "Merged" )
postingsBase=$( GetLocalArchiveFolder "$dbase" "Postings" )
currentBase=$( GetLocalArchiveFolder "$dbase" "Current" )
indexedBase=$( GetLocalArchiveFolder "$dbase" "Indexed" )
invertedBase=$( GetLocalArchiveFolder "$dbase" "Inverted" )

seconds_start=$(date "+%s")
echo "Removing Previous Indices"
cd "${indexedBase}"
target="${indexedBase}"
find "$target" -name "*.e2x" -delete
find "$target" -name "*.e2x.gz" -delete
cd "${invertedBase}"
target="${invertedBase}"
find "$target" -name "*.inv" -delete
find "$target" -name "*.inv.gz" -delete
cd "${mergedBase}"
target="${mergedBase}"
find "$target" -name "*.mrg" -delete
find "$target" -name "*.mrg.gz" -delete
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
CLR=$seconds

seconds_start=$(date "+%s")
echo "Collecting Current PubMed Archive"
cd "${currentBase}"
target="${currentBase}"
if [ \! -f pubmed001.xml ]
then
  [ -f pubmed001.xml.gz ] || pm-collect "${archiveBase}" "${currentBase}"
  echo "Expanding Current PubMed Archive"
  for fl in *.xml.gz
  do
    base=${fl%.xml.gz}
    echo "$base.xml"
    gunzip -c "$fl" |
    xtract -set PubmedArticleSet -index -pattern PubmedArticle > "$target/$base.xml"
    sleep 1
    rm "$fl"
  done
fi
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
EXP=$seconds

seconds_start=$(date "+%s")
cd "${indexedBase}"
target="${indexedBase}"
find "$target" -name "*.e2x" -delete
find "$target" -name "*.e2x.gz" -delete
seconds_start=$(date "+%s")
echo "Indexing Custom Field"
cd "${currentBase}"
target=${target%/}
if [ -f "pubmed001.xml.gz" ]
then
  for fl in *.xml.gz
  do
    base=${fl%.xml.gz}
    echo "$base.e2x"
    gunzip -c "$fl" |
    "$helper" |
    gzip -1 > "$target/$base.e2x.gz"
    sleep 1
  done
elif [ -f "pubmed001.xml" ]
then
  for fl in *.xml
  do
    base=${fl%.xml}
    echo "$base.e2x"
    cat "$fl" |
    "$helper" |
    gzip -1 > "$target/$base.e2x.gz"
    sleep 1
  done
else
  echo "Unable to find current PubMed working files"
  exit 1
fi
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
IDX=$seconds

seconds_start=$(date "+%s")
cd "${indexedBase}"
echo "Inverting Custom Indices"
target="${invertedBase}"
find "$target" -name "*.inv" -delete
find "$target" -name "*.inv.gz" -delete
for fl in *.e2x.gz
do
  base=${fl%.e2x.gz}
  echo "$base.inv"
  gunzip -c "$fl" |
  rchive -e2invert |
  gzip -1 > "$target/$base.inv.gz"
  sleep 1
done
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
INV=$seconds

seconds_start=$(date "+%s")
cd "${invertedBase}"
echo "Merging Custom Indices"
target="${mergedBase}"
find "$target" -name "*.mrg" -delete
find "$target" -name "*.mrg.gz" -delete
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
  target=`cygpath -w "$target"`
fi
target=${target%/}
rchive -gzip -merge "$target" *.inv.gz
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
MRG=$seconds

seconds_start=$(date "+%s")
cd "${mergedBase}"
echo "Producing Custom Postings"
target="${postingsBase}"
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
  target=`cygpath -w "$target"`
fi
target=${target%/}
for fl in *.mrg.gz
do
  echo "$fl"
done |
sort |
xargs -n 100 echo |
while read files
do
  rchive -promote "$target" "$*" $files
done
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
PST=$seconds

echo "CLR $CLR seconds"
echo "EXP $EXP seconds"
echo "IDX $IDX seconds"
echo "INV $INV seconds"
echo "MRG $MRG seconds"
echo "PST $PST seconds"

echo ""

date
