#!/usr/bin/perl
# Authors     : Enrique Amigó and Jesús Giménez
# Date        : June 13, 2006
# Description : Responsible for extracting the txt content from a set of sgml
#               translation documents.

# Usage: IQsgml2txt  input_list output.txt
#                     (input)    (output)
#
# ------------------------------------------------------------------------
# ------------------------------------------------------------------------

#Copyright (C) 2006 Enrique Amigó and Jesús Giménez

#This library is free software; you can redistribute it and/or
#modify it under the terms of the GNU Lesser General Public
#License as published by the Free Software Foundation; either
#version 2.1 of the License, or (at your option) any later version.

#This library is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#Lesser General Public License for more details.

#You should have received a copy of the GNU Lesser General Public
#License along with this library; if not, write to the Free Software
#Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# ------------------------------------------------------------------------

use strict;
use Data::Dumper;
use IO;
use IO::File;

sub get_out
{
   $0 =~ /\/([^\/]*$)/;
   print STDERR "Usage : ", $1, "  <file_list>  > <output_file>\n";
   print STDERR "                           (input)      (output)\n\n";
   print STDERR "  - V <0|1|2>     : verbosity\n";
   print STDERR "                    0 - non-verbose (default)\n";
   print STDERR "                    1 - low verbosity\n";
   print STDERR "                    2 - medium verbosity\n";
   print STDERR "\nExample: $1 'toy/*.sgm' > toy.txt\n\n";
   print STDERR "\nExample: $1 -V 1 'cha/*sgm' | sort -n | awk '{gsub(/^[^ ]*: /,\"\"); print}' > cha.ref\n";
   print STDERR "\nExample: $1 -V 1 'cha/*sgm' | sort -n | awk '{gsub(/^[^ ]*: /,\"\"); print}' | tokenizer.pl > cha.ref.tok\n";
   print STDERR "\nExample: $1 -V 1 'cha/*sgm' | sort -n | awk '{gsub(/^[^ ]*: /,\"\"); print}' | tokenizer.pl | lc-i.perl -1 > cha.ref.tok.lc\n";
   exit;
}

# ------------------------------------- MAIN ------------------------------------------
my $NARG = 1;

# check number of arguments
my $ARGLEN = scalar(@ARGV);
if ($ARGLEN < $NARG) { get_out(); }

my $input = "";
#my $output = "";
my $verbose = 0;

my $ARGOK = 0;
my $i = 0;
while (($i < $ARGLEN) and (!$ARGOK)) {
   my $opt = shift(@ARGV);
   if (($opt eq "-V") or ($opt eq "-v")) { $verbose = shift(@ARGV); }
   else {
      if ($opt ne "") {
         $input = $opt;
         #if (scalar(@ARGV)) { $output = shift(@ARGV); $ARGOK = 1; }
         $ARGOK = 1;
      }
   }
   $i++;
}

if (!($ARGOK)) { get_out(); }

print STDERR "PROCESSING <$input>...";

my $iter = 0;
my $DOCid = "";
my $n = -1;
my $FLIST = new IO::File("ls $input |") or die "Couldn't open input files <$input>\n";
while (defined( my $file = $FLIST->getline())) {
   chomp($file);
   my $FILE = new IO::File("< $file") or die "Couldn't open input file <$file>\n";
   while (defined( my $line = $FILE->getline())) {
      chomp($line);
      if ($line =~ /^<DOC .*/) {
         my @l = split(" ", $line);
         my $i = 1;
         while ($i < scalar(@l)) {
	    if ($l[$i] =~ /^docid/) {
               my @ll = split(/\"/, $l[$i]);
               $DOCid = $ll[1];
               $n = 0;
	    }
            $i++;
	 }
      }
      elsif ($line =~ /.*<DOC .*/) {
         my @l = split("<DOC", $line);

         my @ll = split(" ", $l[1]);
         my $i = 0;
         while ($i < scalar(@ll)) {
	    if ($ll[$i] =~ /^docid/) {
               my @lll = split(/\"/, $ll[$i]);
               $DOCid = $lll[1];
               $n = 0;
	    }
            $i++;
	 }
      }
      elsif ($line =~ /.*<doc .*/) {
         my @l = split("<doc", $line);

         my @ll = split(" ", $l[1]);
         my $i = 0;
         while ($i < scalar(@ll)) {
	    if ($ll[$i] =~ /^docid/) {
               my @lll = split(/\"/, $ll[$i]);
               $DOCid = $lll[1];
               $n = 0;
	    }
            $i++;
	 }
      }
      elsif ($line =~ /^<seg [^>]*>.*<\/seg>$/) {
         chomp($line);
         my @l = split(/[<>]/, $line);
         my $segment = $l[2];
         $segment =~ s/^ +//g;
         $segment =~ s/ +$//g;
         my $N = $n;
         if ($n < 1000) { $N = "0".$N; }
         if ($n < 100) { $N = "0".$N; }
         if ($n < 10) { $N = "0".$N; }
         print "$DOCid:$N: $segment\n";
         if ($verbose) {
            if (($iter%100) == 0) { print STDERR "."; }
            if (($iter%1000) == 0) { print STDERR "$iter"; }
         }
         $iter++;
         $n++;
      }
      elsif (($line =~ /^<seg [^>]*>$/) or ($line =~ /^<seg [^>]*>.+$/)) {
         $line = $FILE->getline();
         chomp($line);
         my $segment = $line;
         $segment =~ s/^ +//g;
         $segment =~ s/ +$//g;
         my $N = $n;
         if ($n < 1000) { $N = "0".$N; }
         if ($n < 100) { $N = "0".$N; }
         if ($n < 10) { $N = "0".$N; }
         print "$DOCid:$N: $segment\n";
         if ($verbose) {
            if (($iter%100) == 0) { print STDERR "."; }
            if (($iter%1000) == 0) { print STDERR "$iter"; }
         }
         $iter++;
         $n++;
      }
      #elsif ($line =~ /^<seg[^>]*>/) {
      elsif ($line =~ /^<seg .*/) {
         chomp($line);
         my @l = split(/[<>]/, $line);
         my $segment = $l[2];
         $segment =~ s/^ +//g;
         $segment =~ s/ +$//g;
         my $N = $n;
         if ($n < 1000) { $N = "0".$N; }
         if ($n < 100) { $N = "0".$N; }
         if ($n < 10) { $N = "0".$N; }
         print "$DOCid:$N: $segment\n";
         if ($verbose) {
            if (($iter%100) == 0) { print STDERR "."; }
            if (($iter%1000) == 0) { print STDERR "$iter"; }
         }
         $iter++;
         $n++;
      }
   }
   $FILE->close();
}
$FLIST->close();

if ($verbose) { print STDERR "..$iter segments [DONE]\n"; }