#!/usr/bin/perl # Authors : Enrique Amigó and Jesús Giménez # Date : June 13, 2006 # Description : Responsible for extracting the txt content from a set of sgml # translation documents. # Usage: IQsgml2txt input_list output.txt # (input) (output) # # ------------------------------------------------------------------------ # ------------------------------------------------------------------------ #Copyright (C) 2006 Enrique Amigó and Jesús Giménez #This library is free software; you can redistribute it and/or #modify it under the terms of the GNU Lesser General Public #License as published by the Free Software Foundation; either #version 2.1 of the License, or (at your option) any later version. #This library is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #Lesser General Public License for more details. #You should have received a copy of the GNU Lesser General Public #License along with this library; if not, write to the Free Software #Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ------------------------------------------------------------------------ use strict; use Data::Dumper; use IO; use IO::File; sub get_out { $0 =~ /\/([^\/]*$)/; print STDERR "Usage : ", $1, " > \n"; print STDERR " (input) (output)\n\n"; print STDERR " - V <0|1|2> : verbosity\n"; print STDERR " 0 - non-verbose (default)\n"; print STDERR " 1 - low verbosity\n"; print STDERR " 2 - medium verbosity\n"; print STDERR "\nExample: $1 'toy/*.sgm' > toy.txt\n\n"; print STDERR "\nExample: $1 -V 1 'cha/*sgm' | sort -n | awk '{gsub(/^[^ ]*: /,\"\"); print}' > cha.ref\n"; print STDERR "\nExample: $1 -V 1 'cha/*sgm' | sort -n | awk '{gsub(/^[^ ]*: /,\"\"); print}' | tokenizer.pl > cha.ref.tok\n"; print STDERR "\nExample: $1 -V 1 'cha/*sgm' | sort -n | awk '{gsub(/^[^ ]*: /,\"\"); print}' | tokenizer.pl | lc-i.perl -1 > cha.ref.tok.lc\n"; exit; } # ------------------------------------- MAIN ------------------------------------------ my $NARG = 1; # check number of arguments my $ARGLEN = scalar(@ARGV); if ($ARGLEN < $NARG) { get_out(); } my $input = ""; #my $output = ""; my $verbose = 0; my $ARGOK = 0; my $i = 0; while (($i < $ARGLEN) and (!$ARGOK)) { my $opt = shift(@ARGV); if (($opt eq "-V") or ($opt eq "-v")) { $verbose = shift(@ARGV); } else { if ($opt ne "") { $input = $opt; #if (scalar(@ARGV)) { $output = shift(@ARGV); $ARGOK = 1; } $ARGOK = 1; } } $i++; } if (!($ARGOK)) { get_out(); } print STDERR "PROCESSING <$input>..."; my $iter = 0; my $DOCid = ""; my $n = -1; my $FLIST = new IO::File("ls $input |") or die "Couldn't open input files <$input>\n"; while (defined( my $file = $FLIST->getline())) { chomp($file); my $FILE = new IO::File("< $file") or die "Couldn't open input file <$file>\n"; while (defined( my $line = $FILE->getline())) { chomp($line); if ($line =~ /^]*>.*<\/seg>$/) { chomp($line); my @l = split(/[<>]/, $line); my $segment = $l[2]; $segment =~ s/^ +//g; $segment =~ s/ +$//g; my $N = $n; if ($n < 1000) { $N = "0".$N; } if ($n < 100) { $N = "0".$N; } if ($n < 10) { $N = "0".$N; } print "$DOCid:$N: $segment\n"; if ($verbose) { if (($iter%100) == 0) { print STDERR "."; } if (($iter%1000) == 0) { print STDERR "$iter"; } } $iter++; $n++; } elsif (($line =~ /^]*>$/) or ($line =~ /^]*>.+$/)) { $line = $FILE->getline(); chomp($line); my $segment = $line; $segment =~ s/^ +//g; $segment =~ s/ +$//g; my $N = $n; if ($n < 1000) { $N = "0".$N; } if ($n < 100) { $N = "0".$N; } if ($n < 10) { $N = "0".$N; } print "$DOCid:$N: $segment\n"; if ($verbose) { if (($iter%100) == 0) { print STDERR "."; } if (($iter%1000) == 0) { print STDERR "$iter"; } } $iter++; $n++; } #elsif ($line =~ /^]*>/) { elsif ($line =~ /^]/, $line); my $segment = $l[2]; $segment =~ s/^ +//g; $segment =~ s/ +$//g; my $N = $n; if ($n < 1000) { $N = "0".$N; } if ($n < 100) { $N = "0".$N; } if ($n < 10) { $N = "0".$N; } print "$DOCid:$N: $segment\n"; if ($verbose) { if (($iter%100) == 0) { print STDERR "."; } if (($iter%1000) == 0) { print STDERR "$iter"; } } $iter++; $n++; } } $FILE->close(); } $FLIST->close(); if ($verbose) { print STDERR "..$iter segments [DONE]\n"; }