#! /usr/bin/perl # Assumes # PoS tags in the Last-but-one column # SE trees in the Last column use strict; my $help = << "EoH;"; collins-fixp.pl : fixes punctuation terminals of Collins parse trees. Initial version. By default, the script raises up in the tree those punctuation terminals that are placed at the end of a constituent node. The tree nodes are processed recursively from the bottom-up. A terminal is never placed outside the root node "TOP". Punctuation terminals are those whose PoS is: . , : '' See options "-a", "-qo" and "-qc" for further heuristics, still experimental. Usage : The script reads columns from STDIN. The last two columns should be: the PoS tags column; and the Collins tree column in Start-End format. By default, the script adds a new column corresponding to the arranged tree, and outputs all columns to STDOUT. To inspect the behavior of the script, it is useful to have the words and the WSJ reference trees in initial columns. Use option "-s" to output only PoS tags and the arranged tree. Options : -a Do NOT raise up a comma when it closes an apposition. In particular, we consider that a node with N childs contains an appositoin when N>3, the N-th child is a ",", and the (N-2)-th child is also a ",". -qo Raise up opening quotation marks (``) at the beginning of a constituent, until they are in a "TOP" or "S" node. -qc Do NOT raise up a closing quotation mark ('') when it is balanced with a opening quotation mark (i.e. the first child of the parent is a "``"). -q Shortcut for "-qo" and "-qc". -s Print only syntax columns, i.e. PoS tags and arranged tree. EoH; my $treat_oqm = 0; my $treat_cqm = 0; my $treat_app = 0; my $syntax_only = 0; while (@ARGV) { my $a = shift @ARGV; if ($a eq "-qo") { $treat_oqm = 1; } elsif ($a eq "-qc") { $treat_cqm = 1; } elsif ($a eq "-q") { $treat_oqm = 1; $treat_cqm = 1; } elsif ($a eq "-a") { $treat_app = 1; } elsif ($a eq "-s") { $syntax_only = 1; } else { print $help and exit(); } } my $cols = SRL::sentence::read_columns(\*STDIN); while (@$cols) { my $last = $cols->[$#{@$cols}]; my $colpos = $#{@$cols} -1; my $l = scalar(@$last); my (@W,$i); for ($i=0;$i<$l;$i++) { push @W, SRL::word->new($i,"xxx", $cols->[$colpos][$i]); } # initialize tree my $tree = SRL::syntree->new(); $tree->load_SE_tagging(\@W, @$last); arrange_tree($tree); my @tags = $tree->to_SE_tagging($l); if ($syntax_only) { $cols = [ $cols->[$colpos], \@tags ]; } else { push @$cols, \@tags; } SRL::sentence::reformat_columns($cols); for ($i=0;$i<$l;$i++) { print join(" ", map { $_->[$i] } @$cols), "\n"; } print "\n"; $cols = SRL::sentence::read_columns(\*STDIN); } sub arrange_tree { my $t = shift; my $r = $t->root; arrange_node($t, $t->root, $t->ref_terminals); } # arranges the punctuation of a node $n # returns a list of node references # the first element is a (possibly undefined) child of $n that is raised up to the left of $n # the remaining elements are the childs of $n that are raised to the right of $n sub arrange_node { my ($t,$n,$T) = @_; if (!$n->is_terminal) { my $s; # the new list of sons for $n my @S; foreach $s ( $n->sons ) { my ($sl, @S1) = arrange_node($t,$s,$T); defined($sl) and push @S, $sl; push @S, $s, @S1; } # stop if we are in the TOP node if ($n->content->type =~ /^TOP/) { return (); } # the phrase associated to the tree node my $p = $n->content; # print STDERR "Processing ", $p->to_string, "\n"; # a son passed to the father (goes to the left of the current) my $sl = undef; if ($treat_oqm and @S and $S[0]->is_terminal and ($S[0]->content->pos eq "``") and ($n->content->type !~ /^(TOP|S)/)) { $sl = shift @S; $p->set_start($p->start+1); } # the list of childs that are passed to the father my @Sout; # continue processing childs my $c = 1; while (@S and $c) { $s = $S[$#S]; if ($s->is_terminal) { my $pos = $s->content->pos; if ($treat_app and ($pos eq ",") and (@S>3) and ($S[$#S-2]->is_terminal) and ($S[$#S-2]->content->pos eq $pos)) { $c = 0; } elsif ($treat_cqm and ($pos eq "''") and (@S>2) and ($S[0]->is_terminal) and ($S[0]->content->pos eq "``")) { $c = 0; } elsif ($pos =~ /^(\.|,|:|'')$/ and ($n->dad->content->type ne "TOP")) { push @Sout, pop @S; } else { $c = 0; } } else { $c = 0; } # $c = 0; } $n->set_sons(@S); $p->set_start( $S[0]->is_terminal ? $S[0]->content->id : $S[0]->content->start ); $p->set_end( $S[$#S]->is_terminal ? $S[$#S]->content->id : $S[$#S]->content->end ); return ($sl, @Sout); } else { return (); } } ################################################################################ # # Package s e n t e n c e # # February 2004 # # Stores information of a sentence, namely words, chunks, clauses, # named entities and propositions (gold and predicted). # # Provides access methods. # Provides methods for reading/writing sentences from/to files in # CoNLL-2004/CoNLL-2005 formats. # # ################################################################################ package SRL::sentence; use strict; sub new { my ($pkg, $id) = @_; my $s = []; $s->[0] = $id; # sentence number $s->[1] = undef; # words (the list or the number of words) $s->[2] = []; # gold props $s->[3] = []; # predicted props $s->[4] = undef; # chunks $s->[5] = undef; # clauses $s->[6] = undef; # full syntactic tree $s->[7] = undef; # named entities return bless $s, $pkg; } #----- sub id { my $s = shift; return $s->[0]; } #----- sub length { my $s = shift; if (ref($s->[1])) { return scalar(@{$s->[1]}); } else { return $s->[1]; } } sub set_length { my $s = shift; $s->[1] = shift; } #----- # returns the i-th word of the sentence sub word { my ($s, $i) = @_; return $s->[1][$i]; } # returns the list of words of the sentence sub words { my $s = shift; if (@_) { return map { $s->[1][$_] } @_; } else { return @{$s->[1]}; } } sub ref_words { my $s = shift; return $s->[1]; } sub chunking { my $s = shift; return $s->[4]; } sub clausing { my $s = shift; return $s->[5]; } sub syntree { my $s = shift; return $s->[6]; } sub named_entities { my $s = shift; return $s->[7]; } #----- sub add_gold_props { my $s = shift; push @{$s->[2]}, @_; } sub gold_props { my $s = shift; return @{$s->[2]}; } sub add_pred_props { my $s = shift; push @{$s->[3]}, @_; } sub pred_props { my $s = shift; return @{$s->[3]}; } #------------------------------------------------------------ # I/O F U N C T I O N S #------------------------------------------------------------ # Reads a complete (words, synt, props) sentence from a stream # Returns: the reference to the sentence object or # undef if no sentence found # The propositions in the file are stored as gold props # For each gold prop, an empty predicted prop is created # # The %C hash contains the column number for each annotation of # the datafile. # sub read_from_stream { my ($pkg, $id, $fh, %C) = @_; if (!%C) { %C = ( words => 0, pos => 1, chunks => 2, clauses => 3, syntree => 4, ne => 5, props => 6 ) } # my $k; # foreach $k ( "words", "pos", "props" ) { # if (!exists($C{$k}) { # die "sentence->read_from_stream :: undefined column number for $k.\n"; # } # } my $cols = read_columns($fh); if (!@$cols) { return undef; } my $s = $pkg->new($id); # words and PoS my $words = $cols->[$C{words}]; my $pos = $cols->[$C{pos}]; # initialize list of words $s->[1] = []; my $i; for ($i=0;$i<@$words;$i++) { push @{$s->[1]}, SRL::word->new($i, $words->[$i], $pos->[$i]); } my $c; # chunks if (exists($C{chunks})) { $c = $cols->[$C{chunks}]; # initialize chunking $s->[4] = SRL::phrase_set->new(); $s->[4]->load_SE_tagging(@$c); } # clauses if (exists($C{clauses})) { $c = $cols->[$C{clauses}]; # initialize clauses $s->[5] = SRL::phrase_set->new(); $s->[5]->load_SE_tagging(@$c); } # syntree if (exists($C{syntree})) { $c = $cols->[$C{syntree}]; # initialize syntree $s->[6] = SRL::syntree->new(); $s->[6]->load_SE_tagging($s->[1], @$c); } # named entities if (exists($C{ne})) { $c = $cols->[$C{ne}]; $s->[7] = SRL::phrase_set->new(); $s->[7]->load_SE_tagging(@$c); } my $i = 0; while ($i<$C{props}) { shift @$cols; $i++; } # gold props my $targets = shift @$cols or die "error :: reading sentence $id :: no targets found!\n"; if (@$cols) { $s->load_props($s->[2], $targets, $cols); } # initialize predicted props foreach $i ( grep { $targets->[$_] ne "-" } ( 0 .. scalar(@$targets)-1 ) ) { push @{$s->[3]}, SRL::prop->new($targets->[$i], $i); } return $s; } #------------------------------------------------------------ # reads the propositions of a sentence from files # allows to store propositions as gold and/or predicted, # by specifying filehandles as values in the %FILES hash # indexed by {GOLD,PRED} keys # expects: each prop file: first column specifying target verbs, # and remaining columns specifying arguments # returns a new sentence, containing the list of prop # objects, one for each column, in gold/pred contexts # returns undef when EOF sub read_props { my ($pkg, $id, %FILES) = @_; my $s = undef; my $length = undef; if (exists($FILES{GOLD})) { my $cols = read_columns($FILES{GOLD}); # end of file if (!@$cols) { return undef; } $s = $pkg->new($id); my $targets = shift @$cols; $length = scalar(@$targets); $s->set_length($length); $s->load_props($s->[2], $targets, $cols); } if (exists($FILES{PRED})) { my $cols = read_columns($FILES{PRED}); if (!defined($s)) { # end of file if (!@$cols) { return undef; } $s = $pkg->new($id); } my $targets = shift @$cols; if (defined($length)) { ($length != scalar(@$targets)) and die "ERROR : sentence $id : gold and pred sentences do not align correctly!\n"; } else { $length = scalar(@$targets); $s->set_length($length); } $s->load_props($s->[3], $targets, $cols); } return $s; } sub load_props { my ($s, $where, $targets, $cols) = @_; my $i; for ($i=0; $i<@$targets; $i++) { if ($targets->[$i] ne "-") { my $prop = SRL::prop->new($targets->[$i], $i); my $col = shift @$cols; if (defined($col)) { # print "SE Tagging: ", join(" ", @$col), "\n"; $prop->load_SE_tagging(@$col); } else { print STDERR "WARNING : sentence ", $s->id, " : can't find column of args for prop ", $prop->verb, "!\n"; } push @$where, $prop; } } } # writes a sentence to an output stream # allows to specify which parts of the sentence are written # by giving true values to the %WHAT hash, indexed by # {WORDS,SYNT,GOLD,PRED} keys sub write_to_stream { my ($s, $fh, %WHAT) = @_; if (!%WHAT) { %WHAT = ( WORDS => 1, PSYNT => 1, FSYNT => 1, GOLD => 0, PRED => 1 ); } my @columns; if ($WHAT{WORDS}) { my @words = map { $_->form } $s->words; push @columns, \@words; } if ($WHAT{PSYNT}) { my @pos = map { $_->pos } $s->words; push @columns, \@pos; my @chunks = $s->chunking->to_SE_tagging($s->length); push @columns, \@chunks; my @clauses = $s->clausing->to_SE_tagging($s->length); push @columns, \@clauses; } if ($WHAT{FSYNT}) { my @pos = map { $_->pos } $s->words; push @columns, \@pos; my @sttags = $s->syntree->to_SE_tagging(); push @columns, \@sttags; } if ($WHAT{GOLD}) { push @columns, $s->props_to_columns($s->[2]); } if ($WHAT{PRED}) { push @columns, $s->props_to_columns($s->[3]); } if ($WHAT{PROPS}) { push @columns, $s->props_to_columns($WHAT{PROPS}); } reformat_columns(\@columns); # finally, print columns word by word my $i; for ($i=0;$i<$s->length;$i++) { print $fh join(" ", map { $_->[$i] } @columns), "\n"; } print $fh "\n"; } # turns a set of propositions (target verbs + args for each one) into a set of # columns in the CoNLL Start-End format sub props_to_columns { my ($s, $Pref) = @_; my @props = sort { $a->position <=> $b->position } @{$Pref}; my $l = $s->length; my $verbs = []; my @cols = ( $verbs ); my $p; foreach $p ( @props ) { defined($verbs->[$p->position]) and die "sentence->preds_to_columns: already defined verb at sentence ", $s->id, " position ", $p->position, "!\n"; $verbs->[$p->position] = sprintf("%-15s", $p->verb); my @tags = $p->to_SE_tagging($l); push @cols, \@tags; } # finally, define empty verb positions my $i; for ($i=0;$i<$l;$i++) { if (!defined($verbs->[$i])) { $verbs->[$i] = sprintf("%-15s", "-"); } } return @cols; } # Writes the predicted propositions of the sentence to an output file handler ($fh) # Specifically, writes a column of target verbs, and a column of arguments # for each target verb # OBSOLETE : the same can be done with write_to_stream($s, PRED => 1) sub write_pred_props { my ($s, $fh) = @_; my @props = sort { $a->position <=> $b->position } $s->pred_props; my $l = $s->length; my @verbs = (); my @cols = (); my $p; foreach $p ( @props ) { defined($verbs[$p->position]) and die "prop->write_pred_props: already defined verb at sentence ", $s->id, " position ", $p->position, "!\n"; $verbs[$p->position] = $p->verb; my @tags = $p->to_SE_tagging($l); push @cols, \@tags; } # finally, print columns word by word my $i; for ($i=0;$i<$l;$i++) { printf $fh "%-15s %s\n", (defined($verbs[$i])? $verbs[$i] : "-"), join(" ", map { $_->[$i] } @cols); } print "\n"; } # reads columns until blank line or EOF # returns an array of columns (each column is a reference to an array containing the column) # each column in the returned array should be the same size sub read_columns { my $fh = shift; # read columns until blank line or eof my @cols; my $i; my @line = split(" ", <$fh>); while (@line) { for ($i=0; $i<@line; $i++) { push @{$cols[$i]}, $line[$i]; } @line = split(" ", <$fh>); } return \@cols; } # reformats the tags of a list of columns, so that each # column has a fixed width along all tags # # sub reformat_columns { my $cols = shift; # a reference to the list of columns of a sentence my $i; for ($i=0;$i[$i]); } } # reformats the tags of a column, so that each # tag has the same width # # tag sequences are left justified # start-end annotations are centered at the asterisk # sub column_pretty_format { my $col = shift; # a reference to the column (array) of tags (!@$col) and return undef; my ($i); if ($col->[0] =~ /\*/) { # Start-End my $ok = 1; my (@s,@e,$t,$ms,$me); $ms = 2; $me = 2; $i = 0; while ($ok and $i<@$col) { if ($col->[$i] =~ /^(.*\*)(.*)$/) { $s[$i] = $1; $e[$i] = $2; if (length($s[$i]) > $ms) { $ms = length($s[$i]); } if (length($e[$i]) > $me) { $me = length($e[$i]); } } else { # In this case, the current token is not compliant with SE format # So, we treat format the column as a sequence of tags $ok = 0; } $i++; } # print "M $ms $me\n"; if ($ok) { my $f = "%".($ms+1)."s%-".($me+1)."s"; for ($i=0; $i<@$col; $i++) { $col->[$i] = sprintf($f, $s[$i], $e[$i]); } return; } } # Tokens my $l=0; map { (length($_)>$l) and ($l=length($_)) } @$col; my $f = "%-".($l+1)."s"; for ($i=0; $i<@$col; $i++) { $col->[$i] = sprintf($f,$col->[$i]); } } 1; ################################## # # ################################## use strict; package SRL::syntree; # a tree node links to a phrase (for non-terminals) or a word (for terminals) =head1 NAME SRL::syntree - a syntactic tree =head1 SYNOPSIS open F, "mytrees.mrg"; $tree = SRL::syntree->read_from_stream(\*F); $tree->pretty_print(); =head1 DESCRIPTION Data structure for a syntactic tree; =head1 METHODS =over 4 =cut ######################################## =item $tree = SRL::syntree->new() Creates a new syntactic tree, empty =cut sub new { my ($pkg) = @_; my $t = []; # pointer to root $t->[0] = undef; # pointer to terminals (leaves) @{$t->[1]} = (); return bless $t, $pkg; } ######################################## sub DESTROY { my $t = shift; # warn "SRL::syntree : Destroying tree $t"; $t->root->destroy_node(); } ######################################## =item $tree = SRL->read_wsj_mrg($fh) Creates a new syntactic tree, read from the filehandle $fh, in "WSJ mrg" format. =cut sub read_wsj_mrg { my ($pkg, $fh) = @_; my $tree = $pkg->new; bless $tree, $pkg; my @N; ## open nodes my $complete = 0; while ( !$complete and !eof($fh)) { my $line = <$fh>; chomp($line); $line =~ s/\s+$//; while ($line ne "") { $line =~ s/^\s+//; if ($line =~ /^\(([^ ()]+) ([^ ()]+)\)/) { ## Terminal my $tag = $1; my $word = $2; $line = $'; #' my $w = SRL::word->new(scalar(@{$tree->[1]}), $word, $tag); my $n = SRL::synnode->new_terminal($w); push @{$tree->[1]}, $n; if (@N) { $N[$#N]->add_sons($n); $n->set_dad($N[$#N]); } else { die "syntree->read_wsj_mrg: found terminal without non-terminal parent!\n"; } } elsif ($line =~ /^\(([^ ()]+)?/) { ## Start of Non-Terminal # print "branch 2\n"; $line = $'; #' my $type = ($1) ? $1 : undef; my $p = SRL::phrase->new(scalar(@{$tree->[1]}), undef, $type); my $n = SRL::synnode->new_non_terminal($p); if (@N) { $N[$#N]->add_sons($n); $n->set_dad($N[$#N]); } else { defined($tree->[0]) and die "syntree->read_wsj_mrg: found a root node when root was already defined!\n"; $tree->[0] = $n; } push @N, $n; } elsif ($line =~ /^\)/) { ## End of Non-Terminal # print "branch 3\n"; $line = $'; #' my $n = pop @N; $n->content->set_end(scalar(@{$tree->[1]})-1); !@N and $complete = 1; } elsif ($line !~ /^\s*$/) { die "syntree->read_wsj_mrg: Unknown token in line : $line\n"; } } } if ($complete) { return $tree; } else { return undef; } } ######################################## =item $tree->load_SE_tagging($W, @tags) $W is a reference to the list of words (SRL::word objects) @tags are the Start-End tags associated to each word. =cut sub load_SE_tagging { my ($tree, $W, @tags) = @_; # $W is a ref to the sequence of words my (@N); # open nodes my $wid = 0; # Create artificial root if necessary if (($tags[0] !~ /^\(/) or ($tags[$#tags] !~ /\)$/)) { } while (@tags) { my $tag = shift @tags; # Open non-terminals at $wid-th word while ($tag !~ /^\*/) { # In the RegExp below, note that "\*" is permitted to be part of the tag $tag =~ /^\(((\\\*|[^*(])+)/ or die "syntree->load_SE_tagging: opening nodes -- bad format in $tag at $wid-th position!\n"; my $type = ($1) ? $1 : undef; $tag = $'; #' my $p = SRL::phrase->new($wid, undef, $type); my $n = SRL::synnode->new_non_terminal($p); if (@N) { $N[$#N]->add_sons($n); $n->set_dad($N[$#N]); } elsif (!defined($tree->[0])) { $tree->[0] = $n; } else { my $root = SRL::synnode->new_non_terminal(undef); $root->add_sons($tree->[0],$n); $tree->[0] = $root; push @N, $root; } push @N, $n; } # Create terminal node for the $wid-th word $tag =~ s/^\*//; my $n = SRL::synnode->new_terminal($W->[$wid]); push @{$tree->[1]}, $n; if (@N) { $N[$#N]->add_sons($n); $n->set_dad($N[$#N]); } else { my $root = SRL::synnode->new_non_terminal(undef); if (defined($tree->[0])) { $root->add_sons($tree->[0]); } $tree->[0] = $root; $root->add_sons($n); push @N, $root; } # Close non-terminals at $wid-th word while ($tag ne "") { $tag =~ /^([^\)]*)\)/ or die "syntree->load_SE_tagging: closing nodes -- bad format in $tag at $wid-th position!\n"; my $type = $1; $tag = $'; #' my $n = pop @N or die "syntree->load_SE_tagging : unbalanced start-end tags!\n"; (!$type) or ($type eq $n->content->type) or die "syntree->load_SE_tagging: types in start-end tags do not match!\n"; $n->content->set_end($wid); } $wid++; } } ######################################## =item $root = $tree->root() Returns the root node of the tree. =cut sub root { my $t = shift; return $t->[0]; } ######################################## =item $tree->set_root($node) Sets $node as the root node of the tree. =cut sub set_root { my ($t,$r) = @_; $t->[0] = $r; } ######################################## =item @T = $tree->terminals() Returns the list of terminal nodes of the tree. =cut sub terminals { my $t = shift; return @{$t->[1]}; } ######################################## =item @T = $tree->ref_terminals() Returns a reference to the list of terminal nodes of the tree. =cut sub ref_terminals { my $t = shift; return $t->[1]; } ######################################## =item $tree->add_terminals($t1, $t2, $t3, @newT) Adds new terminal nodes to the tree. =cut sub add_terminals { my $t = shift; push @{$t->[1]}, @_; } ######################################## =item $tree->set_terminals(@newT) Initializes the list of terminal nodes of the tree to @newT. =cut sub set_terminals { my $t = shift; @{$t->[1]} = @_; } ######################################## =item @N = $tree->dfs(@newT) Returns the list of nodes of the tree in depth-first-search order (i.e., preorder) =cut sub dfs { my $t = shift; return $t->[0]->dfs; } ######################################## =item $s = $tree->to_string() Generates a string that represents the tree in "WSJ mrg" format, with no linebreaks. =cut sub to_string { my $t = shift; return $t->[0]->to_string; } ######################################## =item $s = $tree->to_pretty_string() Generates a string that represents the tree in "WSJ mrg" format, with linebreaks =cut sub to_pretty_string { my $t = shift; if ($t->[0]->is_terminal or ($t->[0]->content and $t->[0]->content->type)) { return $t->[0]->to_pretty_string(" "); } else { return "( " . join("\n ", map { $_->to_pretty_string(" ") } $t->[0]->sons ), ")"; } } ######################################## =item @SEtags = $tree->to_SE_tagging() =cut sub to_SE_tagging { my $t = shift; my @S; my @E; $t->[0]->to_SE_tagging(\@S, \@E); my $l = scalar(@{$t->[1]}); my (@tags, $i); for ($i=0; $i<$l; $i++) { # $tags[$i] = sprintf("%15s*%-15s", $S[$i], $E[$i]); $tags[$i] = sprintf("%s*%s", $S[$i], $E[$i]); } return @tags; } ############################################################ package SRL::synnode; sub new_terminal { my ($pkg, $w) = @_; my $n = []; # if terminal -> undef # else -> list of sons $n->[0] = undef; $n->[1] = $w; # daddy $n->[2] = undef; return bless $n, $pkg; } sub new_non_terminal { my ($pkg, $p) = @_; my $n = []; # if terminal -> undef # else -> list of sons @{$n->[0]} = (); $n->[1] = $p; # daddy $n->[2] = undef; return bless $n, $pkg; } sub destroy_node { my $n = shift; # Destroy sons if not terminal if (defined($n->[0])) { my $s; foreach $s ( @{$n->[0]} ) { $s->destroy_node; } } $n->[0] = undef; # undef the dad $n->[2] = undef; } sub is_terminal { my $n = shift; return !defined($n->[0]); } sub content { my $n = shift; return $n->[1]; } sub set_content { my $n = shift; $n->[1] = shift; } sub dad { my $n = shift; return $n->[2]; } sub set_dad { my $n = shift; $n->[2] = shift; } sub sons { my $n = shift; return @{$n->[0]}; } sub set_sons { my $n = shift; @{$n->[0]} = @_; } sub add_sons { my $n = shift; push @{$n->[0]}, @_; } sub dfs { my $n = shift; if (defined($n->[0])) { return ($n, map { $_->dfs } @{$n->[0]}); } else { return ($n); } } sub to_string { my $n = shift; if (defined($n->[0])) { my $str = "("; if (defined($n->content)) { $str .= $n->content->type; } return $str . " " . join(" ", map { $_->to_string } @{$n->[0]}) . ")"; } else { return "(" . $n->content->pos . " " . $n->content->form . ")"; } } sub to_pretty_string { my $n = shift; my $iniline = shift; if ($n->is_terminal) { return "(" . $n->content->pos . " " . $n->content->form . ") "; } else { my $str; if (defined($n->content)) { $str = "(". $n->content->type . " "; } else { $str = "( "; } my $son; my $sep = ""; foreach $son ( @{$n->[0]} ) { if ($son->is_terminal) { $str .= $sep . $son->to_pretty_string(); $sep = ""; } else { $str .= "\n" . $iniline . $son->to_pretty_string($iniline." "); $sep = "\n" . $iniline; } } $str .= ")"; return $str; } } sub to_SE_tagging { my ($n, $S, $E) = @_; if (!$n->is_terminal) { my $p = $n->content; defined($p) and $S->[$p->start] .= "(".$p->type; map { $_->to_SE_tagging($S,$E) } @{$n->[0]}; defined($p) and $E->[$p->end] .= ")"; #$p->type.")"; } } =head1 SEE ALSO Documentation of the SRL package for CoNLL-2005. Check : http://www.lsi.upc.edu/~srlconll =head1 COPYRIGHT Copyright 2004-2005 Xavier Carreras and Lluís Màrquez Technical University of Catalonia (UPC) This software is free for research and educational purposes. Published work containing results derived from use of this software must contain an appropriate acknowledgement. =cut 1; 1; ################################################################## # # Package p h r a s e : a generic phrase # # January 2004 # # This class represents generic phrases. # A phrase is a sequence of contiguous words in a sentence. # A phrase is identified by the positions of the start/end words # of the sequence that the phrase spans. # A phrase has a type. # A phrase may contain a list of internal subphrases, that is, # phrases found within the phrase. Thus, a phrase object is seen # eventually as a hierarchical structure. # # A syntactic base chunk is a phrase with no internal phrases. # A clause is a phrase which may have internal phrases # A proposition argument is implemented as a special class which # inherits from the phrase class. # ################################################################## use strict; package SRL::phrase; # Constructor: creates a new phrase # Parameters: start position, end position and type sub new { my $pkg = shift; my $ph = []; # start word index $ph->[0] = (@_) ? shift : undef; # end word index $ph->[1] = (@_) ? shift : undef; # phrase type $ph->[2] = (@_) ? shift : undef; # @{$ph->[3]} = (); return bless $ph, $pkg; } # returns the start position of the phrase sub start { my $ph = shift; return $ph->[0]; } # initializes the start position of the phrase sub set_start { my $ph = shift; $ph->[0] = shift; } # returns the end position of the phrase sub end { my $ph = shift; return $ph->[1]; } # initializes the end position of the phrase sub set_end { my $ph = shift; $ph->[1] = shift; } # returns the type of the phrase sub type { my $ph = shift; return $ph->[2]; } # initializes the type of the phrase sub set_type { my $ph = shift; $ph->[2] = shift; } # returns the subphrases of the current phrase sub phrases { my $ph = shift; return @{$ph->[3]}; } # adds phrases as subphrases sub add_phrases { my $ph = shift; push @{$ph->[3]}, @_; } # initializes the set of subphrases sub set_phrases { my $ph = shift; @{$ph->[3]} = @_; } # depth first search # returns the phrases rooted int the current phrase in dfs order sub dfs { my $ph = shift; return ($ph, map { $_->dfs } $ph->phrases); } # generates a string representing the phrase (and subphrases if arg is a TRUE value), for printing sub to_string { my $ph = shift; my $rec = ( @_ ) ? shift : 1; my $str = "(" . $ph->start . " "; $rec and map { $str .= $_->to_string." " } $ph->phrases; $str .= $ph->end . ")"; if (defined($ph->type)) { $str .= "_".$ph->type; } return $str; } 1; ################################################################## # # Package w o r d : a word # # April 2004 # # A word, containing id (position in sentence), form and PoS tag # ################################################################## use strict; package SRL::word; # Constructor: creates a new word # Parameters: id (position), form and PoS tag sub new { my ($pkg, @fields) = @_; my $w = []; $w->[0] = shift @fields; # id (position in sentence) $w->[1] = shift @fields; # form $w->[2] = shift @fields; # PoS return bless $w, $pkg; } # returns the id of the word sub id { my $w = shift; return $w->[0]; } # returns the form of the word sub form { my $w = shift; return $w->[1]; } # returns the PoS tag of the word sub pos { my $w = shift; return $w->[2]; } sub to_string { my $w = shift; return "w@".$w->[0].":".$w->[1].":".$w->[2]; } 1;