diff --git a/bcovington/src/covington.py b/bcovington/src/covington.py new file mode 100644 index 0000000..e29dae5 --- /dev/null +++ b/bcovington/src/covington.py @@ -0,0 +1,875 @@ +from dynet import * +from utils import read_conll, write_conll, CovingtonConfiguration +from operator import itemgetter +from itertools import chain +from tarjan import tarjan +import time, random +import numpy as np +import os +import warnings + + +""" +This is a module extended from original the transition-based BIST-Parser barchybrid: + +https://github.com/elikip/bist-parser/blob/master/barchybrid/ +Kiperwasser, E., & Goldberg, Y. (2016). Simple and accurate dependency parsing using bidirectional LSTM feature representations. arXiv preprint arXiv:1603.04351. + + +that has been adapted to include to support non-projective transition-based dependency parsing +using an implementation (O(n^2)) of the traditional Covington's (2001) algorithm, according +to the list-based transition-based described in Nivre (2008). + +Covington, M. A. (2001). A fundamental algorithm for dependency parsing. In Proceedings of the 39th annual ACM southeast conference (pp. 95-102). +Nivre, J. (2008). Algorithms for deterministic incremental dependency parsing. Computational Linguistics, 34(4), 513-553. + +We also include the O(n) dynamic oracle described in Gomez-Rodriguez and Fernandez-Gonzalez (2015). +TODO: Current implementation is O(n^2) + +Gomez-Rodriguez, C., & Fernandez-Gonzalez, D. (2015). An efficient dynamic oracle for unrestricted non-projective parsing. Volume 2: Short Papers, 256. + +""" + + + +class CovingtonBILSTM: + + #ACTIVATION FUNCTIONS + TANH = 'tanh' + SIGMOID = 'sigmoid' + RELU = 'relu' + TANH3 = 'tanh3' + + #OPTIMIZERS + SGD="sgd" + MOMENTUM="momentum" + ADAGRAD="adagrad" + ADADELTA="adadelta" + ADAM = "adam" + + #SPECIAL INDEXES + INDEX_WORD_PAD = 1 + INDEX_WORD_INITIAL = 2 + INDEX_POS_PAD = 1 + INDEX_POS_INITIAL = 2 + INIT_WORD_INDEX = 3 + INIT_POS_INDEX = INIT_WORD_INDEX + + INDEX_FEATS_PAD = 1 + INDEX_FEATS_INITIAL= 2 + INIT_FEATS_INDEX = INIT_WORD_INDEX + + #TRANSITIONS + LEFT_ARC = 0 + RIGHT_ARC = 1 + SHIFT = 2 + NO_ARC = 3 + TRANSITIONS = [LEFT_ARC, RIGHT_ARC, SHIFT, NO_ARC] + + #OTHER HYPERPARAMETERS + SIZE_TRANSITIONS = len(TRANSITIONS) + + def __init__(self, words, lemmas, cpos, pos, feats, rels, w2i, l2i, options, path_oov_external_embedding=None, + pretrained=False): + + self.model = Model() + if options.optimizer == self.ADAM: + self.trainer = AdamTrainer(self.model) + elif options.optimizer == self.SGD: + self.trainer = SimpleSGDTrainer(self.model) + elif options.optimizer == self.MOMENTUM: + self.trainer = MomentumSGDTrainer(self.model) + elif options.optimizer == self.ADAGRAD: + self.trainer = AdagradTrainer(self.model) + elif options.optimizer == self.ADADELTA: + self.trainer = AdadeltaTrainer(self.model) + else: + raise NotImplementedError("Selected optimizer is not available") + + random.seed(1) + + self.activations = {self.TANH: tanh, + self.SIGMOID: logistic, + self.RELU: rectify, + self.TANH3: (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))} + + self.activation = self.activations[options.activation] + + self.oracle = options.oracle + + + self.ldims = options.lstm_dims * 2 #*2 because it is a bi-lstm + self.wdims = options.wembedding_dims + self.pdims = options.pembedding_dims + self.rdims = options.rembedding_dims + self.layers = options.lstm_layers + self.wordsCount = words + + self.vocab = {word: ind+self.INIT_WORD_INDEX for word, ind in w2i.iteritems()} + self.lemmas = {lemma: ind+self.INIT_WORD_INDEX for lemma,ind in l2i.iteritems()} + self.cpos = {cpos: ind+self.INIT_POS_INDEX for ind, cpos in enumerate(cpos)} + self.pos = {pos: ind+self.INIT_POS_INDEX for ind, pos in enumerate(pos)} + self.feats = {f: ind+self.INIT_FEATS_INDEX for ind, f in enumerate(feats)} + self.rels = {word: ind for ind, word in enumerate(rels)} + + #List of dependency types + self.irels = rels + + self.headFlag = options.headFlag + self.rlMostFlag = options.rlMostFlag + self.rlFlag = options.rlFlag + self.kb = options.window_b + self.kl1 = options.window_l1 + self.kl2_r = options.window_l2r + self.kl2_l = options.window_l2l + + self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0) + + #Reading external embedding files, if they exists + + #INFORMATION FOR EXTERNAL WORD EMBEDDINGS + self.external_embedding = None + self.edim = None + self.noextrn = None + self.extrnd = None + self.elookup = None + if options.external_embedding is not None and os.path.exists(options.external_embedding): + self.external_embedding, self.edim,self.noextrn,self.extrnd, self.elookup = self._assign_external_embeddings(options.external_embedding, + self.INDEX_WORD_PAD, self.INDEX_WORD_INITIAL) + else: + warnings.warn("Not using any external file for FORM embeddings") + + #INFORMATION FOR THE EXTERNAL CPOSTAG EMBEDDINGS + self.cpos_external_embedding = None + self.cpos_edim = None + self.cpos_noextrn = None + self.cpos_extrnd = None + self.cpos_elookup = None + if options.cpos_external_embedding is not None and os.path.exists(options.cpos_external_embedding): + self.cpos_external_embedding, self.cpos_edim,self.cpos_noextrn,self.cpos_extrnd, self.cpos_elookup = self._assign_external_embeddings(options.cpos_external_embedding, + self.INDEX_POS_PAD, self.INDEX_POS_INITIAL) + else: + warnings.warn("Not using any external file for CPOSTAG embeddings") + + #INFORMATION FOR THE EXTERNAL POSTAG EMBEDDINGS + self.pos_external_embedding = None + self.pos_edim = None + self.pos_noextrn = None + self.pos_extrnd = None + self.pos_elookup= None + if options.pos_external_embedding is not None and os.path.exists(options.pos_external_embedding): + self.pos_external_embedding, self.pos_edim,self.pos_noextrn,self.pos_extrnd, self.pos_elookup = self._assign_external_embeddings(options.pos_external_embedding, + self.INDEX_POS_PAD, self.INDEX_POS_INITIAL) + else: + warnings.warn("Not using any external file for POSTAG embeddings") + + #INFORMATION FOR THE EXTERNAL FEATS EMBEDDINGS + self.feats_external_embedding = None + self.feats_edim = None + self.feats_noextrn = None + self.feats_extrnd = None + self.feats_elookup= None + + if options.feats_external_embedding is not None and os.path.exists(options.feats_external_embedding): + self.feats_external_embedding, self.feats_edim,self.feats_noextrn,self.feats_extrnd, self.feats_elookup = self._assign_external_embeddings(options.feats_external_embedding, self.INDEX_FEATS_PAD, self.INDEX_FEATS_INITIAL) + else: + warnings.warn("Not using any external file for FEATS embeddings") + + + #INFORMATION FOR THE EXTERNAL FEATS EMBEDDINGS +# self.lemmas_external_embedding = None +# self.lemmas_edim = None +# self.lemmas_noextrn = None +# self.lemmas_extrnd = None +# self.lemmas_elookup= None + +# if options.lemmas_external_embedding is not None and os.path.exists(options.lemmas_external_embedding): +# self.lemmas_external_embedding, self.lemmas_edim,self.lemmas_noextrn,self.lemmas_extrnd, self.lemmas_elookup = self._assign_external_embeddings(options.lemmas_external_embedding, self.INDEX_FEATS_PAD, self.INDEX_FEATS_INITIAL) +# else: +# warnings.warn("Not using any external file for LEMMAS embeddings") + + + + + self.oov_external_embedding = None + self.oov_edim = None + self.oov_noextrn = None + self.oov_extrnd = None + self.oov_elookup = None + + + if path_oov_external_embedding is not None and os.path.exists(options.feats_external_embedding): + self.oov_external_embedding, self.oov_edim,self.oov_noextrn,self.oov_extrnd, self.oov_elookup = self._assign_external_embeddings(path_oov_external_embedding, + self.INDEX_WORD_PAD, self.INDEX_WORD_INITIAL) + + if self.oov_external_embedding is not None and self.oov_edim != self.edim: + raise ValueError("The dimensions of the embeddings for OOV words is not equal to the dimension of the rest of external word embeddings (self.oov_edim != self.edim)") + + #Obtaining the dimension of the input + dims = (self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0) + + (self.cpos_edim if self.cpos_external_embedding is not None else 0) + + (self.pos_edim if self.pos_external_embedding is not None else 0)+ + (self.feats_edim if self.feats_external_embedding is not None else 0) +# + +# (self.lemmas_edim if self.lemmas_external_embedding is not None else 0) + ) + + + #Initialization of the architecture + + self.blstmFlag = options.blstmFlag + self.bibiFlag = options.bibiFlag + + if self.bibiFlag: + self.surfaceBuilders = [VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model), + VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model)] + self.bsurfaceBuilders = [VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model), + VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model)] + elif self.blstmFlag: + if self.layers > 0: + self.surfaceBuilders = [VanillaLSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model), LSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model)] + else: + self.surfaceBuilders = [SimpleRNNBuilder(1, dims, self.ldims * 0.5, self.model), LSTMBuilder(1, dims, self.ldims * 0.5, self.model)] + + + self.hidden_units = options.hidden_units + self.hidden2_units = options.hidden2_units + self.vocab['*PAD*'] = self.INDEX_WORD_PAD + self.cpos['*PAD*'] = self.INDEX_POS_PAD + self.feats['*PAD*'] = self.INDEX_FEATS_PAD + + self.vocab['*INITIAL*'] = self.INDEX_WORD_INITIAL + self.cpos['*INITIAL*'] = self.INDEX_POS_INITIAL + self.feats['*INITIAL*'] = self.INDEX_FEATS_INITIAL + + self.wlookup = self.model.add_lookup_parameters((len(words) + self.INIT_WORD_INDEX, self.wdims)) + self.plookup = self.model.add_lookup_parameters((len(cpos) + self.INIT_POS_INDEX, self.pdims)) + self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims)) + + + self.word2lstm = self.model.add_parameters((self.ldims, dims)) + + self.word2lstmbias = self.model.add_parameters((self.ldims)) + self.lstm2lstm = self.model.add_parameters((self.ldims, self.ldims * self.nnvecs + self.rdims)) + self.lstm2lstmbias = self.model.add_parameters((self.ldims)) + + self.hidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.kl1 + self.kl2_l + self.kl2_r + self.kb))) + self.hidBias = self.model.add_parameters((self.hidden_units)) + + self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.hid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.outLayer = self.model.add_parameters((self.SIZE_TRANSITIONS, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.outBias = self.model.add_parameters((self.SIZE_TRANSITIONS)) + + self.rhidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.kl1 + self.kl2_l + self.kl2_r + self.kb))) + self.rhidBias = self.model.add_parameters((self.hidden_units)) + + self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.rhid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.routLayer = self.model.add_parameters((2 * (len(self.irels) + 0) + 1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.routBias = self.model.add_parameters((2 * (len(self.irels) + 0) + 1)) + + self.pretrained = pretrained + + + def _assign_external_embeddings(self,option_external_embedding, + index_pad,index_initial): + """ + Reads an external embedding file + Returns: + external_embedding: A dictionary of key:embedding + edim: Dimension of the embedding + noextrn: ?? + extrnd: Index for each key + elookup: Parameter lookup + """ + + + if option_external_embedding is not None: + + external_embedding_fp = open(option_external_embedding,'r') + external_embedding_fp.readline() + + external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] + for line in external_embedding_fp} + + + external_embedding_fp.close() + + edim = len(external_embedding.values()[0]) + noextrn = [0.0 for _ in xrange(edim)] + extrnd = {element: i + self.INIT_POS_INDEX + for i, element in enumerate(external_embedding)} + elookup = self.model.add_lookup_parameters((len(external_embedding) + self.INIT_WORD_INDEX, edim)) + + for element, i in extrnd.iteritems(): + elookup.init_row(i, external_embedding[element]) + extrnd['*PAD*'] = index_pad + extrnd['*INITIAL*'] = index_initial + + return external_embedding, edim, noextrn, extrnd, elookup + + return None,None,None,None,None + + + + def __evaluate(self, c, train): + """ + @param c: A CovingtonConfiguration instance + @param train: True if used in the training phase, False otherwise + Returns the scores for all possible transitions (training) + or the top ones (testing) for a given configuration c + """ + + #Gets the embeddings for the terms to be used in the prediction + top_l1 = [c.sentence[c.l1-i].lstms if c.l1 - i > 0 else [self.empty] for i in xrange(self.kl1)] + top_l2l = [c.sentence[c.l1+1+i].lstms if c.l1+1+i < c.b else [self.empty] for i in xrange(self.kl2_l)] + top_l2r = [c.sentence[c.b-i].lstms if c.b-i > c.l1 else [self.empty] for i in xrange(self.kl2_r)] + topBuffer = [c.sentence[c.b+i-1].lstms if c.b+i-1 <= c.sentence[-1].id else [self.empty] for i in xrange(self.kb)] + + input = concatenate(list(chain(*(top_l1 + top_l2l + top_l2r + topBuffer)))) + + if self.hidden2_units > 0: + routput = (self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr())) + self.routBias.expr()) + else: + routput = (self.routLayer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr()) + self.routBias.expr()) + + if self.hidden2_units > 0: + output = (self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr())) + self.outBias.expr()) + else: + output = (self.outLayer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr()) + self.outBias.expr()) + + scrs, uscrs = routput.value(), output.value() + + if train: + left_arc_info = [(rel,self.LEFT_ARC, scrs[1+j*2] + uscrs[self.LEFT_ARC], routput[1+j*2]+ output[self.LEFT_ARC]) + for j, rel in enumerate(self.irels) if c.l1 > 0 and c.l1 < c.b and c.b <= c.sentence[-1].id] + + right_arc_info = [(rel,self.RIGHT_ARC, scrs[2+j*2] + uscrs[self.RIGHT_ARC], routput[2+j*2]+ output[self.RIGHT_ARC]) + for j, rel in enumerate(self.irels) if c.l1 >= 0 and c.l1 < c.b and c.b <= c.sentence[-1].id] + + shift_info = [ (None, self.SHIFT, scrs[0] + uscrs[self.SHIFT], routput[0] + output[self.SHIFT]) ] if c.b <= c.sentence[-1].id else [] + + no_arc_info = [(None, self.NO_ARC,scrs[3] + uscrs[self.NO_ARC], routput[3] + output[self.NO_ARC] )] if c.l1> 0 and c.b <= c.sentence[-1].id else [] + + ret = [left_arc_info,right_arc_info, shift_info, no_arc_info] + + else: + #It is done different from the 'train' phase, due to the dynamic oracle. + #In the test phase we already pick the most likely transition/dependency instead of returning them all + #and then selecting one according to the prediction of the dynamic oracle + sLEFT,rLEFT = max(zip(scrs[1::2],self.irels)) + sRIGHT,rRIGHT = max(zip(scrs[2::2],self.irels)) + sLEFT += uscrs[self.LEFT_ARC] + sRIGHT += uscrs[self.RIGHT_ARC] + ret = [ [(rLEFT, self.LEFT_ARC, sLEFT) ] if (c.l1 > 0 and c.l1 < c.b and c.b <= c.sentence[-1].id and self._is_valid_left_arc(c)) else [], + [(rRIGHT, self.RIGHT_ARC, sRIGHT) ] if (c.l1 >= 0 and c.l1 < c.b and c.b <= c.sentence[-1].id and self._is_valid_right_arc(c)) else [], + [(None, self.SHIFT, scrs[0] + uscrs[self.SHIFT]) ] if (c.b <= c.sentence[-1].id) else [], + [(None, self.NO_ARC,scrs[3] + uscrs[self.NO_ARC]) ] if (c.l1 > 0 and c.b <= c.sentence[-1].id) else [] + ] + return ret + + + def Save(self, filename): + self.model.save(filename) + + + def Load(self, filename): + self.model.load(filename) + + def Init(self): + evec = self.elookup[1] if self.external_embedding is not None else None + cpos_evec = self.cpos_elookup[1] if self.cpos_external_embedding is not None else None + pos_evec = self.pos_elookup[1] if self.pos_external_embedding is not None else None + feats_evec = self.feats_elookup[1] if self.feats_external_embedding is not None else None + # lemmas_evec = self.lemmas_elookup[1] if self.lemmas_external_embedding is not None else None + paddingWordVec = self.wlookup[1] + paddingPosVec = self.plookup[1] if self.pdims > 0 else None + # paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec, cpos_evec, pos_evec, feats_evec, lemmas_evec])) + self.word2lstmbias.expr()) + paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec, cpos_evec, pos_evec, feats_evec])) + self.word2lstmbias.expr()) + self.empty = paddingVec if self.nnvecs == 1 else concatenate([paddingVec for _ in xrange(self.nnvecs)]) + + + def getWordEmbeddings(self, sentence, train): + """ + Gets the embeddings (also external) for every term in a sentence + Returns a vector of all embeddings concatenated + """ + + for root in sentence: + c = float(self.wordsCount.get(root.norm, 0)) + dropFlag = not train or (random.random() < (c/(0.25+c))) + sys.stdout.flush() + root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0] + root.cposvec = self.plookup[int(self.cpos.get(root.cpos,0))] if self.pdims > 0 else None + + #For word embeddings + if self.external_embedding is not None: + if root.form in self.external_embedding: + root.evec = self.elookup[self.extrnd[root.form]] + elif root.norm in self.external_embedding: + root.evec = self.elookup[self.extrnd[root.norm]] + else: + if (self.oov_external_embedding is not None and root.form.replace(" ","_") in self.oov_external_embedding): + root.evec = self.oov_elookup[self.oov_extrnd[root.form.replace(" ","_")]] + else: + root.evec = self.elookup[0] + else: + root.evec = None + + #For cpostag embeddings + if self.cpos_external_embedding is not None: + if root.cpos in self.cpos_external_embedding: + root.cposevec = self.cpos_elookup[self.cpos_extrnd[root.cpos]] + else: + root.cposevec = self.cpos_elookup[0] + else: + root.cposevec = None + + #For postag embeddings + if self.pos_external_embedding is not None: + if root.pos in self.pos_external_embedding: + root.posevec = self.pos_elookup[self.pos_extrnd[root.pos]] + else: + root.posevec = self.pos_elookup[0] + else: + root.posevec = None +# + #For feats embeddings + if self.feats_external_embedding is not None: + if root.feats in self.feats_external_embedding: + root.featsevec = self.feats_elookup[self.feats_extrnd[root.feats]] + else: + root.featsevec = self.feats_elookup[0] + else: + root.featsevec = None + + + #For lemmas embeddings +# if self.lemmas_external_embedding is not None: +# if root.lemma in self.lemmas_external_embedding: +# root.lemmasevec = self.lemmas_elookup[self.lemmas_extrnd[root.lemma]] +# else: +# root.lemmasevec = self.lemmas_elookup[0] +# else: +# root.lemmasevec = None + + + # root.ivec = concatenate(filter(None, [root.wordvec, root.cposvec, root.evec, root.cposevec, root.posevec, root.featsevec, root.lemmasevec])) + root.ivec = concatenate(filter(None, [root.wordvec, root.cposvec, root.evec, root.cposevec, root.posevec, root.featsevec])) + + if self.blstmFlag: + forward = self.surfaceBuilders[0].initial_state() + backward = self.surfaceBuilders[1].initial_state() + + for froot, rroot in zip(sentence, reversed(sentence)): + forward = forward.add_input( froot.ivec ) + backward = backward.add_input( rroot.ivec ) + froot.fvec = forward.output() + rroot.bvec = backward.output() + for root in sentence: + root.vec = concatenate( [root.fvec, root.bvec] ) + + if self.bibiFlag: + bforward = self.bsurfaceBuilders[0].initial_state() + bbackward = self.bsurfaceBuilders[1].initial_state() + + for froot, rroot in zip(sentence, reversed(sentence)): + bforward = bforward.add_input( froot.vec ) + bbackward = bbackward.add_input( rroot.vec ) + froot.bfvec = bforward.output() + rroot.bbvec = bbackward.output() + for root in sentence: + root.vec = concatenate( [root.bfvec, root.bbvec] ) + + else: + for root in sentence: + root.ivec = (self.word2lstm.expr() * root.ivec) + self.word2lstmbias.expr() + root.vec = tanh( root.ivec ) + + + def Predict(self, conll_path): + """ + Makes non-projective depending parsing prediction given a ConLL-X file + """ + + + with open(conll_path, 'r') as conllFP: + for iSentence, sentence in enumerate(read_conll(conllFP)): + self.Init() + + l1 = sentence[0].id + b = sentence[1].id + arcs = set([]) + + self.getWordEmbeddings(sentence, False) + + for root in sentence: + root.lstms = [root.vec for _ in xrange(self.nnvecs)] + + hoffset = 1 if self.headFlag else 0 + + c = CovingtonConfiguration(l1,b,sentence,arcs) + while not self._is_final_state(b,sentence): + + transition_scores = self.__evaluate(c, False) + + + best = max(chain(*transition_scores), key = itemgetter(2) ) + + if best[1] == self.LEFT_ARC: + + sentence[l1].pred_parent_id = sentence[b].id + sentence[l1].pred_relation = best[0] + best_op = self.LEFT_ARC + if self.rlMostFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].lstms[best_op+hoffset] + if self.rlFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].vec + + arcs.add((b,l1)) + l1 = l1 -1 + + elif best[1] == self.RIGHT_ARC: + + sentence[b].pred_parent_id = sentence[l1].id + sentence[b].pred_relation = best[0] + + best_op = self.RIGHT_ARC + if self.rlMostFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].lstms[best_op+hoffset] + if self.rlFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].vec + + arcs.add((l1,b)) + l1 = l1-1 + + elif best[1] == self.SHIFT: + l1 = b + b = b + 1 + + + elif best[1] == self.NO_ARC: + l1 = l1 - 1 + + c = CovingtonConfiguration(l1,b,sentence,arcs) + renew_cg() + yield sentence + + + def Train(self, conll_path): + """ + Trains a O(n^2) Covington's parser with a O(n^2) dynamic oracle + """ + mloss = 0.0 + errors = 0 + batch = 0 + eloss = 0.0 + eerrors = 0 + lerrors = 0 + etotal = 0 + ltotal = 0 + ninf = -float('inf') + + hoffset = 1 if self.headFlag else 0 + + start = time.time() + + with open(conll_path, 'r') as conllFP: + shuffledData = list(read_conll(conllFP)) + + random.shuffle(shuffledData) + + + errs = [] + eeloss = 0.0 + + self.Init() + + for iSentence, sentence in enumerate(shuffledData): + if iSentence % 100 == 0 and iSentence != 0: + print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start + start = time.time() + eerrors = 0 + eloss = 0.0 + etotal = 0 + lerrors = 0 + ltotal = 0 + + self.getWordEmbeddings(sentence, True) + #We obtain the gold arcs to then compute the dynamic oracle for covington + gold_arcs = set([]) + for word in sentence: + + #TODO: Weird error if not, adds and arc (0,0) + if word.id != word.parent_id: + gold_arcs.add((word.parent_id,word.id)) + + + l1 = sentence[0].id + b = sentence[1].id + arcs = set([]) + c = CovingtonConfiguration(l1,b,sentence,arcs) + loss_c = self._loss(c,gold_arcs, iSentence) + + for word in sentence: + word.lstms = [word.vec for _ in xrange(self.nnvecs)] + + hoffset = 1 if self.headFlag else 0 + + while not self._is_final_state(b,sentence): + + costs = [None,None,None,None] + transition_scores = self.__evaluate(c, True) + + #We determine if the transitions are valid for a given configuration c + for t in self.TRANSITIONS: + + l1_aux = l1 + b_aux = b + arcs_aux = set(arcs) + valid_transition = False + + if t == self.LEFT_ARC and self._is_valid_left_arc(c): + arcs_aux.add((b_aux,l1_aux)) + l1_aux = l1_aux -1 + valid_transition = True + + if t == self.RIGHT_ARC and l1 >=0 and self._is_valid_right_arc(c): + arcs_aux.add((l1_aux,b_aux)) + l1_aux = l1_aux-1 + valid_transition = True + + if t == self.NO_ARC and l1 >0: + l1_aux = l1_aux-1 + valid_transition = True + + if t == self.SHIFT: + l1_aux = b_aux + b_aux = b_aux + 1 + valid_transition = True + + if valid_transition: + + new_c = CovingtonConfiguration(l1_aux,b_aux,sentence,arcs_aux) + loss_new_c = self._loss(new_c,gold_arcs,iSentence) + + cost = loss_new_c - loss_c + costs[t] = float(cost) + + #Valid transitions are those with cost 0 + #If it is a LEFT/RIGHT arc, also the relation must match with the one in gold standard + valid_transitions = [s for s in chain(*transition_scores) if costs[s[1]] == 0 and (s[1] in [self.SHIFT,self.NO_ARC] + or ((s[1] == self.LEFT_ARC and s[0] == sentence[l1].relation) + or (s[1] == self.RIGHT_ARC and s[0] == sentence[b].relation)))] + + best_valid = max(valid_transitions, key=itemgetter(2)) + + wrong_transitions = [s for s in chain(*transition_scores) if costs[s[1]] is not None and ( (costs[s[1]] != 0) or (s[1] in [self.LEFT_ARC,self.RIGHT_ARC] + and ((s[1] == self.LEFT_ARC and s[0] != sentence[l1].relation) + or (s[1] == self.RIGHT_ARC and s[0] != sentence[b].relation))) ) ] + + #Aggressive exploration as done by Kiperwasser and Golberg (2016) + if wrong_transitions != []: + best_wrong = max(wrong_transitions, key=itemgetter(2)) + + best = best_valid if ( (not self.oracle) or (best_valid[2] - best_wrong[2] > 1.0) + or (best_valid[2] > best_wrong[2] and random.random() > 0.1) ) else best_wrong + else: + best = best_valid + + + #Moving a new configuration based on the "best" choice + if best[1] == self.LEFT_ARC: + + sentence[l1].pred_parent_id = sentence[b].id + sentence[l1].pred_relation = best[0] + + best_op = self.LEFT_ARC + if self.rlMostFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].lstms[best_op+hoffset] + if self.rlFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].vec + + child = sentence[l1] + arcs.add((b,l1)) + l1 = l1 -1 + + elif best[1] == self.RIGHT_ARC: + + + sentence[b].pred_parent_id = sentence[l1].id + sentence[b].pred_relation = best[0] + + best_op = self.RIGHT_ARC + if self.rlMostFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].lstms[best_op+hoffset] + if self.rlFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].vec + + arcs.add((l1,b)) + child = sentence[b] + l1 = l1-1 + + + elif best[1] == self.SHIFT: + l1 = b + child = sentence[b] + b = b + 1 + + + elif best[1] == self.NO_ARC: + l1 = l1 - 1 + child = sentence[l1] + + + if best_valid[2] < best_wrong[2] + 1.0: + loss = best_wrong[3] - best_valid[3] + mloss += 1.0 + best_wrong[2] - best_valid[2] + eloss += 1.0 + best_wrong[2] - best_valid[2] + errs.append(loss) + + + if best[1] not in [self.SHIFT, self.NO_ARC] and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): + lerrors += 1 + if child.pred_parent_id != child.parent_id: + errors += 1 + eerrors += 1 + + etotal += 1 + c = CovingtonConfiguration(l1,b,sentence,arcs) + loss_c = self._loss(c,gold_arcs, iSentence) + + + if len(errs) > 50: + eerrs = esum(errs) + scalar_loss = eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + errs = [] + lerrs = [] + + renew_cg() + self.Init() + + if len(errs) > 0: + eerrs = (esum(errs)) # * (1.0/(float(len(errs)))) + eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + + errs = [] + lerrs = [] + + renew_cg() + + self.trainer.update_epoch() + print "Loss: ", mloss/iSentence + + + def _is_final_state(self,b,sentence): + return b >= len(sentence) + + + def _is_valid_left_arc(self,c): + + aux = set(c.A) + aux.add((c.b,c.l1)) + l1_has_head = self._y_has_head(c.A, c.b, c.l1) + return (c.l1 > 0 and not l1_has_head + and self._count_cycles(aux) == 0) + + + def _is_valid_right_arc(self,c): + + b_has_head = self._y_has_head(c.A, c.l1, c.b) + aux = set(c.A) + aux.add((c.l1,c.b)) + return ((not b_has_head) and self._count_cycles(aux) == 0) + + + """ + Gomez-Rodriguez & Fernandez-Gonzalez: + An Efficiente Dynamic Oracle for Unrestricted Non-Projective Parsing (ACL,2015) + Algorithm 1 + """ + def _loss(self, c, gold_arcs, iSentence): + + U = set([]) #set of unreachable nodes + non_built_arcs = gold_arcs.difference(c.A) + + + i = c.l1 + j = c.b + + for x,y in non_built_arcs: + left = min(x,y) #O(n) + right = max(x,y) #O(n) + if (j > right or (j==right and i < left) or self._y_has_head(c.A,x,y) + or self._weakly_connected(c.A, x, y,c, gold_arcs)): + U.add((x,y)) + + I = gold_arcs.difference(U) + + return len(U) + self._count_cycles( c.A.union(I)) + + + #TODO: This can be done more efficient + #O(n^2) + def _weakly_connected(self,A,x,y,c, gold_arcs): + + weakly_connected = False + end_path = False + parent = x + + while parent != 0 and not weakly_connected and not end_path and A != set([]): + if (parent,y) in A: + weakly_connected = True + break + else: + + for (a,b) in A: + if b == parent: + parent = a + break + else: + end_path = True + + + return weakly_connected + + + """ + Tarjan (1972) implementation at https://github.com/bwesterb/py-tarjan/ + O(n) + """ + def _count_cycles(self, A): + + d = {} + for a,b in A: + if a not in d: + d[a] = [b] + else: + d[a].append(b) + + return sum([1 for e in tarjan(d) if len(e) > 1]) + + + """ + Determines if node y has already a head + """ + #O(n) + def _y_has_head(self,A,x,y): + + for z,y_prime in A: + if y_prime == y and z != x: + return True + return False + + #O(n) +# def violates_single_root(self, A): +# print A,[1 for (h,d) in A if h==0], len([1 for (h,d) in A if h==0]) != 0 +# return len([1 for (h,d) in A if h==0]) != 0 + diff --git a/bcovington/src/parser.py b/bcovington/src/parser.py new file mode 100644 index 0000000..45a5071 --- /dev/null +++ b/bcovington/src/parser.py @@ -0,0 +1,175 @@ +from argparse import ArgumentParser +import utils +import covington +import os +import pickle +import time +import tempfile +import yaml +import codecs +import sys +import warnings +""" +Main file +""" + + + +if __name__ == '__main__': + + parser = ArgumentParser() + parser.add_argument("--input", dest="input", help="Path to the input file",default=None) + parser.add_argument("--input_type", dest="input_type",help="Style of the input file [raw|conllu] (only use with --predict)") + parser.add_argument("--pipe", dest="pipe",default="UDpipe",help="Framework used to do the pipeline. Only \"UDpipe\" supported (only use with --predict)") + parser.add_argument("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="../data/PTB_SD_3_3_0/train.conll") + parser.add_argument("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="../data/PTB_SD_3_3_0/dev.conll") + parser.add_argument("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="../data/PTB_SD_3_3_0/test.conll") + parser.add_argument("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle") + parser.add_argument("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE") + parser.add_argument("--extrn_cpos", dest="cpos_external_embedding",help="CPoStag external embeddings", metavar="FILE") + parser.add_argument("--extrn_pos", dest="pos_external_embedding", help= "PoStag external embeddings", metavar="FILE") + parser.add_argument("--extrn_feats", dest="feats_external_embedding", help="Feats external embeddings", metavar="FILE") + parser.add_argument("--model", dest="model", help="Load/Save model file", metavar="FILE", default="bcovington.model") + parser.add_argument("--wembedding", type=int, dest="wembedding_dims", default=100) + parser.add_argument("--pembedding", type=int, dest="pembedding_dims", default=25) + parser.add_argument("--rembedding", type=int, dest="rembedding_dims", default=25) + parser.add_argument("--epochs", type=int, dest="epochs", default=30) + parser.add_argument("--hidden", type=int, dest="hidden_units", default=100) + parser.add_argument("--hidden2", type=int, dest="hidden2_units", default=0) + parser.add_argument("--kb", type=int, dest="window_b", default=1) + parser.add_argument("--k1", type=int, dest="window_l1", default=3) + parser.add_argument("--k2r", type=int, dest="window_l2r", default = 1) + parser.add_argument("--k2l", type=int, dest="window_l2l", default = 1) + parser.add_argument("--lr", type=float, dest="learning_rate", default=0.1) + parser.add_argument("--outdir", type=str, dest="output", default="results") + parser.add_argument("--activation", type=str, dest="activation", default="tanh") + parser.add_argument("--optimizer",type=str, dest="optimizer", default="adam") + parser.add_argument("--lstmlayers", type=int, dest="lstm_layers", default=2) + parser.add_argument("--lstmdims", type=int, dest="lstm_dims", default=125) + parser.add_argument("--dynet-seed", type=int, dest="seed", default=7) + parser.add_argument("--disableoracle", action="store_false", dest="oracle", default=True) + parser.add_argument("--disableblstm", action="store_false", dest="blstmFlag", default=True) + parser.add_argument("--bibi-lstm", action="store_true", dest="bibiFlag", default=False) + parser.add_argument("--usehead", action="store_true", dest="headFlag", default=False) + parser.add_argument("--userlmost", action="store_true", dest="rlFlag", default=False) + parser.add_argument("--userl", action="store_true", dest="rlMostFlag", default=False) + parser.add_argument("--dynet-mem", type=int, dest="cnn_mem", default=512) + parser.add_argument("--conll2017", action="store_true",dest="conll2017", default=False) + parser.add_argument("--predict", action="store_true", dest="predictFlag", default=False) + + + # parser.add_argument("--conf", metavar="FILE", dest="conf",required=True) + + args = parser.parse_args() + + if not args.predictFlag: + + if not os.path.exists(args.output): + os.mkdir(args.output) + + # config = yaml.safe_load(open(args.conf)) + + print "Training..." + if not (args.rlFlag or args.rlMostFlag or args.headFlag): + print 'You must use either --userlmost or --userl or --usehead (you can use multiple)' + sys.exit() + + path_tmp_file_oov = None + + print 'Preparing vocab' + words, w2i, lemmas, l2i, cpos, pos, feats, rels = utils.vocab(args.conll_train) + + + with open(os.path.join(args.output, args.params), 'w') as paramsfp: + pickle.dump((words, w2i, lemmas, l2i, cpos, pos, feats, rels, args), paramsfp) + print 'Finished collecting vocab' + + print 'Initializing blstm covington:' + parser = covington.CovingtonBILSTM(words, lemmas, cpos, pos, feats, rels, w2i, l2i, args, + path_tmp_file_oov) + + + if path_tmp_file_oov is not None: + os.unlink(path_tmp_file_oov) + + if args.conll2017: + with codecs.open(args.conll_dev) as f_conll_dev: + lookup_conll_data = utils.lookup_conll_extra_data(f_conll_dev) + + + + for epoch in xrange(args.epochs): + print 'Starting epoch', epoch + parser.Train(args.conll_train) + devpath = os.path.join(args.output, 'dev_epoch_' + str(epoch+1) + '.conll') + utils.write_conll(devpath, parser.Predict(args.conll_dev)) + + if args.conll2017: + utils.dump_lookup_extra_into_conll(devpath, lookup_conll_data) + utils.transform_to_single_root(devpath) + + + print 'Executing conll17_eval' + + + if not args.conll2017: + os.system('perl src/utils/eval.pl -g ' + args.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt') + else: + os.system('python src/utils/conll17_ud_eval.py -v -w src/utils/weights.clas ' + args.conll_dev + ' ' + devpath + ' > ' + devpath + '.txt') + + + + + parser.Save(os.path.join(args.output, args.model)) + + else: + + #TEST PHASE + with codecs.open(args.params, 'r') as paramsfp: + aux = pickle.load(paramsfp) + words, w2i, lemmas, l2i, cpos , pos, feats, rels, stored_opt = aux + + + stored_opt.external_embedding = args.external_embedding + stored_opt.pos_external_embedding = args.pos_external_embedding + stored_opt.feats_external_embedding = args.feats_external_embedding + + print "Running model with this configuration", stored_opt + + parser = covington.CovingtonBILSTM(words, lemmas, cpos, pos, feats, rels, w2i, l2i, stored_opt, + None) + + parser.Load(args.model) + + conllu = (os.path.splitext(args.conll_test.lower())[1] == '.conllu') + tespath = os.path.join(args.output, 'test_pred.conll' if not conllu else 'test_pred.conllu') + + + if args.conll2017: + with codecs.open(args.conll_test) as f_conll_test: + lookup_conll_data = utils.lookup_conll_extra_data(f_conll_test) + + + + ts = time.time() + pred = list(parser.Predict(args.conll_test)) + te = time.time() + utils.write_conll(tespath, pred) + + + if args.conll2017: + utils.dump_lookup_extra_into_conll(tespath, lookup_conll_data) + utils.transform_to_single_root(tespath) + + + if not args.conll2017: + os.system('perl src/utils/eval.pl -g ' + args.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt') + else: + os.system('python src/utils/conll17_ud_eval.py -v -w src/utils/weights.clas ' + args.conll_test + ' ' + tespath + ' > ' + tespath + '.txt') + + + + + + + diff --git a/bcovington/src/utils.py b/bcovington/src/utils.py new file mode 100644 index 0000000..80dd275 --- /dev/null +++ b/bcovington/src/utils.py @@ -0,0 +1,265 @@ +from collections import Counter +import re +import codecs + +""" +This is a module slightly extended from original utils in BIST-Parser: +https://github.com/elikip/bist-parser/blob/master/barchybrid/src/utils.py + +that has been adapted to include to support non-projective transition-based dependency parsing +and CoNLLU dependencies. +""" + +DUMMY_ROOT = 0 +UD_CTAG_VERB = "VERB" +UD_HEAD_COLUMN = 6 +UD_CTAG_COLUMN = 3 +UD_ID_COLUMN = 0 + + + +class CovingtonConfiguration(object): + """ + Nivre, J. (2008). Algorithms for deterministic incremental dependency parsing. Computational Linguistics, 34(4), 513-553. + + l1: Word Id of the word at the top of the lambda one list + b: Word Id of the word at the top of the buffer + sentence: List of ConllEntry + A: set of created arcs (tuples (headID,dependentID)) + """ + + def __init__(self,l1,b,sentence, A): + + self.l1 = l1 + self.b = b + self.sentence = sentence + self.A = A + + def __str__(self): + return str(self.l1)+" "+str(self.b)+" "+str(self.A) + + +class ConllEntry(object): + """ + Contains the information of a line in a CoNLL-X file. + """ + + def __init__(self, id, form, lemma, cpos, pos, feats, + parent_id=None, relation=None): + + self.id = id + self.form = form + self.lemma = normalize(lemma) + self.norm = normalize(form) + self.cpos = cpos + self.pos = pos + self.feats = feats + self.parent_id = parent_id + self.relation = relation + + #By default everything is assigned to a dummy root + self.pred_parent_id = 0 + self.pred_relation = 'root' + + #For debugging + def __str__(self): + return "["+'\,'.join(map(str,[self.id,self.form,self.lemma,self.norm,self.cpos,self.pos,self.feats,self.parent_id,self.relation]))+"]" + + + +def vocab(conll_path): + + wordsCount = Counter() + lemmasCount = Counter() + cposCount = Counter() + posCount = Counter() + featsCount = Counter() + relCount = Counter() + + with open(conll_path, 'r') as conllFP: + for sentence in read_conll(conllFP): + + wordsCount.update([node.norm for node in sentence]) + lemmasCount.update([node.lemma for node in sentence]) + cposCount.update([node.cpos for node in sentence]) + posCount.update([node.pos for node in sentence]) + featsCount.update([node.feats for node in sentence]) + relCount.update([node.relation for node in sentence]) + + return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, + lemmasCount, {l: i for i, l in enumerate(lemmasCount.keys())}, + cposCount.keys(), posCount.keys(), featsCount.keys(), + relCount.keys()) + + +def read_conll(fh): + """ + Reads a ConLL file given a file object fh + """ + + non_proj_sentences = 0 + read = 0 + tokens_read = 0 + root = ConllEntry(0, '*root*', '*root-lemma*', 'ROOT-POS', 'ROOT-CPOS','FEATS-ROOT', 0, 'rroot') + tokens = [root] + for line in fh: + + if line.startswith('#'): continue + tok = line.strip().split('\t') + if not tok or tok == ['']: #If it is empty line + if len(tokens)>1: + yield tokens + read += 1 + tokens = [root] + id = 0 + else: + try: + if "." in tok[0] or "-" in tok[0]: continue + tokens.append(ConllEntry(int(tok[0]), tok[1], tok[2] ,tok[3], + tok[4], tok[5], int(tok[6]) if tok[6] != '_' else -1 , tok[7])) + tokens_read+=1 + + except IndexError: + pass + + #Last sentence + if len(tokens) > 1: + yield tokens + print read, 'sentences read.' + print tokens_read ,'tokens read' + + +def write_conll(fn, conll_gen): + """ + Writes a CoNLL file + """ + with open(fn, 'w') as fh: + for sentence in conll_gen: + for entry in sentence[1:]: + fh.write('\t'.join([str(entry.id), entry.form, entry.lemma, entry.cpos, entry.pos, entry.feats, str(entry.pred_parent_id), entry.pred_relation, '_', '_'])) + fh.write('\n') + fh.write('\n') + + + +numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"); +def normalize(word): + return 'NUM' if numberRegex.match(word) else word.lower() + + + + +""" +Looks for multiword expressions in the CoNLL file and creates a lookup table that +allows to reconstruct then the output +""" +def lookup_conll_extra_data(fh): + + lookup = {} + sentence_id = 0 + lookup[sentence_id] = {} + id_insert_before = 1 + + for line in fh: + + if line.startswith('#'): continue + tok = line.strip().split('\t') + + if not tok or tok == ['']: #If it is empty line + sentence_id+=1 + id_insert_before = 1 + lookup[sentence_id] = {} + else: + if "." in tok[0] or "-" in tok[0]: + lookup[sentence_id][id_insert_before] = line + else: + id_insert_before+=1 + + return lookup + +""" +dumps the content of the lookup table extracted by lookup_conll_extra_data +into a output conll_path +""" +def dump_lookup_extra_into_conll(conll_path,lookup): + + sentence_id = 0 + word_id = 1 + + with codecs.open(conll_path) as f_conll: + lines = f_conll.readlines() + + #DUMPING the content of the file + f_conll = codecs.open(conll_path,"w") + + for line in lines: + + tok = line.strip().split('\t') + if tok == ['']: #If it is empty line + sentence_id+=1 + word_id = 1 + else: + if sentence_id in lookup: + if word_id in lookup[sentence_id]: + f_conll.write(lookup[sentence_id][word_id]) + word_id+=1 + f_conll.write(line) + + f_conll.close() + + +def get_rooted(conll_str): + """ + Returns a list of [id,ctag,head] of the nodes rooted to 0 + """ + rooted_elements = [] + + lines = conll_str.split('\n') + for l in lines: + ls = l.split('\t') + try: + identifier,tag,head = int(ls[UD_ID_COLUMN]),ls[UD_CTAG_COLUMN],int(ls[UD_HEAD_COLUMN]) + if head == DUMMY_ROOT: + rooted_elements.append((identifier,tag,head)) + except ValueError: + pass + return rooted_elements + + +def get_new_single_root(lmultiple_rooted): + """ + Returns the ID of the first VERB rooted to 0 or the leftmost rooted + element otherwise + """ + for e in lmultiple_rooted: + if e[2] == DUMMY_ROOT and e[1] == UD_CTAG_VERB: + return e[0] + return lmultiple_rooted[0][0] + +""" +""" +def transform_to_single_root(conll_path): + + with codecs.open(conll_path) as f_conll: + sentences = f_conll.read().split('\n\n') + + with codecs.open(conll_path,"w") as f_conll: + + i=0 + for s in sentences: + if s == "": continue + rooted = get_rooted(s) + if len(rooted) > 1: + frv = get_new_single_root(rooted) + for l in s.split('\n'): + ls = l.strip().split('\t') + + if ls != [''] and not l.startswith("#"): #If it is empty line + if ls[UD_HEAD_COLUMN] != "_" and int(ls[UD_HEAD_COLUMN]) == DUMMY_ROOT and int(ls[UD_ID_COLUMN]) != frv: + ls[UD_HEAD_COLUMN] = str(frv) + + f_conll.write('\t'.join(ls)+"\n") + else: + f_conll.write(s+"\n") + f_conll.write('\n') + i+=1 diff --git a/bcovington/src/utils/conll17_ud_eval.py b/bcovington/src/utils/conll17_ud_eval.py new file mode 100644 index 0000000..c1ec200 --- /dev/null +++ b/bcovington/src/utils/conll17_ud_eval.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python + +# CoNLL 2017 UD Parsing evaluation script. +# +# Compatible with Python 2.7 and 3.2+, can be used either as a module +# or a standalone executable. +# +# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL), +# Faculty of Mathematics and Physics, Charles University, Czech Republic. +# +# Changelog: +# - [02 Jan 2017] Version 0.9: Initial release +# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation +# - [10 Mar 2017] Version 1.0: Add documentation and test +# Compare HEADs correctly using aligned words +# Allow evaluation with errorneous spaces in forms +# Compare forms in LCS case insensitively +# Detect cycles and multiple root nodes +# Compute AlignedAccuracy + +# Command line usage +# ------------------ +# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file +# +# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics +# is printed +# - if -v is given, several metrics are printed (as precision, recall, F1 score, +# and in case the metric is computed on aligned words also accuracy on these): +# - Tokens: how well do the gold tokens match system tokens +# - Sentences: how well do the gold sentences match system sentences +# - Words: how well can the gold words be aligned to system words +# - UPOS: using aligned words, how well does UPOS match +# - XPOS: using aligned words, how well does XPOS match +# - Feats: using aligned words, how well does FEATS match +# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match +# - Lemmas: using aligned words, how well does LEMMA match +# - UAS: using aligned words, how well does HEAD match +# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match +# - if weights_file is given (with lines containing deprel-weight pairs), +# one more metric is shown: +# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight + +# API usage +# --------- +# - load_conllu(file) +# - loads CoNLL-U file from given file object to an internal representation +# - the file object should return str on both Python 2 and Python 3 +# - raises UDError exception if the given file cannot be loaded +# - evaluate(gold_ud, system_ud) +# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu) +# - raises UDError if the concatenated tokens of gold and system file do not match +# - returns a dictionary with the metrics described above, each metrics having +# three fields: precision, recall and f1 + +# Description of token matching +# ----------------------------- +# In order to match tokens of gold file and system file, we consider the text +# resulting from concatenation of gold tokens and text resulting from +# concatenation of system tokens. These texts should match -- if they do not, +# the evaluation fails. +# +# If the texts do match, every token is represented as a range in this original +# text, and tokens are equal only if their range is the same. + +# Description of word matching +# ---------------------------- +# When matching words of gold file and system file, we first match the tokens. +# The words which are also tokens are matched as tokens, but words in multi-word +# tokens have to be handled differently. +# +# To handle multi-word tokens, we start by finding "multi-word spans". +# Multi-word span is a span in the original text such that +# - it contains at least one multi-word token +# - all multi-word tokens in the span (considering both gold and system ones) +# are completely inside the span (i.e., they do not "stick out") +# - the multi-word span is as small as possible +# +# For every multi-word span, we align the gold and system words completely +# inside this span using LCS on their FORMs. The words not intersecting +# (even partially) any multi-word span are then aligned as tokens. + + +from __future__ import division +from __future__ import print_function + +import argparse +import io +import sys +import unittest + +# CoNLL-U column names +ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) + +# UD Error is used when raising exceptions in this module +class UDError(Exception): + pass + +# Load given CoNLL-U file into internal representation +def load_conllu(file): + # Internal representation classes + class UDRepresentation: + def __init__(self): + # Characters of all the tokens in the whole file. + # Whitespace between tokens is not included. + self.characters = [] + # List of UDSpan instances with start&end indices into `characters`. + self.tokens = [] + # List of UDWord instances. + self.words = [] + # List of UDSpan instances with start&end indices into `characters`. + self.sentences = [] + class UDSpan: + def __init__(self, start, end): + self.start = start + # Note that self.end marks the first position **after the end** of span, + # so we can use characters[start:end] or range(start, end). + self.end = end + class UDWord: + def __init__(self, span, columns, is_multiword): + # Span of this word (or MWT, see below) within ud_representation.characters. + self.span = span + # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,... + self.columns = columns + # is_multiword==True means that this word is part of a multi-word token. + # In that case, self.span marks the span of the whole multi-word token. + self.is_multiword = is_multiword + # Reference to the UDWord instance representing the HEAD (or None if root). + self.parent = None + # Let's ignore language-specific deprel subtypes. + self.columns[DEPREL] = columns[DEPREL].split(':')[0] + + ud = UDRepresentation() + + # Load the CoNLL-U file + index, sentence_start = 0, None + while True: + line = file.readline() + if not line: + break + line = line.rstrip("\r\n") + + # Handle sentence start boundaries + if sentence_start is None: + # Skip comments + if line.startswith("#"): + continue + # Start a new sentence + ud.sentences.append(UDSpan(index, 0)) + sentence_start = len(ud.words) + if not line: + # Add parent UDWord links and check there are no cycles + def process_word(word): + if word.parent == "remapping": + raise UDError("There is a cycle in a sentence") + if word.parent is None: + head = int(word.columns[HEAD]) + if head > len(ud.words) - sentence_start: + raise UDError("HEAD '{}' points outside of the sentence".format(word.columns[HEAD])) + if head: + parent = ud.words[sentence_start + head - 1] + word.parent = "remapping" + process_word(parent) + word.parent = parent + + for word in ud.words[sentence_start:]: + process_word(word) + + # Check there is a single root node + if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1: + raise UDError("There are multiple roots in a sentence") + + # End the sentence + ud.sentences[-1].end = index + sentence_start = None + continue + + # Read next token/word + columns = line.split("\t") + if len(columns) != 10: + raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(line)) + + # Skip empty nodes + if "." in columns[ID]: + continue + + # Delete spaces from FORM so gold.characters == system.characters + # even if one of them tokenizes the space. + columns[FORM] = columns[FORM].replace(" ", "") + if not columns[FORM]: + raise UDError("There is an empty FORM in the CoNLL-U file") + + # Save token + ud.characters.extend(columns[FORM]) + ud.tokens.append(UDSpan(index, index + len(columns[FORM]))) + index += len(columns[FORM]) + + # Handle multi-word tokens to save word(s) + if "-" in columns[ID]: + try: + start, end = map(int, columns[ID].split("-")) + except: + raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) + + for _ in range(start, end + 1): + word_line = file.readline().rstrip("\r\n") + word_columns = word_line.split("\t") + if len(word_columns) != 10: + raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(word_line)) + ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) + # Basic tokens/words + else: + try: + word_id = int(columns[ID]) + except: + raise UDError("Cannot parse word ID '{}'".format(columns[ID])) + if word_id != len(ud.words) - sentence_start + 1: + raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)) + + try: + head_id = int(columns[HEAD]) + except: + raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD])) + if head_id < 0: + raise UDError("HEAD cannot be negative") + + ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) + + if sentence_start is not None: + raise UDError("The CoNLL-U file does not end with empty line") + + return ud + +# Evaluate the gold and system treebanks (loaded using load_conllu). +def evaluate(gold_ud, system_ud, deprel_weights=None): + class Score: + def __init__(self, gold_total, system_total, correct, aligned_total=None): + self.precision = correct / system_total if system_total else 0.0 + self.recall = correct / gold_total if gold_total else 0.0 + self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 + self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total + class AlignmentWord: + def __init__(self, gold_word, system_word): + self.gold_word = gold_word + self.system_word = system_word + self.gold_parent = None + self.system_parent_gold_aligned = None + class Alignment: + def __init__(self, gold_words, system_words): + self.gold_words = gold_words + self.system_words = system_words + self.matched_words = [] + self.matched_words_map = {} + def append_aligned_words(self, gold_word, system_word): + self.matched_words.append(AlignmentWord(gold_word, system_word)) + self.matched_words_map[system_word] = gold_word + def fill_parents(self): + # We represent root parents in both gold and system data by '0'. + # For gold data, we represent non-root parent by corresponding gold word. + # For system data, we represent non-root parent by either gold word aligned + # to parent system nodes, or by None if no gold words is aligned to the parent. + for words in self.matched_words: + words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0 + words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \ + if words.system_word.parent is not None else 0 + + def lower(text): + if sys.version_info < (3, 0) and isinstance(text, str): + return text.decode("utf-8").lower() + return text.lower() + + def spans_score(gold_spans, system_spans): + correct, gi, si = 0, 0, 0 + while gi < len(gold_spans) and si < len(system_spans): + if system_spans[si].start < gold_spans[gi].start: + si += 1 + elif gold_spans[gi].start < system_spans[si].start: + gi += 1 + else: + correct += gold_spans[gi].end == system_spans[si].end + si += 1 + gi += 1 + + return Score(len(gold_spans), len(system_spans), correct) + + def alignment_score(alignment, key_fn, weight_fn=lambda w: 1): + gold, system, aligned, correct = 0, 0, 0, 0 + + for word in alignment.gold_words: + gold += weight_fn(word) + + for word in alignment.system_words: + system += weight_fn(word) + + for words in alignment.matched_words: + aligned += weight_fn(words.gold_word) + + if key_fn is None: + # Return score for whole aligned words + return Score(gold, system, aligned) + + for words in alignment.matched_words: + if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned): + correct += weight_fn(words.gold_word) + + return Score(gold, system, correct, aligned) + + def beyond_end(words, i, multiword_span_end): + if i >= len(words): + return True + if words[i].is_multiword: + return words[i].span.start >= multiword_span_end + return words[i].span.end > multiword_span_end + + def extend_end(word, multiword_span_end): + if word.is_multiword and word.span.end > multiword_span_end: + return word.span.end + return multiword_span_end + + def find_multiword_span(gold_words, system_words, gi, si): + # We know gold_words[gi].is_multiword or system_words[si].is_multiword. + # Find the start of the multiword span (gs, ss), so the multiword span is minimal. + # Initialize multiword_span_end characters index. + if gold_words[gi].is_multiword: + multiword_span_end = gold_words[gi].span.end + if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: + si += 1 + else: # if system_words[si].is_multiword + multiword_span_end = system_words[si].span.end + if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: + gi += 1 + gs, ss = gi, si + + # Find the end of the multiword span + # (so both gi and si are pointing to the word following the multiword span end). + while not beyond_end(gold_words, gi, multiword_span_end) or \ + not beyond_end(system_words, si, multiword_span_end): + if gi < len(gold_words) and (si >= len(system_words) or + gold_words[gi].span.start <= system_words[si].span.start): + multiword_span_end = extend_end(gold_words[gi], multiword_span_end) + gi += 1 + else: + multiword_span_end = extend_end(system_words[si], multiword_span_end) + si += 1 + return gs, ss, gi, si + + def compute_lcs(gold_words, system_words, gi, si, gs, ss): + lcs = [[0] * (si - ss) for i in range(gi - gs)] + for g in reversed(range(gi - gs)): + for s in reversed(range(si - ss)): + if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): + lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) + lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) + lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) + return lcs + + def align_words(gold_words, system_words): + alignment = Alignment(gold_words, system_words) + + gi, si = 0, 0 + while gi < len(gold_words) and si < len(system_words): + if gold_words[gi].is_multiword or system_words[si].is_multiword: + # A: Multi-word tokens => align via LCS within the whole "multiword span". + gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) + + if si > ss and gi > gs: + lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) + + # Store aligned words + s, g = 0, 0 + while g < gi - gs and s < si - ss: + if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): + alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) + g += 1 + s += 1 + elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): + g += 1 + else: + s += 1 + else: + # B: No multi-word token => align according to spans. + if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): + alignment.append_aligned_words(gold_words[gi], system_words[si]) + gi += 1 + si += 1 + elif gold_words[gi].span.start <= system_words[si].span.start: + gi += 1 + else: + si += 1 + + alignment.fill_parents() + + return alignment + + # Check that underlying character sequences do match + if gold_ud.characters != system_ud.characters: + index = 0 + while gold_ud.characters[index] == system_ud.characters[index]: + index += 1 + + raise UDError( + "The concatenation of tokens in gold file and in system file differ!\n" + + "First 20 differing characters in gold file: '{}' and system file: '{}'".format( + "".join(gold_ud.characters[index:index + 20]), + "".join(system_ud.characters[index:index + 20]) + ) + ) + + # Align words + alignment = align_words(gold_ud.words, system_ud.words) + + # Compute the F1-scores + result = { + "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), + "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), + "Words": alignment_score(alignment, None), + "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]), + "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]), + "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]), + "AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), + "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]), + "UAS": alignment_score(alignment, lambda w, parent: parent), + "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])), + } + + # Add WeightedLAS if weights are given + if deprel_weights is not None: + def weighted_las(word): + return deprel_weights.get(word.columns[DEPREL], 1.0) + result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las) + + return result + +def load_deprel_weights(weights_file): + if weights_file is None: + return None + + deprel_weights = {} + for line in weights_file: + # Ignore comments and empty lines + if line.startswith("#") or not line.strip(): + continue + + columns = line.rstrip("\r\n").split() + if len(columns) != 2: + raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line)) + + deprel_weights[columns[0]] = float(columns[1]) + + return deprel_weights + +def load_conllu_file(path): + _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) + return load_conllu(_file) + +def evaluate_wrapper(args): + # Load CoNLL-U files + gold_ud = load_conllu_file(args.gold_file) + system_ud = load_conllu_file(args.system_file) + + # Load weights if requested + deprel_weights = load_deprel_weights(args.weights) + + return evaluate(gold_ud, system_ud, deprel_weights) + +def main(): + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument("gold_file", type=str, + help="Name of the CoNLL-U file with the gold data.") + parser.add_argument("system_file", type=str, + help="Name of the CoNLL-U file with the predicted data.") + parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None, + metavar="deprel_weights_file", + help="Compute WeightedLAS using given weights for Universal Dependency Relations.") + parser.add_argument("--verbose", "-v", default=0, action="count", + help="Print all metrics.") + args = parser.parse_args() + + # Use verbose if weights are supplied + if args.weights is not None and not args.verbose: + args.verbose = 1 + + # Evaluate + evaluation = evaluate_wrapper(args) + + # Print the evaluation + if not args.verbose: + print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) + else: + metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"] + if args.weights is not None: + metrics.append("WeightedLAS") + + print("Metrics | Precision | Recall | F1 Score | AligndAcc") + print("-----------+-----------+-----------+-----------+-----------") + for metric in metrics: + print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( + metric, + 100 * evaluation[metric].precision, + 100 * evaluation[metric].recall, + 100 * evaluation[metric].f1, + "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else "" + )) + +if __name__ == "__main__": + main() + +# Tests, which can be executed with `python -m unittest conll17_ud_eval`. +class TestAlignment(unittest.TestCase): + @staticmethod + def _load_words(words): + """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" + lines, num_words = [], 0 + for w in words: + parts = w.split(" ") + if len(parts) == 1: + num_words += 1 + lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) + else: + lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) + for part in parts[1:]: + num_words += 1 + lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) + return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) + + def _test_exception(self, gold, system): + self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) + + def _test_ok(self, gold, system, correct): + metrics = evaluate(self._load_words(gold), self._load_words(system)) + gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) + system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) + self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), + (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) + + def test_exception(self): + self._test_exception(["a"], ["b"]) + + def test_equal(self): + self._test_ok(["a"], ["a"], 1) + self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) + + def test_equal_with_multiword(self): + self._test_ok(["abc a b c"], ["a", "b", "c"], 3) + self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) + self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) + self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) + + def test_alignment(self): + self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) + self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) + self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) + self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) + self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) + self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) + self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1) diff --git a/bcovington/src/utils/eval.pl b/bcovington/src/utils/eval.pl new file mode 100644 index 0000000..f8dbbfd --- /dev/null +++ b/bcovington/src/utils/eval.pl @@ -0,0 +1,1827 @@ +#!/usr/bin/env perl + +# Author: Yuval Krymolowski +# Addition of precision and recall +# and of frame confusion list: Sabine Buchholz +# Addition of DEPREL + ATTACHMENT: +# Prokopis Prokopidis (prokopis at ilsp dot gr) +# Acknowledgements: +# to Markus Kuhn for suggesting the use of +# the Unicode category property + +if ($] < 5.008001) +{ + printf STDERR < -s + + This script evaluates a system output with respect to a gold standard. + Both files should be in UTF-8 encoded CoNLL-X tabular format. + + Punctuation tokens (those where all characters have the Unicode + category property "Punctuation") are ignored for scoring (unless the + -p flag is used). + + The output breaks down the errors according to their type and context. + + Optional parameters: + -o FILE : output: print output to FILE (default is standard output) + -q : quiet: only print overall performance, without the details + -b : evalb: produce output in a format similar to evalb + (http://nlp.cs.nyu.edu/evalb/); use together with -q + -p : punctuation: also score on punctuation (default is not to score on it) + -v : version: show the version number + -h : help: print this help text and exit + +EOT +; + +my ($line_num) ; +my ($sep) = '0x01' ; + +my ($START) = '.S' ; +my ($END) = '.E' ; + +my ($con_err_num) = 3 ; +my ($freq_err_num) = 10 ; +my ($spec_err_loc_con) = 8 ; + +################################################################################ +### subfunctions ### +################################################################################ + +# Whether a string consists entirely of characters with the Unicode +# category property "Punctuation" (see "man perlunicode") +sub is_uni_punct +{ + my ($word) = @_ ; + + return scalar(Encode::decode_utf8($word)=~ /^\p{Punctuation}+$/) ; +} + +# The length of a unicode string, excluding non-spacing marks +# (for example vowel marks in Arabic) + +sub uni_len +{ + my ($word) = @_ ; + my ($ch, $l) ; + + $l = 0 ; + foreach $ch (split(//, Encode::decode_utf8($word))) + { + if ($ch !~ /^\p{NonspacingMark}/) + { + $l++ ; + } + } + + return $l ; +} + +sub filter_context_counts +{ # filter_context_counts + + my ($vec, $num, $max_len) = @_ ; + my ($con, $l, $thresh) ; + + $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ; + + foreach $con (keys %{$vec}) + { + if (${$vec}{$con} < $thresh) + { + delete ${$vec}{$con} ; + next ; + } + + $l = uni_len($con) ; + + if ($l > ${$max_len}) + { + ${$max_len} = $l ; + } + } + +} # filter_context_counts + +sub print_context +{ # print_context + + my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ; + my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ; + + printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ; + printf OUT " ||" ; + printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ; + printf OUT "\n" ; + printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len; + printf OUT "--++" ; + printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len; + printf OUT "\n" ; + + @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ; + @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ; + + $n = scalar @v_con ; + if (scalar @v_con_pos > $n) + { + $n = scalar @v_con_pos ; + } + + foreach $i (0 .. $n-1) + { + if (defined $v_con_pos[$i]) + { + $con_pos = $v_con_pos[$i] ; + printf OUT " %-*s | %4d | %4d | %4d | %4d", + $max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos}, + ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos}, + ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ; + } + else + { + printf OUT " %-*s | %4s | %4s | %4s | %4s", + $max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ; + } + + printf OUT " ||" ; + + if (defined $v_con[$i]) + { + $con = $v_con[$i] ; + printf OUT " %-*s | %4d | %4d | %4d | %4d", + $max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con}, + ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con}, + ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ; + } + else + { + printf OUT " %-*s | %4s | %4s | %4s | %4s", + $max_con_len, ' ', ' ', ' ', ' ', ' ' ; + } + + printf OUT "\n" ; + } + + printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len; + printf OUT "--++" ; + printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len; + printf OUT "\n" ; + + printf OUT "\n\n" ; + +} # print_context + +sub num_as_word +{ + my ($num) = @_ ; + + $num = abs($num) ; + + if ($num == 1) + { + return ('one word') ; + } + elsif ($num == 2) + { + return ('two words') ; + } + elsif ($num == 3) + { + return ('three words') ; + } + elsif ($num == 4) + { + return ('four words') ; + } + else + { + return ($num.' words') ; + } +} + +sub describe_err +{ # describe_err + + my ($head_err, $head_aft_bef, $dep_err) = @_ ; + my ($dep_g, $dep_s, $desc) ; + my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ; + + if ($head_err eq '-') + { + $desc = 'correct head' ; + + if ($head_aft_bef_s eq '0') + { + $desc .= ' (0)' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= ' (the focus word)' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= ' (after the focus word)' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= ' (before the focus word)' ; + } + } + elsif ($head_aft_bef_s eq '0') + { + $desc = 'head = 0 instead of ' ; + if ($head_aft_bef_g eq 'a') + { + $desc.= 'after ' ; + } + if ($head_aft_bef_g eq 'b') + { + $desc.= 'before ' ; + } + $desc .= 'the focus word' ; + } + elsif ($head_aft_bef_g eq '0') + { + $desc = 'head is ' ; + if ($head_aft_bef_g eq 'a') + { + $desc.= 'after ' ; + } + if ($head_aft_bef_g eq 'b') + { + $desc.= 'before ' ; + } + $desc .= 'the focus word instead of 0' ; + } + else + { + $desc = num_as_word($head_err) ; + if ($head_err < 0) + { + $desc .= ' before' ; + } + else + { + $desc .= ' after' ; + } + + $desc = 'head '.$desc.' the correct head ' ; + + if ($head_aft_bef_s eq '0') + { + $desc .= '(0' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= '(the focus word' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= '(after the focus word' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= '(before the focus word' ; + } + + if ($head_aft_bef_g ne $head_aft_bef_s) + { + $desc .= ' instead of' ; + if ($head_aft_bef_s eq '0') + { + $desc .= '0' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= 'the focus word' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= 'after the focus word' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= 'before the focus word' ; + } + } + + $desc .= ')' ; + } + + $desc .= ', ' ; + + if ($dep_err eq '-') + { + $desc .= 'correct dependency' ; + } + else + { + ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ; + $desc .= sprintf('dependency "%s" instead of "%s"', $dep_s, $dep_g) ; + } + + return($desc) ; + +} # describe_err + +sub get_context +{ # get_context + + my ($sent, $i_w) = @_ ; + my ($w_2, $w_1, $w1, $w2) ; + my ($p_2, $p_1, $p1, $p2) ; + + if ($i_w >= 2) + { + $w_2 = ${${$sent}[$i_w-2]}{word} ; + $p_2 = ${${$sent}[$i_w-2]}{pos} ; + } + else + { + $w_2 = $START ; + $p_2 = $START ; + } + + if ($i_w >= 1) + { + $w_1 = ${${$sent}[$i_w-1]}{word} ; + $p_1 = ${${$sent}[$i_w-1]}{pos} ; + } + else + { + $w_1 = $START ; + $p_1 = $START ; + } + + if ($i_w <= scalar @{$sent}-2) + { + $w1 = ${${$sent}[$i_w+1]}{word} ; + $p1 = ${${$sent}[$i_w+1]}{pos} ; + } + else + { + $w1 = $END ; + $p1 = $END ; + } + + if ($i_w <= scalar @{$sent}-3) + { + $w2 = ${${$sent}[$i_w+2]}{word} ; + $p2 = ${${$sent}[$i_w+2]}{pos} ; + } + else + { + $w2 = $END ; + $p2 = $END ; + } + + return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ; + +} # get_context + +sub read_sent +{ # read_sent + + my ($sent_gold, $sent_sys) = @_ ; + my ($line_g, $line_s, $new_sent) ; + my (%fields_g, %fields_s) ; + + $new_sent = 1 ; + + @{$sent_gold} = () ; + @{$sent_sys} = () ; + + while (1) + { # main reading loop + + $line_g = ; + $line_s = ; + + $line_num++ ; + + # system output has fewer lines than gold standard + if ((defined $line_g) && (! defined $line_s)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : past end of file\n" ; + exit(1) ; + } + + # system output has more lines than gold standard + if ((! defined $line_g) && (defined $line_s)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: past end of file\n" ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + # end of file reached for both + if ((! defined $line_g) && (! defined $line_s)) + { + return (1) ; + } + + # one contains end of sentence but other one does not + if (($line_g =~ /^\s+$/) != ($line_s =~ /^\s+$/)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + # end of sentence reached + if ($line_g =~ /^\s+$/) + { + return(0) ; + } + + # now both lines contain information + + if ($new_sent) + { + $new_sent = 0 ; + } + + # 'official' column names + # options.output = ['id','form','lemma','cpostag','postag', + # 'feats','head','deprel','phead','pdeprel'] + + @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_g))[1, 3, 6, 7] ; + + push @{$sent_gold}, { %fields_g } ; + + @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_s))[1, 3, 6, 7] ; + + if (($fields_g{word} ne $fields_s{word}) + || + ($fields_g{pos} ne $fields_s{pos})) + { + printf STDERR "Word/pos mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + push @{$sent_sys}, { %fields_s } ; + + } # main reading loop + +} # read_sent + +################################################################################ +### main ### +################################################################################ + +our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b) ; + +my ($sent_num, $eof, $word_num, @err_sent) ; +my (@sent_gold, @sent_sys, @starts) ; +my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ; +my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ; +my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ; +my ($loc_con, %loc_con_err_counts, %err_desc) ; +my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ; +my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ; +my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ; +my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ; +my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ; +my (%freq_err, $err) ; + +my ($i, $j, $i_w, $l, $n_args) ; +my ($w_2, $w_1, $w1, $w2) ; +my ($wp_2, $wp_1, $wp1, $wp2) ; +my ($p_2, $p_1, $p1, $p2) ; + +my ($short_output) ; +my ($score_on_punct) ; +$counts{punct} = 0; # initialize + +getopts("g:o:s:qvhpb") ; + +if (defined $opt_v) +{ + my $id = '$Id: eval.pl,v 1.9 2006/05/09 20:30:01 yuval Exp $'; + my @parts = split ' ',$id; + print "Version $parts[2]\n"; + exit(0); +} + +if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s))) +{ + die $usage ; +} + +if (! defined $opt_g) +{ + die "Gold standard file (-g) missing\n" ; +} + +if (! defined $opt_s) +{ + die "System output file (-s) missing\n" ; +} + +if (! defined $opt_o) +{ + $opt_o = '-' ; +} + +if (defined $opt_q) +{ + $short_output = 1 ; +} else { + $short_output = 0 ; +} + +if (defined $opt_p) +{ + $score_on_punct = 1 ; +} else { + $score_on_punct = 0 ; +} + +$line_num = 0 ; +$sent_num = 0 ; +$eof = 0 ; + +@err_sent = () ; +@starts = () ; + +%{$err_sent[0]} = () ; + +$max_pos_len = length('CPOS') ; + +################################################################################ +### reading input ### +################################################################################ + +open (GOLD, "<$opt_g") || die "Could not open gold standard file $opt_g\n" ; +open (SYS, "<$opt_s") || die "Could not open system output file $opt_s\n" ; +open (OUT, ">$opt_o") || die "Could not open output file $opt_o\n" ; + + +if (defined $opt_b) { # produce output similar to evalb + print OUT " Sent. Attachment Correct Scoring \n"; + print OUT " ID Tokens - Unlab. Lab. HEAD HEAD+DEPREL tokens - - - -\n"; + print OUT " ============================================================================\n"; +} + + +while (! $eof) +{ # main reading loop + + $starts[$sent_num] = $line_num+1 ; + $eof = read_sent(\@sent_gold, \@sent_sys) ; + + $sent_num++ ; + + %{$err_sent[$sent_num]} = () ; + $word_num = scalar @sent_gold ; + + # for accuracy per sentence + my %sent_counts = ( tot => 0, + err_any => 0, + err_head => 0 + ); + + # printf "$sent_num $word_num\n" ; + + my @frames_g = ('** '); # the initial frame for the virtual root + my @frames_s = ('** '); # the initial frame for the virtual root + foreach $i_w (0 .. $word_num-1) + { # loop on words + push @frames_g, ''; # initialize + push @frames_s, ''; # initialize + } + + foreach $i_w (0 .. $word_num-1) + { # loop on words + + ($word, $pos, $head_g, $dep_g) + = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ; + $wp = $word.' / '.$pos ; + + # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ; + + if ((! $score_on_punct) && is_uni_punct($word)) + { + $counts{punct}++ ; + # ignore punctuations + next ; + } + + if (length($pos) > $max_pos_len) + { + $max_pos_len = length($pos) ; + } + + ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ; + + $counts{tot}++ ; + $counts{word}{$wp}{tot}++ ; + $counts{pos}{$pos}{tot}++ ; + $counts{head}{$head_g-$i_w-1}{tot}++ ; + + # for frame confusions + # add child to frame of parent + $frames_g[$head_g] .= "$dep_g "; + $frames_s[$head_s] .= "$dep_s "; + # add to frame of token itself + $frames_g[$i_w+1] .= "*$dep_g* "; # $i_w+1 because $i_w starts counting at zero + $frames_s[$i_w+1] .= "*$dep_g* "; + + # for precision and recall of DEPREL + $counts{dep}{$dep_g}{tot}++ ; # counts for gold standard deprels + $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions + $counts{dep_s}{$dep_s}{tot}++ ; # counts for system deprels + $counts{all_dep}{$dep_g} = 1 ; # list of all deprels that occur ... + $counts{all_dep}{$dep_s} = 1 ; # ... in either gold or system output + + # for precision and recall of HEAD direction + my $dir_g; + if ($head_g == 0) { + $dir_g = 'to_root'; + } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero + # also below + $dir_g = 'left'; + } elsif ($head_g > $i_w+1) { + $dir_g = 'right'; + } else { + # token links to itself; should never happen in correct gold standard + $dir_g = 'self'; + } + my $dir_s; + if ($head_s == 0) { + $dir_s = 'to_root'; + } elsif ($head_s < $i_w+1) { + $dir_s = 'left'; + } elsif ($head_s > $i_w+1) { + $dir_s = 'right'; + } else { + # token links to itself; should not happen in good system + # (but not forbidden in shared task) + $dir_s = 'self'; + } + $counts{dir_g}{$dir_g}{tot}++ ; # counts for gold standard head direction + $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions + $counts{dir_s}{$dir_s}{tot}++ ; # counts for system head direction + + # for precision and recall of HEAD distance + my $dist_g; + if ($head_g == 0) { + $dist_g = 'to_root'; + } elsif ( abs($head_g - ($i_w+1)) <= 1 ) { + $dist_g = '1'; # includes the 'self' cases + } elsif ( abs($head_g - ($i_w+1)) <= 2 ) { + $dist_g = '2'; + } elsif ( abs($head_g - ($i_w+1)) <= 6 ) { + $dist_g = '3-6'; + } else { + $dist_g = '7-...'; + } + my $dist_s; + if ($head_s == 0) { + $dist_s = 'to_root'; + } elsif ( abs($head_s - ($i_w+1)) <= 1 ) { + $dist_s = '1'; # includes the 'self' cases + } elsif ( abs($head_s - ($i_w+1)) <= 2 ) { + $dist_s = '2'; + } elsif ( abs($head_s - ($i_w+1)) <= 6 ) { + $dist_s = '3-6'; + } else { + $dist_s = '7-...'; + } + $counts{dist_g}{$dist_g}{tot}++ ; # counts for gold standard head distance + $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions + $counts{dist_s}{$dist_s}{tot}++ ; # counts for system head distance + + + $err_head = ($head_g ne $head_s) ; # error in head + $err_dep = ($dep_g ne $dep_s) ; # error in deprel + + $head_err = '-' ; + $dep_err = '-' ; + + # for accuracy per sentence + $sent_counts{tot}++ ; + if ($err_dep || $err_head) { + $sent_counts{err_any}++ ; + } + if ($err_head) { + $sent_counts{err_head}++ ; + } + + # total counts and counts for CPOS involved in errors + + if ($head_g eq '0') + { + $head_aft_bef_g = '0' ; + } + elsif ($head_g eq $i_w+1) + { + $head_aft_bef_g = 'e' ; + } + else + { + $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ; + } + + if ($head_s eq '0') + { + $head_aft_bef_s = '0' ; + } + elsif ($head_s eq $i_w+1) + { + $head_aft_bef_s = 'e' ; + } + else + { + $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ; + } + + $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ; + + if ($err_head) + { + if ($head_aft_bef_s eq '0') + { + $head_err = 0 ; + } + else + { + $head_err = $head_s-$head_g ; + } + + $err_sent[$sent_num]{head}++ ; + $counts{err_head}{tot}++ ; + $counts{err_head}{$head_err}++ ; + + $counts{word}{err_head}{$wp}++ ; + $counts{pos}{$pos}{err_head}{tot}++ ; + $counts{pos}{$pos}{err_head}{$head_err}++ ; + } + + if ($err_dep) + { + $dep_err = $dep_g.'->'.$dep_s ; + $err_sent[$sent_num]{dep}++ ; + $counts{err_dep}{tot}++ ; + $counts{err_dep}{$dep_err}++ ; + + $counts{word}{err_dep}{$wp}++ ; + $counts{pos}{$pos}{err_dep}{tot}++ ; + $counts{pos}{$pos}{err_dep}{$dep_err}++ ; + + if ($err_head) + { + $counts{err_both}++ ; + $counts{pos}{$pos}{err_both}++ ; + } + } + + ### DEPREL + ATTACHMENT + if ((!$err_dep) && ($err_head)) { + $counts{err_head_corr_dep}{tot}++ ; + $counts{err_head_corr_dep}{$dep_s}++ ; + } + ### DEPREL + ATTACHMENT + + # counts for words involved in errors + + if (! ($err_head || $err_dep)) + { + next ; + } + + $err_sent[$sent_num]{word}++ ; + $counts{err_any}++ ; + $counts{word}{err_any}{$wp}++ ; + $counts{pos}{$pos}{err_any}++ ; + + ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ; + + if ($w_2 ne $START) + { + $wp_2 = $w_2.' / '.$p_2 ; + } + else + { + $wp_2 = $w_2 ; + } + + if ($w_1 ne $START) + { + $wp_1 = $w_1.' / '.$p_1 ; + } + else + { + $wp_1 = $w_1 ; + } + + if ($w1 ne $END) + { + $wp1 = $w1.' / '.$p1 ; + } + else + { + $wp1 = $w1 ; + } + + if ($w2 ne $END) + { + $wp2 = $w2.' / '.$p2 ; + } + else + { + $wp2 = $w2 ; + } + + $con_bef = $wp_1 ; + $con_bef_2 = $wp_2.' + '.$wp_1 ; + $con_aft = $wp1 ; + $con_aft_2 = $wp1.' + '.$wp2 ; + + $con_pos_bef = $p_1 ; + $con_pos_bef_2 = $p_2.'+'.$p_1 ; + $con_pos_aft = $p1 ; + $con_pos_aft_2 = $p1.'+'.$p2 ; + + if ($w_1 ne $START) + { + # do not count '.S' as a word context + $counts{con_bef_2}{tot}{$con_bef_2}++ ; + $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ; + $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ; + $counts{con_bef}{tot}{$con_bef}++ ; + $counts{con_bef}{err_head}{$con_bef} += $err_head ; + $counts{con_bef}{err_dep}{$con_bef} += $err_dep ; + } + + if ($w1 ne $END) + { + # do not count '.E' as a word context + $counts{con_aft_2}{tot}{$con_aft_2}++ ; + $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ; + $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ; + $counts{con_aft}{tot}{$con_aft}++ ; + $counts{con_aft}{err_head}{$con_aft} += $err_head ; + $counts{con_aft}{err_dep}{$con_aft} += $err_dep ; + } + + $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ; + $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ; + $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ; + $counts{con_pos_bef}{tot}{$con_pos_bef}++ ; + $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ; + $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ; + + $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ; + $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ; + $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ; + $counts{con_pos_aft}{tot}{$con_pos_aft}++ ; + $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ; + $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ; + + $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ; + $freq_err{$err}++ ; + + } # loop on words + + foreach $i_w (0 .. $word_num) # including one for the virtual root + { # loop on words + if ($frames_g[$i_w] ne $frames_s[$i_w]) { + $counts{frame2}{"$frames_g[$i_w]/ $frames_s[$i_w]"}++ ; + } + } + + if (defined $opt_b) { # produce output similar to evalb + if ($word_num > 0) { + my ($unlabeled,$labeled) = ('NaN', 'NaN'); + if ($sent_counts{tot} > 0) { # there are scoring tokens + $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot}; + $labeled = 100-$sent_counts{err_any} *100.0/$sent_counts{tot}; + } + printf OUT " %4d %4d 0 %6.2f %6.2f %4d %4d %4d 0 0 0 0\n", + $sent_num, $word_num, + $unlabeled, $labeled, + $sent_counts{tot}-$sent_counts{err_head}, + $sent_counts{tot}-$sent_counts{err_any}, + $sent_counts{tot},; + } + } + +} # main reading loop + +################################################################################ +### printing output ### +################################################################################ + +if (defined $opt_b) { # produce output similar to evalb + print OUT "\n\n"; +} +printf OUT " Labeled attachment score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_any}, $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ; +printf OUT " Unlabeled attachment score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ; +printf OUT " Label accuracy score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ; + +if ($short_output) +{ + exit(0) ; +} +printf OUT "\n %s\n\n", '=' x 80 ; +printf OUT " Evaluation of the results in %s\n vs. gold standard %s:\n\n", $opt_s, $opt_g ; + +printf OUT " Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\n\n", $START, $END ; + +printf OUT " Number of non-scoring tokens: $counts{punct}\n\n"; + +printf OUT " The overall accuracy and its distribution over CPOSTAGs\n\n" ; +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n", + 'Accuracy', 'words', 'right', 'right', 'both' ; +printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n", + ' ', ' ', 'head', ' dep', 'right' ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + 'total', $counts{tot}, + $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot}, + $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot}, + $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}}) +{ + if (! defined($counts{pos}{$pos}{err_head}{tot})) + { + $counts{pos}{$pos}{err_head}{tot} = 0 ; + } + if (! defined($counts{pos}{$pos}{err_dep}{tot})) + { + $counts{pos}{$pos}{err_dep}{tot} = 0 ; + } + if (! defined($counts{pos}{$pos}{err_any})) + { + $counts{pos}{$pos}{err_any} = 0 ; + } + + printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + $pos, $counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ; +} + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT "\n\n" ; + +printf OUT " The overall error rate and its distribution over CPOSTAGs\n\n" ; +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n", + 'Error', 'words', 'head', ' dep', 'both' ; +printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n", + + 'Rate', ' ', 'err', ' err', 'wrong' ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + 'total', $counts{tot}, + $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot}, + $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot}, + $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}}) +{ + if (! defined($counts{pos}{$pos}{err_both})) + { + $counts{pos}{$pos}{err_both} = 0 ; + } + + printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + $pos, $counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ; + +} + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +### added by Sabine Buchholz +printf OUT "\n\n"; +printf OUT " Precision and recall of DEPREL\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dep (sort keys %{$counts{all_dep}}) { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dep2}{$dep}{$dep})) { + $tot_corr = $counts{dep2}{$dep}{$dep}; + } + if (defined($counts{dep}{$dep}{tot})) { + $tot_g = $counts{dep}{$dep}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dep_s}{$dep}{tot})) { + $tot_s = $counts{dep_s}{$dep}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +### DEPREL + ATTACHMENT: +### Same as Sabine's DEPREL apart from $tot_corr calculation +printf OUT "\n\n"; +printf OUT " Precision and recall of DEPREL + ATTACHMENT\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dep (sort keys %{$counts{all_dep}}) { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dep2}{$dep}{$dep})) { + if (defined($counts{err_head_corr_dep}{$dep})) { + $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep}; + } else { + $tot_corr = $counts{dep2}{$dep}{$dep}; + } + } + if (defined($counts{dep}{$dep}{tot})) { + $tot_g = $counts{dep}{$dep}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dep_s}{$dep}{tot})) { + $tot_s = $counts{dep_s}{$dep}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} +### DEPREL + ATTACHMENT + +printf OUT "\n\n"; +printf OUT " Precision and recall of binned HEAD direction\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " direction | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dir ('to_root', 'left', 'right', 'self') { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dir2}{$dir}{$dir})) { + $tot_corr = $counts{dir2}{$dir}{$dir}; + } + if (defined($counts{dir_g}{$dir}{tot})) { + $tot_g = $counts{dir_g}{$dir}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dir_s}{$dir}{tot})) { + $tot_s = $counts{dir_s}{$dir}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +printf OUT "\n\n"; +printf OUT " Precision and recall of binned HEAD distance\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " distance | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dist ('to_root', '1', '2', '3-6', '7-...') { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dist2}{$dist}{$dist})) { + $tot_corr = $counts{dist2}{$dist}{$dist}; + } + if (defined($counts{dist_g}{$dist}{tot})) { + $tot_g = $counts{dist_g}{$dist}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dist_s}{$dist}{tot})) { + $tot_s = $counts{dist_s}{$dist}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +printf OUT "\n\n"; +printf OUT " Frame confusions (gold versus system; *...* marks the head token)\n\n"; +foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}}) +{ + if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later) + { + printf OUT " %3d %s\n", $counts{frame2}{$frame}, $frame; + } +} +### end of: added by Sabine Buchholz + + +# +# Leave only the 5 words mostly involved in errors +# + + +$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ; + +# ensure enough space for title +$max_word_len = length('word') ; + +foreach $word (keys %{$counts{word}{err_any}}) +{ + if ($counts{word}{err_any}{$word} < $thresh) + { + delete $counts{word}{err_any}{$word} ; + next ; + } + + $l = uni_len($word) ; + if ($l > $max_word_len) + { + $max_word_len = $l ; + } +} + +# filter a case when the difference between the error counts +# for 2-word and 1-word contexts is small +# (leave the 2-word context) + +foreach $con (keys %{$counts{con_aft_2}{tot}}) +{ + ($w1) = split(/\+/, $con) ; + + if (defined $counts{con_aft}{tot}{$w1} && + $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1) + { + delete $counts{con_aft}{tot}{$w1} ; + } +} + +foreach $con (keys %{$counts{con_bef_2}{tot}}) +{ + ($w_2, $w_1) = split(/\+/, $con) ; + + if (defined $counts{con_bef}{tot}{$w_1} && + $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1) + { + delete $counts{con_bef}{tot}{$w_1} ; + } +} + +foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}}) +{ + ($p1) = split(/\+/, $con_pos) ; + + if (defined($counts{con_pos_aft}{tot}{$p1}) && + $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1) + { + delete $counts{con_pos_aft}{tot}{$p1} ; + } +} + +foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}}) +{ + ($p_2, $p_1) = split(/\+/, $con_pos) ; + + if (defined($counts{con_pos_bef}{tot}{$p_1}) && + $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1) + { + delete $counts{con_pos_bef}{tot}{$p_1} ; + } +} + +# for each context type, take the three contexts most involved in errors + +$max_con_len = 0 ; + +filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_bef}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_aft}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \$max_con_len) ; + +# for each CPOS context type, take the three CPOS contexts most involved in errors + +$max_con_pos_len = 0 ; + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}}) +{ + if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_bef_2}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_bef}{tot}}) +{ + if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_bef}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_aft}{tot}}) +{ + if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_aft}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}}) +{ + if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_aft_2}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +# printing + +# ------------- focus words + +printf OUT "\n\n" ; +printf OUT " %d focus words where most of the errors occur:\n\n", scalar keys %{$counts{word}{err_any}} ; + +printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s\n", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ; +printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len; + +foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}}) +{ + if (!defined($counts{word}{err_head}{$word})) + { + $counts{word}{err_head}{$word} = 0 ; + } + if (! defined($counts{word}{err_dep}{$word})) + { + $counts{word}{err_dep}{$word} = 0 ; + } + if (! defined($counts{word}{err_any}{$word})) + { + $counts{word}{err_any}{$word} = 0; + } + printf OUT " %-*s | %4d | %4d | %4d | %4d\n", + $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word}, + $counts{word}{err_head}{$word}, + $counts{word}{err_dep}{$word}, + $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ; +} + +printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len; + +# ------------- contexts + +printf OUT "\n\n" ; + +printf OUT " one-token preceeding contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ; + +printf OUT " two-token preceeding contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ; + +printf OUT " one-token following contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ; + +printf OUT " two-token following contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ; + +# ------------- Sentences + +printf OUT " Sentence with the highest number of word errors:\n" ; +$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word}) + <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +printf OUT "\n\n" ; + +printf OUT " Sentence with the highest number of head errors:\n" ; +$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head}) + <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +printf OUT "\n\n" ; + +printf OUT " Sentence with the highest number of dependency errors:\n" ; +$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep}) + <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +# +# Second pass, collect statistics of the frequent errors +# + +# filter the errors, leave the most frequent $freq_err_num errors + +$i = 0 ; + +$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ; + +foreach $err (keys %freq_err) +{ + if ($freq_err{$err} < $thresh) + { + delete $freq_err{$err} ; + } +} + +# in case there are several errors with the threshold count + +$freq_err_num = scalar keys %freq_err ; + +%err_counts = () ; + +$eof = 0 ; + +seek (GOLD, 0, 0) ; +seek (SYS, 0, 0) ; + +while (! $eof) +{ # second reading loop + + $eof = read_sent(\@sent_gold, \@sent_sys) ; + $sent_num++ ; + + $word_num = scalar @sent_gold ; + + # printf "$sent_num $word_num\n" ; + + foreach $i_w (0 .. $word_num-1) + { # loop on words + ($word, $pos, $head_g, $dep_g) + = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ; + + # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ; + + if ((! $score_on_punct) && is_uni_punct($word)) + { + # ignore punctuations + next ; + } + + ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ; + + $err_head = ($head_g ne $head_s) ; + $err_dep = ($dep_g ne $dep_s) ; + + $head_err = '-' ; + $dep_err = '-' ; + + if ($head_g eq '0') + { + $head_aft_bef_g = '0' ; + } + elsif ($head_g eq $i_w+1) + { + $head_aft_bef_g = 'e' ; + } + else + { + $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ; + } + + if ($head_s eq '0') + { + $head_aft_bef_s = '0' ; + } + elsif ($head_s eq $i_w+1) + { + $head_aft_bef_s = 'e' ; + } + else + { + $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ; + } + + $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ; + + if ($err_head) + { + if ($head_aft_bef_s eq '0') + { + $head_err = 0 ; + } + else + { + $head_err = $head_s-$head_g ; + } + } + + if ($err_dep) + { + $dep_err = $dep_g.'->'.$dep_s ; + } + + if (! ($err_head || $err_dep)) + { + next ; + } + + # handle only the most frequent errors + + $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ; + + if (! exists $freq_err{$err}) + { + next ; + } + + ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ; + + $con_bef = $w_1 ; + $con_bef_2 = $w_2.' + '.$w_1 ; + $con_aft = $w1 ; + $con_aft_2 = $w1.' + '.$w2 ; + + $con_pos_bef = $p_1 ; + $con_pos_bef_2 = $p_2.'+'.$p_1 ; + $con_pos_aft = $p1 ; + $con_pos_aft_2 = $p1.'+'.$p2 ; + + @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ; + + # printf "# %-25s %-15s %-10s %-25s %-3s %-30s\n", + # $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ; + + @bits = (0, 0, 0, 0, 0, 0) ; + $j = 0 ; + + while ($j == 0) + { + for ($i = 0; $i <= $#bits; $i++) + { + if ($bits[$i] == 0) + { + $bits[$i] = 1 ; + $j = 0 ; + last ; + } + else + { + $bits[$i] = 0 ; + $j = 1 ; + } + } + + @e_bits = @cur_err ; + + for ($i = 0; $i <= $#bits; $i++) + { + if (! $bits[$i]) + { + $e_bits[$i] = '*' ; + } + } + + # include also the last case which is the most general + # (wildcards for everything) + $err_counts{$err}{join($sep, @e_bits)}++ ; + + } + + } # loop on words +} # second reading loop + +printf OUT "\n\n" ; +printf OUT " Specific errors, %d most frequent errors:", $freq_err_num ; +printf OUT "\n %s\n", '=' x 41 ; + + +# deleting local contexts which are too general + +foreach $err (keys %err_counts) +{ + foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}} + keys %{$err_counts{$err}}) + { + @cur_err = split(/\Q$sep\E/, $loc_con) ; + + # In this loop, one or two elements of the local context are + # replaced with '*' to make it more general. If the entry for + # the general context has the same count it is removed. + + foreach $i (0 .. $#cur_err) + { + $w1 = $cur_err[$i] ; + if ($cur_err[$i] eq '*') + { + next ; + } + $cur_err[$i] = '*' ; + $con1 = join($sep, @cur_err) ; + if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con}) + && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con})) + { + delete $err_counts{$err}{$con1} ; + } + for ($j = $i+1; $j <=$#cur_err; $j++) + { + if ($cur_err[$j] eq '*') + { + next ; + } + $w2 = $cur_err[$j] ; + $cur_err[$j] = '*' ; + $con1 = join($sep, @cur_err) ; + if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con}) + && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con})) + { + delete $err_counts{$err}{$con1} ; + } + $cur_err[$j] = $w2 ; + } + $cur_err[$i] = $w1 ; + } + } +} + +# Leaving only the topmost local contexts for each error + +foreach $err (keys %err_counts) +{ + $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ; + + # of the threshold is too low, take the 2nd highest count + # (the highest may be the total which is the generic case + # and not relevant for printing) + + if ($thresh < 5) + { + $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ; + } + + foreach $loc_con (keys %{$err_counts{$err}}) + { + if ($err_counts{$err}{$loc_con} < $thresh) + { + delete $err_counts{$err}{$loc_con} ; + } + else + { + if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*'))) + { + $loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ; + } + } + } +} + +# printing an error summary + +# calculating the context field length + +$max_word_spec_len= length('word') ; +$max_con_aft_len = length('word') ; +$max_con_bef_len = length('word') ; +$max_con_pos_len = length('CPOS') ; + +foreach $err (keys %err_counts) +{ + foreach $loc_con (sort keys %{$err_counts{$err}}) + { + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $loc_con) ; + + $l = uni_len($word) ; + if ($l > $max_word_spec_len) + { + $max_word_spec_len = $l ; + } + + $l = uni_len($con_bef) ; + if ($l > $max_con_bef_len) + { + $max_con_bef_len = $l ; + } + + $l = uni_len($con_aft) ; + if ($l > $max_con_aft_len) + { + $max_con_aft_len = $l ; + } + + if (length($con_pos_aft) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos_aft) ; + } + + if (length($con_pos_bef) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos_bef) ; + } + } +} + +$err_counter = 0 ; + +foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err) +{ + + ($head_err, $head_aft_bef, $dep_err) = split(/\Q$sep\E/, $err) ; + + $err_counter++ ; + $err_desc{$err} = sprintf("%2d. ", $err_counter). + describe_err($head_err, $head_aft_bef, $dep_err) ; + + # printf OUT " %-3s %-30s %d\n", $head_err, $dep_err, $freq_err{$err} ; + printf OUT "\n" ; + printf OUT " %s : %d times\n", $err_desc{$err}, $freq_err{$err} ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + printf OUT " %-*s | %-*s | %-*s | %s\n", + $max_con_pos_len+$max_con_bef_len+3, ' Before', + $max_word_spec_len+$max_pos_len+3, ' Focus', + $max_con_pos_len+$max_con_aft_len+3, ' After', + 'Count' ; + + printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s |\n", + $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word', + $max_pos_len, 'CPOS', $max_word_spec_len, 'word', + $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}} + keys %{$err_counts{$err}}) + { + if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*'))) + { + next ; + } + + $con1 = $loc_con ; + $con1 =~ s/\*/ /g ; + + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $con1) ; + + printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\n", + $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef, + $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word, + $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft, + $err_counts{$err}{$loc_con} ; + } + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + +} + +printf OUT "\n\n" ; +printf OUT " Local contexts involved in several frequent errors:" ; +printf OUT "\n %s\n", '=' x 51 ; +printf OUT "\n\n" ; + +foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=> + scalar keys %{$loc_con_err_counts{$a}}} + keys %loc_con_err_counts) +{ + + if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1) + { + next ; + } + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + printf OUT " %-*s | %-*s | %-*s \n", + $max_con_pos_len+$max_con_bef_len+3, ' Before', + $max_word_spec_len+$max_pos_len+3, ' Focus', + $max_con_pos_len+$max_con_aft_len+3, ' After' ; + + printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s \n", + $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word', + $max_pos_len, 'CPOS', $max_word_spec_len, 'word', + $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + $con1 = $loc_con ; + $con1 =~ s/\*/ /g ; + + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $con1) ; + + printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \n", + $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef, + $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word, + $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=> + $loc_con_err_counts{$loc_con}{$a}} + keys %{$loc_con_err_counts{$loc_con}}) + { + printf OUT " %s : %d times\n", $err_desc{$err}, + $loc_con_err_counts{$loc_con}{$err} ; + } + + printf OUT "\n" ; +} + +close GOLD ; +close SYS ; + +close OUT ; + diff --git a/bcovington/src/utils/weights.clas b/bcovington/src/utils/weights.clas new file mode 100644 index 0000000..eee7ac6 --- /dev/null +++ b/bcovington/src/utils/weights.clas @@ -0,0 +1,11 @@ +# Relations used to attach function words to content words +aux 0.1 +case 0.1 +cc 0.1 +clf 0.1 +cop 0.1 +det 0.1 +mark 0.1 + +# Punctuation +punct 0