From 41b15ff56875d92ae9a9961647b9ad5039efecb8 Mon Sep 17 00:00:00 2001 From: valedica Date: Fri, 11 Mar 2022 10:24:50 +0100 Subject: [PATCH 1/3] Fix punctuation in average embedding --- neuralcoref/neuralcoref.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralcoref/neuralcoref.pyx b/neuralcoref/neuralcoref.pyx index a137d50..a5a4602 100644 --- a/neuralcoref/neuralcoref.pyx +++ b/neuralcoref/neuralcoref.pyx @@ -893,7 +893,7 @@ cdef class NeuralCoref(object): cdef int n = 0 embed_arr = numpy.zeros(self.static_vectors.shape[1], dtype='float32') for token in span: - if token.lower not in PUNCTS: + if not inside(token.lower, self.hashes.puncts): n += 1 embed_vector = self.get_word_embedding(token, tuned=False) embed_arr = embed_arr + embed_vector From 9016065889b182321b6aca933f915613483eb4cc Mon Sep 17 00:00:00 2001 From: valedica Date: Fri, 11 Mar 2022 11:00:28 +0100 Subject: [PATCH 2/3] Add doc embeddings calculation during inference --- neuralcoref/neuralcoref.pyx | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/neuralcoref/neuralcoref.pyx b/neuralcoref/neuralcoref.pyx index a5a4602..8ca8bcb 100644 --- a/neuralcoref/neuralcoref.pyx +++ b/neuralcoref/neuralcoref.pyx @@ -714,7 +714,7 @@ cdef class NeuralCoref(object): # if debug: print("Build single features and pair features arrays") # ''' Build single features and pair features arrays ''' doc_c = doc.c - doc_embedding = numpy.zeros(SIZE_EMBEDDING, dtype='float32') # self.embeds.get_average_embedding(doc.c, 0, doc.length + 1, self.hashes.puncts) + doc_embedding = self.get_doc_embedding(doc) doc_embed = doc_embedding for i in range(n_mentions): s_inp_arr[i, :SGNL_FEATS_0] = self.get_mention_embeddings(mentions[i], doc_embedding) # Set embeddings @@ -871,6 +871,14 @@ cdef class NeuralCoref(object): def normalize(self, Token token): return self.hashes.digit_word if token.is_digit else token.lower + + def get_doc_embedding(self, Doc doc): + embed_arr = numpy.zeros(self.static_vectors.shape[1], dtype='float32') + for sent in doc.sents: + utt_embed = self.get_average_embedding(sent) + embed_arr += utt_embed + embed_arr = numpy.divide(embed_arr, float(max(len(list(doc.sents)), 1))) + return embed_arr def get_static(self, hash_t word): return self.static_vectors[word] if word in self.static_vectors else self.static_vectors[self.hashes.unknown_word] From 11215dcd335a932713977f8fb97a0c45d82edfb2 Mon Sep 17 00:00:00 2001 From: valedica Date: Fri, 11 Mar 2022 11:28:03 +0100 Subject: [PATCH 3/3] Revert "Add doc embeddings calculation during inference" This reverts commit 9016065889b182321b6aca933f915613483eb4cc. --- neuralcoref/neuralcoref.pyx | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/neuralcoref/neuralcoref.pyx b/neuralcoref/neuralcoref.pyx index 8ca8bcb..a5a4602 100644 --- a/neuralcoref/neuralcoref.pyx +++ b/neuralcoref/neuralcoref.pyx @@ -714,7 +714,7 @@ cdef class NeuralCoref(object): # if debug: print("Build single features and pair features arrays") # ''' Build single features and pair features arrays ''' doc_c = doc.c - doc_embedding = self.get_doc_embedding(doc) + doc_embedding = numpy.zeros(SIZE_EMBEDDING, dtype='float32') # self.embeds.get_average_embedding(doc.c, 0, doc.length + 1, self.hashes.puncts) doc_embed = doc_embedding for i in range(n_mentions): s_inp_arr[i, :SGNL_FEATS_0] = self.get_mention_embeddings(mentions[i], doc_embedding) # Set embeddings @@ -871,14 +871,6 @@ cdef class NeuralCoref(object): def normalize(self, Token token): return self.hashes.digit_word if token.is_digit else token.lower - - def get_doc_embedding(self, Doc doc): - embed_arr = numpy.zeros(self.static_vectors.shape[1], dtype='float32') - for sent in doc.sents: - utt_embed = self.get_average_embedding(sent) - embed_arr += utt_embed - embed_arr = numpy.divide(embed_arr, float(max(len(list(doc.sents)), 1))) - return embed_arr def get_static(self, hash_t word): return self.static_vectors[word] if word in self.static_vectors else self.static_vectors[self.hashes.unknown_word]