Skip to content

Commit f651015

Browse files
author
_ygor_gallina
authored
Fix 0 division error with overlapping candidates (#177)
* Dealt with overlapping candidates * Fixed PEP8 warnings
1 parent 485c938 commit f651015

File tree

2 files changed

+47
-21
lines changed

2 files changed

+47
-21
lines changed

pke/unsupervised/graph_based/multipartiterank.py

+24-13
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ class MultipartiteRank(TopicRank):
5050
stoplist += stopwords.words('english')
5151
extractor.candidate_selection(pos=pos, stoplist=stoplist)
5252
53-
# 4. build the Multipartite graph and rank candidates using random walk,
54-
# alpha controls the weight adjustment mechanism, see TopicRank for
55-
# threshold/method parameters.
53+
# 4. build the Multipartite graph and rank candidates using random
54+
# walk, alpha controls the weight adjustment mechanism, see
55+
# TopicRank for threshold/method parameters.
5656
extractor.candidate_weighting(alpha=1.1,
5757
threshold=0.74,
5858
method='average')
@@ -82,7 +82,7 @@ def topic_clustering(self,
8282
Args:
8383
threshold (float): the minimum similarity for clustering,
8484
defaults to 0.74, i.e. more than 1/4 of stem overlap
85-
similarity.
85+
similarity.
8686
method (str): the linkage method, defaults to average.
8787
"""
8888

@@ -125,7 +125,8 @@ def build_topic_graph(self):
125125
for node_i, node_j in combinations(self.candidates.keys(), 2):
126126

127127
# discard intra-topic edges
128-
if self.topic_identifiers[node_i] == self.topic_identifiers[node_j]:
128+
if self.topic_identifiers[node_i] \
129+
== self.topic_identifiers[node_j]:
129130
continue
130131

131132
weights = []
@@ -136,14 +137,23 @@ def build_topic_graph(self):
136137
gap = abs(p_i - p_j)
137138

138139
# alter gap according to candidate length
140+
# if candidates overlap gap is 1
139141
if p_i < p_j:
140-
gap -= len(self.candidates[node_i].lexical_form) - 1
142+
len_i = len(self.candidates[node_i].lexical_form)
143+
if gap < len_i:
144+
gap = 1
145+
else:
146+
gap -= len_i - 1
141147
if p_j < p_i:
142-
gap -= len(self.candidates[node_j].lexical_form) - 1
148+
len_j = len(self.candidates[node_j].lexical_form)
149+
if gap < len_j:
150+
gap = 1
151+
else:
152+
gap -= len_j - 1
143153

144154
weights.append(1.0 / gap)
145155

146-
# add weighted edges
156+
# add weighted edges
147157
if weights:
148158
# node_i -> node_j
149159
self.graph.add_edge(node_i, node_j, weight=sum(weights))
@@ -154,8 +164,8 @@ def weight_adjustment(self, alpha=1.1):
154164
""" Adjust edge weights for boosting some candidates.
155165
156166
Args:
157-
alpha (float): hyper-parameter that controls the strength of the
158-
weight adjustment, defaults to 1.1.
167+
alpha (float): hyper-parameter that controls the strength of
168+
the weight adjustment, defaults to 1.1.
159169
"""
160170

161171
# weighted_edges = defaultdict(list)
@@ -195,7 +205,8 @@ def weight_adjustment(self, alpha=1.1):
195205
node_i, node_j = nodes
196206
position_i = 1.0 / (1 + self.candidates[node_i].offsets[0])
197207
position_i = math.exp(position_i)
198-
self.graph[node_j][node_i]['weight'] += (boosters * alpha * position_i)
208+
self.graph[node_j][node_i]['weight'] += (
209+
boosters * alpha * position_i)
199210

200211
def candidate_weighting(self,
201212
threshold=0.74,
@@ -207,8 +218,8 @@ def candidate_weighting(self,
207218
threshold (float): the minimum similarity for clustering,
208219
defaults to 0.25.
209220
method (str): the linkage method, defaults to average.
210-
alpha (float): hyper-parameter that controls the strength of the
211-
weight adjustment, defaults to 1.1.
221+
alpha (float): hyper-parameter that controls the strength of
222+
the weight adjustment, defaults to 1.1.
212223
"""
213224
if not self.candidates:
214225
return

pke/unsupervised/graph_based/topicrank.py

+23-8
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,11 @@ def candidate_selection(self, pos=None, stoplist=None):
9797
stoplist = self.stoplist
9898

9999
# filter candidates containing stopwords or punctuation marks
100-
self.candidate_filtering(stoplist=list(string.punctuation) +
101-
['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] +
102-
stoplist)
100+
self.candidate_filtering(stoplist=(
101+
list(string.punctuation)
102+
+ ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
103+
+ stoplist
104+
))
103105

104106
def vectorize_candidates(self):
105107
"""Vectorize the keyphrase candidates.
@@ -175,11 +177,24 @@ def build_topic_graph(self):
175177
for c_j in self.topics[j]:
176178
for p_i in self.candidates[c_i].offsets:
177179
for p_j in self.candidates[c_j].offsets:
180+
# compute gap
178181
gap = abs(p_i - p_j)
182+
183+
# alter gap according to candidate length
184+
# if candidates overlap gap is 1
179185
if p_i < p_j:
180-
gap -= len(self.candidates[c_i].lexical_form) - 1
186+
len_i = len(self.candidates[c_i].lexical_form)
187+
if gap < len_i:
188+
gap = 1
189+
else:
190+
gap -= len_i - 1
181191
if p_j < p_i:
182-
gap -= len(self.candidates[c_j].lexical_form) - 1
192+
len_j = len(self.candidates[c_j].lexical_form)
193+
if gap < len_j:
194+
gap = 1
195+
else:
196+
gap -= len_j - 1
197+
183198
self.graph[i][j]['weight'] += 1.0 / gap
184199

185200
def candidate_weighting(self,
@@ -193,9 +208,9 @@ def candidate_weighting(self,
193208
to 0.74.
194209
method (str): the linkage method, defaults to average.
195210
heuristic (str): the heuristic for selecting the best candidate for
196-
each topic, defaults to first occurring candidate. Other options
197-
are 'frequent' (most frequent candidate, position is used for
198-
ties).
211+
each topic, defaults to first occurring candidate. Other
212+
options are 'frequent' (most frequent candidate, position is
213+
used for ties).
199214
200215
"""
201216
if not self.candidates:

0 commit comments

Comments
 (0)