Skip to content

Support for Python2.7 #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ To train a deep learning model, say Seq2SeqSummarizer, run the following command
```bash
pip install requirements.txt

cd demo
python seq2seq_train.py
python demo/seq2seq_train.py
```

The training code in seq2seq_train.py is quite straightforward and illustrated below:
Expand Down
10 changes: 10 additions & 0 deletions demo/data/test1
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
An Indian-American woman has been elected as the president of the powerful student's body of the prestigious Harvard University.
Sruthi Palaniappan, 20, whose parents migrated to the US from Chennai in 1992, was elected as president of the Harvard University Undergraduate Council.
Her running mate Julia Huesa, 20, was elected as vice president, according to an announcement by the Undergraduate Council Election Commission.
Ms Palaniappan said that Ms Huesa and her planned to work on improving the Council's communication with the student body in their initial days in office.
"I think from the onset, better structuring the way we communicate with students is something that we need to already set the tone and plan for," she said.
"I think we're going to work on it before we even leave for break and just get off the ground running," she told Harvard Crimson, the student newspaper of the varsity.
Ms Palaniappan was the youngest delegate at the Democratic National Convention in Philadelphia in July 2016.
According to the report, Ms Palaniappan and Ms Huesa garnered nearly 41.5 per cent of the votes as against their nearest opponent Nadine M Khoury and Arnav Agrawal, who received 26.6 per cent of the votes.
They ran their campaign under the slogan "Make Harvard Home".
The duo, a long-time member of the Undergraduate Council, are scheduled to take over from the outgoing president Catherine L Zhang '19 and vice president Nicholas D Boucher '19.
201 changes: 201 additions & 0 deletions demo/data/wiki_v_small.csv

Large diffs are not rendered by default.

68 changes: 34 additions & 34 deletions demo/seq2seq_predict.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
from __future__ import print_function

import pandas as pd
from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer
import numpy as np


def main():
np.random.seed(42)
data_dir_path = './data'
model_dir_path = './models'

print('loading csv file ...')
df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")
X = df['text']
Y = df.title

config = np.load(Seq2SeqSummarizer.get_config_file_path(model_dir_path=model_dir_path)).item()

summarizer = Seq2SeqSummarizer(config)
summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))

print('start predicting ...')
for i in np.random.permutation(np.arange(len(X)))[0:20]:
x = X[i]
actual_headline = Y[i]
headline = summarizer.summarize(x)
# print('Article: ', x)
print('Generated Headline: ', headline)
print('Original Headline: ', actual_headline)


if __name__ == '__main__':
main()
from __future__ import print_function
import pandas as pd
from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer
import numpy as np
def main():
np.random.seed(42)
data_dir_path = './demo/data'
model_dir_path = './demo/models'
print('loading csv file ...')
df = pd.read_csv(data_dir_path + "/wiki_v_small.csv") # "/fake_or_real_news.csv")
X = df['text']
Y = df['summary']
config = np.load(Seq2SeqSummarizer.get_config_file_path(model_dir_path=model_dir_path)).item()
summarizer = Seq2SeqSummarizer(config)
summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))
print('start predicting ...')
for i in np.random.permutation(np.arange(len(X)))[0:20]:
x = X[i]
actual_headline = Y[i]
headline = summarizer.summarize(x)
# print('Article: ', x)
print('Generated Headline: ', headline)
print('Original Headline: ', actual_headline)
if __name__ == '__main__':
main()
96 changes: 48 additions & 48 deletions demo/seq2seq_train.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,48 @@
from __future__ import print_function

import pandas as pd
from sklearn.model_selection import train_test_split
from keras_text_summarization.library.utility.plot_utils import plot_and_save_history
from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer
from keras_text_summarization.library.applications.fake_news_loader import fit_text
import numpy as np

LOAD_EXISTING_WEIGHTS = False


def main():
np.random.seed(42)
data_dir_path = './data'
report_dir_path = './reports'
model_dir_path = './models'

print('loading csv file ...')
df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

print('extract configuration from input texts ...')
Y = df.title
X = df['text']

config = fit_text(X, Y)

summarizer = Seq2SeqSummarizer(config)

if LOAD_EXISTING_WEIGHTS:
summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

print('demo size: ', len(Xtrain))
print('testing size: ', len(Xtest))

print('start fitting ...')
history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)

history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
if LOAD_EXISTING_WEIGHTS:
history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png'
plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})


if __name__ == '__main__':
main()
from __future__ import print_function
import pandas as pd
from sklearn.model_selection import train_test_split
from keras_text_summarization.library.utility.plot_utils import plot_and_save_history
from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer
from keras_text_summarization.library.applications.fake_news_loader import fit_text
import numpy as np
LOAD_EXISTING_WEIGHTS = False
def main():
np.random.seed(42)
data_dir_path = './demo/data'
report_dir_path = './demo/reports'
model_dir_path = './demo/models'
print('loading csv file ...')
df = pd.read_csv(data_dir_path + "/wikihowAll.csv")
print('extract configuration from input texts ...')
Y = df['summary']
X = df['text']
config = fit_text(X, Y)
summarizer = Seq2SeqSummarizer(config)
if LOAD_EXISTING_WEIGHTS:
summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
print('demo size: ', len(Xtrain))
print('testing size: ', len(Xtest))
print('start fitting ...')
history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)
history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
if LOAD_EXISTING_WEIGHTS:
history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png'
plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
if __name__ == '__main__':
main()
134 changes: 67 additions & 67 deletions keras_text_summarization/library/applications/fake_news_loader.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,67 @@
from collections import Counter

MAX_INPUT_SEQ_LENGTH = 500
MAX_TARGET_SEQ_LENGTH = 50
MAX_INPUT_VOCAB_SIZE = 5000
MAX_TARGET_VOCAB_SIZE = 2000


def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None):
if input_seq_max_length is None:
input_seq_max_length = MAX_INPUT_SEQ_LENGTH
if target_seq_max_length is None:
target_seq_max_length = MAX_TARGET_SEQ_LENGTH
input_counter = Counter()
target_counter = Counter()
max_input_seq_length = 0
max_target_seq_length = 0

for line in X:
text = [word.lower() for word in line.split(' ')]
seq_length = len(text)
if seq_length > input_seq_max_length:
text = text[0:input_seq_max_length]
seq_length = len(text)
for word in text:
input_counter[word] += 1
max_input_seq_length = max(max_input_seq_length, seq_length)

for line in Y:
line2 = 'START ' + line.lower() + ' END'
text = [word for word in line2.split(' ')]
seq_length = len(text)
if seq_length > target_seq_max_length:
text = text[0:target_seq_max_length]
seq_length = len(text)
for word in text:
target_counter[word] += 1
max_target_seq_length = max(max_target_seq_length, seq_length)

input_word2idx = dict()
for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)):
input_word2idx[word[0]] = idx + 2
input_word2idx['PAD'] = 0
input_word2idx['UNK'] = 1
input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])

target_word2idx = dict()
for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)):
target_word2idx[word[0]] = idx + 1
target_word2idx['UNK'] = 0

target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

num_input_tokens = len(input_word2idx)
num_target_tokens = len(target_word2idx)

config = dict()
config['input_word2idx'] = input_word2idx
config['input_idx2word'] = input_idx2word
config['target_word2idx'] = target_word2idx
config['target_idx2word'] = target_idx2word
config['num_input_tokens'] = num_input_tokens
config['num_target_tokens'] = num_target_tokens
config['max_input_seq_length'] = max_input_seq_length
config['max_target_seq_length'] = max_target_seq_length

return config
from collections import Counter
MAX_INPUT_SEQ_LENGTH = 500
MAX_TARGET_SEQ_LENGTH = 50
MAX_INPUT_VOCAB_SIZE = 5000
MAX_TARGET_VOCAB_SIZE = 2000
def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None):
if input_seq_max_length is None:
input_seq_max_length = MAX_INPUT_SEQ_LENGTH
if target_seq_max_length is None:
target_seq_max_length = MAX_TARGET_SEQ_LENGTH
input_counter = Counter()
target_counter = Counter()
max_input_seq_length = 0
max_target_seq_length = 0
for line in X:
text = [word.lower() for word in line.split(' ')]
seq_length = len(text)
if seq_length > input_seq_max_length:
text = text[0:input_seq_max_length]
seq_length = len(text)
for word in text:
input_counter[word] += 1
max_input_seq_length = max(max_input_seq_length, seq_length)
for line in Y:
line2 = 'START ' + line.lower() + ' END'
text = [word for word in line2.split(' ')]
seq_length = len(text)
if seq_length > target_seq_max_length:
text = text[0:target_seq_max_length]
seq_length = len(text)
for word in text:
target_counter[word] += 1
max_target_seq_length = max(max_target_seq_length, seq_length)
input_word2idx = dict()
for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)):
input_word2idx[word[0]] = idx + 2
input_word2idx['PAD'] = 0
input_word2idx['UNK'] = 1
input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
target_word2idx = dict()
for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)):
target_word2idx[word[0]] = idx + 1
target_word2idx['UNK'] = 0
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])
num_input_tokens = len(input_word2idx)
num_target_tokens = len(target_word2idx)
config = dict()
config['input_word2idx'] = input_word2idx
config['input_idx2word'] = input_idx2word
config['target_word2idx'] = target_word2idx
config['target_idx2word'] = target_idx2word
config['num_input_tokens'] = num_input_tokens
config['num_target_tokens'] = num_target_tokens
config['max_input_seq_length'] = max_input_seq_length
config['max_target_seq_length'] = max_target_seq_length
return config
Loading