chen0040 · bikramkhastgir · Nov 20, 2018 · Nov 20, 2018 · Nov 20, 2018 · Nov 20, 2018
diff --git a/README.md b/README.md
@@ -56,8 +56,7 @@ To train a deep learning model, say Seq2SeqSummarizer, run the following command
 ```bash
 pip install requirements.txt
 
-cd demo
-python seq2seq_train.py 
+python demo/seq2seq_train.py 
 ```
 
 The training code in seq2seq_train.py is quite straightforward and illustrated below:

diff --git a/demo/data/test1 b/demo/data/test1
@@ -0,0 +1,10 @@
+An Indian-American woman has been elected as the president of the powerful student's body of the prestigious Harvard University.
+Sruthi Palaniappan, 20, whose parents migrated to the US from Chennai in 1992, was elected as president of the Harvard University Undergraduate Council.
+Her running mate Julia Huesa, 20, was elected as vice president, according to an announcement by the Undergraduate Council Election Commission.
+Ms Palaniappan said that Ms Huesa and her planned to work on improving the Council's communication with the student body in their initial days in office.
+"I think from the onset, better structuring the way we communicate with students is something that we need to already set the tone and plan for," she said.
+"I think we're going to work on it before we even leave for break and just get off the ground running," she told Harvard Crimson, the student newspaper of the varsity.
+Ms Palaniappan was the youngest delegate at the Democratic National Convention in Philadelphia in July 2016.
+According to the report, Ms Palaniappan and Ms Huesa garnered nearly 41.5 per cent of the votes as against their nearest opponent Nadine M Khoury and Arnav Agrawal, who received 26.6 per cent of the votes.
+They ran their campaign under the slogan "Make Harvard Home".
+The duo, a long-time member of the Undergraduate Council, are scheduled to take over from the outgoing president Catherine L Zhang '19 and vice president Nicholas D Boucher '19.
diff --git a/demo/data/wiki_v_small.csv b/demo/data/wiki_v_small.csv
diff --git a/demo/seq2seq_predict.py b/demo/seq2seq_predict.py
@@ -1,34 +1,34 @@
-from __future__ import print_function
-
-import pandas as pd
-from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer
-import numpy as np
-
-
-def main():
-    np.random.seed(42)
-    data_dir_path = './data'
-    model_dir_path = './models'
-
-    print('loading csv file ...')
-    df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")
-    X = df['text']
-    Y = df.title
-
-    config = np.load(Seq2SeqSummarizer.get_config_file_path(model_dir_path=model_dir_path)).item()
-
-    summarizer = Seq2SeqSummarizer(config)
-    summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))
-
-    print('start predicting ...')
-    for i in np.random.permutation(np.arange(len(X)))[0:20]:
-        x = X[i]
-        actual_headline = Y[i]
-        headline = summarizer.summarize(x)
-        # print('Article: ', x)
-        print('Generated Headline: ', headline)
-        print('Original Headline: ', actual_headline)
-
-
-if __name__ == '__main__':
-    main()
+from __future__ import print_function
+
+import pandas as pd
+from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer
+import numpy as np
+
+
+def main():
+    np.random.seed(42)
+    data_dir_path = './demo/data'
+    model_dir_path = './demo/models'
+
+    print('loading csv file ...')
+    df = pd.read_csv(data_dir_path + "/wiki_v_small.csv")  # "/fake_or_real_news.csv")
+    X = df['text']
+    Y = df['summary']
+
+    config = np.load(Seq2SeqSummarizer.get_config_file_path(model_dir_path=model_dir_path)).item()
+
+    summarizer = Seq2SeqSummarizer(config)
+    summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))
+
+    print('start predicting ...')
+    for i in np.random.permutation(np.arange(len(X)))[0:20]:
+        x = X[i]
+        actual_headline = Y[i]
+        headline = summarizer.summarize(x)
+        # print('Article: ', x)
+        print('Generated Headline: ', headline)
+        print('Original Headline: ', actual_headline)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/seq2seq_train.py b/demo/seq2seq_train.py
@@ -1,48 +1,48 @@
-from __future__ import print_function
-
-import pandas as pd
-from sklearn.model_selection import train_test_split
-from keras_text_summarization.library.utility.plot_utils import plot_and_save_history
-from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer
-from keras_text_summarization.library.applications.fake_news_loader import fit_text
-import numpy as np
-
-LOAD_EXISTING_WEIGHTS = False
-
-
-def main():
-    np.random.seed(42)
-    data_dir_path = './data'
-    report_dir_path = './reports'
-    model_dir_path = './models'
-
-    print('loading csv file ...')
-    df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")
-
-    print('extract configuration from input texts ...')
-    Y = df.title
-    X = df['text']
-
-    config = fit_text(X, Y)
-
-    summarizer = Seq2SeqSummarizer(config)
-
-    if LOAD_EXISTING_WEIGHTS:
-        summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))
-
-    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
-
-    print('demo size: ', len(Xtrain))
-    print('testing size: ', len(Xtest))
-
-    print('start fitting ...')
-    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)
-
-    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
-    if LOAD_EXISTING_WEIGHTS:
-        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png'
-    plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
-
-
-if __name__ == '__main__':
-    main()
+from __future__ import print_function
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from keras_text_summarization.library.utility.plot_utils import plot_and_save_history
+from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer
+from keras_text_summarization.library.applications.fake_news_loader import fit_text
+import numpy as np
+
+LOAD_EXISTING_WEIGHTS = False
+
+
+def main():
+    np.random.seed(42)
+    data_dir_path = './demo/data'
+    report_dir_path = './demo/reports'
+    model_dir_path = './demo/models'
+
+    print('loading csv file ...')
+    df = pd.read_csv(data_dir_path + "/wikihowAll.csv")
+
+    print('extract configuration from input texts ...')
+    Y = df['summary']
+    X = df['text']
+
+    config = fit_text(X, Y)
+
+    summarizer = Seq2SeqSummarizer(config)
+
+    if LOAD_EXISTING_WEIGHTS:
+        summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))
+
+    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
+
+    print('demo size: ', len(Xtrain))
+    print('testing size: ', len(Xtest))
+
+    print('start fitting ...')
+    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)
+
+    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
+    if LOAD_EXISTING_WEIGHTS:
+        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png'
+    plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
+
+
+if __name__ == '__main__':
+    main()
diff --git a/keras_text_summarization/library/applications/fake_news_loader.py b/keras_text_summarization/library/applications/fake_news_loader.py
@@ -1,67 +1,67 @@
-from collections import Counter
-
-MAX_INPUT_SEQ_LENGTH = 500
-MAX_TARGET_SEQ_LENGTH = 50
-MAX_INPUT_VOCAB_SIZE = 5000
-MAX_TARGET_VOCAB_SIZE = 2000
-
-
-def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None):
-    if input_seq_max_length is None:
-        input_seq_max_length = MAX_INPUT_SEQ_LENGTH
-    if target_seq_max_length is None:
-        target_seq_max_length = MAX_TARGET_SEQ_LENGTH
-    input_counter = Counter()
-    target_counter = Counter()
-    max_input_seq_length = 0
-    max_target_seq_length = 0
-
-    for line in X:
-        text = [word.lower() for word in line.split(' ')]
-        seq_length = len(text)
-        if seq_length > input_seq_max_length:
-            text = text[0:input_seq_max_length]
-            seq_length = len(text)
-        for word in text:
-            input_counter[word] += 1
-        max_input_seq_length = max(max_input_seq_length, seq_length)
-
-    for line in Y:
-        line2 = 'START ' + line.lower() + ' END'
-        text = [word for word in line2.split(' ')]
-        seq_length = len(text)
-        if seq_length > target_seq_max_length:
-            text = text[0:target_seq_max_length]
-            seq_length = len(text)
-        for word in text:
-            target_counter[word] += 1
-            max_target_seq_length = max(max_target_seq_length, seq_length)
-
-    input_word2idx = dict()
-    for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)):
-        input_word2idx[word[0]] = idx + 2
-    input_word2idx['PAD'] = 0
-    input_word2idx['UNK'] = 1
-    input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
-
-    target_word2idx = dict()
-    for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)):
-        target_word2idx[word[0]] = idx + 1
-    target_word2idx['UNK'] = 0
-
-    target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])
-
-    num_input_tokens = len(input_word2idx)
-    num_target_tokens = len(target_word2idx)
-
-    config = dict()
-    config['input_word2idx'] = input_word2idx
-    config['input_idx2word'] = input_idx2word
-    config['target_word2idx'] = target_word2idx
-    config['target_idx2word'] = target_idx2word
-    config['num_input_tokens'] = num_input_tokens
-    config['num_target_tokens'] = num_target_tokens
-    config['max_input_seq_length'] = max_input_seq_length
-    config['max_target_seq_length'] = max_target_seq_length
-
-    return config
+from collections import Counter
+
+MAX_INPUT_SEQ_LENGTH = 500
+MAX_TARGET_SEQ_LENGTH = 50
+MAX_INPUT_VOCAB_SIZE = 5000
+MAX_TARGET_VOCAB_SIZE = 2000
+
+
+def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None):
+    if input_seq_max_length is None:
+        input_seq_max_length = MAX_INPUT_SEQ_LENGTH
+    if target_seq_max_length is None:
+        target_seq_max_length = MAX_TARGET_SEQ_LENGTH
+    input_counter = Counter()
+    target_counter = Counter()
+    max_input_seq_length = 0
+    max_target_seq_length = 0
+
+    for line in X:
+        text = [word.lower() for word in line.split(' ')]
+        seq_length = len(text)
+        if seq_length > input_seq_max_length:
+            text = text[0:input_seq_max_length]
+            seq_length = len(text)
+        for word in text:
+            input_counter[word] += 1
+        max_input_seq_length = max(max_input_seq_length, seq_length)
+
+    for line in Y:
+        line2 = 'START ' + line.lower() + ' END'
+        text = [word for word in line2.split(' ')]
+        seq_length = len(text)
+        if seq_length > target_seq_max_length:
+            text = text[0:target_seq_max_length]
+            seq_length = len(text)
+        for word in text:
+            target_counter[word] += 1
+            max_target_seq_length = max(max_target_seq_length, seq_length)
+
+    input_word2idx = dict()
+    for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)):
+        input_word2idx[word[0]] = idx + 2
+    input_word2idx['PAD'] = 0
+    input_word2idx['UNK'] = 1
+    input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
+
+    target_word2idx = dict()
+    for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)):
+        target_word2idx[word[0]] = idx + 1
+    target_word2idx['UNK'] = 0
+
+    target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])
+    
+    num_input_tokens = len(input_word2idx)
+    num_target_tokens = len(target_word2idx)
+
+    config = dict()
+    config['input_word2idx'] = input_word2idx
+    config['input_idx2word'] = input_idx2word
+    config['target_word2idx'] = target_word2idx
+    config['target_idx2word'] = target_idx2word
+    config['num_input_tokens'] = num_input_tokens
+    config['num_target_tokens'] = num_target_tokens
+    config['max_input_seq_length'] = max_input_seq_length
+    config['max_target_seq_length'] = max_target_seq_length
+
+    return config