From 3e285abf3bfe2cc21cbfbc82f69004b76ee7128e Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 20:42:55 +0100 Subject: [PATCH 01/23] Extract style tags and non dialogue words. --- srtparser.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/srtparser.h b/srtparser.h index 3a94495..bc97b7f 100644 --- a/srtparser.h +++ b/srtparser.h @@ -91,7 +91,8 @@ class SubtitleItem std::vector getSpeakerNames(); //return string vector of speaker names std::vector getNonDialogueWords(); //return string vector of non dialogue words std::vector getStyleTags(); //return string vector of style tags - + std::vector _nonDialogue; + std::vector _styleTag; void setStartTime(long int startTime); //set starting time void setEndTime(long int endTime); //set ending time @@ -393,9 +394,12 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue int countP = 0; for(char& c : output) // replacing <...> with ~~~~ { + string tag; + if(c=='<') { countP++; + tag += '<' c = '~'; } @@ -404,16 +408,22 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(countP!=0) { if(c != '>') + tag += c; c = '~'; else if(c == '>') { + tag += '>'; c = '~'; countP--; } } } + + _styleTag.push_back(tag); } + + } //stripping non dialogue data e.g. (applause) @@ -431,8 +441,11 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue int countP = 0; for(char& c : output) // replacing (...) with ~~~~ { + string tag; + if(c=='(') { + tag += '<'; countP++; c = '~'; } @@ -442,15 +455,19 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(countP!=0) { if(c != ')') + tag += c; c = '~'; else if(c == ')') { + tag += '>'; c = '~'; countP--; } } } + + _nonDialogue.push_back(tag); } } @@ -581,10 +598,12 @@ inline int SubtitleItem::getSpeakerCount() const } inline int SubtitleItem::getNonDialogueCount() const { + _nonDialogueCount = _nonDialogue.size() return _nonDialogueCount; } inline int SubtitleItem::getStyleTagCount() const { + _styleTagCount = _styleTag.size() return _styleTagCount; } inline int SubtitleItem::getWordCount() const @@ -654,4 +673,4 @@ inline SubtitleWord::~SubtitleWord(void) } -#endif //SRTPARSER_H \ No newline at end of file +#endif //SRTPARSER_H From 8c97741626dd005f35e5bcc114aa0c6c704f9dff Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 20:47:21 +0100 Subject: [PATCH 02/23] Update srtparser.h --- srtparser.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/srtparser.h b/srtparser.h index bc97b7f..b73c02c 100644 --- a/srtparser.h +++ b/srtparser.h @@ -383,13 +383,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue //stripping HTML tags if(!keepHTML) { - /* - * TODO : Before erasing, extract the words. - * std::vector getStyleTags(); - * int getStyleTagCount() const; - * std::vector _styleTag; - * int _styleTagCount; - */ int countP = 0; for(char& c : output) // replacing <...> with ~~~~ @@ -399,7 +392,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(c=='<') { countP++; - tag += '<' + tag += '<'; c = '~'; } @@ -598,12 +591,12 @@ inline int SubtitleItem::getSpeakerCount() const } inline int SubtitleItem::getNonDialogueCount() const { - _nonDialogueCount = _nonDialogue.size() + _nonDialogueCount = _nonDialogue.size(); return _nonDialogueCount; } inline int SubtitleItem::getStyleTagCount() const { - _styleTagCount = _styleTag.size() + _styleTagCount = _styleTag.size(); return _styleTagCount; } inline int SubtitleItem::getWordCount() const From b2e35c5ef00a15c35a759ec54ba0c420f37a3a67 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 20:47:54 +0100 Subject: [PATCH 03/23] Update srtparser.h --- srtparser.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/srtparser.h b/srtparser.h index b73c02c..c41012a 100644 --- a/srtparser.h +++ b/srtparser.h @@ -423,14 +423,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(!doNotIgnoreNonDialogues) { - /* - * TODO : Before erasing, extract the words. - * std::vector getNonDialogueWords(); - * int getNonDialogueCount() const; - * std::vector _nonDialogue; - * int _nonDialogueCount; - */ - int countP = 0; for(char& c : output) // replacing (...) with ~~~~ { From 72560fd000ebdd12e3c6f705c7ef7a4c85f57e31 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 22:16:59 +0100 Subject: [PATCH 04/23] Update srtparser.h --- srtparser.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/srtparser.h b/srtparser.h index c41012a..251dbd8 100644 --- a/srtparser.h +++ b/srtparser.h @@ -383,16 +383,23 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue //stripping HTML tags if(!keepHTML) { + /* + * TODO : Before erasing, extract the words. + * std::vector getStyleTags(); + * int getStyleTagCount() const; + * std::vector _styleTag; + * int _styleTagCount; + */ int countP = 0; for(char& c : output) // replacing <...> with ~~~~ { - string tag; + std::string tag; if(c=='<') { countP++; - tag += '<'; + tag += '<' c = '~'; } @@ -423,10 +430,18 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(!doNotIgnoreNonDialogues) { + /* + * TODO : Before erasing, extract the words. + * std::vector getNonDialogueWords(); + * int getNonDialogueCount() const; + * std::vector _nonDialogue; + * int _nonDialogueCount; + */ + int countP = 0; for(char& c : output) // replacing (...) with ~~~~ { - string tag; + std::string tag; if(c=='(') { @@ -583,12 +598,12 @@ inline int SubtitleItem::getSpeakerCount() const } inline int SubtitleItem::getNonDialogueCount() const { - _nonDialogueCount = _nonDialogue.size(); + _nonDialogueCount = _nonDialogue.size() return _nonDialogueCount; } inline int SubtitleItem::getStyleTagCount() const { - _styleTagCount = _styleTag.size(); + _styleTagCount = _styleTag.size() return _styleTagCount; } inline int SubtitleItem::getWordCount() const From 24fbf16e2d083842ef11ff35c72022b8f7d69dda Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 22:19:06 +0100 Subject: [PATCH 05/23] Update srtparser.h --- srtparser.h | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/srtparser.h b/srtparser.h index 251dbd8..a16abb9 100644 --- a/srtparser.h +++ b/srtparser.h @@ -383,13 +383,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue //stripping HTML tags if(!keepHTML) { - /* - * TODO : Before erasing, extract the words. - * std::vector getStyleTags(); - * int getStyleTagCount() const; - * std::vector _styleTag; - * int _styleTagCount; - */ int countP = 0; for(char& c : output) // replacing <...> with ~~~~ @@ -399,7 +392,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(c=='<') { countP++; - tag += '<' + tag += '<'; c = '~'; } @@ -430,14 +423,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(!doNotIgnoreNonDialogues) { - /* - * TODO : Before erasing, extract the words. - * std::vector getNonDialogueWords(); - * int getNonDialogueCount() const; - * std::vector _nonDialogue; - * int _nonDialogueCount; - */ - int countP = 0; for(char& c : output) // replacing (...) with ~~~~ { From 7defad5dffc8b910c184e07c6b4ddc423e1d7def Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 22:25:56 +0100 Subject: [PATCH 06/23] tricked by a sign --- srtparser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/srtparser.h b/srtparser.h index a16abb9..461a515 100644 --- a/srtparser.h +++ b/srtparser.h @@ -430,7 +430,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(c=='(') { - tag += '<'; + tag += '('; countP++; c = '~'; } From 617bde8603cf13d03df41b0a5d94b4d872f9f66b Mon Sep 17 00:00:00 2001 From: MatejMecka Date: Tue, 2 Jan 2018 22:28:40 +0100 Subject: [PATCH 07/23] fixed some ifs --- srtparser.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/srtparser.h b/srtparser.h index 461a515..6f578db 100644 --- a/srtparser.h +++ b/srtparser.h @@ -400,10 +400,10 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue { if(countP!=0) { - if(c != '>') + if(c != '>'){ tag += c; c = '~'; - + } else if(c == '>') { tag += '>'; @@ -439,10 +439,10 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue { if(countP!=0) { - if(c != ')') + if(c != ')'){ tag += c; c = '~'; - + } else if(c == ')') { tag += '>'; @@ -659,3 +659,4 @@ inline SubtitleWord::~SubtitleWord(void) #endif //SRTPARSER_H + From e1511e6e2545d20a23e5d14266a917474cfb69ce Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 22:29:49 +0100 Subject: [PATCH 08/23] Update srtparser.h --- srtparser.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/srtparser.h b/srtparser.h index 6f578db..011ef90 100644 --- a/srtparser.h +++ b/srtparser.h @@ -583,12 +583,12 @@ inline int SubtitleItem::getSpeakerCount() const } inline int SubtitleItem::getNonDialogueCount() const { - _nonDialogueCount = _nonDialogue.size() + _nonDialogueCount = _nonDialogue.size(); return _nonDialogueCount; } inline int SubtitleItem::getStyleTagCount() const { - _styleTagCount = _styleTag.size() + _styleTagCount = _styleTag.size(); return _styleTagCount; } inline int SubtitleItem::getWordCount() const From 779f07b0bcd22abf7b126c03dcd701466f91a265 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 22:31:20 +0100 Subject: [PATCH 09/23] remove empty lines --- srtparser.h | 1 - 1 file changed, 1 deletion(-) diff --git a/srtparser.h b/srtparser.h index 011ef90..3d7ba7d 100644 --- a/srtparser.h +++ b/srtparser.h @@ -659,4 +659,3 @@ inline SubtitleWord::~SubtitleWord(void) #endif //SRTPARSER_H - From 80b60bcd4ae45d9a2d943dad3f57b4670581684f Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 23:32:57 +0100 Subject: [PATCH 10/23] increment variable From 7fde062459b404aa02b33fd027fbd462f77f20df Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 2 Jan 2018 23:33:53 +0100 Subject: [PATCH 11/23] Update srtparser.h --- srtparser.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/srtparser.h b/srtparser.h index 3d7ba7d..1214aac 100644 --- a/srtparser.h +++ b/srtparser.h @@ -583,12 +583,10 @@ inline int SubtitleItem::getSpeakerCount() const } inline int SubtitleItem::getNonDialogueCount() const { - _nonDialogueCount = _nonDialogue.size(); return _nonDialogueCount; } inline int SubtitleItem::getStyleTagCount() const { - _styleTagCount = _styleTag.size(); return _styleTagCount; } inline int SubtitleItem::getWordCount() const From fdc92120dfd26837cb27d5f0e8fb009b2c6cf025 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Sun, 14 Jan 2018 15:51:10 +0700 Subject: [PATCH 12/23] Update srtparser.h --- srtparser.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/srtparser.h b/srtparser.h index 1214aac..6970b4e 100644 --- a/srtparser.h +++ b/srtparser.h @@ -91,8 +91,6 @@ class SubtitleItem std::vector getSpeakerNames(); //return string vector of speaker names std::vector getNonDialogueWords(); //return string vector of non dialogue words std::vector getStyleTags(); //return string vector of style tags - std::vector _nonDialogue; - std::vector _styleTag; void setStartTime(long int startTime); //set starting time void setEndTime(long int endTime); //set ending time @@ -414,8 +412,11 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue } _styleTag.push_back(tag); + + } + _styleTagCount++; } @@ -453,7 +454,10 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue } _nonDialogue.push_back(tag); + } + + _nonDialogueCount++; } output.erase(std::remove(output.begin(), output.end(), '~'), output.end()); // deleting all ~ From c9ce550d5b5a9ecd1b6fe4a71cc6f56d0b30a400 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Sun, 14 Jan 2018 16:00:12 +0700 Subject: [PATCH 13/23] Update srtparser.h --- srtparser.h | 662 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 662 insertions(+) diff --git a/srtparser.h b/srtparser.h index 6970b4e..6d01259 100644 --- a/srtparser.h +++ b/srtparser.h @@ -1,4 +1,12 @@ /* + * Author : Saurabh Shrivastava + * Email : saurabh.shrivastava54@gmail.com + * Link : https://github.com/saurabhshri + * + * Based on subtitle-parser by Oleksii Maryshchenko. + * Email : young_developer@mail.ru + * Link : https://github.com/young-developer/subtitle-parser + *//* * Author : Saurabh Shrivastava * Email : saurabh.shrivastava54@gmail.com * Link : https://github.com/saurabhshri @@ -374,6 +382,660 @@ inline bool SubtitleItem::getIgnoreStatus() const } +inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames) //process subtitle +{ + std::string output = _text; + + //stripping HTML tags + if(!keepHTML) + { + + int countP = 0; + for(char& c : output) // replacing <...> with ~~~~ + { + std::string tag; + + if(c=='<') + { + countP++; + tag += '<'; + c = '~'; + } + + else + { + if(countP!=0) + { + if(c != '>'){ + tag += c; + c = '~'; + } + else if(c == '>') + { + tag += '>'; + c = '~'; + countP--; + _styleTagCount++; + + } + } + } + + _styleTag.push_back(tag); + + } + + } + + //stripping non dialogue data e.g. (applause) + + if(!doNotIgnoreNonDialogues) + { + int countP = 0; + for(char& c : output) // replacing (...) with ~~~~ + { + std::string tag; + + if(c=='(') + { + tag += '('; + countP++; + c = '~'; + } + + else + { + if(countP!=0) + { + if(c != ')'){ + tag += c; + c = '~'; + } + else if(c == ')') + { + tag += ')'; + c = '~'; + countP--; + _nonDialogueCount++; + } + } + } + + _nonDialogue.push_back(tag); + + } + + } + + output.erase(std::remove(output.begin(), output.end(), '~'), output.end()); // deleting all ~ + + //Extracting speaker names + if(!doNotRemoveSpeakerNames) + { + for(int i=0; output[i]!='\0';i++) + { + int colonIndex = 0, nameBeginIndex = 0; + if(output[i]==':') //speaker found; travel back + { + _speakerCount++; + colonIndex = i; + + int tempIndex = 0, foundEvilColon = 0, continueFlag = 0, spaceBeforeColon = 0; + + if(output[i-1] == ' ') + spaceBeforeColon = 2; + + /* + Possible Cases : + + Elon Musk: Hey Saurabh, you are pretty smart. // First and Last Name + Saurabh: *_* What? Elon Musk: Yes! // Two names in single line + Saurabh : OMG OMG! // Space before colon + Elon: LOL World: LAMAO + Saurabh: ._. // normal + + */ + + for(int j=i - spaceBeforeColon; j>=0;j--) + { + if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || output[j] == '\n' + || output[j] == ' ' || j== 0) + { + + if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || j == 0) + { + if((continueFlag && j == 0)) + { + if(!isupper(output[j])) + { + nameBeginIndex = tempIndex; + break; + } + + else + tempIndex = j; + + } + + else if(j!=0) + tempIndex = j + 1; + } + + else if(output[j] == ' ' && isupper(output[j+1])) + { + tempIndex = j; + continueFlag = 1; + + continue; + } + + else if(output[j] == ' ' && !isupper(output[j+1] && tempIndex == 0)) + { + _speakerCount--; + foundEvilColon = 1; + break; + } + + nameBeginIndex = tempIndex; + break; + } + } + + if(foundEvilColon) + continue; + + i = nameBeginIndex; //compensating the removal and changes in index + + //check if there's a space after colon i.e. A: Hello vs A:Hello + int removeSpace = 0; + if(output[colonIndex + 1]==' ') + removeSpace = 1; + + _speaker.push_back(output.substr(nameBeginIndex, colonIndex - nameBeginIndex)); + output.erase(nameBeginIndex, colonIndex - nameBeginIndex + removeSpace); + } + + } + + } + + // removing more than one whitespaces with one space + unique_copy (output.begin(), output.end(), std::back_insert_iterator(_justDialogue), + [](char a,char b) + { + return isspace(a) && isspace(b); + }); + + // trimming whitespaces + const char* whiteSpaces = " \t\n\r\f\v"; + _justDialogue.erase(0, _justDialogue.find_first_not_of(whiteSpaces)); + _justDialogue.erase(_justDialogue.find_last_not_of(whiteSpaces) + 1); + + if(_justDialogue.empty() || _justDialogue == " ") + _ignore = true; + + else + { + _word = split(_justDialogue, ' ', _word); //extracting individual words + _wordCount = _word.size(); + } +} + +inline std::string SubtitleItem::getDialogue(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames) +{ + if(_justDialogue.empty()) + extractInfo(keepHTML, doNotIgnoreNonDialogues, doNotRemoveSpeakerNames); + + return _justDialogue; +} +inline int SubtitleItem::getSpeakerCount() const +{ + return _speakerCount; +} +inline int SubtitleItem::getNonDialogueCount() const +{ + return _nonDialogueCount; +} +inline int SubtitleItem::getStyleTagCount() const +{ + return _styleTagCount; +} +inline int SubtitleItem::getWordCount() const +{ + return _wordCount; +} +inline std::vector SubtitleItem::getSpeakerNames() +{ + return _speaker; +} +inline std::vector SubtitleItem::getNonDialogueWords() +{ + return _nonDialogue; +} +inline std::vector SubtitleItem::getIndividualWords() +{ + return _word; +} +inline std::string SubtitleItem::getWordByIndex(int index) +{ + return _word[index]; +} +inline std::vector SubtitleItem::getWordStartTimes() +{ + return _wordStartTime; +} +inline std::vector SubtitleItem::getWordEndTimes() +{ + return _wordEndTime; +} +inline long int SubtitleItem::getWordStartTimeByIndex(int index) +{ + return _wordStartTime[index]; +} +inline long int SubtitleItem::getWordEndTimeByIndex(int index) +{ + return _wordEndTime[index]; +} +inline std::vector SubtitleItem::getStyleTags() +{ + return _styleTag; +} +inline SubtitleItem::~SubtitleItem(void) +{ + +} + +//5. SubtitleWordclass + +inline SubtitleWord::SubtitleWord(void) +{ + _text = ""; +} + +inline SubtitleWord::SubtitleWord(std::string text) +{ + _text = text; +} + +inline std::string SubtitleWord::getText() const +{ + return _text; +} + +inline SubtitleWord::~SubtitleWord(void) +{ +} + + +#endif //SRTPARSER_H + + +#ifndef SRTPARSER_H +#define SRTPARSER_H + +#include +#include +#include +#include +#include +#include + +//function for splitting sentences based on supplied delimiter +inline std::vector &split(const std::string &s, char delim, std::vector &elems) { + std::stringstream ss(s); + std::string item; + + while (getline(ss, item, delim)) { + elems.push_back(item); + } + return elems; +} + +/**** Class definitions ****/ + + +class SubtitleWord +{ +private: + std::string _text; +public: + SubtitleWord(void); + SubtitleWord(std::string text); + virtual std::string getText() const; + ~SubtitleWord(void); +}; + +class SubtitleItem +{ +private: + long int _startTime; //in milliseconds + long int _endTime; + std::string _text; //actual line, as present in subtitle file + long int timeMSec(std::string value); //converts time string into ms + + int _subNo; //subtitle number + std::string _startTimeString; //time as in srt format + std::string _endTimeString; + bool _ignore; //should subtitle be ignore; used when the subtitle is empty after processing + std::string _justDialogue; //contains processed subtitle - stripped style, non dialogue text removal etc. + int _speakerCount; //count of number of speakers + std::vector _speaker; //list of speakers in a single subtitle + int _nonDialogueCount; //count of non spoken words in a subtitle + std::vector _nonDialogue; //list of non dialogue words, e.g. (applause) + int _wordCount; //number of words in _justDialogue + std::vector _word; //list of words in dialogue + std::vector _wordStartTime; //start time of each word in dialogue + std::vector _wordEndTime; //end time of each word in dialogue + std::vector _wordDuration; //actual duration of each word without silence + int _styleTagCount; //count of style tags in a single subtitle + std::vector _styleTag; //list of style tags in that subtitle + void extractInfo(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0, bool doNotRemoveSpeakerNames = 0); //process subtitle +public: + long int getStartTime() const; //returns starting time in ms + long int getEndTime() const; //returns ending time in ms + std::string getText() const; //returns subtitle text as present in .srt file + + int getSubNo() const; //returns subtitle number + std::string getStartTimeString() const; //returns sarting time as present in .srt file + std::string getEndTimeString() const; //returns ending time as present in .srt file + bool getIgnoreStatus() const; //returns status, whether the subtitle is ignorable or not after processing + std::string getDialogue(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0, bool doNotRemoveSpeakerNames = 0); //returns processed subtitle + int getSpeakerCount() const; //return speaker count + int getNonDialogueCount() const; //return non dialogue words count + int getStyleTagCount() const; //return style tags count + int getWordCount() const; //return words count + std::vector getIndividualWords(); //return string vector of individual words + std::string getWordByIndex(int index); //return word stored at 'index' + std::vector getWordStartTimes(); //return long int vector of start time of individual words + std::vector getWordEndTimes(); //return long int vector of end time of individual words + long int getWordStartTimeByIndex(int index); //return the start time of a word based on index + long int getWordEndTimeByIndex (int index); //return the end time of a word based on index + std::vector getSpeakerNames(); //return string vector of speaker names + std::vector getNonDialogueWords(); //return string vector of non dialogue words + std::vector getStyleTags(); //return string vector of style tags + + void setStartTime(long int startTime); //set starting time + void setEndTime(long int endTime); //set ending time + void setText(std::string text); //set subtitle text + void setWordTimes(std::vector wordStartTime, std::vector wordEndTime, std::vector wordDuration); //assign time to individual words + + SubtitleItem(void); + SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore = false, + std::string justDialogue = "" , int speakerCount = 0, int nonDialogueCount = 0, + int styleTagCount = 0, int wordCount = 0, std::vector speaker = std::vector(), + std::vector nonDialogue = std::vector(), + std::vector styleTags = std::vector(), + std::vector word = std::vector()); //default constructor + ~SubtitleItem(void); +}; + +class SubtitleParser +{ +protected: + std::vector _subtitles; //stores subtitles + std::string _fileName; //supplied filename + virtual void parse(std::string fileName) = 0; +public: + virtual std::vector getSubtitles(); //returns subtitles + std::string getFileData(); + SubtitleParser(void); + virtual ~SubtitleParser(void); +}; + +class SubtitleParserFactory +{ +private: + std::string _fileName; +public: + SubtitleParser* getParser(); + SubtitleParserFactory(std::string fileName); + ~SubtitleParserFactory(void); +}; + +class SubRipParser : public SubtitleParser +{ + void parse(std::string fileName); +public: + SubRipParser(void); + SubRipParser(std::string fileName); + ~SubRipParser(void); +}; + + +/**** Function definitions ****/ + +//1. SubtitleParserFactory class + +inline SubtitleParserFactory::SubtitleParserFactory(std::string fileName) +{ + _fileName = fileName; +} + +inline SubtitleParser* SubtitleParserFactory::getParser() +{ + return new SubRipParser(_fileName); //creates and returns SubRipParser obj +} + +inline SubtitleParserFactory::~SubtitleParserFactory(void) +{ +} + +//2. SubtitleParser class + +inline std::vector SubtitleParser::getSubtitles() +{ + return _subtitles; +} + +inline std::string SubtitleParser::getFileData() //returns whole read file i.e. contents of input.srt +{ + std::ifstream infile(_fileName); + std::string allData = ""; + std::string line; + while (std::getline(infile, line)) + { + std::istringstream iss(line); + allData += line + "\n"; + } + return allData; + +} + +inline SubtitleParser::SubtitleParser(void) +{ + +} + +inline SubtitleParser::~SubtitleParser(void) +{ +} + +//3. SubRipParser class + +inline SubRipParser::SubRipParser(void) +{ +} + +inline void SubRipParser::parse(std::string fileName) //srt parser +{ + + std::ifstream infile(fileName); + std::string line, start, end, completeLine = "", timeLine = ""; + int subNo, turn = 0; + + /* + * turn = 0 -> Add subtitle number + * turn = 1 -> Add string to timeLine + * turn > 1 -> Add string to completeLine + */ + + while (std::getline(infile, line)) + { + line.erase(remove(line.begin(), line.end(), '\r'), line.end()); + + if (line.compare("")) + { + if(!turn) + { + subNo=atoi(line.c_str()); + turn++; + continue; + } + + if (line.find("-->") != std::string::npos) + { + timeLine += line; + + std::vector srtTime; + srtTime = split(timeLine, ' ', srtTime); + start = srtTime[0]; + end = srtTime[2]; + + } + else + { + if (completeLine != "") + completeLine += " "; + + completeLine += line; + } + + turn++; + } + + else + { + turn = 0; + _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine)); + completeLine = timeLine = ""; + } + + if(infile.eof()) //insert last remaining subtitle + { + _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine)); + } + } +} + +inline SubRipParser::SubRipParser(std::string fileName) +{ + _fileName = fileName; + parse(fileName); +} + +inline SubRipParser::~SubRipParser(void) +{ + for(int i=0;i != _subtitles.size();++i) + { + if(_subtitles[i]) + delete _subtitles[i]; + } +} + +//4. SubtitleItem class + +inline SubtitleItem::SubtitleItem(void) +{ +} + +inline SubtitleItem::SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore, + std::string justDialogue, int speakerCount, int nonDialogueCount, + int styleTagCount, int wordCount, std::vector speaker, std::vector nonDialogue, + std::vector styleTags, std::vector word) +{ + _startTime = timeMSec(startTime); + _endTime = timeMSec(endTime); + _text = text; + + _subNo = subNo; + _startTimeString = startTime; + _endTimeString = endTime; + _ignore = ignore; + _justDialogue = justDialogue; + _speakerCount = speakerCount; + _nonDialogueCount = nonDialogueCount; + _wordCount = wordCount; + _speaker = speaker; + _styleTagCount = styleTagCount; + _styleTag = styleTags; + _nonDialogue = nonDialogue; + _word = word; + + extractInfo(); +} + +inline long int SubtitleItem::timeMSec(std::string value) +{ + std::vector t, secs; + int hours, mins, seconds, milliseconds; + + t = split(value, ':', t); + hours = atoi(t[0].c_str()); + mins = atoi(t[1].c_str()); + + secs = split(t[2], ',', secs); + seconds = atoi(secs[0].c_str()); + milliseconds = atoi(secs[1].c_str()); + + return hours * 3600000 + mins * 60000 + seconds * 1000 + milliseconds; +} + +inline long int SubtitleItem::getStartTime() const +{ + return _startTime; +} +inline long int SubtitleItem::getEndTime() const +{ + return _endTime; +} + +inline std::string SubtitleItem::getText() const +{ + return _text; +} + +inline void SubtitleItem::setStartTime(long int startTime) +{ + _startTime = startTime; +} +inline void SubtitleItem::setEndTime(long int endTime) +{ + _endTime = endTime; +} +inline void SubtitleItem::setText(std::string text) +{ + _text = text; +} +inline void SubtitleItem::setWordTimes(std::vector wordStartTime, std::vector wordEndTime, std::vector wordDuration) +{ + _wordStartTime = wordStartTime; + _wordEndTime = wordEndTime; + _wordDuration = wordDuration; +} +inline int SubtitleItem::getSubNo() const +{ + return _subNo; +} +inline std::string SubtitleItem::getStartTimeString() const +{ + return _startTimeString; +} + +inline std::string SubtitleItem::getEndTimeString() const +{ + return _endTimeString; +} + +inline bool SubtitleItem::getIgnoreStatus() const +{ + if(_ignore) + return true; + + else + return false; + +} + inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames) //process subtitle { std::string output = _text; From 796ea56e7b6e08351dd0f73aeabf28ad2ab68fad Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Sun, 14 Jan 2018 16:00:51 +0700 Subject: [PATCH 14/23] Update srtparser.h --- srtparser.h | 663 ---------------------------------------------------- 1 file changed, 663 deletions(-) diff --git a/srtparser.h b/srtparser.h index 6d01259..7a06d98 100644 --- a/srtparser.h +++ b/srtparser.h @@ -1,12 +1,4 @@ /* - * Author : Saurabh Shrivastava - * Email : saurabh.shrivastava54@gmail.com - * Link : https://github.com/saurabhshri - * - * Based on subtitle-parser by Oleksii Maryshchenko. - * Email : young_developer@mail.ru - * Link : https://github.com/young-developer/subtitle-parser - *//* * Author : Saurabh Shrivastava * Email : saurabh.shrivastava54@gmail.com * Link : https://github.com/saurabhshri @@ -667,659 +659,4 @@ inline SubtitleWord::~SubtitleWord(void) } -#endif //SRTPARSER_H - - -#ifndef SRTPARSER_H -#define SRTPARSER_H - -#include -#include -#include -#include -#include -#include - -//function for splitting sentences based on supplied delimiter -inline std::vector &split(const std::string &s, char delim, std::vector &elems) { - std::stringstream ss(s); - std::string item; - - while (getline(ss, item, delim)) { - elems.push_back(item); - } - return elems; -} - -/**** Class definitions ****/ - - -class SubtitleWord -{ -private: - std::string _text; -public: - SubtitleWord(void); - SubtitleWord(std::string text); - virtual std::string getText() const; - ~SubtitleWord(void); -}; - -class SubtitleItem -{ -private: - long int _startTime; //in milliseconds - long int _endTime; - std::string _text; //actual line, as present in subtitle file - long int timeMSec(std::string value); //converts time string into ms - - int _subNo; //subtitle number - std::string _startTimeString; //time as in srt format - std::string _endTimeString; - bool _ignore; //should subtitle be ignore; used when the subtitle is empty after processing - std::string _justDialogue; //contains processed subtitle - stripped style, non dialogue text removal etc. - int _speakerCount; //count of number of speakers - std::vector _speaker; //list of speakers in a single subtitle - int _nonDialogueCount; //count of non spoken words in a subtitle - std::vector _nonDialogue; //list of non dialogue words, e.g. (applause) - int _wordCount; //number of words in _justDialogue - std::vector _word; //list of words in dialogue - std::vector _wordStartTime; //start time of each word in dialogue - std::vector _wordEndTime; //end time of each word in dialogue - std::vector _wordDuration; //actual duration of each word without silence - int _styleTagCount; //count of style tags in a single subtitle - std::vector _styleTag; //list of style tags in that subtitle - void extractInfo(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0, bool doNotRemoveSpeakerNames = 0); //process subtitle -public: - long int getStartTime() const; //returns starting time in ms - long int getEndTime() const; //returns ending time in ms - std::string getText() const; //returns subtitle text as present in .srt file - - int getSubNo() const; //returns subtitle number - std::string getStartTimeString() const; //returns sarting time as present in .srt file - std::string getEndTimeString() const; //returns ending time as present in .srt file - bool getIgnoreStatus() const; //returns status, whether the subtitle is ignorable or not after processing - std::string getDialogue(bool keepHTML = 0, bool doNotIgnoreNonDialogues = 0, bool doNotRemoveSpeakerNames = 0); //returns processed subtitle - int getSpeakerCount() const; //return speaker count - int getNonDialogueCount() const; //return non dialogue words count - int getStyleTagCount() const; //return style tags count - int getWordCount() const; //return words count - std::vector getIndividualWords(); //return string vector of individual words - std::string getWordByIndex(int index); //return word stored at 'index' - std::vector getWordStartTimes(); //return long int vector of start time of individual words - std::vector getWordEndTimes(); //return long int vector of end time of individual words - long int getWordStartTimeByIndex(int index); //return the start time of a word based on index - long int getWordEndTimeByIndex (int index); //return the end time of a word based on index - std::vector getSpeakerNames(); //return string vector of speaker names - std::vector getNonDialogueWords(); //return string vector of non dialogue words - std::vector getStyleTags(); //return string vector of style tags - - void setStartTime(long int startTime); //set starting time - void setEndTime(long int endTime); //set ending time - void setText(std::string text); //set subtitle text - void setWordTimes(std::vector wordStartTime, std::vector wordEndTime, std::vector wordDuration); //assign time to individual words - - SubtitleItem(void); - SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore = false, - std::string justDialogue = "" , int speakerCount = 0, int nonDialogueCount = 0, - int styleTagCount = 0, int wordCount = 0, std::vector speaker = std::vector(), - std::vector nonDialogue = std::vector(), - std::vector styleTags = std::vector(), - std::vector word = std::vector()); //default constructor - ~SubtitleItem(void); -}; - -class SubtitleParser -{ -protected: - std::vector _subtitles; //stores subtitles - std::string _fileName; //supplied filename - virtual void parse(std::string fileName) = 0; -public: - virtual std::vector getSubtitles(); //returns subtitles - std::string getFileData(); - SubtitleParser(void); - virtual ~SubtitleParser(void); -}; - -class SubtitleParserFactory -{ -private: - std::string _fileName; -public: - SubtitleParser* getParser(); - SubtitleParserFactory(std::string fileName); - ~SubtitleParserFactory(void); -}; - -class SubRipParser : public SubtitleParser -{ - void parse(std::string fileName); -public: - SubRipParser(void); - SubRipParser(std::string fileName); - ~SubRipParser(void); -}; - - -/**** Function definitions ****/ - -//1. SubtitleParserFactory class - -inline SubtitleParserFactory::SubtitleParserFactory(std::string fileName) -{ - _fileName = fileName; -} - -inline SubtitleParser* SubtitleParserFactory::getParser() -{ - return new SubRipParser(_fileName); //creates and returns SubRipParser obj -} - -inline SubtitleParserFactory::~SubtitleParserFactory(void) -{ -} - -//2. SubtitleParser class - -inline std::vector SubtitleParser::getSubtitles() -{ - return _subtitles; -} - -inline std::string SubtitleParser::getFileData() //returns whole read file i.e. contents of input.srt -{ - std::ifstream infile(_fileName); - std::string allData = ""; - std::string line; - while (std::getline(infile, line)) - { - std::istringstream iss(line); - allData += line + "\n"; - } - return allData; - -} - -inline SubtitleParser::SubtitleParser(void) -{ - -} - -inline SubtitleParser::~SubtitleParser(void) -{ -} - -//3. SubRipParser class - -inline SubRipParser::SubRipParser(void) -{ -} - -inline void SubRipParser::parse(std::string fileName) //srt parser -{ - - std::ifstream infile(fileName); - std::string line, start, end, completeLine = "", timeLine = ""; - int subNo, turn = 0; - - /* - * turn = 0 -> Add subtitle number - * turn = 1 -> Add string to timeLine - * turn > 1 -> Add string to completeLine - */ - - while (std::getline(infile, line)) - { - line.erase(remove(line.begin(), line.end(), '\r'), line.end()); - - if (line.compare("")) - { - if(!turn) - { - subNo=atoi(line.c_str()); - turn++; - continue; - } - - if (line.find("-->") != std::string::npos) - { - timeLine += line; - - std::vector srtTime; - srtTime = split(timeLine, ' ', srtTime); - start = srtTime[0]; - end = srtTime[2]; - - } - else - { - if (completeLine != "") - completeLine += " "; - - completeLine += line; - } - - turn++; - } - - else - { - turn = 0; - _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine)); - completeLine = timeLine = ""; - } - - if(infile.eof()) //insert last remaining subtitle - { - _subtitles.push_back(new SubtitleItem(subNo,start,end,completeLine)); - } - } -} - -inline SubRipParser::SubRipParser(std::string fileName) -{ - _fileName = fileName; - parse(fileName); -} - -inline SubRipParser::~SubRipParser(void) -{ - for(int i=0;i != _subtitles.size();++i) - { - if(_subtitles[i]) - delete _subtitles[i]; - } -} - -//4. SubtitleItem class - -inline SubtitleItem::SubtitleItem(void) -{ -} - -inline SubtitleItem::SubtitleItem(int subNo, std::string startTime,std::string endTime, std::string text, bool ignore, - std::string justDialogue, int speakerCount, int nonDialogueCount, - int styleTagCount, int wordCount, std::vector speaker, std::vector nonDialogue, - std::vector styleTags, std::vector word) -{ - _startTime = timeMSec(startTime); - _endTime = timeMSec(endTime); - _text = text; - - _subNo = subNo; - _startTimeString = startTime; - _endTimeString = endTime; - _ignore = ignore; - _justDialogue = justDialogue; - _speakerCount = speakerCount; - _nonDialogueCount = nonDialogueCount; - _wordCount = wordCount; - _speaker = speaker; - _styleTagCount = styleTagCount; - _styleTag = styleTags; - _nonDialogue = nonDialogue; - _word = word; - - extractInfo(); -} - -inline long int SubtitleItem::timeMSec(std::string value) -{ - std::vector t, secs; - int hours, mins, seconds, milliseconds; - - t = split(value, ':', t); - hours = atoi(t[0].c_str()); - mins = atoi(t[1].c_str()); - - secs = split(t[2], ',', secs); - seconds = atoi(secs[0].c_str()); - milliseconds = atoi(secs[1].c_str()); - - return hours * 3600000 + mins * 60000 + seconds * 1000 + milliseconds; -} - -inline long int SubtitleItem::getStartTime() const -{ - return _startTime; -} -inline long int SubtitleItem::getEndTime() const -{ - return _endTime; -} - -inline std::string SubtitleItem::getText() const -{ - return _text; -} - -inline void SubtitleItem::setStartTime(long int startTime) -{ - _startTime = startTime; -} -inline void SubtitleItem::setEndTime(long int endTime) -{ - _endTime = endTime; -} -inline void SubtitleItem::setText(std::string text) -{ - _text = text; -} -inline void SubtitleItem::setWordTimes(std::vector wordStartTime, std::vector wordEndTime, std::vector wordDuration) -{ - _wordStartTime = wordStartTime; - _wordEndTime = wordEndTime; - _wordDuration = wordDuration; -} -inline int SubtitleItem::getSubNo() const -{ - return _subNo; -} -inline std::string SubtitleItem::getStartTimeString() const -{ - return _startTimeString; -} - -inline std::string SubtitleItem::getEndTimeString() const -{ - return _endTimeString; -} - -inline bool SubtitleItem::getIgnoreStatus() const -{ - if(_ignore) - return true; - - else - return false; - -} - -inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames) //process subtitle -{ - std::string output = _text; - - //stripping HTML tags - if(!keepHTML) - { - - int countP = 0; - for(char& c : output) // replacing <...> with ~~~~ - { - std::string tag; - - if(c=='<') - { - countP++; - tag += '<'; - c = '~'; - } - - else - { - if(countP!=0) - { - if(c != '>'){ - tag += c; - c = '~'; - } - else if(c == '>') - { - tag += '>'; - c = '~'; - countP--; - } - } - } - - _styleTag.push_back(tag); - - - } - - _styleTagCount++; - - } - - //stripping non dialogue data e.g. (applause) - - if(!doNotIgnoreNonDialogues) - { - int countP = 0; - for(char& c : output) // replacing (...) with ~~~~ - { - std::string tag; - - if(c=='(') - { - tag += '('; - countP++; - c = '~'; - } - - else - { - if(countP!=0) - { - if(c != ')'){ - tag += c; - c = '~'; - } - else if(c == ')') - { - tag += '>'; - c = '~'; - countP--; - } - } - } - - _nonDialogue.push_back(tag); - - } - - _nonDialogueCount++; - } - - output.erase(std::remove(output.begin(), output.end(), '~'), output.end()); // deleting all ~ - - //Extracting speaker names - if(!doNotRemoveSpeakerNames) - { - for(int i=0; output[i]!='\0';i++) - { - int colonIndex = 0, nameBeginIndex = 0; - if(output[i]==':') //speaker found; travel back - { - _speakerCount++; - colonIndex = i; - - int tempIndex = 0, foundEvilColon = 0, continueFlag = 0, spaceBeforeColon = 0; - - if(output[i-1] == ' ') - spaceBeforeColon = 2; - - /* - Possible Cases : - - Elon Musk: Hey Saurabh, you are pretty smart. // First and Last Name - Saurabh: *_* What? Elon Musk: Yes! // Two names in single line - Saurabh : OMG OMG! // Space before colon - Elon: LOL World: LAMAO - Saurabh: ._. // normal - - */ - - for(int j=i - spaceBeforeColon; j>=0;j--) - { - if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || output[j] == '\n' - || output[j] == ' ' || j== 0) - { - - if(output[j] == '.' || output[j] == '!' || output[j] == ',' || output[j] == '?' || j == 0) - { - if((continueFlag && j == 0)) - { - if(!isupper(output[j])) - { - nameBeginIndex = tempIndex; - break; - } - - else - tempIndex = j; - - } - - else if(j!=0) - tempIndex = j + 1; - } - - else if(output[j] == ' ' && isupper(output[j+1])) - { - tempIndex = j; - continueFlag = 1; - - continue; - } - - else if(output[j] == ' ' && !isupper(output[j+1] && tempIndex == 0)) - { - _speakerCount--; - foundEvilColon = 1; - break; - } - - nameBeginIndex = tempIndex; - break; - } - } - - if(foundEvilColon) - continue; - - i = nameBeginIndex; //compensating the removal and changes in index - - //check if there's a space after colon i.e. A: Hello vs A:Hello - int removeSpace = 0; - if(output[colonIndex + 1]==' ') - removeSpace = 1; - - _speaker.push_back(output.substr(nameBeginIndex, colonIndex - nameBeginIndex)); - output.erase(nameBeginIndex, colonIndex - nameBeginIndex + removeSpace); - } - - } - - } - - // removing more than one whitespaces with one space - unique_copy (output.begin(), output.end(), std::back_insert_iterator(_justDialogue), - [](char a,char b) - { - return isspace(a) && isspace(b); - }); - - // trimming whitespaces - const char* whiteSpaces = " \t\n\r\f\v"; - _justDialogue.erase(0, _justDialogue.find_first_not_of(whiteSpaces)); - _justDialogue.erase(_justDialogue.find_last_not_of(whiteSpaces) + 1); - - if(_justDialogue.empty() || _justDialogue == " ") - _ignore = true; - - else - { - _word = split(_justDialogue, ' ', _word); //extracting individual words - _wordCount = _word.size(); - } -} - -inline std::string SubtitleItem::getDialogue(bool keepHTML, bool doNotIgnoreNonDialogues, bool doNotRemoveSpeakerNames) -{ - if(_justDialogue.empty()) - extractInfo(keepHTML, doNotIgnoreNonDialogues, doNotRemoveSpeakerNames); - - return _justDialogue; -} -inline int SubtitleItem::getSpeakerCount() const -{ - return _speakerCount; -} -inline int SubtitleItem::getNonDialogueCount() const -{ - return _nonDialogueCount; -} -inline int SubtitleItem::getStyleTagCount() const -{ - return _styleTagCount; -} -inline int SubtitleItem::getWordCount() const -{ - return _wordCount; -} -inline std::vector SubtitleItem::getSpeakerNames() -{ - return _speaker; -} -inline std::vector SubtitleItem::getNonDialogueWords() -{ - return _nonDialogue; -} -inline std::vector SubtitleItem::getIndividualWords() -{ - return _word; -} -inline std::string SubtitleItem::getWordByIndex(int index) -{ - return _word[index]; -} -inline std::vector SubtitleItem::getWordStartTimes() -{ - return _wordStartTime; -} -inline std::vector SubtitleItem::getWordEndTimes() -{ - return _wordEndTime; -} -inline long int SubtitleItem::getWordStartTimeByIndex(int index) -{ - return _wordStartTime[index]; -} -inline long int SubtitleItem::getWordEndTimeByIndex(int index) -{ - return _wordEndTime[index]; -} -inline std::vector SubtitleItem::getStyleTags() -{ - return _styleTag; -} -inline SubtitleItem::~SubtitleItem(void) -{ - -} - -//5. SubtitleWordclass - -inline SubtitleWord::SubtitleWord(void) -{ - _text = ""; -} - -inline SubtitleWord::SubtitleWord(std::string text) -{ - _text = text; -} - -inline std::string SubtitleWord::getText() const -{ - return _text; -} - -inline SubtitleWord::~SubtitleWord(void) -{ -} - - #endif //SRTPARSER_H From 4241a6876442e6f6b1ffed0e1c79c3e538a4f476 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Sun, 14 Jan 2018 22:39:16 +0700 Subject: [PATCH 15/23] don't merge --- srtparser.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/srtparser.h b/srtparser.h index 7a06d98..94d0a55 100644 --- a/srtparser.h +++ b/srtparser.h @@ -390,7 +390,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(c=='<') { countP++; - tag += '<'; c = '~'; } @@ -404,7 +403,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue } else if(c == '>') { - tag += '>'; c = '~'; countP--; _styleTagCount++; @@ -412,7 +410,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue } } } - + tag.erase(0,1); // to fix _styleTag.push_back(tag); } @@ -430,7 +428,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(c=='(') { - tag += '('; countP++; c = '~'; } @@ -445,7 +442,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue } else if(c == ')') { - tag += ')'; c = '~'; countP--; _nonDialogueCount++; From 8c6726628f621c2f98de94e031e1b0b6fccc46e8 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Sun, 14 Jan 2018 23:13:31 +0700 Subject: [PATCH 16/23] Update srtparser.h --- srtparser.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/srtparser.h b/srtparser.h index 94d0a55..54c666d 100644 --- a/srtparser.h +++ b/srtparser.h @@ -398,8 +398,13 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(countP!=0) { if(c != '>'){ - tag += c; - c = '~'; + if(c == '/'){ + ; + } + else{ + tag += c; + c = '~'; + } } else if(c == '>') { @@ -410,7 +415,6 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue } } } - tag.erase(0,1); // to fix _styleTag.push_back(tag); } From d89de381ef9ab4e6195287860e88deb7e950d2ae Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Mon, 15 Jan 2018 11:05:04 +0700 Subject: [PATCH 17/23] Update srtparser.h --- srtparser.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/srtparser.h b/srtparser.h index 54c666d..ddd54c6 100644 --- a/srtparser.h +++ b/srtparser.h @@ -383,9 +383,9 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue { int countP = 0; + std::string tag; for(char& c : output) // replacing <...> with ~~~~ { - std::string tag; if(c=='<') { @@ -398,24 +398,22 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(countP!=0) { if(c != '>'){ - if(c == '/'){ - ; - } - else{ tag += c; c = '~'; - } } else if(c == '>') { c = '~'; countP--; _styleTagCount++; - + if(tag[0] == '/'){ + tag.erase(0,1); + } + _nonDialogue.push_back(tag); + tag=""; } } } - _styleTag.push_back(tag); } @@ -426,9 +424,9 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(!doNotIgnoreNonDialogues) { int countP = 0; + std::string tag; for(char& c : output) // replacing (...) with ~~~~ { - std::string tag; if(c=='(') { @@ -441,7 +439,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(countP!=0) { if(c != ')'){ - tag += c; + tag.push_back(c); c = '~'; } else if(c == ')') @@ -449,14 +447,12 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue c = '~'; countP--; _nonDialogueCount++; + _nonDialogue.push_back(tag); + tag=""; } } } - - _nonDialogue.push_back(tag); - } - } output.erase(std::remove(output.begin(), output.end(), '~'), output.end()); // deleting all ~ From 586106b42178acd6dcb788bb56cf27f7b2b83194 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Mon, 15 Jan 2018 20:39:15 +0700 Subject: [PATCH 18/23] fix saurabh's suggestions --- srtparser.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/srtparser.h b/srtparser.h index ddd54c6..911d8ef 100644 --- a/srtparser.h +++ b/srtparser.h @@ -409,7 +409,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(tag[0] == '/'){ tag.erase(0,1); } - _nonDialogue.push_back(tag); + _styleTag.push_back(tag); tag=""; } } @@ -424,7 +424,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(!doNotIgnoreNonDialogues) { int countP = 0; - std::string tag; + std::string action; for(char& c : output) // replacing (...) with ~~~~ { @@ -439,7 +439,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(countP!=0) { if(c != ')'){ - tag.push_back(c); + action.push_back(c); c = '~'; } else if(c == ')') @@ -447,7 +447,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue c = '~'; countP--; _nonDialogueCount++; - _nonDialogue.push_back(tag); + _nonDialogue.push_back(action); tag=""; } } From ff4d2260cf3e831a5098606ac14a9fd87d1e6a5f Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Mon, 15 Jan 2018 21:22:20 +0700 Subject: [PATCH 19/23] update pairs --- srtparser.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/srtparser.h b/srtparser.h index 911d8ef..b1333b1 100644 --- a/srtparser.h +++ b/srtparser.h @@ -407,10 +407,13 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue countP--; _styleTagCount++; if(tag[0] == '/'){ - tag.erase(0,1); + tag="" + + } + else{ + _nonDialogue.push_back(tag); + tag=""; } - _styleTag.push_back(tag); - tag=""; } } } @@ -424,7 +427,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(!doNotIgnoreNonDialogues) { int countP = 0; - std::string action; + std::string tag; for(char& c : output) // replacing (...) with ~~~~ { @@ -439,7 +442,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(countP!=0) { if(c != ')'){ - action.push_back(c); + tag.push_back(c); c = '~'; } else if(c == ')') @@ -447,7 +450,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue c = '~'; countP--; _nonDialogueCount++; - _nonDialogue.push_back(action); + _nonDialogue.push_back(tag); tag=""; } } From 2d703164fc2220a7b29fbb58d15baaf8575eccba Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Mon, 15 Jan 2018 21:38:52 +0700 Subject: [PATCH 20/23] Update srtparser.h --- srtparser.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/srtparser.h b/srtparser.h index b1333b1..078be1f 100644 --- a/srtparser.h +++ b/srtparser.h @@ -407,7 +407,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue countP--; _styleTagCount++; if(tag[0] == '/'){ - tag="" + tag=""; } else{ @@ -427,7 +427,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(!doNotIgnoreNonDialogues) { int countP = 0; - std::string tag; + std::string action; for(char& c : output) // replacing (...) with ~~~~ { @@ -442,7 +442,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue if(countP!=0) { if(c != ')'){ - tag.push_back(c); + action.push_back(c); c = '~'; } else if(c == ')') @@ -450,8 +450,8 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue c = '~'; countP--; _nonDialogueCount++; - _nonDialogue.push_back(tag); - tag=""; + _nonDialogue.push_back(action); + action=""; } } } From 1a67859c1f76fdb9912f7085808bc5f5a3dc68f1 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Mon, 15 Jan 2018 23:39:09 +0700 Subject: [PATCH 21/23] Update srtparser.h --- srtparser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/srtparser.h b/srtparser.h index 078be1f..f7ea30b 100644 --- a/srtparser.h +++ b/srtparser.h @@ -411,7 +411,7 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue } else{ - _nonDialogue.push_back(tag); + _styleTag.push_back(tag); tag=""; } } From b3a503db7bce09ebf670a276750347fc9601a652 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Mon, 15 Jan 2018 23:43:24 +0700 Subject: [PATCH 22/23] fix small bug From c6ca75c8c84d9cb71eeed57f1c2dee1a0008cf07 Mon Sep 17 00:00:00 2001 From: Matej Plavevski Date: Tue, 16 Jan 2018 21:26:25 +0700 Subject: [PATCH 23/23] Update srtparser.h --- srtparser.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/srtparser.h b/srtparser.h index f7ea30b..63b7696 100644 --- a/srtparser.h +++ b/srtparser.h @@ -91,7 +91,6 @@ class SubtitleItem std::vector getSpeakerNames(); //return string vector of speaker names std::vector getNonDialogueWords(); //return string vector of non dialogue words std::vector getStyleTags(); //return string vector of style tags - void setStartTime(long int startTime); //set starting time void setEndTime(long int endTime); //set ending time void setText(std::string text); //set subtitle text @@ -405,21 +404,15 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue { c = '~'; countP--; - _styleTagCount++; - if(tag[0] == '/'){ - tag=""; - - } - else{ + if(tag[0] != '/'){ + _styleTagCount++; _styleTag.push_back(tag); - tag=""; } + tag=""; } } } - } - } //stripping non dialogue data e.g. (applause) @@ -430,13 +423,11 @@ inline void SubtitleItem::extractInfo(bool keepHTML, bool doNotIgnoreNonDialogue std::string action; for(char& c : output) // replacing (...) with ~~~~ { - if(c=='(') { countP++; c = '~'; } - else { if(countP!=0)