Skip to content

Add support for Japanese subject #107

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 41 additions & 7 deletions gmailutils/gmail.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"log"
"net/http"
"os"
"regexp"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -253,27 +254,48 @@ func Subject(m *gmail.MessagePart) string {

// NormalizeAndSplit normalizes subj format and split it to type/source.
func NormalizeAndSplit(subj string) []string {
srcType, _ := splitOnDash(subj) // handles at least EN and FR locales
var srcType []string
srcType, _ = splitOnDash(subj) // handles at least EN and FR locales
if len(srcType) != 2 {
srcType = splitOnRuLocale(subj)
}

// nomalizes citations
if len(srcType) != 2 {
re := regexp.MustCompile(citations.En + `|` + citations.ja)
substr := re.FindAllStringSubmatch(subj, -1)
if substr != nil {
switch {
case substr[0][1] != "":
srcType = []string{substr[0][1], citations.En}
case substr[0][2] != "":
srcType = []string{substr[0][2], citations.En}
case substr[0][3] != "":
srcType = []string{substr[0][3], citations.En}
}
}
}

return srcType
}

type subjFormat struct{ ru, En string }
type subjFormat struct{ ru, ja, En string }

var (
articles = subjFormat{
"Новые статьи пользователя ", "new articles",
"Новые статьи пользователя ", "新しい論文", "new articles",
}
citations = subjFormat{
": новые ссылки", "new citations",
": новые ссылки", `^(?:(.+) さん|(自分))の論文からの引用: \d+ 件$`, `^\d+ new citations? to articles by (.+)$`,
Copy link
Owner

@bzz bzz Sep 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if the first one also needs to be updated with a regex for a number - I'll check that before merging.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the check! I don't have knowledge and testing environment for the language, so the update will be fine.

}
citationsOld = subjFormat{
": новые ссылки", "新しい引用", "new citations",
}
related = subjFormat{
"Новые статьи, связанные с работами автора ", "new related research",
"Новые статьи, связанные с работами автора ", "関連する新しい研究", "new related research",
}
search = subjFormat{
"Новые результаты по запросу ", "new results",
"Новые результаты по запросу ", "新しい結果", "new results",
}
// TODO(bzz): add this as well
// recomended = subjFormat{
Expand Down Expand Up @@ -314,7 +336,19 @@ func splitOnDash(str string) ([]string, string) {
}
}
sep := fmt.Sprintf(" %s ", dash)
return strings.Split(str, sep), sep
result := strings.Split(str, sep)

if len(result) == 2 {
switch result[1] {
case articles.ru, articles.ja:
result[1] = articles.En
case related.ru, related.ja:
result[1] = related.En
case search.ru, search.ja:
result[1] = search.En
}
}
return result, sep
}

// MessageTextBody returns the text (if any) of a given message ID
Expand Down
30 changes: 29 additions & 1 deletion gmailutils/gmail_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,15 @@ func TestSubjSplit(t *testing.T) {
},
{
`"Learning to represent programs with graphs" - new citations`,
`"Learning to represent programs with graphs"`, citations.En,
`"Learning to represent programs with graphs"`, citationsOld.En,
},
{
`3 new citations to articles by Diomidis Spinellis`,
"Diomidis Spinellis", citations.En,
},
{
`1 new citation to articles by Diomidis Spinellis`,
"Diomidis Spinellis", citations.En,
},
{
`"machine learning on code" – de nouveaux résultats sont disponibles`,
Expand All @@ -31,6 +39,26 @@ func TestSubjSplit(t *testing.T) {
`Новые результаты по запросу "deep learning source code"`,
`"deep learning source code"`, search.En,
},
{
`Diomidis Spinellis さんの論文からの引用: 123 件`,
"Diomidis Spinellis", citations.En,
},
{
`自分の論文からの引用: 1 件`,
"自分", citations.En,
},
{
`Diomidis Spinellis - 関連する新しい研究`,
"Diomidis Spinellis", related.En,
},
{
`Diomidis Spinellis - 新しい論文`,
"Diomidis Spinellis", articles.En,
},
{
`Diomidis Spinellis - 新しい結果`,
"Diomidis Spinellis", search.En,
},
// {
// `Рекомендуемые статьи`, "", recomended.En
// }
Expand Down
10 changes: 10 additions & 0 deletions papers/papers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ func TestScholarURLExtraction(t *testing.T) {
"http://scholar.google.ru/scholar_url?url=https://www.jstage.jst.go.jp/article/transinf/E102.D/12/E102.D_2019MPP0005/_article/-char/ja/&hl=en",
"https://www.jstage.jst.go.jp/article/transinf/E102.D/12/E102.D_2019MPP0005/_article/-char/ja/", false,
},
{
".co.jp",
"http://scholar.google.co.jp/scholar_url?url=https://dl.acm.org/doi/abs/10.1145/3379337.3415831&hl=ja&sa=X&d=17323521467117279604&ei=2H-RX7X0BIKOygSV-YCoCQ&scisig=AAGBfm0sUgXNPcsegVW1Ds0b1UxEXge1OA&nossl=1&oi=scholaralrt",
"https://dl.acm.org/doi/abs/10.1145/3379337.3415831", false,
},
{
".com.au",
"http://scholar.google.com.au/scholar_url?url=https://dl.acm.org/doi/abs/10.1145/3379337.3415831&hl=ja&sa=X&d=17323521467117279604&ei=2H-RX7X0BIKOygSV-YCoCQ&scisig=AAGBfm0sUgXNPcsegVW1Ds0b1UxEXge1OA&nossl=1&oi=scholaralrt",
"https://dl.acm.org/doi/abs/10.1145/3379337.3415831", false,
},
{
"anothe TLD, short URL",
"https://scholar.google.au/scholar_url?url=http://www.test.com&hl=1",
Expand Down