Skip to content

Better args handling for Dedup #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 10 additions & 22 deletions process.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,29 +194,17 @@ func largestKMatchPairs(pairs MatchPairs, k int) MatchPairs {
return make(MatchPairs, 0)
}

func Dedupe(sliceWithDupes []string, args ...interface{}) ([]string, error) {
var scorer func(string, string) int
scorer = func(s1, s2 string) int {
return TokenSetRatio(s1, s2, true, true)
}
threshold := 70
var defaultThreshold = 70

for i, arg := range args {
switch i {
case 0:
t, err := arg.(int)
if err {
return nil, errors.New("expected first optional argument to be an integer")
}
threshold = t
case 1:
s, err := arg.(func(string, string) int)
if err {
return nil, errors.New("expected second optional argument to be a function of the form f(string,string)->int")
}
scorer = s
func Dedupe(sliceWithDupes []string, threshold *int, scorer func(string, string) int) ([]string, error) {
if scorer == nil {
scorer = func(s1, s2 string) int {
return TokenSetRatio(s1, s2, true, true)
}
}
if threshold == nil {
threshold = &defaultThreshold
}

extracted := []string{}
for _, elem := range sliceWithDupes {
Expand All @@ -226,13 +214,13 @@ func Dedupe(sliceWithDupes []string, args ...interface{}) ([]string, error) {
}
filtered := MatchPairs{}
for _, m := range matches {
if m.Score > threshold {
if m.Score > *threshold {
filtered = append(filtered, m)
}
}
if len(filtered) == 1 {
extracted = append(extracted, filtered[0].Match)
} else {
} else if len(filtered) > 0 {
altPoints := alphaLengthSortPairs(filtered)
sort.Sort(altPoints)
extracted = append(extracted, altPoints[0].Match)
Expand Down
34 changes: 32 additions & 2 deletions process_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package fuzzy

import (
"github.com/stretchr/testify/assert"
"testing"
)

Expand Down Expand Up @@ -97,16 +98,45 @@ func TestExtractOne(t *testing.T) {

func TestDedupe(t *testing.T) {
sliceWithDupes := []string{"Frodo Baggins", "Tom Sawyer", "Bilbo Baggin", "Samuel L. Jackson", "F. Baggins", "Frody Baggins", "Bilbo Baggins"}
res, _ := Dedupe(sliceWithDupes)
res, err := Dedupe(sliceWithDupes, nil, nil)
assert.Nil(t, err)
if len(res) >= len(sliceWithDupes) {
t.Error("expecting Dedupe to remove at least one string from slice")
}

sliceWithoutDupes := []string{"Tom", "Dick", "Harry"}
res2, _ := Dedupe(sliceWithoutDupes)
res2, err := Dedupe(sliceWithoutDupes, nil, nil)
assert.Nil(t, err)
if len(res2) != len(sliceWithoutDupes) {
t.Error("not expecting Dedupe to remove any strings from slice")
}

lowThreshold := 1
res3, err := Dedupe(sliceWithDupes, &lowThreshold, nil)
assert.Nil(t, err)
if len(res3) != 1 {
t.Error("expecting low threshold to dedupe all items")
}

highThreshold := 99
res4, err := Dedupe(sliceWithDupes, &highThreshold, nil)
assert.Nil(t, err)
if len(res4) != len(sliceWithDupes) {
t.Error("expecting high threshold to maintain all items")
}

threshold := 1
res5, err := Dedupe(sliceWithDupes, &threshold, func(s1 string, s2 string) int {
diff := len(s1) - len(s2)
if diff < 0 {
diff *= -1
}
return diff
})
assert.Nil(t, err)
if len(res5) != 2 {
t.Error("expecting custom scorer to yield two results")
}
}

func assertMatch(t *testing.T, query, expectedMatch, actualMatch string) {
Expand Down