Skip to content

Dynamic optimization in levenshtein distance #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 101 additions & 12 deletions src/EditDistance.elm
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@ module EditDistance
between two lists, or the actual edit steps required to go from one to the
other.


# Edit Steps

@docs EditStep, edits, editsFromStrings, editsWithCostFunc, editsWithCostFuncFromStrings


# Levenshtein

@docs levenshtein, levenshteinFromStrings

-}
Expand Down Expand Up @@ -47,10 +51,11 @@ required to turn one given list into another.
, Move 'r' 2 3
]

The resulting indices reflect edits where _deletions are made first_, before
The resulting indices reflect edits where *deletions are made first*, before
insertions and substitutions. That is, indices for deletions refer to the source
list, whereas indices for insertions and substitutions refer to the latter,
intermediate lists.

-}
edits : List comparable -> List comparable -> List (EditStep comparable)
edits source target =
Expand All @@ -77,9 +82,10 @@ Int).
, Delete 'b' 1
]

(Note that the cost function is applied _before_ insertions and deletions are
(Note that the cost function is applied *before* insertions and deletions are
converted into moves, meaning it will never receive an EditStep of type Move as
an argument.)

-}
editsWithCostFunc : (EditStep comparable -> Int) -> List comparable -> List comparable -> List (EditStep comparable)
editsWithCostFunc costFunc source target =
Expand Down Expand Up @@ -253,6 +259,7 @@ moveFromSteps editSteps step =
[ Delete 'g' 0
, Move 'r' 2 3
]

-}
editsFromStrings : String -> String -> List (EditStep Char)
editsFromStrings source target =
Expand All @@ -276,6 +283,7 @@ editsFromStrings source target =
[ Insert 'd' 1
, Delete 'b' 1
]

-}
editsWithCostFuncFromStrings : (EditStep Char -> Int) -> String -> String -> List (EditStep Char)
editsWithCostFuncFromStrings costFunc source target =
Expand All @@ -294,27 +302,107 @@ into another.

levenshtein (String.toList "garvey") (String.toList "avery")
== 3

-}
levenshtein : List comparable -> List comparable -> Int
levenshtein source target =
case ( source, target ) of
( source, [] ) ->
( [], _ ) ->
List.length target

( _, [] ) ->
List.length source

( [], target ) ->
List.length target
( [ src ], _ ) ->
if List.any ((==) src) target then
List.length target - 1
else
List.length target

( _, [ tgt ] ) ->
if List.any ((==) tgt) source then
List.length source - 1
else
List.length source

( src_hd :: src_tail, tgt_hd :: tgt_tail ) ->
if src_hd == tgt_hd then
levenshtein src_tail tgt_tail
else
Maybe.withDefault 0
(List.minimum
[ (levenshtein src_tail target) + 1
, (levenshtein source tgt_tail) + 1
, (levenshtein src_tail tgt_tail) + 1
]
)
initTextLoop source target


hdOrZero : List Int -> Int
hdOrZero lst =
case lst of
hd :: tl ->
hd

[] ->
0


tlOrEmpty : List a -> List a
tlOrEmpty lst =
case lst of
hd :: tl ->
tl

[] ->
[]


patternLoop : comparable -> List comparable -> Int -> List Int -> List Int
patternLoop tChar pattern b0 prevCol =
let
recLoop : List comparable -> Int -> Int -> Int -> Int -> List Int -> List Int -> List Int
recLoop pattern idx b0 b1 b2 prevCol revCurCol =
case pattern of
pChar :: pTail ->
let
b0_ : Int
b0_ =
b1

b1_ : Int
b1_ =
hdOrZero prevCol

b2_ : Int
b2_ =
if pChar == tChar then
b0
else
1 + (min b1 b2 |> min b0)
in
recLoop pTail (idx + 1) b0_ b1_ b2_ (tlOrEmpty prevCol) (b2_ :: revCurCol)

[] ->
revCurCol
in
recLoop pattern 1 b0 (hdOrZero prevCol) (b0 + 1) (tlOrEmpty prevCol) []


textLoop : List comparable -> List comparable -> Int -> List Int -> List Int
textLoop text pattern idx revCol =
case text of
tChar :: tTail ->
List.reverse revCol
|> patternLoop tChar pattern (idx - 1)
|> textLoop tTail pattern (idx + 1)

[] ->
revCol


initTextLoop : List comparable -> List comparable -> Int
initTextLoop text pattern =
let
initCol : List Int
initCol =
List.range 1 (List.length pattern) |> List.reverse
in
textLoop text pattern 1 initCol |> hdOrZero


{-| Same as the `levenshtein` function, but for String values.
Expand All @@ -324,6 +412,7 @@ levenshtein source target =
levenshtein "preterit" "zeitgeist" == 6

levenshtein "garvey" "avery" == 3

-}
levenshteinFromStrings : String -> String -> Int
levenshteinFromStrings source target =
Expand Down
41 changes: 41 additions & 0 deletions tests/LevenshteinFuzz.elm
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
module LevenshteinFuzz exposing (all)

import EditDistance exposing (levenshtein)
import Test exposing (Test, describe, fuzz2)
import Fuzz exposing (list, char)
import Expect


all : Test
all =
describe "The optimized levenshtein implementation"
[ fuzz2 (list char) (list char) "should match the old implementation" <|
\source target ->
Expect.equal (levenshtein source target)
(safeLevenshtein source target)
]


min3 : comparable -> comparable -> comparable -> comparable
min3 a b c =
min a b |> min c


safeLevenshtein : List comparable -> List comparable -> Int
safeLevenshtein source target =
case ( source, target ) of
( source, [] ) ->
List.length source

( [], target ) ->
List.length target

( src_hd :: src_tail, tgt_hd :: tgt_tail ) ->
if src_hd == tgt_hd then
levenshtein src_tail tgt_tail
else
min3
(levenshtein src_tail target)
(levenshtein source tgt_tail)
(levenshtein src_tail tgt_tail)
+ 1
13 changes: 0 additions & 13 deletions tests/Main.elm

This file was deleted.

14 changes: 13 additions & 1 deletion tests/Tests.elm
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ all =
Expect.equal (edits abc adc)
[ Substitute 'd' 1
]
, test "abc -> adc (without cost function) edit steps" <|
, test "abc -> adc (with cost function) edit steps" <|
\() ->
let
costFunc editStep =
Expand Down Expand Up @@ -170,6 +170,18 @@ all =
, describe "when getting Levenshtein distance between strings"
[ test "kitten <-> sitting Levenshtein distance" <|
\() -> Expect.equal (levenshteinFromStrings "kitten" "sitting") 3
, test "a <-> abc Levenshtein distance" <|
\() -> Expect.equal (levenshteinFromStrings "a" "abc") 2
, test "abc <-> a Levenshtein distance" <|
\() -> Expect.equal (levenshteinFromStrings "abc" "a") 2
, test "ab <-> abc Levenshtein distance" <|
\() -> Expect.equal (levenshteinFromStrings "ab" "abc") 1
, test "abc <-> ab Levenshtein distance" <|
\() -> Expect.equal (levenshteinFromStrings "abc" "ab") 1
, test "a <-> xyz Levenshtein distance" <|
\() -> Expect.equal (levenshteinFromStrings "a" "xyz") 3
, test "xyz <-> a Levenshtein distance" <|
\() -> Expect.equal (levenshteinFromStrings "xyz" "a") 3
, test "sitting <-> kitten Levenshtein distance" <|
\() -> Expect.equal (levenshteinFromStrings "sitting" "kitten") 3
, test "kitten <-> empty string Levenshtein distance" <|
Expand Down
19 changes: 9 additions & 10 deletions tests/elm-package.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
{
"version": "1.0.0",
"summary": "Sample Elm Test",
"repository": "https://github.com/user/project.git",
"license": "BSD-3-Clause",
"summary": "Test Suites",
"repository": "https://github.com/erwald/elm-edit-distance.git",
"license": "MIT",
"source-directories": [
".",
"../src"
"..",
"../src",
"."
],
"exposed-modules": [],
"dependencies": {
"elm-community/json-extra": "2.0.0 <= v < 3.0.0",
"elm-lang/html": "2.0.0 <= v < 3.0.0",
"mgold/elm-random-pcg": "4.0.2 <= v < 5.0.0",
"eeue56/elm-html-test": "5.1.2 <= v < 6.0.0",
"elm-community/elm-test": "4.0.0 <= v < 5.0.0",
"elm-lang/core": "5.0.0 <= v < 6.0.0",
"elm-community/elm-test": "3.0.0 <= v < 4.0.0",
"rtfeldman/node-test-runner": "3.0.0 <= v < 4.0.0"
"elm-lang/html": "2.0.0 <= v < 3.0.0"
},
"elm-version": "0.18.0 <= v < 0.19.0"
}