Sunday, April 12, 2009

Email validation with FParsec

Email address validation is one of those topics that keep coming up again and again. It seems that we developers never get it quite right, but with good reason: the spec is downright insane. There are six RFCs involved, which obsolete some other RFCs, and in turn have some erratas. I hope the guys at the IETF had some very good reasons to make this so damn complex!

Anyway, you can validate all the RFCs you want and you can still get an invalid address. joe@example.com is syntactically correct yet there isn't any Joe that works at example.com :-)

What's interesting here is the different approaches taken to cope with such a monster:

The last one really caught my interest so I ported it (mostly as an exercise) to F# + FParsec. Then I grabbed Dominic's testcase and ran it with FsUnit. The result? It passes 84% of Dominic's tests (with no false negatives). Here's the code (updated to F# 2.0 / FParsec trunk 5/17/2010):

module EmailValidation.EmailValidator

open System
open FParsec
open FParsec.Primitives
open FParsec.CharParsers

let isValidEmail email =
    let wsp = anyOf " \t" >>% ()
    let crlf = pchar '\n' >>% ()
    let nullChar = pchar (char 0) >>% ()
    let ranges = Seq.map (Seq.map char) >> Seq.concat >> Seq.toArray >> (fun x -> String x) >> (fun x -> anyOf x >>% ())
    let vchar = ranges [{0x21..0x7e}]
    let obsNoWsCtl = ranges [{1..8};{11..12};{14..31};{127..127}]
    let atomText = digit <|> letter <|> anyOf "!#$%&'*+-/=?^_`{|}~"
    let atom = many1 atomText >>% ()
    let fws = (many1 wsp >>. optional (crlf >>. many1 wsp)) <|> (many1 (crlf >>. many1 wsp) >>% ())
    let commentText = ranges [{33..39};{42..91};{93..126}] <|> obsNoWsCtl
    let quotedPair = pchar '\\' >>. (vchar <|> wsp <|> crlf <|> obsNoWsCtl <|> nullChar)
    let rec commentContent x = (commentText <|> quotedPair <|> comment) x
    and comment = between (pchar '(') (pchar ')') (many (commentContent <|> fws)) >>% ()
    let cfws = many (comment <|> fws)
    let quotedText = ranges [{33..33};{35..91};{93..126}] <|> obsNoWsCtl
    let quotedContent = quotedText <|> quotedPair
    let quotedString = between (pchar '"') (pchar '"') (many (optional fws >>. quotedContent) >>. optional fws)
    let dottedAtoms = sepBy1 (optional cfws >>. (atom <|> quotedString) >>. optional cfws) (pchar '.') >>% ()
    let localPart = dottedAtoms
    let domainText = ranges [{33..90};{94..126}] <|> obsNoWsCtl
    let domainLiteral =  between (optional cfws >>. pchar '[') (pchar ']' >>. optional cfws) (many (optional fws >>. domainText) >>. optional fws)
    let domain = dottedAtoms <|> domainLiteral 
    let addrSpec = localPart >>. pchar '@' >>. domain >>. eof
    match run addrSpec email with
    | Failure (msg, _, _) -> false
    | Success _ -> true

Just for reference, here's the actual test output:

192 passed.
36 failed.
0 erred.
----
Failed: ID "21": "123456789012345678901234567890123456789012345678901234567890@1
2345678901234567890123456789012345678901234567890123456789.123456789012345678901
23456789012345678901234567890123456789.12345678901234567890123456789012345678901
234567890123456789.1234.example.com"
Expected: false
Actual: true
----
Failed: ID "23": "12345678901234567890123456789012345678901234567890123456789012
345@example.com"
Expected: false
Actual: true
----
Failed: ID "31": """@example.com"
Expected: false
Actual: true
----
Failed: ID "34": "x@x23456789.x23456789.x23456789.x23456789.x23456789.x23456789.
x23456789.x23456789.x23456789.x23456789.x23456789.x23456789.x23456789.x23456789.
x23456789.x23456789.x23456789.x23456789.x23456789.x23456789.x23456789.x23456789.
x23456789.x23456789.x23456789.x23456"
Expected: false
Actual: true
----
Failed: ID "35": "first.last@[.12.34.56.78]"
Expected: false
Actual: true
----
Failed: ID "36": "first.last@[12.34.56.789]"
Expected: false
Actual: true
----
Failed: ID "37": "first.last@[::12.34.56.78]"
Expected: false
Actual: true
----
Failed: ID "38": "first.last@[IPv5:::12.34.56.78]"
Expected: false
Actual: true
----
Failed: ID "39": "first.last@[IPv6:1111:2222:3333::4444:5555:12.34.56.78]"
Expected: false
Actual: true
----
Failed: ID "40": "first.last@[IPv6:1111:2222:3333:4444:5555:12.34.56.78]"
Expected: false
Actual: true
----
Failed: ID "41": "first.last@[IPv6:1111:2222:3333:4444:5555:6666:7777:12.34.56.7
8]"
Expected: false
Actual: true
----
Failed: ID "42": "first.last@[IPv6:1111:2222:3333:4444:5555:6666:7777]"
Expected: false
Actual: true
----
Failed: ID "43": "first.last@[IPv6:1111:2222:3333:4444:5555:6666:7777:8888:9999]
"
Expected: false
Actual: true
----
Failed: ID "44": "first.last@[IPv6:1111:2222::3333::4444:5555:6666]"
Expected: false
Actual: true
----
Failed: ID "45": "first.last@[IPv6:1111:2222:3333::4444:5555:6666:7777]"
Expected: false
Actual: true
----
Failed: ID "46": "first.last@[IPv6:1111:2222:333x::4444:5555]"
Expected: false
Actual: true
----
Failed: ID "47": "first.last@[IPv6:1111:2222:33333::4444:5555]"
Expected: false
Actual: true
----
Failed: ID "48": "first.last@example.123"
Expected: false
Actual: true
----
Failed: ID "49": "first.last@com"
Expected: false
Actual: true
----
Failed: ID "50": "first.last@-xample.com"
Expected: false
Actual: true
----
Failed: ID "51": "first.last@exampl-.com"
Expected: false
Actual: true
----
Failed: ID "52": "first.last@x23456789012345678901234567890123456789012345678901
2345678901234.example.com"
Expected: false
Actual: true
----
Failed: ID "97": "test@123.123.123.123"
Expected: false
Actual: true
----
Failed: ID "115": "test@12345678901234567890123456789012345678901234567890123456
78901234567890123456789012345678901234567890123456789012345678901234567890123456
78901234567890123456789012345678901234567890123456789012345678901234567890123456
789012345678901234567890123456789012.com"
Expected: false
Actual: true
----
Failed: ID "116": "test@example"
Expected: false
Actual: true
----
Failed: ID "153": "first."".last@example.com"
Expected: false
Actual: true
----
Failed: ID "158": "first.last@[IPv6:1111:2222:3333:4444:5555:6666:12.34.567.89]"

Expected: false
Actual: true
----
Failed: ID "159": ""test\
 blah"@example.com"
Expected: false
Actual: true
----
Failed: ID "190": "a@b"
Expected: false
Actual: true
----
Failed: ID "199": "aaa@[123.123.123.333]"
Expected: false
Actual: true
----
Failed: ID "201": "a@bar"
Expected: false
Actual: true
----
Failed: ID "205": "a@-b.com"
Expected: false
Actual: true
----
Failed: ID "206": "a@b-.com"
Expected: false
Actual: true
----
Failed: ID "213": "invalid@special.museum-"
Expected: false
Actual: true
----
Failed: ID "216": "foobar@192.168.0.1"
Expected: false
Actual: true
----
Failed: ID "227": ""null \0"@char.com"
Expected: false
Actual: true

1 comment:

Macie said...

Great reaading your blog