blob: 194d1b53297ef64e9859a1afad8e42ed6051225f [file] [log] [blame] [edit]
# This file is derived from
#
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
#
# Which was created by Markus Kuhn <mkuhn@acm.org> - 2000-09-02
#
# lines begining with # and blank lines are ignored
#
# Beyond that, this file consists of a series of test cases. Each test case consists of
# 2 or 3 lines:
#
# 1. A UTF-8 string
# 2. A status
# VALID : The string is a valid UTF-8 representation of valid Unicode
# INCOMPLETE : The string has a partial character at the end
# NOTUNICODE : The string is valid UTF-8, but the characters represented
# are not valid unicode (
# OVERLONG : The string includes overlong sequences
# MALFORMED : The string is not valid UTF-8
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
# as a series of hex numbers.
# 1 Some correct UTF-8 text
κόσμε
VALID
03ba 1f79 03c3 03bc 03b5
# 2.1 First possible sequence of a certain length
#
# FIXME - handle NULLS?
#
# [ NULL BYTE ]
#VALID
#0000
€
VALID
0080
ࠀ
VALID
0800
𐀀
VALID
00010000
øˆ€€€
NOTUNICODE
00200000
ü„€€€€
NOTUNICODE
04000000

VALID
0000007f
ß¿
VALID
000007ff
ï¿¿
VALID
0000ffff
÷¿¿¿
NOTUNICODE
001fffff
û¿¿¿¿
NOTUNICODE
03ffffff
ý¿¿¿¿¿
NOTUNICODE
7fffffff
# 2.3 Other boundary conditions
퟿
VALID
d7ff

VALID
e000
�
VALID
fffd
􏿽
VALID
0010fffd
􏿿
VALID
0010ffff
ô€€
NOTUNICODE
00110000
# 3.1 Unexpected continuation bytes
€
MALFORMED
¿
MALFORMED
€¿
MALFORMED
€¿€
MALFORMED
€¿€¿
MALFORMED
€¿€¿€
MALFORMED
€¿€¿€¿
MALFORMED
€¿€¿€¿€
MALFORMED
€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿
MALFORMED
# 3.2 Lonely start characters
À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
MALFORMED
à á â ã ä å æ ç è é ê ë ì í î ï
MALFORMED
ð ñ ò ó ô õ ö ÷
MALFORMED
ø ù ú û
MALFORMED
ü ý
MALFORMED
# 3.3 Sequences with last continuation byte missing
À
INCOMPLETE
à€
INCOMPLETE
ð€€
INCOMPLETE
ø€€€
INCOMPLETE
ü€€€€
INCOMPLETE
ß
INCOMPLETE
ï¿
INCOMPLETE
÷¿¿
INCOMPLETE
û¿¿¿
INCOMPLETE
ý¿¿¿¿
INCOMPLETE
# 3.4 Concatenation of incomplete sequences
Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿
MALFORMED
# 3.5 Impossible bytes
þ
MALFORMED
ÿ
MALFORMED
þþÿÿ
MALFORMED
# Examples of an overlong ASCII character
À¯
OVERLONG
à€¯
OVERLONG
ð€€¯
OVERLONG
ø€€€¯
OVERLONG
ü€€€€¯
OVERLONG
# Maximum overlong sequences
Á¿
OVERLONG
àŸ¿
OVERLONG
ð¿¿
OVERLONG
ø‡¿¿¿
OVERLONG
üƒ¿¿¿¿
OVERLONG
# Overlong representation of the NUL character

OVERLONG
à€€
OVERLONG
ð€€€
OVERLONG
ø€€€€
OVERLONG
ü€€€€€
OVERLONG
# Illegal code positions
# Single UTF-16 surrogates
í €
NOTUNICODE
d800
í­¿
NOTUNICODE
db7f
í®€
NOTUNICODE
db80
í¯¿
NOTUNICODE
dbff
í°€
NOTUNICODE
dc00
í¾€
NOTUNICODE
df80
í¿¿
NOTUNICODE
dfff
# Paired UTF-16 surrogates
𐀀
NOTUNICODE
d800 dc00
𐏿
NOTUNICODE
d800 dfff
󯰀
NOTUNICODE
db7f dc00
í­¿í¿¿
NOTUNICODE
db7f dfff
󰀀
NOTUNICODE
db80 dc00
󰏿
NOTUNICODE
db80 dfff
􏰀
NOTUNICODE
dbff dc00
􏿿
NOTUNICODE
dbff dfff
################
#
# Some more tests, not from Markus Kuhn's file
#
# Mixed plane 0 and higher planes
A𐀀B􏿽C
VALID
41 00010000 42 10fffd 43