Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ UnicodeExtras
Installation
------------

julia> Pkg.clone("git://github.com/nolta/UnicodeExtras.git")
julia> Pkg.clone("git://github.com/nolta/UnicodeExtras.jl.git")

Usage
-----
Expand All @@ -14,17 +14,40 @@ Usage
```jlcon
julia> using UnicodeExtras

julia> b = encode("Ålborg", "iso-8859-1")
6-element Array{Uint8,1}:
julia> b = encode("Ålborg is eating apples", "iso-8859-1")
23-element Array{Uint8,1}:
0xc5
0x6c
0x62
0x6f
0x72
0x67
0x20
0x69
0x73
0x20
0x69
0x6e
0x67
0x20
0x61
0x70
0x70
0x6c
0x65
0x73

julia> decode(b, "iso-8859-1")
"Ålborg"
"Ålborg is eating apples"

julia> detect_encoding(b)
5-element Array{ASCIIString,1}:
"ISO-8859-1"
"ISO-8859-2"
"Shift_JIS"
"GB18030"
"Big5"
```

### Case handling
Expand Down
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
julia 0.2
ICU 0.3
Compat 0.7
36 changes: 24 additions & 12 deletions src/UnicodeExtras.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ module UnicodeExtras

using ICU

using Compat

export
UnicodeText,
decode,
detect_encoding,
encode,
foldcase,
set_locale, # from ICU
Expand Down Expand Up @@ -51,7 +54,7 @@ titlecase(s::UTF16String) = u_strToTitle(s)
## UnicodeText ##

immutable UnicodeText
data::Array{Uint16,1}
data::Array{UInt16,1}
end

UnicodeText(s::ByteString) = UnicodeText(utf16(s).data)
Expand All @@ -62,18 +65,18 @@ convert(::Type{UTF16String}, t::UnicodeText) = UTF16String(t.data)

cmp(a::UnicodeText, b::UnicodeText) = ucol_strcoll(ICU.collator, a.data, b.data)
# is this right?
cmp(t::UnicodeText, s::String) = cmp(UTF16String(t.data), s)
cmp(s::String, t::UnicodeText) = cmp(t, s)
cmp(t::UnicodeText, s::AbstractString) = cmp(UTF16String(t.data), s)
cmp(s::AbstractString, t::UnicodeText) = cmp(t, s)

endof(t::UnicodeText) = length(t)

isequal(a::UnicodeText, b::UnicodeText) = cmp(a,b) == 0
isequal(a::UnicodeText, b::String) = cmp(a,b) == 0
isequal(a::String, b::UnicodeText) = cmp(a,b) == 0
isequal(a::UnicodeText, b::UnicodeText) = cmp(a,b) == 0
isequal(a::UnicodeText, b::AbstractString) = cmp(a,b) == 0
isequal(a::AbstractString, b::UnicodeText) = cmp(a,b) == 0

isless(a::UnicodeText, b::UnicodeText) = cmp(a,b) < 0
isless(a::UnicodeText, b::String) = cmp(a,b) < 0
isless(a::String, b::UnicodeText) = cmp(a,b) < 0
isless(a::UnicodeText, b::UnicodeText) = cmp(a,b) < 0
isless(a::UnicodeText, b::AbstractString) = cmp(a,b) < 0
isless(a::AbstractString, b::UnicodeText) = cmp(a,b) < 0

function length(t::UnicodeText)
bi = ubrk_open(UBRK_CHARACTER, ICU.locale, t.data)
Expand All @@ -86,7 +89,7 @@ function length(t::UnicodeText)
end

getindex(t::UnicodeText, i::Int) = getindex(t, i:i)
function getindex(t::UnicodeText, r::Range1{Int})
function getindex(t::UnicodeText, r::UnitRange{Int})
bi = ubrk_open(UBRK_CHARACTER, ICU.locale, t.data)
offset = 0
for i = 1:first(r)-1
Expand All @@ -111,7 +114,7 @@ show(io::IO, t::UnicodeText) = show(io, UTF16String(t.data))

## encodings ##

function transcode(src::Array{Uint8,1}, from::ASCIIString, to::ASCIIString)
function transcode(src::Array{UInt8,1}, from::ASCIIString, to::ASCIIString)
src_cnv = ucnv_open(from)
dst_cnv = ucnv_open(to)
src_buf = IOBuffer(src)
Expand All @@ -126,12 +129,21 @@ function transcode(src::Array{Uint8,1}, from::ASCIIString, to::ASCIIString)
takebuf_array(dst_buf)
end

function decode(b::Array{Uint8,1}, encoding::ASCIIString)
function decode(b::Array{UInt8,1}, encoding::ASCIIString)
bytestring(transcode(b, encoding, "utf8"))
end

function encode(s::ByteString, encoding::ASCIIString)
transcode(s.data, "utf8", encoding)
end

function detect_encoding(b::Array{UInt8,1})
cs = ucsdet_open()
ucsdet_setText(cs, b)
a = ucsdet_detectAll(cs)
ret = ASCIIString[ucsdet_getName(x) for x in a]
ucsdet_close(cs)
ret
end

end # module