From 3ca16f9702fd525ad0540233c0d9f457cbd014ed Mon Sep 17 00:00:00 2001 From: Matthijs Cox Date: Fri, 21 Nov 2025 16:58:52 +0100 Subject: [PATCH] fixing HDF5 char arrays --- src/MAT_HDF5.jl | 23 ++++++++++++++++++++--- src/MAT_v5.jl | 6 +++--- test/read.jl | 12 +++++++++++- test/v7.3/char_arrays.mat | Bin 0 -> 5464 bytes test/v7/char_arrays.mat | Bin 0 -> 429 bytes 5 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 test/v7.3/char_arrays.mat create mode 100644 test/v7/char_arrays.mat diff --git a/src/MAT_HDF5.jl b/src/MAT_HDF5.jl index 66f7361..b99917b 100644 --- a/src/MAT_HDF5.jl +++ b/src/MAT_HDF5.jl @@ -786,14 +786,31 @@ function read(obj::Union{HDF5.Dataset,HDF5.Attribute}, ::Type{MatlabString}) data = reshape(data, sz[2:end]) end if ndims(data) == 1 - return String(convert(Vector{Char}, data)) + return convert_string(data) elseif ndims(data) == 2 - return datap = String[rstrip(String(convert(Vector{Char}, vec(data[i, :])))) for i = 1:size(data, 1)] + return String[convert_string(c) for c in eachrow(data)] else - return data + return stringify_eachrow(data) end end +function stringify_eachrow(data) + dims = size(data) + fixed_dims = [1, 3:length(dims)...] # all except 2 + output_dims = dims[fixed_dims] + output = Array{String}(undef, output_dims...) + for I in CartesianIndices(output_dims) + idx = ntuple(d -> d == 2 ? Colon() : I[d < 2 ? d : d-1], length(dims)) + slice = view(data, idx...) + output[I] = convert_string(slice) + end + return output +end + +function convert_string(v::AbstractArray{UInt16}) + return String(convert(Vector{Char}, v)) +end + ## Utilities for handling complex numbers function build_datatype_complex(T::Type) memtype = create_datatype(HDF5.API.H5T_COMPOUND, 2*sizeof(T)) diff --git a/src/MAT_v5.jl b/src/MAT_v5.jl index 8f9eed7..d22b6b5 100644 --- a/src/MAT_v5.jl +++ b/src/MAT_v5.jl @@ -289,7 +289,7 @@ function read_string(f::IO, swap_bytes::Bool, dimensions::Vector{Int32}) else data = Vector{String}(undef, dimensions[1]) for i = 1:dimensions[1] - data[i] = rstrip(String(chars[i:dimensions[1]:end])) + data[i] = String(chars[i:dimensions[1]:end]) end end elseif dtype <= 4 || dtype == 17 @@ -316,7 +316,7 @@ function read_string(f::IO, swap_bytes::Bool, dimensions::Vector{Int32}) elseif dimensions[1] == 1 data = String(take!(bufs[1])) else - data = String[rstrip(String(take!(buf))) for buf in bufs] + data = String[String(take!(buf)) for buf in bufs] end else error("Unsupported string type") @@ -372,7 +372,7 @@ function read_matrix(f::IO, swap_bytes::Bool, subsys::Subsystem) data = read_struct(f, swap_bytes, dimensions, class == mxOBJECT_CLASS, subsys) elseif class == mxSPARSE_CLASS data = read_sparse(f, swap_bytes, dimensions, flags) - elseif class == mxCHAR_CLASS && length(dimensions) <= 2 + elseif class == mxCHAR_CLASS data = read_string(f, swap_bytes, dimensions) elseif class == mxFUNCTION_CLASS data = read_matrix(f, swap_bytes, subsys) diff --git a/test/read.jl b/test/read.jl index fe24fb3..c77f472 100644 --- a/test/read.jl +++ b/test/read.jl @@ -80,7 +80,7 @@ for _format in ["v6", "v7", "v7.3"] result = Dict( "simple_string" => "the quick brown fox", "accented_string" => "thé qüîck browñ fòx", - "concatenated_strings" => String["this is a string", "this is another string"], + "concatenated_strings" => String["this is a string ", "this is another string"], "cell_strings" => Any["this is a string" "this is another string"], "empty_string" => "" ) @@ -158,6 +158,16 @@ for _format in ["v6", "v7", "v7.3"] end +for _format in ["v7", "v7.3"] + result = Dict{String,Any}( + "s" => " aαβ ", # test α and β characters, not possible in v6 + "s2" => ["fòx", "aαβ", " ef", "ac "], + "s3" => reshape(["faò", "aeα", " xc", "fβ "], 2, 2), + "s4" => Any["fòx"; "aαβ"; " ef"; "ac ";;] + ) + check(joinpath(dirname(@__FILE__), _format, "char_arrays.mat"), result) +end + result = Dict( "index" => [8.8604784000000000e+04 9.8707212000000000e+04 1.0394035200000000e+05 1.1429712000000000e+05 1.5474923999999999e+05 1.5475435200000001e+05 1.5501074400000001e+05 1.5505315200000000e+05 1.5505718400000001e+05 1.5506168400000001e+05 1.5506524799999999e+05 5.4945741599999997e+05 5.6345896799999999e+05 5.9956981200000003e+05 7.0691623199999996e+05 7.9063657200000004e+05 8.4311938800000004e+05 9.2225131200000003e+05 1.1248994160000000e+06 1.2508148520000000e+06 1.4164141320000000e+06 1.4275988280000000e+06 1.4744331000000001e+06 1.4982212879999999e+06 1.5549058440000000e+06 1.5870300840000000e+06 1.6192005120000001e+06 1.6766071560000000e+06 1.9386816839999999e+06 1.9969427879999999e+06 2.0021861880000001e+06 2.3272494120000000e+06 2.5309351080000000e+06 2.6743788720000000e+06], "spikes" => [ diff --git a/test/v7.3/char_arrays.mat b/test/v7.3/char_arrays.mat new file mode 100644 index 0000000000000000000000000000000000000000..5f46f60d23a65537e52a215a96cb554aad7eea53 GIT binary patch literal 5464 zcmeHKUr!T35TEN&I1v;vM&i>XyzqcfS}LSZRQ{wy!A6_N8=lu|Xi{3!YYe{mh~L6T zAC*twBOk-hz!%Wjo0(o|dr+fE39y9D&g|~&?r-Pc?c_@@^UrZ*A%i>l($h+{=FelX z=C&%0=3x$t>u@o;y?ff1PEgCv;zmbUlkJkK1!JLy%k0SlC$!%wSzrOLuo5gt%6_V^o43i{&6x2jJ}UcH zQuHQ7kt2R#!#E(g@WVc_k=9JpJkyMh7e`o~0KT~elQiN01#ODo`(Dir!mB@|dB#=r zJl=uZBo-!M%@C4LQ@_2gT5s)_eXmjW!%;dXZO5*Lp+xmWF!On2I=tC^wP8V$&dlxC z7*kzGAth{c9AV@(0BEy3?w3&lI95PoCTd*yJ{mJY#B)NO$&2R=YgFxQMu6+KGlfgS zAoTKs+8Ksdv?(H2$j*$dosqTq;K31`*cExnt7muoG8T-5wvBC^R>J|yW`WwN;-kFM zZu#K;c^)?x{17)J6b)|P6~!$s|91Q3W%>8#UB+Mih`HO}n}R@G{w+vcR~%gFycg;T zEeq8=Bkn8NR0jJHjf+wK<%n-t=lwnM?RAi5_U2pGWBk8g>y3XBXDIJ>o?AoRqW?KH|MIQ?U%TGJoUOB|meqTm z(%wF!1Mt}#>Vk$ccB+bF=4#xSnvR(;WU<;=lvnt~TIcCIYX0r!ZK`aH>@oeDw@8vj z7~a%(hSzs7-&aQ7DI=D6-p>XEeH+DIvF*gJ*FIB54e1`RF7#`)aDmj5+#mc`@me*P zp%z!K$Z)$}iTKD**>~B1di|8;HjGlHJ)>O9?Vs=epX=hNE+3<}bfAze=1TZjc$X^%N14qp-!@v=#2COiQSMEFQZ|za@^DuC9>DvRZ365Id`~gX4 B7^wgN literal 0 HcmV?d00001 diff --git a/test/v7/char_arrays.mat b/test/v7/char_arrays.mat new file mode 100644 index 0000000000000000000000000000000000000000..8b307726feaf086b5fc8439f46bf697f98fa1adc GIT binary patch literal 429 zcmeZu4DoSvQZUssQ1EpO(M`+DN!3vZ$Vn_o%P-2cQV4Jk_w+L}(NSnFH}aZCnOXwB$+8Z zV@MKX<49=zB)KrhO+kn4Qj^0O)+1azojlSr%?-63t}-(?FJR^a>9mLI{D7p>^^q~N zp)#|u^F~4DW`zm5j7Ktonz+-0c91bHaAP=-jA2cZ zQ%GYTw9$XoxV&VaBNsK(jV8O$a}@q0@4vV^K*oV%AO^r2wA)a*p aB0XV2Jj01#4_{^P_@qrt%nYt?IXnPgoO@>g literal 0 HcmV?d00001