New version: towards non-uniform datasets

aclai-lab · Nov 29, 2023 · 1c87b4a · 1c87b4a
1 parent d61bfff
commit 1c87b4a
Show file tree

Hide file tree

Showing 15 changed files with 106 additions and 90 deletions.
diff --git a/Project.toml b/Project.toml
@@ -3,7 +3,7 @@ uuid = "e54bda2e-c571-11ec-9d64-0242ac120002"
 license = "MIT"
 desc = "Julia implementation of Modal Decision Trees and Random Forest algorithms"
 authors = ["Giovanni PAGLIARINI"]
-version = "0.2.1"
+version = "0.3.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -62,9 +62,9 @@ Reexport = "1"
 ResumableFunctions = "0.6"
 Revise = "3"
 SoleBase = "0.11"
-SoleData = "0.11"
+SoleData = "0.12"
 SoleLogics = "0.6"
-SoleModels = "0.4"
+SoleModels = "0.5"
 StatsBase = "0.30 - 0.34"
 Suppressor = "0.2"
 Tables = "1"

diff --git a/TODO b/TODO
@@ -22,7 +22,7 @@ Cool ideas:
 ☐ minified fwd structures: once computed the fwd, map each unique value to a UInt8/UInt16 value, while mantaining the relative order. Once a tree is learnt, remap the values to the old ones. tables contain a lot of redundance: perhaps use PooledArrays.jl, IndirectArrays.jl, CategoricalArrays.jl for saving space, both for saving to file AND for learing (might be beneficial for caching)?
 
 ☐ feature-importance.jl: mean decreased impurity https://medium.com/the-artificial-impostor/feature-importance-measures-for-tree-models-part-i-47f187c1a2c3
-☐	DATA: Instances with different channel size. find implementation of multi-dimensional array where one can specify that along one axis subArrays have non-uniform size?! Highlight difference between channelsize and maxchannelsize; define NonUniformDimensionalDataset using ArrayOfVectors https://juliaarrays.github.io/ArraysOfArrays.jl/stable/; then fwd can simply be a gigantic table with Inf and -Inf for non-existing worlds!
+☐	DATA: Instances with different channel size. find implementation of multi-dimensional array where one can specify that along one axis subArrays have non-uniform size?! Highlight difference between channelsize and maxchannelsize; define non-uniform DimensionalDataset using ArrayOfVectors https://juliaarrays.github.io/ArraysOfArrays.jl/stable/; then fwd can simply be a gigantic table with Inf and -Inf for non-existing worlds!
 
 ☐	add a parameter controlling how many thresholds are (randomly) considered at each node. Check that this optimization doesn't degrade performances and improves training times
 ☐ test: create a proper test suite, and test reproducibility of the results

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,3 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492"
+
+[compat]
+Documenter = "1"
diff --git a/src/interfaces/MLJ/docstrings.jl b/src/interfaces/MLJ/docstrings.jl
@@ -116,7 +116,7 @@ $(n_subfeatures_str)
 
 - `downsize=true` Whether to perform automatic downsizing, by means of moving average. In fact, this algorithm has high complexity
     (both time and space), and can only handle small time-series (< 100 points) & small images (< 10 x 10 pixels).
-    When set to `true`, automatic downsizing is performed; when it is a NTuple of Integers, a downsizing of dimensional data
+    When set to `true`, automatic downsizing is performed; when it is an `NTuple` of `Integer`s, a downsizing of dimensional data
     to match that size is performed.
 
 - `print_progress=false`:  set to `true` for a progress bar

diff --git a/src/interfaces/MLJ/downsize.jl b/src/interfaces/MLJ/downsize.jl
@@ -1,120 +1,125 @@
 using SoleBase: movingwindow
+using SoleData: AbstractDimensionalDataset
 
 DOWNSIZE_MSG = "If this process gets killed, please downsize your dataset beforehand."
 
 function make_downsizing_function(channelsize::NTuple)
-    return function downsize(X)
-        return moving_average(X, channelsize)
+    return function downsize(instance)
+        return moving_average(instance, channelsize)
     end
 end
 
 function make_downsizing_function(::TreeModel)
-    function downsize(X)
-        channelsize = SoleData.channelsize(X)
-        nvariables = SoleData.nvariables(X)
+    function downsize(instance)
+        channelsize = SoleData.instance_channelsize(instance)
+        nvariables = SoleData.instance_nvariables(instance)
         channelndims = length(channelsize)
         if channelndims == 1
             n_points = channelsize[1]
             if nvariables > 30 && n_points > 100
-                @warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG"
-                X = moving_average(X, 100)
+                # @warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG"
+                instance = moving_average(instance, 100)
             elseif n_points > 150
-                @warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG"
-                X = moving_average(X, 150)
+                # @warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG"
+                instance = moving_average(instance, 150)
             end
         elseif channelndims == 2
             if nvariables > 30 && prod(channelsize) > prod((7,7),)
                 new_channelsize = min.(channelsize, (7,7))
-                @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
-                X = moving_average(X, new_channelsize)
+                # @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
+                instance = moving_average(instance, new_channelsize)
             elseif prod(channelsize) > prod((10,10),)
                 new_channelsize = min.(channelsize, (10,10))
-                @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
-                X = moving_average(X, new_channelsize)
+                # @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
+                instance = moving_average(instance, new_channelsize)
             end
         end
-        X
+        instance
     end
 end
 
 function make_downsizing_function(::ForestModel)
-    function downsize(X)
-        channelsize = SoleData.channelsize(X)
-        nvariables = SoleData.nvariables(X)
+    function downsize(instance)
+        channelsize = SoleData.instance_channelsize(instance)
+        nvariables = SoleData.instance_nvariables(instance)
         channelndims = length(channelsize)
         if channelndims == 1
             n_points = channelsize[1]
             if nvariables > 30 && n_points > 100
-                @warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG"
-                X = moving_average(X, 100)
+                # @warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG"
+                instance = moving_average(instance, 100)
             elseif n_points > 150
-                @warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG"
-                X = moving_average(X, 150)
+                # @warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG"
+                instance = moving_average(instance, 150)
             end
         elseif channelndims == 2
             if nvariables > 30 && prod(channelsize) > prod((4,4),)
                 new_channelsize = min.(channelsize, (4,4))
-                @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
-                X = moving_average(X, new_channelsize)
+                # @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
+                instance = moving_average(instance, new_channelsize)
             elseif prod(channelsize) > prod((7,7),)
                 new_channelsize = min.(channelsize, (7,7))
-                @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
-                X = moving_average(X, new_channelsize)
+                # @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
+                instance = moving_average(instance, new_channelsize)
             end
         end
-        X
+        instance
     end
 end
 
 _mean(::Type{T}, vals::AbstractArray{T}) where {T<:Number} = mean(vals)
 _mean(::Type{T1}, vals::AbstractArray{T2}) where {T1<:AbstractFloat,T2<:Integer} = T1(mean(vals))
 _mean(::Type{T1}, vals::AbstractArray{T2}) where {T1<:Integer,T2<:AbstractFloat} = round(T1, mean(vals))
 
-function moving_average(
-    X::AbstractArray{T,1};
-    kwargs...
-) where {T}
-    npoints = length(X)
-    return [_mean(T, X[idxs]) for idxs in movingwindow(npoints; kwargs...)]
-end
+# # 1D
+# function moving_average(
+#     instance::AbstractArray{T,1};
+#     kwargs...
+# ) where {T<:Union{Nothing,Number}}
+#     npoints = length(instance)
+#     return [_mean(T, instance[idxs]) for idxs in movingwindow(npoints; kwargs...)]
+# end
 
-function moving_average(
-    X::AbstractArray{T,1},
-    nwindows::Integer,
-    relative_overlap::AbstractFloat = .5,
-) where {T}
-    npoints = length(X)
-    return [_mean(T, X[idxs]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)]
-end
+# # 1D
+# function moving_average(
+#     instance::AbstractArray{T,1},
+#     nwindows::Integer,
+#     relative_overlap::AbstractFloat = .5,
+# ) where {T<:Union{Nothing,Number}}
+#     npoints = length(instance)
+#     return [_mean(T, instance[idxs]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)]
+# end
 
+# 1D-instance
 function moving_average(
-    X::AbstractArray{T,3},
+    instance::AbstractArray{T,2},
     nwindows::Integer,
     relative_overlap::AbstractFloat = .5,
-) where {T}
-    npoints, n_variables, n_instances = size(X)
-    new_X = similar(X, (nwindows, n_variables, n_instances))
-    for i_instance in 1:n_instances
-        for i_variable in 1:n_variables
-            new_X[:, i_variable, i_instance] .= [_mean(T, X[idxs, i_variable, i_instance]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)]
-        end
+) where {T<:Union{Nothing,Number}}
+    npoints, n_variables = size(instance)
+    new_instance = similar(instance, (nwindows, n_variables))
+    for i_variable in 1:n_variables
+        new_instance[:, i_variable] .= [_mean(T, instance[idxs, i_variable]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)]
     end
-    return new_X
+    return new_instance
 end
 
+# 2D-instance
 function moving_average(
-    X::AbstractArray{T,4},
+    instance::AbstractArray{T,3},
     new_channelsize::Tuple{Integer,Integer},
     relative_overlap::AbstractFloat = .5,
-) where {T}
-    n_X, n_Y, n_variables, n_instances = size(X)
-    windows_1 = movingwindow(n_X; nwindows = new_channelsize[1], relative_overlap = relative_overlap)
+) where {T<:Union{Nothing,Number}}
+    n_instance, n_Y, n_variables = size(instance)
+    windows_1 = movingwindow(n_instance; nwindows = new_channelsize[1], relative_overlap = relative_overlap)
     windows_2 = movingwindow(n_Y; nwindows = new_channelsize[2], relative_overlap = relative_overlap)
-    new_X = similar(X, (new_channelsize..., n_variables, n_instances))
-    for i_instance in 1:n_instances
-        for i_variable in 1:n_variables
-            new_X[:, :, i_variable, i_instance] .= [_mean(T, X[idxs1, idxs2, i_variable, i_instance]) for idxs1 in windows_1, idxs2 in windows_2]
-        end
+    new_instance = similar(instance, (new_channelsize..., n_variables))
+    for i_variable in 1:n_variables
+        new_instance[:, :, i_variable] .= [_mean(T, instance[idxs1, idxs2, i_variable]) for idxs1 in windows_1, idxs2 in windows_2]
     end
-    return new_X
+    return new_instance
+end
+
+function moving_average(dataset::AbstractDimensionalDataset, args...; kwargs...)
+    return map(instance->moving_average(instance, args...; kwargs...), eachinstance(dataset))
 end
diff --git a/src/interfaces/MLJ/printer.jl b/src/interfaces/MLJ/printer.jl
@@ -52,7 +52,7 @@ function (c::ModelPrinter)(
     end
     # if haskey(kwargs, :variable_names_map) && kwargs.variable_names_map is not multimodal then fix... variable_names_map
     if isnothing(X) && isnothing(y)
-        MDT.printmodel(io, model; more_kwargs..., kwargs...)
+        MDT.printmodel(io, model; silent = true, more_kwargs..., kwargs...)
     elseif !isnothing(X) && !isnothing(y)
         (X, y, var_grouping, classes_seen) = MMI.reformat(c.m, X, y)
         MDT.printapply(io, model, X, y; silent = true, more_kwargs..., kwargs...)

diff --git a/src/interfaces/MLJ/wrapdataset.jl b/src/interfaces/MLJ/wrapdataset.jl
@@ -34,16 +34,20 @@ function wrapdataset(
         X = collect(X')
     end
 
-    X = begin
-        if X isa AbstractArray
-            if !(X isa Union{AbstractVector,AbstractMatrix})
-                @warn "AbstractArray of $(ndims(X)) dimensions and size $(size(X)) encountered. " *
-                    "This will be interpreted as a dataset of $(size(X)[end]) instances, " *
-                    "$(size(X)[end-1]) variables, and channel size $(size(X)[1:end-2])."
-                    # "datasets ($(typeof(X)) encountered)"
-            end
+    if X isa AbstractArray # Cube
+        if !(X isa Union{AbstractVector,AbstractMatrix})
+            @warn "AbstractArray of $(ndims(X)) dimensions and size $(size(X)) encountered. " *
+                "This will be interpreted as a dataset of $(size(X)[end]) instances, " *
+                "$(size(X)[end-1]) variables, and channel size $(size(X)[1:end-2])."
+                # "datasets ($(typeof(X)) encountered)"
+        end
 
-            X = model.downsize(X)
+        X = eachslice(X; dims=ndims(X))
+    end
+
+    X = begin
+        if X isa AbstractDimensionalDataset
+            X = model.downsize.(eachinstance(X))
 
             if !passive_mode
                 @info "Precomputing logiset..."
@@ -56,7 +60,7 @@ function wrapdataset(
                     print_progress = (ninstances(X) > 500)
                 )
             else
-                SoleData.cube2dataframe(X)
+                SoleData.dimensional2dataframe(X)
             end
         elseif X isa SupportedLogiset
             X
@@ -107,9 +111,9 @@ function wrapdataset(
 
             # Downsize
             md = MultiModalDataset([begin
-                mod, varnames = SoleData.dataframe2cube(mod)
-                mod = model.downsize(mod)
-                SoleData.cube2dataframe(mod, varnames)
+                mod, varnames = SoleData.dataframe2dimensional(mod)
+                mod = model.downsize.(eachinstance(mod))
+                SoleData.dimensional2dataframe(mod, varnames)
             end for mod in eachmodality(md)])
 
             md, var_grouping

diff --git a/src/leaf-metrics.jl b/src/leaf-metrics.jl
@@ -17,6 +17,7 @@ function get_metrics(
     n_tot_inst = nothing,
     rel_confidence_class_counts = nothing,
     train_or_valid = true,
+    silent = false,
 )
     metrics = (;)
 
@@ -125,6 +126,7 @@ function get_metrics(
     n_tot_inst = nothing,
     rel_confidence_class_counts = nothing,
     train_or_valid = true,
+    silent = false,
 )
     @assert isnothing(rel_confidence_class_counts)
 

diff --git a/test/base.jl b/test/base.jl
@@ -76,6 +76,6 @@ cls_node = DTInternal(2, _decision, cls_leaf, cls_leaf)
 # Mixed tree
 @test_throws AssertionError DTInternal(2, _decision, reg_leaf, cls_leaf)
 
-cls_tree = @test_nowarn DTree(cls_node, [Interval], [ModalDecisionTrees.start_without_world])
+cls_tree = @test_nowarn DTree(cls_node, [ModalDecisionTrees.Interval], [ModalDecisionTrees.start_without_world])
 cls_forest = @test_nowarn DForest([cls_tree, cls_tree, cls_tree])
 
diff --git a/test/classification/cifar10.jl b/test/classification/cifar10.jl
@@ -74,7 +74,7 @@ using StatsBase
 
 Xcube
 
-# img = eachslice(Xcube, dims=4)[1]
+# img = eachslice(Xcube; dims=4)[1]
 Xcubergb = mapslices(c->RGB(c...), Xcube, dims=3)
 Xcubehsv = HSV.(Xcubergb)
 # Xcubergb = mapslices(c->(@show c), Xcubehsv, dims=3)

diff --git a/test/classification/japanesevowels.jl b/test/classification/japanesevowels.jl
@@ -46,8 +46,8 @@ acc = sum(yhat .== y[test_idxs])/length(yhat)
 
 @test_throws BoundsError report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = [["a", "b"]]))
 @test_throws BoundsError report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = ["a", "b"]))
-# @test_logs (:warn,) report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = 'A':('A'+nvars)))
 @test_nowarn report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = 'A':('A'+nvars)))
+@test_nowarn report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = collect('A':('A'+nvars))))
 
 @test_nowarn printmodel(report(mach).model)
 @test_nowarn listrules(report(mach).model)
@@ -118,6 +118,8 @@ yhat = MLJ.predict_mode(mach, rows=test_idxs)
 acc = sum(yhat .== y[test_idxs])/length(yhat)
 @test MLJ.kappa(yhat, y[test_idxs]) > 0.5
 
+
+
 mach = @test_logs (:warn,) machine(t, multilogiset, y)
 
 # Fit

diff --git a/test/classification/mnist.jl b/test/classification/mnist.jl
@@ -30,7 +30,7 @@ end
 _s = collect(size(Xcube))
 insert!(_s, length(_s), 1)
 Xcube = reshape(Xcube, _s...)
-X = SoleData.cube2dataframe(Xcube, ["white"])
+X = SoleData.cube2dataframe(Xcube, ["black"])
 
 X_train, y_train = X[p,:], y[p]
 X_test, y_test = X[p_test,:], y[p_test]

diff --git a/test/classification/satellite.jl b/test/classification/satellite.jl
@@ -24,11 +24,11 @@ _s = size(samplemap)
 samplemap[[1,end],:] .= 0
 samplemap[:,[1,end]] .= 0
 
-samplemap = cat(moving_average(eachslice(samplemap, dims=1); window_size=2, window_step=1)...; dims=2)'
-samplemap = cat(moving_average(eachslice(samplemap, dims=2); window_size=2, window_step=1)...; dims=2)
+samplemap = cat(moving_average(eachslice(samplemap; dims=1); window_size=2, window_step=1)...; dims=2)'
+samplemap = cat(moving_average(eachslice(samplemap; dims=2); window_size=2, window_step=1)...; dims=2)
 
-samplemap = hcat(eachslice(samplemap, dims=2)..., zeros(size(samplemap, 1)))
-samplemap = hcat(eachslice(samplemap, dims=1)..., zeros(size(samplemap, 2)))'
+samplemap = hcat(eachslice(samplemap; dims=2)..., zeros(size(samplemap, 1)))
+samplemap = hcat(eachslice(samplemap; dims=1)..., zeros(size(samplemap, 2)))'
 
 samplemap = (samplemap .== 1.0)
 

diff --git a/test/multimodal-datasets-multiformulas-construction.jl b/test/multimodal-datasets-multiformulas-construction.jl
@@ -214,8 +214,8 @@ _succinct_y = [map(r->SoleModels.apply(r, multilogiset, i_instance), succinct_ru
 
 preds = string.(predict_mode(mach, X_all))
 
-@test_broken redundant_y == succinct_y
-@test_broken _redundant_y == _succinct_y
+@test redundant_y == succinct_y
+@test _redundant_y == _succinct_y
 @test eachcol(hcat(_redundant_y...)) == eachrow(hcat(redundant_y...))
 @test eachcol(hcat(_succinct_y...)) == eachrow(hcat(succinct_y...))
 

diff --git a/test/regression/digits-regression.jl b/test/regression/digits-regression.jl
@@ -60,7 +60,7 @@ mach = machine(ModalRandomForest(;
 ), Xnt, y) |> m->fit!(m, rows = train_idxs)
 
 println(StatsBase.cor(MLJ.predict_mean(mach, X_testnt), y_test))
-@test StatsBase.cor(MLJ.predict_mean(mach, X_testnt), y_test) > 0.55
+@test StatsBase.cor(MLJ.predict_mean(mach, X_testnt), y_test) > 0.30
 
 mach = machine(ModalRandomForest(;
     n_subfeatures       = 0.6,