Skip to content

Commit

Permalink
New version: towards non-uniform datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
giopaglia committed Nov 29, 2023
1 parent d61bfff commit 1c87b4a
Show file tree
Hide file tree
Showing 15 changed files with 106 additions and 90 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ uuid = "e54bda2e-c571-11ec-9d64-0242ac120002"
license = "MIT"
desc = "Julia implementation of Modal Decision Trees and Random Forest algorithms"
authors = ["Giovanni PAGLIARINI"]
version = "0.2.1"
version = "0.3.0"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Expand Down Expand Up @@ -62,9 +62,9 @@ Reexport = "1"
ResumableFunctions = "0.6"
Revise = "3"
SoleBase = "0.11"
SoleData = "0.11"
SoleData = "0.12"
SoleLogics = "0.6"
SoleModels = "0.4"
SoleModels = "0.5"
StatsBase = "0.30 - 0.34"
Suppressor = "0.2"
Tables = "1"
Expand Down
2 changes: 1 addition & 1 deletion TODO
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Cool ideas:
☐ minified fwd structures: once computed the fwd, map each unique value to a UInt8/UInt16 value, while mantaining the relative order. Once a tree is learnt, remap the values to the old ones. tables contain a lot of redundance: perhaps use PooledArrays.jl, IndirectArrays.jl, CategoricalArrays.jl for saving space, both for saving to file AND for learing (might be beneficial for caching)?

☐ feature-importance.jl: mean decreased impurity https://medium.com/the-artificial-impostor/feature-importance-measures-for-tree-models-part-i-47f187c1a2c3
☐ DATA: Instances with different channel size. find implementation of multi-dimensional array where one can specify that along one axis subArrays have non-uniform size?! Highlight difference between channelsize and maxchannelsize; define NonUniformDimensionalDataset using ArrayOfVectors https://juliaarrays.github.io/ArraysOfArrays.jl/stable/; then fwd can simply be a gigantic table with Inf and -Inf for non-existing worlds!
☐ DATA: Instances with different channel size. find implementation of multi-dimensional array where one can specify that along one axis subArrays have non-uniform size?! Highlight difference between channelsize and maxchannelsize; define non-uniform DimensionalDataset using ArrayOfVectors https://juliaarrays.github.io/ArraysOfArrays.jl/stable/; then fwd can simply be a gigantic table with Inf and -Inf for non-existing worlds!

☐ add a parameter controlling how many thresholds are (randomly) considered at each node. Check that this optimization doesn't degrade performances and improves training times
☐ test: create a proper test suite, and test reproducibility of the results
Expand Down
3 changes: 3 additions & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[deps]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492"

[compat]
Documenter = "1"
2 changes: 1 addition & 1 deletion src/interfaces/MLJ/docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ $(n_subfeatures_str)
- `downsize=true` Whether to perform automatic downsizing, by means of moving average. In fact, this algorithm has high complexity
(both time and space), and can only handle small time-series (< 100 points) & small images (< 10 x 10 pixels).
When set to `true`, automatic downsizing is performed; when it is a NTuple of Integers, a downsizing of dimensional data
When set to `true`, automatic downsizing is performed; when it is an `NTuple` of `Integer`s, a downsizing of dimensional data
to match that size is performed.
- `print_progress=false`: set to `true` for a progress bar
Expand Down
125 changes: 65 additions & 60 deletions src/interfaces/MLJ/downsize.jl
Original file line number Diff line number Diff line change
@@ -1,120 +1,125 @@
using SoleBase: movingwindow
using SoleData: AbstractDimensionalDataset

DOWNSIZE_MSG = "If this process gets killed, please downsize your dataset beforehand."

function make_downsizing_function(channelsize::NTuple)
return function downsize(X)
return moving_average(X, channelsize)
return function downsize(instance)
return moving_average(instance, channelsize)
end
end

function make_downsizing_function(::TreeModel)
function downsize(X)
channelsize = SoleData.channelsize(X)
nvariables = SoleData.nvariables(X)
function downsize(instance)
channelsize = SoleData.instance_channelsize(instance)
nvariables = SoleData.instance_nvariables(instance)
channelndims = length(channelsize)
if channelndims == 1
n_points = channelsize[1]
if nvariables > 30 && n_points > 100
@warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG"
X = moving_average(X, 100)
# @warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG"
instance = moving_average(instance, 100)
elseif n_points > 150
@warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG"
X = moving_average(X, 150)
# @warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG"
instance = moving_average(instance, 150)
end
elseif channelndims == 2
if nvariables > 30 && prod(channelsize) > prod((7,7),)
new_channelsize = min.(channelsize, (7,7))
@warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
X = moving_average(X, new_channelsize)
# @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
instance = moving_average(instance, new_channelsize)
elseif prod(channelsize) > prod((10,10),)
new_channelsize = min.(channelsize, (10,10))
@warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
X = moving_average(X, new_channelsize)
# @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
instance = moving_average(instance, new_channelsize)
end
end
X
instance
end
end

function make_downsizing_function(::ForestModel)
function downsize(X)
channelsize = SoleData.channelsize(X)
nvariables = SoleData.nvariables(X)
function downsize(instance)
channelsize = SoleData.instance_channelsize(instance)
nvariables = SoleData.instance_nvariables(instance)
channelndims = length(channelsize)
if channelndims == 1
n_points = channelsize[1]
if nvariables > 30 && n_points > 100
@warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG"
X = moving_average(X, 100)
# @warn "Downsizing series $(n_points) points to $(100) points ($(nvariables) variables). $DOWNSIZE_MSG"
instance = moving_average(instance, 100)
elseif n_points > 150
@warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG"
X = moving_average(X, 150)
# @warn "Downsizing series $(n_points) points to $(150) points ($(nvariables) variables). $DOWNSIZE_MSG"
instance = moving_average(instance, 150)
end
elseif channelndims == 2
if nvariables > 30 && prod(channelsize) > prod((4,4),)
new_channelsize = min.(channelsize, (4,4))
@warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
X = moving_average(X, new_channelsize)
# @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
instance = moving_average(instance, new_channelsize)
elseif prod(channelsize) > prod((7,7),)
new_channelsize = min.(channelsize, (7,7))
@warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
X = moving_average(X, new_channelsize)
# @warn "Downsizing image of size $(channelsize) to $(new_channelsize) pixels ($(nvariables) variables). $DOWNSIZE_MSG"
instance = moving_average(instance, new_channelsize)
end
end
X
instance
end
end

_mean(::Type{T}, vals::AbstractArray{T}) where {T<:Number} = mean(vals)
_mean(::Type{T1}, vals::AbstractArray{T2}) where {T1<:AbstractFloat,T2<:Integer} = T1(mean(vals))
_mean(::Type{T1}, vals::AbstractArray{T2}) where {T1<:Integer,T2<:AbstractFloat} = round(T1, mean(vals))

function moving_average(
X::AbstractArray{T,1};
kwargs...
) where {T}
npoints = length(X)
return [_mean(T, X[idxs]) for idxs in movingwindow(npoints; kwargs...)]
end
# # 1D
# function moving_average(
# instance::AbstractArray{T,1};
# kwargs...
# ) where {T<:Union{Nothing,Number}}
# npoints = length(instance)
# return [_mean(T, instance[idxs]) for idxs in movingwindow(npoints; kwargs...)]
# end

function moving_average(
X::AbstractArray{T,1},
nwindows::Integer,
relative_overlap::AbstractFloat = .5,
) where {T}
npoints = length(X)
return [_mean(T, X[idxs]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)]
end
# # 1D
# function moving_average(
# instance::AbstractArray{T,1},
# nwindows::Integer,
# relative_overlap::AbstractFloat = .5,
# ) where {T<:Union{Nothing,Number}}
# npoints = length(instance)
# return [_mean(T, instance[idxs]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)]
# end

# 1D-instance
function moving_average(
X::AbstractArray{T,3},
instance::AbstractArray{T,2},
nwindows::Integer,
relative_overlap::AbstractFloat = .5,
) where {T}
npoints, n_variables, n_instances = size(X)
new_X = similar(X, (nwindows, n_variables, n_instances))
for i_instance in 1:n_instances
for i_variable in 1:n_variables
new_X[:, i_variable, i_instance] .= [_mean(T, X[idxs, i_variable, i_instance]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)]
end
) where {T<:Union{Nothing,Number}}
npoints, n_variables = size(instance)
new_instance = similar(instance, (nwindows, n_variables))
for i_variable in 1:n_variables
new_instance[:, i_variable] .= [_mean(T, instance[idxs, i_variable]) for idxs in movingwindow(npoints; nwindows = nwindows, relative_overlap = relative_overlap)]
end
return new_X
return new_instance
end

# 2D-instance
function moving_average(
X::AbstractArray{T,4},
instance::AbstractArray{T,3},
new_channelsize::Tuple{Integer,Integer},
relative_overlap::AbstractFloat = .5,
) where {T}
n_X, n_Y, n_variables, n_instances = size(X)
windows_1 = movingwindow(n_X; nwindows = new_channelsize[1], relative_overlap = relative_overlap)
) where {T<:Union{Nothing,Number}}
n_instance, n_Y, n_variables = size(instance)
windows_1 = movingwindow(n_instance; nwindows = new_channelsize[1], relative_overlap = relative_overlap)
windows_2 = movingwindow(n_Y; nwindows = new_channelsize[2], relative_overlap = relative_overlap)
new_X = similar(X, (new_channelsize..., n_variables, n_instances))
for i_instance in 1:n_instances
for i_variable in 1:n_variables
new_X[:, :, i_variable, i_instance] .= [_mean(T, X[idxs1, idxs2, i_variable, i_instance]) for idxs1 in windows_1, idxs2 in windows_2]
end
new_instance = similar(instance, (new_channelsize..., n_variables))
for i_variable in 1:n_variables
new_instance[:, :, i_variable] .= [_mean(T, instance[idxs1, idxs2, i_variable]) for idxs1 in windows_1, idxs2 in windows_2]
end
return new_X
return new_instance
end

function moving_average(dataset::AbstractDimensionalDataset, args...; kwargs...)
return map(instance->moving_average(instance, args...; kwargs...), eachinstance(dataset))
end
2 changes: 1 addition & 1 deletion src/interfaces/MLJ/printer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ function (c::ModelPrinter)(
end
# if haskey(kwargs, :variable_names_map) && kwargs.variable_names_map is not multimodal then fix... variable_names_map
if isnothing(X) && isnothing(y)
MDT.printmodel(io, model; more_kwargs..., kwargs...)
MDT.printmodel(io, model; silent = true, more_kwargs..., kwargs...)
elseif !isnothing(X) && !isnothing(y)
(X, y, var_grouping, classes_seen) = MMI.reformat(c.m, X, y)
MDT.printapply(io, model, X, y; silent = true, more_kwargs..., kwargs...)
Expand Down
30 changes: 17 additions & 13 deletions src/interfaces/MLJ/wrapdataset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,20 @@ function wrapdataset(
X = collect(X')
end

X = begin
if X isa AbstractArray
if !(X isa Union{AbstractVector,AbstractMatrix})
@warn "AbstractArray of $(ndims(X)) dimensions and size $(size(X)) encountered. " *
"This will be interpreted as a dataset of $(size(X)[end]) instances, " *
"$(size(X)[end-1]) variables, and channel size $(size(X)[1:end-2])."
# "datasets ($(typeof(X)) encountered)"
end
if X isa AbstractArray # Cube
if !(X isa Union{AbstractVector,AbstractMatrix})
@warn "AbstractArray of $(ndims(X)) dimensions and size $(size(X)) encountered. " *
"This will be interpreted as a dataset of $(size(X)[end]) instances, " *
"$(size(X)[end-1]) variables, and channel size $(size(X)[1:end-2])."
# "datasets ($(typeof(X)) encountered)"
end

X = model.downsize(X)
X = eachslice(X; dims=ndims(X))
end

X = begin
if X isa AbstractDimensionalDataset
X = model.downsize.(eachinstance(X))

if !passive_mode
@info "Precomputing logiset..."
Expand All @@ -56,7 +60,7 @@ function wrapdataset(
print_progress = (ninstances(X) > 500)
)
else
SoleData.cube2dataframe(X)
SoleData.dimensional2dataframe(X)
end
elseif X isa SupportedLogiset
X
Expand Down Expand Up @@ -107,9 +111,9 @@ function wrapdataset(

# Downsize
md = MultiModalDataset([begin
mod, varnames = SoleData.dataframe2cube(mod)
mod = model.downsize(mod)
SoleData.cube2dataframe(mod, varnames)
mod, varnames = SoleData.dataframe2dimensional(mod)
mod = model.downsize.(eachinstance(mod))
SoleData.dimensional2dataframe(mod, varnames)
end for mod in eachmodality(md)])

md, var_grouping
Expand Down
2 changes: 2 additions & 0 deletions src/leaf-metrics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ function get_metrics(
n_tot_inst = nothing,
rel_confidence_class_counts = nothing,
train_or_valid = true,
silent = false,
)
metrics = (;)

Expand Down Expand Up @@ -125,6 +126,7 @@ function get_metrics(
n_tot_inst = nothing,
rel_confidence_class_counts = nothing,
train_or_valid = true,
silent = false,
)
@assert isnothing(rel_confidence_class_counts)

Expand Down
2 changes: 1 addition & 1 deletion test/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,6 @@ cls_node = DTInternal(2, _decision, cls_leaf, cls_leaf)
# Mixed tree
@test_throws AssertionError DTInternal(2, _decision, reg_leaf, cls_leaf)

cls_tree = @test_nowarn DTree(cls_node, [Interval], [ModalDecisionTrees.start_without_world])
cls_tree = @test_nowarn DTree(cls_node, [ModalDecisionTrees.Interval], [ModalDecisionTrees.start_without_world])
cls_forest = @test_nowarn DForest([cls_tree, cls_tree, cls_tree])

2 changes: 1 addition & 1 deletion test/classification/cifar10.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ using StatsBase

Xcube

# img = eachslice(Xcube, dims=4)[1]
# img = eachslice(Xcube; dims=4)[1]
Xcubergb = mapslices(c->RGB(c...), Xcube, dims=3)
Xcubehsv = HSV.(Xcubergb)
# Xcubergb = mapslices(c->(@show c), Xcubehsv, dims=3)
Expand Down
4 changes: 3 additions & 1 deletion test/classification/japanesevowels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ acc = sum(yhat .== y[test_idxs])/length(yhat)

@test_throws BoundsError report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = [["a", "b"]]))
@test_throws BoundsError report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = ["a", "b"]))
# @test_logs (:warn,) report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = 'A':('A'+nvars)))
@test_nowarn report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = 'A':('A'+nvars)))
@test_nowarn report(mach).printmodel(syntaxstring_kwargs = (; variable_names_map = collect('A':('A'+nvars))))

@test_nowarn printmodel(report(mach).model)
@test_nowarn listrules(report(mach).model)
Expand Down Expand Up @@ -118,6 +118,8 @@ yhat = MLJ.predict_mode(mach, rows=test_idxs)
acc = sum(yhat .== y[test_idxs])/length(yhat)
@test MLJ.kappa(yhat, y[test_idxs]) > 0.5



mach = @test_logs (:warn,) machine(t, multilogiset, y)

# Fit
Expand Down
2 changes: 1 addition & 1 deletion test/classification/mnist.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ end
_s = collect(size(Xcube))
insert!(_s, length(_s), 1)
Xcube = reshape(Xcube, _s...)
X = SoleData.cube2dataframe(Xcube, ["white"])
X = SoleData.cube2dataframe(Xcube, ["black"])

X_train, y_train = X[p,:], y[p]
X_test, y_test = X[p_test,:], y[p_test]
Expand Down
8 changes: 4 additions & 4 deletions test/classification/satellite.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ _s = size(samplemap)
samplemap[[1,end],:] .= 0
samplemap[:,[1,end]] .= 0

samplemap = cat(moving_average(eachslice(samplemap, dims=1); window_size=2, window_step=1)...; dims=2)'
samplemap = cat(moving_average(eachslice(samplemap, dims=2); window_size=2, window_step=1)...; dims=2)
samplemap = cat(moving_average(eachslice(samplemap; dims=1); window_size=2, window_step=1)...; dims=2)'
samplemap = cat(moving_average(eachslice(samplemap; dims=2); window_size=2, window_step=1)...; dims=2)

samplemap = hcat(eachslice(samplemap, dims=2)..., zeros(size(samplemap, 1)))
samplemap = hcat(eachslice(samplemap, dims=1)..., zeros(size(samplemap, 2)))'
samplemap = hcat(eachslice(samplemap; dims=2)..., zeros(size(samplemap, 1)))
samplemap = hcat(eachslice(samplemap; dims=1)..., zeros(size(samplemap, 2)))'

samplemap = (samplemap .== 1.0)

Expand Down
4 changes: 2 additions & 2 deletions test/multimodal-datasets-multiformulas-construction.jl
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,8 @@ _succinct_y = [map(r->SoleModels.apply(r, multilogiset, i_instance), succinct_ru

preds = string.(predict_mode(mach, X_all))

@test_broken redundant_y == succinct_y
@test_broken _redundant_y == _succinct_y
@test redundant_y == succinct_y
@test _redundant_y == _succinct_y
@test eachcol(hcat(_redundant_y...)) == eachrow(hcat(redundant_y...))
@test eachcol(hcat(_succinct_y...)) == eachrow(hcat(succinct_y...))

Expand Down
2 changes: 1 addition & 1 deletion test/regression/digits-regression.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ mach = machine(ModalRandomForest(;
), Xnt, y) |> m->fit!(m, rows = train_idxs)

println(StatsBase.cor(MLJ.predict_mean(mach, X_testnt), y_test))
@test StatsBase.cor(MLJ.predict_mean(mach, X_testnt), y_test) > 0.55
@test StatsBase.cor(MLJ.predict_mean(mach, X_testnt), y_test) > 0.30

mach = machine(ModalRandomForest(;
n_subfeatures = 0.6,
Expand Down

0 comments on commit 1c87b4a

Please sign in to comment.