Merge pull request #3 from PharmCat/dev_sm0.7

StatsModels 0.7
PharmCat · Mar 14, 2023 · 31de92f · 31de92f · PharmCat · Mar 14, 2023
2 parents 7fcb7d7 + 3f0820b
commit 31de92f
Show file tree

Hide file tree

Showing 10 changed files with 525 additions and 469 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,34 +1,35 @@
 name = "MetidaBase"
 uuid = "075456b7-4006-432f-9324-2f8453996c49"
 authors = ["PharmCat <[email protected]> and contributors"]
-version = "0.10.2"
+version = "0.11.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
+PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
+Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+SnoopPrecompile = "66db9d55-30c0-4569-8b51-7e840670fc0c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
-PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
-TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
-CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
+
 
 [compat]
-CategoricalArrays = "0.8, 0.9, 0.10"
-DataFrames = "1"
-Distributions = "0.20, 0.21, 0.22, 0.23, 0.24, 0.25"
+CategoricalArrays = "0.9, 0.10"
+CPUSummary = "0.1"
+PrettyTables = "2"
+Requires = "1"
+SnoopPrecompile = "1"
 StatsBase = "0.29, 0.30, 0.31, 0.32, 0.33"
-StatsModels = "0.6"
+StatsModels = "0.6, 0.7"
 Tables = "1"
-PrettyTables = "2"
-TypedTables = "1, 2"
-CPUSummary = "0.1"
 julia = "1"
 
 [extras]
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 
 [targets]
-test = ["Test", "CSV"]
+test = ["DataFrames", "Test", "TypedTables", "CSV"]
diff --git a/src/MetidaBase.jl b/src/MetidaBase.jl
@@ -1,22 +1,27 @@
 # Metida
 # Copyright © 2019-2020 Vladimir Arnautov aka PharmCat <[email protected]>
-
-__precompile__(true)
 module MetidaBase
 
-    using Tables, PrettyTables, StatsBase, StatsModels, Distributions, CategoricalArrays#, Reexport
+    using Tables, PrettyTables, StatsModels, CategoricalArrays, Requires#, Reexport
 
-    #@reexport using StatsModels
-    import DataFrames, TypedTables
+    import StatsBase
     import StatsModels: StatisticalModel, RegressionModel
     import Tables: istable, columnaccess, columns, getcolumn, columnnames, schema, rowaccess, rows
     import CPUSummary: num_cores
 
     import Base: getindex, length, ht_keyindex, show, pushfirst!, iterate, size, findfirst
 
     include("abstracttype.jl")
+    include("m_tables.jl")
+    include("dataset.jl")
     include("types.jl")
     include("utils.jl")
     include("iterators.jl")
+    include("precompile.jl")
+
+    function __init__()
+        @require DataFrames="a93c6f00-e57d-5684-b7b6-d8193f3e46c0" include("dataframes.jl")
+        @require TypedTables="9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" include("typedtables.jl")
+    end
 
 end
diff --git a/src/dataframes.jl b/src/dataframes.jl
@@ -0,0 +1,8 @@
+# DataFrames.jl interface
+function DataFrames.DataFrame(obj::AbstractDataSet; kwargs...)
+    DataFrames.DataFrame(metida_table_(obj; kwargs...))
+end
+
+function DataFrames.DataFrame(obj::MetidaTable)
+    DataFrames.DataFrame(obj.table)
+end
diff --git a/src/dataset.jl b/src/dataset.jl
@@ -0,0 +1,229 @@
+################################################################################
+# DATASET
+################################################################################
+struct DataSet{T <: AbstractData} <: AbstractDataSet{AbstractData}
+    ds::Vector{T}
+end
+
+function getdata(d::DataSet)
+    d.ds
+end
+
+@inline function getindormiss(d::Dict{K}, i::K) where K
+    ind::Int = ht_keyindex(d, i)
+    if ind > 0 return d.vals[ind]  end
+    missing
+end
+
+Tables.istable(::AbstractDataSet) = false
+
+Tables.rowaccess(::AbstractDataSet) = false
+################################################################################
+# BASE
+################################################################################
+
+function Base.getindex(d::DataSet, ind::Int)
+    d.ds[ind]
+end
+
+Base.getindex(d::DataSet, inds::UnitRange{Int64}) = subset(d, inds)
+
+
+@inline function getresultindex_safe(rd::T, ind::Symbol) where T <: AbstractResultData
+    getindormiss(rd.result, ind)
+end
+@inline function getresultindex_unsafe(rd::T, ind::Symbol) where T <: AbstractResultData
+    rd.result[ind]
+end
+
+function Base.getindex(d::DataSet{T}, col::Int, ind) where T <: AbstractResultData
+    getresultindex_safe(d[col], ind)
+end
+function Base.getindex(d::DataSet{T}, col::Colon, ind) where T <: AbstractResultData
+    @inbounds for i in Base.OneTo(length(d))
+        if Base.ht_keyindex(d.ds[i].result, ind) < 1 return getresultindex_safe.(d.ds, ind) end
+    end
+    getresultindex_unsafe.(d.ds, ind)
+end
+
+Base.first(d::DataSet) = first(getdata(d))
+
+function Base.length(d::DataSet)
+    length(getdata(d))
+end
+
+function Base.iterate(d::DataSet)
+    return Base.iterate(getdata(d))
+end
+
+function Base.iterate(d::DataSet, i::Int)
+    return Base.iterate(getdata(d), i)
+end
+
+function Base.map(f, d::DataSet)
+    DataSet(map(f, getdata(d)))
+end
+
+################################################################################
+# BASE
+################################################################################
+# sort!
+################################################################################
+function islessdict(a::Dict{A1,A2}, b::Dict{B1,B2}, k::Union{AbstractVector, Set}) where A1 where A2 where B1 where B2
+    l = length(k)
+    av = Vector{Union{Missing, A2}}(undef, l)
+    bv = Vector{Union{Missing, B2}}(undef, l)
+    @inbounds for i = 1:l
+        av[i] = getindormiss(a, k[i])
+        bv[i] = getindormiss(b, k[i])
+    end
+    isless(av, bv)
+end
+function islessdict(a::Dict, b::Dict, k)
+    isless(getindormiss(a, k), getindormiss(b, k))
+end
+function Base.sort!(d::DataSet{T}, k; alg::Base.Algorithm = QuickSort, lt=nothing, by=nothing, rev::Bool=false, order::Base.Ordering = Base.Forward) where T <: Union{AbstractIdData, AbstractIDResult}
+    if isnothing(by) by = x -> getid(x) end
+    if isnothing(lt) lt = (x, y) -> islessdict(x, y, k) end
+    sort!(d.ds;  alg = alg, lt = lt, by = by, rev = rev, order = order)
+    d
+end
+
+################################################################################
+# filter
+# filter!
+################################################################################
+function Base.filter(f::Function, d::DataSet)
+    ds   =  getdata(d)
+    inds  = findall(f, ds)
+    DataSet(ds[inds])
+end
+function Base.filter!(f::Function, d::DataSet)
+    filter!(f, getdata(d))
+    d
+end
+
+################################################################################
+# Base.findfirst
+################################################################################
+
+function Base.findfirst(d::DataSet{<: AbstractIdData}, sort::Dict)
+    findfirst(x-> sort ⊆ getid(x), getdata(d))
+end
+
+################################################################################
+# SELF
+################################################################################
+
+getid_safe(idd::AbstractIdData, ind) = getindormiss(idd.id, ind)
+
+getid_unsafe(idd::AbstractIdData, ind) = idd.id[ind]
+
+getid_safe(asr::AbstractIDResult, ind) = getindormiss(asr.data.id, ind)
+
+getid_unsafe(asr::AbstractIDResult, ind) = asr.data.id[ind]
+
+getid(idd::AbstractIdData, ind) = getid_safe(idd, ind)
+
+getid(asr::AbstractIDResult, ind) = getid_safe(asr, ind)
+
+getid(idd::AbstractIdData) = idd.id
+
+getid(asr::AbstractIDResult) = asr.data.id
+
+function getid(d::DataSet{T}, col::Int, ind) where T <: Union{AbstractIdData, AbstractIDResult}
+    getid(d[col], ind)
+end
+function getid(d::DataSet{T}, col::Colon, ind) where T <: AbstractIdData
+    @inbounds for i in Base.OneTo(length(d))
+        if Base.ht_keyindex(d.ds[i].id, ind) < 1 return getid_safe.(d.ds, ind) end
+    end
+    getid_unsafe.(d.ds, ind)
+end
+function getid(d::DataSet{T}, col::Colon, ind) where T <: AbstractIDResult
+    @inbounds for i in Base.OneTo(length(d))
+        if Base.ht_keyindex(d.ds[i].data.id, ind) < 1 return getid_safe.(d.ds, ind) end
+    end
+    getid_unsafe.(d.ds, ind)
+end
+
+
+function uniqueidlist(d::DataSet{T}, list::AbstractVector{Symbol}) where T <: AbstractIdData
+    dl = Vector{Dict}(undef, 0)
+    for i in d
+        if list ⊆ keys(getid(i))
+            subd = Dict(k => getid(i)[k] for k in list)
+            if subd ∉ dl push!(dl, subd) end
+        end
+    end
+    dl
+end
+
+function uniqueidlist(d::DataSet{T}, list::Symbol) where T <: AbstractIdData
+    dl = Vector{Dict}(undef, 0)
+    for i in d
+        if list in keys(getid(i))
+            subd = Dict(list => getid(i)[list])
+            if subd ∉ dl push!(dl, subd) end
+        end
+    end
+    dl
+end
+#=
+function uniqueidlist(d::DataSet{T}) where T <: AbstractIdData
+    dl = Vector{Dict}(undef, 0)
+    for i in d
+        id = getid(i)
+        if id ∉ dl push!(dl, id) end
+    end
+    dl
+end
+=#
+function uniqueidlist(::DataSet{T}, ::Nothing) where T <: AbstractIdData
+    nothing
+end
+
+
+function subset(d::DataSet{T}, sort::Dict) where T <: AbstractIdData
+    inds = findall(x-> sort ⊆ getid(x), getdata(d))
+    if length(inds) > 0 return DataSet(getdata(d)[inds]) end
+    DataSet(Vector{T}(undef, 0))
+end
+function subset(d::DataSet{T}, sort::Dict) where T <: AbstractIDResult
+    inds = findall(x-> sort ⊆ getid(x), getdata(d))
+    if length(inds) > 0 return DataSet(getdata(d)[inds]) end
+    DataSet(Vector{T}(undef, 0))
+end
+function subset(d::DataSet, inds)
+    DataSet(getdata(d)[inds])
+end
+################################################################################
+# metida_table from DataSet{AbstractIDResult}
+################################################################################
+function metida_table_(obj::DataSet{RD}; order = nothing, results = nothing, ids = nothing) where RD <: AbstractIDResult
+    idset  = Set(keys(first(obj).data.id))
+    resset = Set(keys(first(obj).result))
+    if length(obj) > 1
+        for i = 2:length(obj)
+            union!(idset,  Set(keys(obj[i].data.id)))
+            union!(resset, Set(keys(obj[i].result)))
+        end
+    end
+    if !isnothing(results)
+        if isa(results, Symbol) results = [results] end
+        if isa(results, String) results = [Symbol(results)] end
+        ressetl = isnothing(order) ? collect(intersect(resset, results)) : sortbyvec!(collect(intersect(resset, results)), order)
+    else
+        ressetl = isnothing(order) ? collect(resset) : sortbyvec!(collect(resset), order)
+    end
+    if !isnothing(ids)
+        if isa(ids, Symbol) ids = [ids] end
+        if isa(ids, String) ids = [Symbol(ids)] end
+        ids ⊆ idset || error("Some id not in dataset!")
+        idset = intersect(idset, ids)
+    end
+    mt1 = metida_table_((getid(obj, :, c) for c in idset)...; names = idset)
+    mt2 = metida_table_((obj[:, c] for c in ressetl)...; names = ressetl)
+    merge(mt1, mt2)
+end
+################################################################################