Deprecate RDD API

dfdx · May 9, 2022 · e752e4e · e752e4e
1 parent 5bc6289
commit e752e4e
Show file tree

Hide file tree

Showing 37 changed files with 193 additions and 82 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 *.jl.mem
 *~
 .idea/
+.vscode/
 target/
 project/
 *.class

diff --git a/jvm/sparkjl/pom.xml b/jvm/sparkjl/pom.xml
@@ -153,6 +153,11 @@
       <artifactId>scala-library</artifactId>
       <version>${scala.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.mdkt.compiler</groupId>
+      <artifactId>InMemoryJavaCompiler</artifactId>
+      <version>1.3.0</version>
+    </dependency>
 
     <!-- additional data formats -->
 

diff --git a/src/Spark.jl b/src/Spark.jl
@@ -1,50 +1,51 @@
 module Spark
 
 export
-    SparkConf,
-    SparkContext,
-    RDD,
-    JuliaRDD,
-    JavaRDD,
-    text_file,
-    parallelize,
-    map,
-    map_pair,
-    map_partitions,
-    map_partitions_pair,
-    map_partitions_with_index,
-    reduce,
-    filter,
-    collect,
-    count,
-    id,
-    num_partitions,
-    close,
-    @attach,
-    share_variable,
-    @share,
-    flat_map,
-    flat_map_pair,
-    cartesian,
-    group_by_key,
-    reduce_by_key,
-    cache,
-    repartition,
-    coalesce,
-    pipe,
+    # SparkConf,
+    # SparkContext,
+    # RDD,
+    # JuliaRDD,
+    # JavaRDD,
+    # text_file,
+    # parallelize,
+    # map,
+    # map_pair,
+    # map_partitions,
+    # map_partitions_pair,
+    # map_partitions_with_index,
+    # reduce,
+    # filter,
+    # collect,
+    # count,
+    # id,
+    # num_partitions,
+    # close,
+    # @attach,
+    # share_variable,
+    # @share,
+    # flat_map,
+    # flat_map_pair,
+    # cartesian,
+    # group_by_key,
+    # reduce_by_key,
+    # cache,
+    # repartition,
+    # coalesce,
+    # pipe,
     # SQL
     SparkSession,
     Dataset,
     sql,
+    count,
     read_json,
     write_json,
     read_parquet,
     write_parquet,
     read_df,
     write_df
-    
-    
+
+
 
 include("core.jl")
 
-end 
+end
diff --git a/src/compiler.jl b/src/compiler.jl
@@ -0,0 +1,60 @@
+# # const JFile = @jimport java.io.File
+# const JToolProvider = @jimport javax.tools.ToolProvider
+# const JJavaCompiler = @jimport javax.tools.JavaCompiler
+# const JInputStream = @jimport java.io.InputStream
+# const JOutputStream = @jimport java.io.OutputStream
+# const JArray = @jimport java.lang.reflect.Array
+
+const JInMemoryJavaCompiler = @jimport org.mdkt.compiler.InMemoryJavaCompiler
+
+# const JUDF1 = @jimport org.apache.spark.sql.api.java.UDF1
+
+
+function mkclass(name::String, src::String)
+    jcompiler = jcall(JInMemoryJavaCompiler, "newInstance", JInMemoryJavaCompiler, ())
+    return jcall(jcompiler, "compile", JClass, (JString, JString), name, src)
+end
+
+
+function instantiate(name::String, src::String)
+    jclass = mkclass(name, src)
+    return jcall(jclass, "newInstance", JObject, ())
+end
+
+
+function main()
+    init()
+    name = "julia.compiled.Dummy"
+    src = """
+        package julia.compiled;
+
+        import java.util.function.Function;
+
+        public class Dummy implements Function<String, String> {
+
+            @Override
+            public String apply(String name) {
+                return "Hello, " + name;
+            }
+
+            public void hello() {
+                System.out.println("Hello!");
+            }
+        }
+    """
+    jc = mkclass(name, src)
+    jo = jcall(jc, "newInstance", JObject, ())
+    # jo = instantiate(name, src)
+    # can't call inherited methods like this?
+    jcall(jo, "apply", JString, (JString,), "Lee")
+    jcall2(jo, "hello", Nothing, ())
+    jcall(jc, "getMethods", Vector{JMethod}, ())
+end
+
+
+function jcall2(jobj::JavaObject, name::String, ret_type, arg_types, args...)
+    jclass = getclass(jobj)
+    jargs = [a for a in convert.(arg_types, args)]  # convert to Vector
+    meth = jcall(jclass, "getMethod", JMethod, (JString, Vector{JClass}), name, getclass.(jargs))
+    return meth(jobj, jargs...)
+end
diff --git a/src/core.jl b/src/core.jl
@@ -1,13 +1,5 @@
-
 using JavaCall
-import Base: map, reduce, count, collect, close
-import Base.Iterators
 
-# config
-const JSparkConf = @jimport org.apache.spark.SparkConf
-# context
-const JSparkContext = @jimport org.apache.spark.SparkContext
-const JJavaSparkContext = @jimport org.apache.spark.api.java.JavaSparkContext
 # SQL
 const JSparkSession = @jimport org.apache.spark.sql.SparkSession
 const JStructType = @jimport org.apache.spark.sql.types.StructType
@@ -21,27 +13,14 @@ const JRowFactory = @jimport org.apache.spark.sql.RowFactory
 const JRow = @jimport org.apache.spark.sql.Row
 const JColumn = @jimport org.apache.spark.sql.Column
 const JSQLFunctions = @jimport org.apache.spark.sql.functions
-# RDD
-const JRDD = @jimport org.apache.spark.rdd.RDD
-const JJavaRDD = @jimport org.apache.spark.api.java.JavaRDD
-const JJavaPairRDD = @jimport org.apache.spark.api.java.JavaPairRDD
-const JJuliaRDD = @jimport org.apache.spark.api.julia.JuliaRDD
-const JJuliaPairRDD = @jimport org.apache.spark.api.julia.JuliaPairRDD
-# utils
-const JRDDUtils = @jimport org.apache.spark.api.julia.RDDUtils
-# Java utils
-const JIterator = @jimport java.util.Iterator
-const JList = @jimport java.util.List
-const JMap = @jimport java.util.Map
-const JArrayList = @jimport java.util.ArrayList
-const JHashMap = @jimport java.util.HashMap
-const JSystem = @jimport java.lang.System
 
+
+include("dotcaller.jl")
 include("init.jl")
-include("serialization.jl")
-include("config.jl")
-include("context.jl")
+include("compiler.jl")
 include("sql.jl")
-include("rdd.jl")
-include("attach.jl")
-include("worker.jl")
+
+# mostly unsupported RDD interface
+include("rdd/core.jl")
+
+
diff --git a/src/dotcaller.jl b/src/dotcaller.jl
@@ -0,0 +1,23 @@
+struct DotCaller{O, Fn}
+    obj::O
+    fn::Fn
+end
+
+DotCaller(obj, fn) = DotCaller{typeof(obj), typeof(fn)}(obj, fn)
+
+(c::DotCaller)(args...) = c.fn(c.obj, args...)
+
+
+
+macro dot_call(T)
+    return quote
+        function Base.getproperty(obj::$T, prop::Symbol)
+            if prop in names(@__MODULE__)
+                fn = getfield(@__MODULE__, prop)
+                return DotCaller(obj, fn)
+            else
+                return getfield(obj, prop)
+            end
+        end
+    end
+end
diff --git a/src/attach.jl → src/rdd/attach.jl b/src/attach.jl → src/rdd/attach.jl
diff --git a/src/config.jl → src/rdd/config.jl b/src/config.jl → src/rdd/config.jl
diff --git a/src/context.jl → src/rdd/context.jl b/src/context.jl → src/rdd/context.jl
diff --git a/src/rdd/core.jl b/src/rdd/core.jl
@@ -0,0 +1,33 @@
+
+import Base: map, reduce, count, collect, close
+import Base.Iterators
+
+# config
+const JSparkConf = @jimport org.apache.spark.SparkConf
+# context
+const JSparkContext = @jimport org.apache.spark.SparkContext
+const JJavaSparkContext = @jimport org.apache.spark.api.java.JavaSparkContext
+
+# RDD
+const JRDD = @jimport org.apache.spark.rdd.RDD
+const JJavaRDD = @jimport org.apache.spark.api.java.JavaRDD
+const JJavaPairRDD = @jimport org.apache.spark.api.java.JavaPairRDD
+const JJuliaRDD = @jimport org.apache.spark.api.julia.JuliaRDD
+const JJuliaPairRDD = @jimport org.apache.spark.api.julia.JuliaPairRDD
+# utils
+const JRDDUtils = @jimport org.apache.spark.api.julia.RDDUtils
+# Java utils
+const JIterator = @jimport java.util.Iterator
+const JList = @jimport java.util.List
+const JMap = @jimport java.util.Map
+const JArrayList = @jimport java.util.ArrayList
+const JHashMap = @jimport java.util.HashMap
+const JSystem = @jimport java.lang.System
+
+
+include("serialization.jl")
+include("config.jl")
+include("context.jl")
+include("rdd.jl")
+include("attach.jl")
+include("worker.jl")
diff --git a/src/rdd.jl → src/rdd/rdd.jl b/src/rdd.jl → src/rdd/rdd.jl
diff --git a/src/serialization.jl → src/rdd/serialization.jl b/src/serialization.jl → src/rdd/serialization.jl
diff --git a/src/worker.jl → src/rdd/worker.jl b/src/worker.jl → src/rdd/worker.jl
diff --git a/src/worker_runner.jl → src/rdd/worker_runner.jl b/src/worker_runner.jl → src/rdd/worker_runner.jl
diff --git a/src/sql.jl b/src/sql.jl
@@ -12,6 +12,8 @@ struct SparkSession
     appname::AbstractString
 end
 
+@dot_call SparkSession
+
 function SparkSession(;master="local",
                       appname="Julia App on Spark",
                       config=Dict{String, String}())
@@ -28,6 +30,7 @@ function SparkSession(;master="local",
     return sess
 end
 
+
 Base.show(io::IO, sess::SparkSession) = print(io, "SparkSession($(sess.master),$(sess.appname))")
 Base.close(sess::SparkSession) = jcall(sess.jsess, "close", Nothing, ())
 
@@ -44,6 +47,7 @@ struct Dataset
     jdf::JDataset
 end
 
+@dot_call Dataset
 
 struct DatasetIterator{T}
     itr::JavaObject{Symbol("java.util.Iterator")}
@@ -63,7 +67,7 @@ type_map = Dict(
     "ObjectType"  => JObject
 )
 
-function mapped_type(x::String) 
+function mapped_type(x::String)
     if x in keys(type_map)
         return type_map[x]
     end
@@ -79,7 +83,7 @@ function TableTraits.getiterator(ds::Dataset)
     mtypes = mapped_type.(unsafe_string.(map(x -> convert(JString, jcall(x, "_2", JObject, ())), jtypes)))
 
     T = NamedTuple{Tuple(mnames),Tuple{mtypes...}}
-    
+
     jit = jcall(ds.jdf, "toLocalIterator", JavaObject{Symbol("java.util.Iterator")}, ())
 
     l = count(ds)
@@ -205,6 +209,8 @@ struct Row
     jrow::JRow
 end
 
+@dot_call Row
+
 Row(objs...) = Row(jcall(JRowFactory, "create", JRow, (Vector{JObject},), [objs...]))
 
 
@@ -313,6 +319,7 @@ struct RelationalGroupedDataset
     jrgd::JRelationalGroupedDataset
 end
 
+@dot_call RelationalGroupedDataset
 
 function group_by(ds::Dataset, col_names...)
     @assert length(col_names) > 0 "group_by requires at least one column name"

diff --git a/test/attach.jl → test/rdd/attach.jl b/test/attach.jl → test/rdd/attach.jl
diff --git a/test/attach_include.jl → test/rdd/attach_include.jl b/test/attach_include.jl → test/rdd/attach_include.jl
diff --git a/test/basic.jl → test/rdd/basic.jl b/test/basic.jl → test/rdd/basic.jl
diff --git a/test/cartesian.jl → test/rdd/cartesian.jl b/test/cartesian.jl → test/rdd/cartesian.jl
diff --git a/test/collect_pair.jl → test/rdd/collect_pair.jl b/test/collect_pair.jl → test/rdd/collect_pair.jl
diff --git a/test/filter.jl → test/rdd/filter.jl b/test/filter.jl → test/rdd/filter.jl
diff --git a/test/flat_map.jl → test/rdd/flat_map.jl b/test/flat_map.jl → test/rdd/flat_map.jl
diff --git a/test/group_by_key.jl → test/rdd/group_by_key.jl b/test/group_by_key.jl → test/rdd/group_by_key.jl
diff --git a/test/julian_versions.jl → test/rdd/julian_versions.jl b/test/julian_versions.jl → test/rdd/julian_versions.jl
diff --git a/test/map.jl → test/rdd/map.jl b/test/map.jl → test/rdd/map.jl
diff --git a/test/map_pair.jl → test/rdd/map_pair.jl b/test/map_pair.jl → test/rdd/map_pair.jl
diff --git a/test/map_partitions.jl → test/rdd/map_partitions.jl b/test/map_partitions.jl → test/rdd/map_partitions.jl
diff --git a/test/pipe.jl → test/rdd/pipe.jl b/test/pipe.jl → test/rdd/pipe.jl
diff --git a/test/pipetest.cmd → test/rdd/pipetest.cmd b/test/pipetest.cmd → test/rdd/pipetest.cmd
diff --git a/test/pipetest.sh → test/rdd/pipetest.sh b/test/pipetest.sh → test/rdd/pipetest.sh
diff --git a/test/reduce.jl → test/rdd/reduce.jl b/test/reduce.jl → test/rdd/reduce.jl
diff --git a/test/reduce_by_key.jl → test/rdd/reduce_by_key.jl b/test/reduce_by_key.jl → test/rdd/reduce_by_key.jl
diff --git a/test/repartition_coalesce.jl → test/rdd/repartition_coalesce.jl b/test/repartition_coalesce.jl → test/rdd/repartition_coalesce.jl
diff --git a/test/share_variable.jl → test/rdd/share_variable.jl b/test/share_variable.jl → test/rdd/share_variable.jl
diff --git a/test/rdd/test_rdd.jl b/test/rdd/test_rdd.jl
@@ -0,0 +1,17 @@
+include("basic.jl")
+include("map.jl")
+include("map_partitions.jl")
+include("attach.jl")
+include("reduce.jl")
+include("text_file.jl")
+include("share_variable.jl")
+include("flat_map.jl")
+include("cartesian.jl")
+include("group_by_key.jl")
+include("reduce_by_key.jl")
+include("collect_pair.jl")
+include("map_pair.jl")
+include("julian_versions.jl")
+include("repartition_coalesce.jl")
+include("filter.jl")
+include("pipe.jl")
diff --git a/test/text_file.jl → test/rdd/text_file.jl b/test/text_file.jl → test/rdd/text_file.jl
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,23 +6,8 @@ Spark.init()
 
 @testset "Spark" begin
 
-include("basic.jl")
-include("map.jl")
-include("map_partitions.jl")
-include("attach.jl")
-include("reduce.jl")
-include("text_file.jl")
-include("share_variable.jl")
-include("flat_map.jl")
-include("cartesian.jl")
-include("group_by_key.jl")
-include("reduce_by_key.jl")
-include("collect_pair.jl")
-include("map_pair.jl")
-include("julian_versions.jl")    
-include("repartition_coalesce.jl")
-include("filter.jl")
-include("pipe.jl")
 include("sql.jl")
 
+# include("rdd/test_rdd.jl")
+
 end
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ @@
     *.jl.mem
     *~
     .idea/
+    .vscode/
     target/
     project/
     *.class
@@ Expand Down @@