From 93a95817560c8ae4c9f904ecbfe842eab4d8baec Mon Sep 17 00:00:00 2001 From: Jian Li Date: Wed, 8 Jan 2025 01:09:03 +0000 Subject: [PATCH 1/2] Non-atomic for UnsortedSegmentCustomKernel --- tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h index a9bf175f205f4e..8e2efd84549000 100644 --- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h +++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h @@ -902,7 +902,7 @@ struct UnsortedSegmentFunctor { config = GetGpuLaunchConfig(data_size, d); TF_CHECK_OK(GpuLaunchKernel( UnsortedSegmentCustomKernel< - T, Index, typename ReduceUpdateOpFor::atomic_op>, + T, Index, typename ReduceUpdateOpFor::nonatomic_op>, config.block_count, config.thread_per_block, 0, d.stream(), input_outer_dim_size, input_inner_dim_size, output_outer_dim_size, unsorted_segment_ids.data(), data.data(), output.data())); From a2c4ba4bb13ad77769f7b8aa67c3a97c55ac11ed Mon Sep 17 00:00:00 2001 From: Jian Li Date: Wed, 8 Jan 2025 01:11:30 +0000 Subject: [PATCH 2/2] Abandon cluster with small size regardless of _XlaCompile attr --- tensorflow/compiler/jit/mark_for_compilation_pass.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 96cf169b914285..37a2aa72d9feb8 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -990,8 +990,7 @@ Status MarkForCompilationPassImpl::CreateClusters() { // trouble. if (cluster->effective_cluster_size() >= debug_options_.min_cluster_size || - cluster->has_functional_control_flow() || - cluster->is_xla_compile_attr_true()) { + cluster->has_functional_control_flow()) { string& name = cluster_names[cluster->cycles_graph_node_id()]; if (name.empty()) {