Skip to content

Commit

Permalink
Split out python runtime from proglang select ray runtime.
Browse files Browse the repository at this point in the history
Signed-off-by: David Wood <[email protected]>
  • Loading branch information
daw3rd committed Jun 17, 2024
1 parent cca39c4 commit 67e9990
Show file tree
Hide file tree
Showing 24 changed files with 650 additions and 227 deletions.
1 change: 1 addition & 0 deletions transforms/code/proglang_select/python/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
37 changes: 37 additions & 0 deletions transforms/code/proglang_select/python/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
test-data/output
output/*
/output/
data-processing-lib/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class


# Distribution / packaging
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
.tox/
htmlcov
.coverage
.cache
nosetests.xml
coverage.xml
40 changes: 40 additions & 0 deletions transforms/code/proglang_select/python/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade pip

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

# END OF STEPS destined for a data-prep-kit base image

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .


# copy source data
COPY ./src/proglang_select_transform_python.py .
COPY ./src/proglang_select_local.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
54 changes: 54 additions & 0 deletions transforms/code/proglang_select/python/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..
# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=proglang_select
# $(REPOROOT)/.make.versions file contains the versions
DOCKER_IMAGE_VERSION=${PROGLANG_SELECT_RAY_VERSION}

venv:: .transforms.python-venv

test:: .transforms.python-test

clean:: .transforms.clean

image:: .transforms.python-image

test-src:: .transforms.test-src

setup:: .transforms.setup

test-image:: .transforms.python-test-image

build:: build-dist image

publish:: publish-dist publish-image

publish-image:: .transforms.publish-image-python

# distribution versions is the same as image version.
set-versions:
$(MAKE) TOML_VERSION=$(DOCKER_IMAGE_VERSION) .defaults.update-toml

build-dist:: set-versions .defaults.build-dist

publish-dist:: .defaults.publish-dist


run-cli-sample: #.transforms.run-cli-python-sample
$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \
RUN_ARGS="--data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \
--proglang_select_language_column language \
--proglang_select_output_column lang_selected \
--proglang_select_allowed_langs_file ../test-data/languages/allowed-code-languages.txt " \
.transforms.run-src-file

run-local-sample: .transforms.run-local-sample

run-local-python-sample: .transforms.run-local-python-sample

load-image:: .transforms.load-image
74 changes: 74 additions & 0 deletions transforms/code/proglang_select/python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Programming Language Select

Please see the set of
[transform project conventions](../../../README.md)
for details on general project conventions, transform configuration,
testing and IDE set up.

## Summary

This is a transform which can be used while preprocessing code data. It allows the
user to specify the programming languages for which the data should be identifies as matching
a defined set of programming languages.
It adds a new annotation column which can specify boolean True/False based on whether the rows belong to the
specified programming languages. The rows which belongs to the programming languages which are
not matched are annotated as False.

It requires a text file specifying the allowed languages. It is specified by the
command line param `proglang_select_allowed_langs_file`.
A sample file is included at `test-data/languages/allowed-code-languages.lst`.
The column specifying programming languages is to be specified by
commandline params `proglang_select_language_column`.

## Configuration and command line Options

The set of dictionary keys holding configuration for values are as follows:

* _proglang_select_allowed_langs_file_ - specifies the location of the list of supported languages
* _proglang_select_language_column_ - specifies the name of the column containing the language
* _proglang_select_output_column_ - specifies the name of the annotation column appended to the parquet.
* _proglang_select_return_known_ - specifies whether to return supported or unsupported languages

## Running

### Launched Command Line Options
The following command line arguments are available in addition to
the options provided by the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md)
and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).

```
--proglang_select_allowed_langs_file PROGLANG_MATCH_ALLOWED_LANGS_FILE
Path to file containing the list of languages to be matched.
--proglang_select_language_column PROGLANG_MATCH_LANGUAGE_COLUMN
The column name holding the name of the programming language assigned to the document
--proglang_select_output_column PROGLANG_MATCH_OUTPUT_COLUMN
The column name to add and that contains the matching information
--proglang_select_s3_cred PROGLANG_MATCH_S3_CRED
AST string of options for s3 credentials. Only required for S3 data access.
access_key: access key help text
secret_key: secret key help text
url: optional s3 url
region: optional s3 region```
```


### Running the samples
To run the samples, use the following `make` targets

* `run-cli-sample` - runs src/proglang_select_transform.py using command line args
* `run-local-sample` - runs src/proglang_select_local.py
* `run-local-python-sample` - runs src/proglang_select_local_python.py

These targets will activate the virtual environment and set up any configuration needed.
Use the `-n` option of `make` to see the detail of what is done to run the sample.

For example,
```shell
make run-cli-sample
...
```
Then
```shell
ls output
```
To see results of the transform.
44 changes: 44 additions & 0 deletions transforms/code/proglang_select/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
[project]
name = "dpk_proglang_select_transform_python"
version = "0.4.0.dev6"
requires-python = ">=3.10"
description = "Programming Language Selection Python Transform"
license = {text = "Apache-2.0"}
readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Shivdeep Singh", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit==0.2.0.dev6",
]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[project.optional-dependencies]
dev = [
"twine",
"pytest>=7.3.2",
"pytest-dotenv>=0.5.2",
"pytest-env>=1.0.0",
"pre-commit>=3.3.2",
"pytest-cov>=4.1.0",
"pytest-mock>=3.10.0",
"moto==5.0.5",
"markupsafe==2.0.1",
]

[options]
package_dir = ["src","test"]

[options.packages.find]
where = ["src/"]

[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
markers = ["unit: unit tests", "integration: integration tests"]

[tool.coverage.run]
include = ["src/*"]
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import os

from data_processing.data_access import DataAccessFactory, DataAccessLocal
from proglang_select_transform_ray import (
from proglang_select_transform import (
ProgLangSelectTransform,
lang_allowed_langs_file_key,
lang_data_factory_key,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@

from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from proglang_select_transform_ray import (
ProgLangSelectRuntime,
from proglang_select_transform import (
ProgLangSelectTransformConfiguration,
lang_allowed_langs_file_key,
lang_lang_column_key,
Expand Down
Loading

0 comments on commit 67e9990

Please sign in to comment.