-
Notifications
You must be signed in to change notification settings - Fork 172
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split out python runtime from proglang select ray runtime.
Signed-off-by: David Wood <[email protected]>
- Loading branch information
Showing
24 changed files
with
650 additions
and
227 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
venv/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
test-data/output | ||
output/* | ||
/output/ | ||
data-processing-lib/ | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
|
||
# Distribution / packaging | ||
bin/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
.tox/ | ||
htmlcov | ||
.coverage | ||
.cache | ||
nosetests.xml | ||
coverage.xml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
FROM docker.io/python:3.10.14-slim-bullseye | ||
|
||
RUN pip install --upgrade pip | ||
|
||
# install pytest | ||
RUN pip install --no-cache-dir pytest | ||
|
||
# Create a user and use it to run the transform | ||
RUN useradd -ms /bin/bash dpk | ||
USER dpk | ||
WORKDIR /home/dpk | ||
|
||
# Copy and install data processing libraries | ||
# These are expected to be placed in the docker context before this is run (see the make image). | ||
COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ | ||
RUN cd data-processing-lib-python && pip install --no-cache-dir -e . | ||
|
||
# END OF STEPS destined for a data-prep-kit base image | ||
|
||
COPY --chown=dpk:root src/ src/ | ||
COPY --chown=dpk:root pyproject.toml pyproject.toml | ||
RUN pip install --no-cache-dir -e . | ||
|
||
|
||
# copy source data | ||
COPY ./src/proglang_select_transform_python.py . | ||
COPY ./src/proglang_select_local.py local/ | ||
|
||
# copy test | ||
COPY test/ test/ | ||
COPY test-data/ test-data/ | ||
|
||
# Set environment | ||
ENV PYTHONPATH /home/dpk | ||
|
||
# Put these at the end since they seem to upset the docker cache. | ||
ARG BUILD_DATE | ||
ARG GIT_COMMIT | ||
LABEL build-date=$BUILD_DATE | ||
LABEL git-commit=$GIT_COMMIT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Define the root of the local git clone for the common rules to be able | ||
# know where they are running from. | ||
REPOROOT=../../../.. | ||
# Include a library of common .transform.* targets which most | ||
# transforms should be able to reuse. However, feel free | ||
# to override/redefine the rules below. | ||
include $(REPOROOT)/transforms/.make.transforms | ||
|
||
TRANSFORM_NAME=proglang_select | ||
# $(REPOROOT)/.make.versions file contains the versions | ||
DOCKER_IMAGE_VERSION=${PROGLANG_SELECT_RAY_VERSION} | ||
|
||
venv:: .transforms.python-venv | ||
|
||
test:: .transforms.python-test | ||
|
||
clean:: .transforms.clean | ||
|
||
image:: .transforms.python-image | ||
|
||
test-src:: .transforms.test-src | ||
|
||
setup:: .transforms.setup | ||
|
||
test-image:: .transforms.python-test-image | ||
|
||
build:: build-dist image | ||
|
||
publish:: publish-dist publish-image | ||
|
||
publish-image:: .transforms.publish-image-python | ||
|
||
# distribution versions is the same as image version. | ||
set-versions: | ||
$(MAKE) TOML_VERSION=$(DOCKER_IMAGE_VERSION) .defaults.update-toml | ||
|
||
build-dist:: set-versions .defaults.build-dist | ||
|
||
publish-dist:: .defaults.publish-dist | ||
|
||
|
||
run-cli-sample: #.transforms.run-cli-python-sample | ||
$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \ | ||
RUN_ARGS="--data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ | ||
--proglang_select_language_column language \ | ||
--proglang_select_output_column lang_selected \ | ||
--proglang_select_allowed_langs_file ../test-data/languages/allowed-code-languages.txt " \ | ||
.transforms.run-src-file | ||
|
||
run-local-sample: .transforms.run-local-sample | ||
|
||
run-local-python-sample: .transforms.run-local-python-sample | ||
|
||
load-image:: .transforms.load-image |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# Programming Language Select | ||
|
||
Please see the set of | ||
[transform project conventions](../../../README.md) | ||
for details on general project conventions, transform configuration, | ||
testing and IDE set up. | ||
|
||
## Summary | ||
|
||
This is a transform which can be used while preprocessing code data. It allows the | ||
user to specify the programming languages for which the data should be identifies as matching | ||
a defined set of programming languages. | ||
It adds a new annotation column which can specify boolean True/False based on whether the rows belong to the | ||
specified programming languages. The rows which belongs to the programming languages which are | ||
not matched are annotated as False. | ||
|
||
It requires a text file specifying the allowed languages. It is specified by the | ||
command line param `proglang_select_allowed_langs_file`. | ||
A sample file is included at `test-data/languages/allowed-code-languages.lst`. | ||
The column specifying programming languages is to be specified by | ||
commandline params `proglang_select_language_column`. | ||
|
||
## Configuration and command line Options | ||
|
||
The set of dictionary keys holding configuration for values are as follows: | ||
|
||
* _proglang_select_allowed_langs_file_ - specifies the location of the list of supported languages | ||
* _proglang_select_language_column_ - specifies the name of the column containing the language | ||
* _proglang_select_output_column_ - specifies the name of the annotation column appended to the parquet. | ||
* _proglang_select_return_known_ - specifies whether to return supported or unsupported languages | ||
|
||
## Running | ||
|
||
### Launched Command Line Options | ||
The following command line arguments are available in addition to | ||
the options provided by the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md) | ||
and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). | ||
|
||
``` | ||
--proglang_select_allowed_langs_file PROGLANG_MATCH_ALLOWED_LANGS_FILE | ||
Path to file containing the list of languages to be matched. | ||
--proglang_select_language_column PROGLANG_MATCH_LANGUAGE_COLUMN | ||
The column name holding the name of the programming language assigned to the document | ||
--proglang_select_output_column PROGLANG_MATCH_OUTPUT_COLUMN | ||
The column name to add and that contains the matching information | ||
--proglang_select_s3_cred PROGLANG_MATCH_S3_CRED | ||
AST string of options for s3 credentials. Only required for S3 data access. | ||
access_key: access key help text | ||
secret_key: secret key help text | ||
url: optional s3 url | ||
region: optional s3 region``` | ||
``` | ||
|
||
|
||
### Running the samples | ||
To run the samples, use the following `make` targets | ||
|
||
* `run-cli-sample` - runs src/proglang_select_transform.py using command line args | ||
* `run-local-sample` - runs src/proglang_select_local.py | ||
* `run-local-python-sample` - runs src/proglang_select_local_python.py | ||
|
||
These targets will activate the virtual environment and set up any configuration needed. | ||
Use the `-n` option of `make` to see the detail of what is done to run the sample. | ||
|
||
For example, | ||
```shell | ||
make run-cli-sample | ||
... | ||
``` | ||
Then | ||
```shell | ||
ls output | ||
``` | ||
To see results of the transform. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
[project] | ||
name = "dpk_proglang_select_transform_python" | ||
version = "0.4.0.dev6" | ||
requires-python = ">=3.10" | ||
description = "Programming Language Selection Python Transform" | ||
license = {text = "Apache-2.0"} | ||
readme = {file = "README.md", content-type = "text/markdown"} | ||
authors = [ | ||
{ name = "Shivdeep Singh", email = "[email protected]" }, | ||
] | ||
dependencies = [ | ||
"data-prep-toolkit==0.2.0.dev6", | ||
] | ||
|
||
[build-system] | ||
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[project.optional-dependencies] | ||
dev = [ | ||
"twine", | ||
"pytest>=7.3.2", | ||
"pytest-dotenv>=0.5.2", | ||
"pytest-env>=1.0.0", | ||
"pre-commit>=3.3.2", | ||
"pytest-cov>=4.1.0", | ||
"pytest-mock>=3.10.0", | ||
"moto==5.0.5", | ||
"markupsafe==2.0.1", | ||
] | ||
|
||
[options] | ||
package_dir = ["src","test"] | ||
|
||
[options.packages.find] | ||
where = ["src/"] | ||
|
||
[tool.pytest.ini_options] | ||
# Currently we use low coverage since we have to run tests separately (see makefile) | ||
#addopts = "--cov --cov-report term-missing --cov-fail-under 25" | ||
markers = ["unit: unit tests", "integration: integration tests"] | ||
|
||
[tool.coverage.run] | ||
include = ["src/*"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.