diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..460ff559 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,62 @@ +name: CI + +on: + push: + branches: + - master + tags: + - v* + pull_request: + branches: + - master + +jobs: + check-code-format: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + python-version: 3.8 + + - name: Install module + run: | + pip install wheel + pip install .[dev] --extra-index-url https://download.pytorch.org/whl/cpu + + - name: Check code format with Black + run: | + black --check . + + - name: Check imports order with isort + run: | + isort --check-only . + + - name: Check code style with Flake8 + if: ${{ always() }} + run: | + flake8 . + + + run-tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + python-version: 3.8 + + - name: Install module + run: | + pip install wheel + pip install .[dev] --extra-index-url https://download.pytorch.org/whl/cpu + + - name: Run pytest + run: | + pytest -v tests/test.py diff --git a/faster_whisper/__init__.py b/faster_whisper/__init__.py index 93ebfcec..eea2e649 100644 --- a/faster_whisper/__init__.py +++ b/faster_whisper/__init__.py @@ -1,2 +1,9 @@ +from faster_whisper.audio import decode_audio from faster_whisper.transcribe import WhisperModel from faster_whisper.utils import format_timestamp + +__all__ = [ + "decode_audio", + "WhisperModel", + "format_timestamp", +] diff --git a/faster_whisper/feature_extractor.py b/faster_whisper/feature_extractor.py index a525d535..e2bc2312 100644 --- a/faster_whisper/feature_extractor.py +++ b/faster_whisper/feature_extractor.py @@ -1,7 +1,7 @@ import numpy as np -# Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py +# Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py # noqa: E501 class FeatureExtractor: def __init__( self, diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 6b030505..140ea9eb 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -2,7 +2,7 @@ def format_timestamp( seconds: float, always_include_hours: bool = False, decimal_marker: str = ".", -): +) -> str: assert seconds >= 0, "non-negative timestamp expected" milliseconds = round(seconds * 1000.0) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..bf2da868 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,9 @@ +[flake8] +max-line-length = 100 +ignore = + E203, + W503, + +[isort] +profile=black +lines_between_types=1 diff --git a/setup.py b/setup.py index 0b7994ae..d7ced254 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,13 @@ def get_requirements(path): install_requires=install_requires, extras_require={ "conversion": conversion_requires, + "dev": conversion_requires + + [ + "black==23.*", + "flake8==6.*", + "isort==5.*", + "pytest==7.*", + ], }, packages=find_packages(), ) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..be0d44f8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,31 @@ +import os + +import ctranslate2 +import pytest + + +@pytest.fixture +def data_dir(): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + + +@pytest.fixture +def jfk_path(data_dir): + return os.path.join(data_dir, "jfk.flac") + + +@pytest.fixture(scope="session") +def tiny_model_dir(tmp_path_factory): + model_path = str(tmp_path_factory.mktemp("data") / "model") + convert_model("tiny", model_path) + return model_path + + +def convert_model(size, output_dir): + name = "openai/whisper-%s" % size + + ctranslate2.converters.TransformersConverter( + name, + copy_files=["tokenizer.json"], + load_as_float16=True, + ).convert(output_dir, quantization="float16") diff --git a/tests/data/jfk.flac b/tests/data/jfk.flac new file mode 100644 index 00000000..e44b7c13 Binary files /dev/null and b/tests/data/jfk.flac differ diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 00000000..de53577a --- /dev/null +++ b/tests/test.py @@ -0,0 +1,25 @@ +from faster_whisper import WhisperModel + + +def test_transcribe(tiny_model_dir, jfk_path): + model = WhisperModel(tiny_model_dir) + segments, info = model.transcribe(jfk_path, word_timestamps=True) + + assert info.language == "en" + assert info.language_probability > 0.9 + assert info.duration == 11 + + segments = list(segments) + + assert len(segments) == 1 + + segment = segments[0] + + assert segment.text == ( + " And so my fellow Americans ask not what your country can do for you, " + "ask what you can do for your country." + ) + + assert segment.text == "".join(word.word for word in segment.words) + assert segment.start == segment.words[0].start + assert segment.end == segment.words[-1].end