-
Notifications
You must be signed in to change notification settings - Fork 160
/
Copy pathtest_pdf2parquet.py
70 lines (63 loc) · 2.68 KB
/
test_pdf2parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
import os
import pyarrow as pa
from data_processing.data_access.data_access_local import DataAccessLocal
from data_processing.test_support import get_files_in_folder
from data_processing.test_support.transform import AbstractBinaryTransformTest
from data_processing.utils import TransformUtils
from dpk_pdf2parquet.transform import Pdf2ParquetTransform
class TestPdf2ParquetTransform(AbstractBinaryTransformTest):
"""
Extends the super-class to define the test data for the tests defined there.
The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
"""
def get_test_transform_fixtures(self) -> list[tuple]:
dal = DataAccessLocal()
basedir = os.path.abspath(
os.path.join(os.path.dirname(__file__), "../test-data")
)
input_dir = os.path.join(basedir, "input")
input_files = get_files_in_folder(input_dir, ".pdf")
input_files = [(name, binary) for name, binary in input_files.items()]
expected_metadata_list = [
{"nrows": 1, "nsuccess": 1, "nfail": 0, "nskip": 0},
{},
]
config = {
"double_precision": 0,
}
expected_files = [
os.path.join(
basedir,
"expected",
TransformUtils.get_file_basename(input_file).replace(
".pdf", ".parquet"
),
)
for input_file, _ in input_files
]
expected_files = [
(dal.get_file(name)[0], TransformUtils.get_file_extension(name)[1])
for name in expected_files
]
return [
# TEST DISABLED.
# This fails because the AbstractBinaryTransformTest is checking the bytes-size of the parquet
# since we need ignored columns, this is not a valid anymore.
# (
# Pdf2ParquetTransform(config),
# input_files,
# expected_files,
# expected_metadata_list,
# )
]