diff --git a/README.md b/README.md index 8fbde2e8..6a40229b 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Medacy can be installed for general use or for pipeline development / research p After installing medaCy and [medaCy's clinical model](examples/models/clinical_notes_model.md), simply run: ```python -from medacy.model import Model +from medacy.ner.model import Model model = Model.load_external('medacy_model_clinical_notes') annotation = model.predict("The patient was prescribed 1 capsule of Advil for 5 days.") diff --git a/docs/source/medacy.model.feature_extractor.rst b/docs/source/medacy.model.feature_extractor.rst deleted file mode 100644 index 013a99c7..00000000 --- a/docs/source/medacy.model.feature_extractor.rst +++ /dev/null @@ -1,7 +0,0 @@ -medacy.model.feature\_extractor module -====================================== - -.. automodule:: medacy.model.feature_extractor - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/medacy.model.rst b/docs/source/medacy.model.rst deleted file mode 100644 index 0695ea40..00000000 --- a/docs/source/medacy.model.rst +++ /dev/null @@ -1,8 +0,0 @@ -medacy.model package -==================== - -.. toctree:: - - medacy.model.feature_extractor - medacy.model.model - medacy.model.stratified_k_fold diff --git a/docs/source/medacy.model.model.rst b/docs/source/medacy.ner.model.model.rst similarity index 54% rename from docs/source/medacy.model.model.rst rename to docs/source/medacy.ner.model.model.rst index 0eddf1fb..e5b4fc06 100644 --- a/docs/source/medacy.model.model.rst +++ b/docs/source/medacy.ner.model.model.rst @@ -1,7 +1,7 @@ -medacy.model.model module +medacy.ner.model.model module ========================= -.. automodule:: medacy.model.model +.. automodule:: medacy.ner.model.model :members: :undoc-members: :show-inheritance: diff --git a/docs/source/medacy.ner.model.rst b/docs/source/medacy.ner.model.rst new file mode 100644 index 00000000..1a48d393 --- /dev/null +++ b/docs/source/medacy.ner.model.rst @@ -0,0 +1,7 @@ +medacy.ner.model package +==================== + +.. toctree:: + + medacy.ner.model.model + medacy.ner.model.stratified_k_fold \ No newline at end of file diff --git a/docs/source/medacy.model.stratified_k_fold.rst b/docs/source/medacy.ner.model.stratified_k_fold.rst similarity index 50% rename from docs/source/medacy.model.stratified_k_fold.rst rename to docs/source/medacy.ner.model.stratified_k_fold.rst index cc3213cf..4f1cb965 100644 --- a/docs/source/medacy.model.stratified_k_fold.rst +++ b/docs/source/medacy.ner.model.stratified_k_fold.rst @@ -1,7 +1,7 @@ -medacy.model.stratified\_k\_fold module +medacy.ner.model.stratified\_k\_fold module ======================================= -.. automodule:: medacy.model.stratified_k_fold +.. automodule:: medacy.ner.model.stratified_k_fold :members: :undoc-members: :show-inheritance: diff --git a/docs/source/medacy.ner.pipelines.base.base_pipeline.rst b/docs/source/medacy.ner.pipelines.base.base_pipeline.rst new file mode 100644 index 00000000..4d487e7e --- /dev/null +++ b/docs/source/medacy.ner.pipelines.base.base_pipeline.rst @@ -0,0 +1,7 @@ +medacy.ner.pipelines.base.base\_pipeline module +=========================================== + +.. automodule:: medacy.ner.pipelines.base.base_pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/medacy.ner.pipelines.base.rst b/docs/source/medacy.ner.pipelines.base.rst new file mode 100644 index 00000000..7e073844 --- /dev/null +++ b/docs/source/medacy.ner.pipelines.base.rst @@ -0,0 +1,6 @@ +medacy.ner.pipelines.base package +============================= + +.. toctree:: + + medacy.ner.pipelines.base.base_pipeline diff --git a/docs/source/medacy.ner.pipelines.clinical_pipeline.rst b/docs/source/medacy.ner.pipelines.clinical_pipeline.rst new file mode 100644 index 00000000..c6a766c2 --- /dev/null +++ b/docs/source/medacy.ner.pipelines.clinical_pipeline.rst @@ -0,0 +1,7 @@ +medacy.ner.pipelines.clinical\_pipeline module +========================================== + +.. automodule:: medacy.ner.pipelines.clinical_pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/medacy.ner.pipelines.drug_event_pipeline.rst b/docs/source/medacy.ner.pipelines.drug_event_pipeline.rst new file mode 100644 index 00000000..4d26fd29 --- /dev/null +++ b/docs/source/medacy.ner.pipelines.drug_event_pipeline.rst @@ -0,0 +1,7 @@ +medacy.ner.pipelines.drug\_event\_pipeline module +============================================= + +.. automodule:: medacy.ner.pipelines.drug_event_pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/medacy.ner.pipelines.fda_nano_drug_label_pipeline.rst b/docs/source/medacy.ner.pipelines.fda_nano_drug_label_pipeline.rst new file mode 100644 index 00000000..13a74898 --- /dev/null +++ b/docs/source/medacy.ner.pipelines.fda_nano_drug_label_pipeline.rst @@ -0,0 +1,7 @@ +medacy.ner.pipelines.fda\_nano\_drug\_label\_pipeline module +======================================================== + +.. automodule:: medacy.ner.pipelines.fda_nano_drug_label_pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/medacy.ner.pipelines.rst b/docs/source/medacy.ner.pipelines.rst new file mode 100644 index 00000000..ff8eacc2 --- /dev/null +++ b/docs/source/medacy.ner.pipelines.rst @@ -0,0 +1,11 @@ +medacy.ner.pipelines package +======================== + +.. toctree:: + + medacy.ner.pipelines.base + medacy.ner.pipelines.clinical_pipeline + medacy.ner.pipelines.drug_event_pipeline + medacy.ner.pipelines.fda_nano_drug_label_pipeline + medacy.ner.pipelines.systematic_review_pipeline + medacy.ner.pipelines.testing_pipeline diff --git a/docs/source/medacy.ner.pipelines.systematic_review_pipeline.rst b/docs/source/medacy.ner.pipelines.systematic_review_pipeline.rst new file mode 100644 index 00000000..ebf76cd0 --- /dev/null +++ b/docs/source/medacy.ner.pipelines.systematic_review_pipeline.rst @@ -0,0 +1,7 @@ +medacy.ner.pipelines.systematic\_review\_pipeline module +==================================================== + +.. automodule:: medacy.ner.pipelines.systematic_review_pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/medacy.pipelines.testing_pipeline.rst b/docs/source/medacy.ner.pipelines.testing_pipeline.rst similarity index 50% rename from docs/source/medacy.pipelines.testing_pipeline.rst rename to docs/source/medacy.ner.pipelines.testing_pipeline.rst index 5e40839b..4038af9d 100644 --- a/docs/source/medacy.pipelines.testing_pipeline.rst +++ b/docs/source/medacy.ner.pipelines.testing_pipeline.rst @@ -1,7 +1,7 @@ -medacy.pipelines.testing\_pipeline module +medacy.ner.pipelines.testing\_pipeline module ========================================= -.. automodule:: medacy.pipelines.testing_pipeline +.. automodule:: medacy.ner.pipelines.testing_pipeline :members: :undoc-members: :show-inheritance: diff --git a/docs/source/medacy.ner.rst b/docs/source/medacy.ner.rst new file mode 100644 index 00000000..727b4ec3 --- /dev/null +++ b/docs/source/medacy.ner.rst @@ -0,0 +1,7 @@ +medacy.ner package +==================== + +.. toctree:: + + medacy.ner.model + medacy.ner.pipelines diff --git a/docs/source/medacy.pipeline_components.feature_extraction.discrete_feature_extractor.rst b/docs/source/medacy.pipeline_components.feature_extraction.discrete_feature_extractor.rst new file mode 100644 index 00000000..fd39e43f --- /dev/null +++ b/docs/source/medacy.pipeline_components.feature_extraction.discrete_feature_extractor.rst @@ -0,0 +1,7 @@ +medacy.pipeline_components.feature_extraction.feature\_extractor module +====================================== + +.. automodule:: medacy.pipeline_components.feature_extraction.discrete_feature_extractor + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/medacy.pipeline_components.feature_extraction.rst b/docs/source/medacy.pipeline_components.feature_extraction.rst new file mode 100644 index 00000000..ac91fb07 --- /dev/null +++ b/docs/source/medacy.pipeline_components.feature_extraction.rst @@ -0,0 +1,6 @@ +medacy.pipeline\_components.feature\_extraction package +======================================== + +.. toctree:: + + medacy.pipeline_components.feature_extraction.discrete_feature_extractor diff --git a/docs/source/medacy.pipeline_components.rst b/docs/source/medacy.pipeline_components.rst index 8ec5b41a..8ba68141 100644 --- a/docs/source/medacy.pipeline_components.rst +++ b/docs/source/medacy.pipeline_components.rst @@ -9,3 +9,4 @@ medacy.pipeline\_components package medacy.pipeline_components.metamap medacy.pipeline_components.tokenization medacy.pipeline_components.units + medacy.pipeline_components.feature_extraction diff --git a/docs/source/medacy.pipelines.base.base_pipeline.rst b/docs/source/medacy.pipelines.base.base_pipeline.rst deleted file mode 100644 index e6e6cbe3..00000000 --- a/docs/source/medacy.pipelines.base.base_pipeline.rst +++ /dev/null @@ -1,7 +0,0 @@ -medacy.pipelines.base.base\_pipeline module -=========================================== - -.. automodule:: medacy.pipelines.base.base_pipeline - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/medacy.pipelines.base.rst b/docs/source/medacy.pipelines.base.rst deleted file mode 100644 index 498205b5..00000000 --- a/docs/source/medacy.pipelines.base.rst +++ /dev/null @@ -1,6 +0,0 @@ -medacy.pipelines.base package -============================= - -.. toctree:: - - medacy.pipelines.base.base_pipeline diff --git a/docs/source/medacy.pipelines.clinical_pipeline.rst b/docs/source/medacy.pipelines.clinical_pipeline.rst deleted file mode 100644 index d168800f..00000000 --- a/docs/source/medacy.pipelines.clinical_pipeline.rst +++ /dev/null @@ -1,7 +0,0 @@ -medacy.pipelines.clinical\_pipeline module -========================================== - -.. automodule:: medacy.pipelines.clinical_pipeline - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/medacy.pipelines.drug_event_pipeline.rst b/docs/source/medacy.pipelines.drug_event_pipeline.rst deleted file mode 100644 index 3dedd7ec..00000000 --- a/docs/source/medacy.pipelines.drug_event_pipeline.rst +++ /dev/null @@ -1,7 +0,0 @@ -medacy.pipelines.drug\_event\_pipeline module -============================================= - -.. automodule:: medacy.pipelines.drug_event_pipeline - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/medacy.pipelines.fda_nano_drug_label_pipeline.rst b/docs/source/medacy.pipelines.fda_nano_drug_label_pipeline.rst deleted file mode 100644 index e5adca71..00000000 --- a/docs/source/medacy.pipelines.fda_nano_drug_label_pipeline.rst +++ /dev/null @@ -1,7 +0,0 @@ -medacy.pipelines.fda\_nano\_drug\_label\_pipeline module -======================================================== - -.. automodule:: medacy.pipelines.fda_nano_drug_label_pipeline - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/medacy.pipelines.rst b/docs/source/medacy.pipelines.rst deleted file mode 100644 index 75e56cb6..00000000 --- a/docs/source/medacy.pipelines.rst +++ /dev/null @@ -1,11 +0,0 @@ -medacy.pipelines package -======================== - -.. toctree:: - - medacy.pipelines.base - medacy.pipelines.clinical_pipeline - medacy.pipelines.drug_event_pipeline - medacy.pipelines.fda_nano_drug_label_pipeline - medacy.pipelines.systematic_review_pipeline - medacy.pipelines.testing_pipeline diff --git a/docs/source/medacy.pipelines.systematic_review_pipeline.rst b/docs/source/medacy.pipelines.systematic_review_pipeline.rst deleted file mode 100644 index 98eec876..00000000 --- a/docs/source/medacy.pipelines.systematic_review_pipeline.rst +++ /dev/null @@ -1,7 +0,0 @@ -medacy.pipelines.systematic\_review\_pipeline module -==================================================== - -.. automodule:: medacy.pipelines.systematic_review_pipeline - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/medacy.relation.rst b/docs/source/medacy.relation.rst new file mode 100644 index 00000000..a482f355 --- /dev/null +++ b/docs/source/medacy.relation.rst @@ -0,0 +1,4 @@ +medacy.relation package +==================== + +.. toctree:: diff --git a/docs/source/medacy.rst b/docs/source/medacy.rst index 954db934..00d2068a 100644 --- a/docs/source/medacy.rst +++ b/docs/source/medacy.rst @@ -4,7 +4,7 @@ medacy package .. toctree:: medacy.data - medacy.model + medacy.ner + medacy.relation medacy.pipeline_components - medacy.pipelines medacy.tools diff --git a/examples/guide/data_management.md b/examples/guide/data_management.md index 943c58c0..429073a5 100644 --- a/examples/guide/data_management.md +++ b/examples/guide/data_management.md @@ -157,7 +157,7 @@ Once you have a trained or imported a model, pass in a Dataset object for bulk p ```python from medacy.data import Dataset -from medacy.model import Model +from medacy.ner.model import Model dataset = Dataset('/home/medacy/data') model = Model.load_external('medacy_model_clinical_notes') diff --git a/examples/guide/model_training.md b/examples/guide/model_training.md index b5460ad6..8e3ccc22 100644 --- a/examples/guide/model_training.md +++ b/examples/guide/model_training.md @@ -66,7 +66,7 @@ The previously mentioned components make up a medaCy model. In summary training import os from medacy.data import Dataset from medacy.pipelines import ClinicalPipeline -from medacy.model import Model +from medacy.ner import Model entities = ['Drug', 'Strength'] @@ -91,7 +91,7 @@ The `ClinicalPipeline` source looks like this: import spacy, sklearn_crfsuite from .base import BasePipeline from ..pipeline_components import ClinicalTokenizer -from medacy.model.feature_extractor import FeatureExtractor +from medacy.pipeline_components.feature_extractor import FeatureExtractor from ..pipeline_components import GoldAnnotatorComponent, MetaMapComponent, UnitComponent, MetaMap diff --git a/examples/guide/model_utilization.md b/examples/guide/model_utilization.md index 3d0e1482..b6fb00b2 100644 --- a/examples/guide/model_utilization.md +++ b/examples/guide/model_utilization.md @@ -9,7 +9,7 @@ Once a CRF model has been trained and saved to disk, it can be loaded again for ```python from medacy.pipelines import ClinicalPipeline -from medacy.model import Model +from medacy.ner import Model pipeline = ClinicalPipeline(metamap=None, entities=['Drug']) model = Model(pipeline) @@ -30,7 +30,7 @@ Once a model has been [packaged](packaging_a_medacy_model.md) and installed it c ```python import medacy_model_clinical_notes #import the python package wrapping the model -from medacy.model import Model +from medacy.ner import Model model = Model.load_external('medacy_model_clinical_notes') diff --git a/examples/scripts/training_predicting.py b/examples/scripts/training_predicting.py index c906580d..8de05973 100644 --- a/examples/scripts/training_predicting.py +++ b/examples/scripts/training_predicting.py @@ -4,7 +4,7 @@ # it's own directory along the models build log and model/pipeline parameters to keep results easily referencable during run time. # Once a sufficent model is produced, consider wrapping it up into a medaCy compatible model as defined the example guide. -from medacy.model import Model +from medacy.ner import Model from medacy.pipelines import SystematicReviewPipeline from medacy.data import Dataset from medacy.pipeline_components import MetaMap diff --git a/medacy/__init__.py b/medacy/__init__.py index 535d6213..d1f710d0 100644 --- a/medacy/__init__.py +++ b/medacy/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.0.9' +__version__ = '0.1.0' __authors__ = "Andriy Mulyar, Corey Sutphin, Bobby Best, Steele Farnsworth, Bridget McInnes" diff --git a/medacy/data/dataset.py b/medacy/data/dataset.py index 0ae34968..3181c619 100644 --- a/medacy/data/dataset.py +++ b/medacy/data/dataset.py @@ -92,6 +92,10 @@ def __init__(self, data_directory, """ Manages directory of training data along with other medaCy generated files. + Only text files: considers a directory for managing metamapping. + Only ann files: considers a directory of predictions. + Both text and ann files: considers a directory for training. + :param data_directory: Directory containing data for training or prediction. :param raw_text_file_extension: The file extension of raw text files in the data_directory (default: *.txt*) :param annotation_file_extension: The file extension of annotation files in the data_directory (default: *.ann*) @@ -112,42 +116,59 @@ def __init__(self, data_directory, # start by filtering all raw_text files, both training and prediction directories will have these raw_text_files = sorted([file for file in all_files_in_directory if file.endswith(raw_text_file_extension)]) - if raw_text_files is None: - raise ValueError("No raw text files exist in directory: %s" % self.data_directory) - if data_limit is not None: - self.data_limit = data_limit - else: - self.data_limit = len(raw_text_files) + if not raw_text_files: #detected a prediction directory + ann_files = sorted([file for file in all_files_in_directory if file.endswith(annotation_file_extension)]) + self.is_training_directory = False - if self.data_limit < 1 or self.data_limit > len(raw_text_files): - raise ValueError("Parameter 'data_limit' must be between 1 and number of raw text files in data_directory") + if data_limit is not None: + self.data_limit = data_limit + else: + self.data_limit = len(ann_files) - # required ann files for this to be a training directory - ann_files = [file.replace(".%s" % raw_text_file_extension, ".%s" % annotation_file_extension) for file in - raw_text_files] + for file in ann_files: + annotation_path = os.path.join(data_directory, file) + file_name = file[:-len(annotation_file_extension) - 1] + self.all_data_files.append(DataFile(file_name, None, annotation_path)) - # only a training directory if every text file has a corresponding ann_file - self.is_training_directory = all([os.path.isfile(os.path.join(data_directory, ann_file)) for ann_file in ann_files]) - # set all file attributes except metamap_path as it is optional. - for file in raw_text_files: - file_name = file[:-len(raw_text_file_extension) - 1] - raw_text_path = os.path.join(data_directory, file) + else: #detected a training directory (raw text files exist) - if self.is_training_directory: - annotation_path = os.path.join(data_directory, file.replace(".%s" % raw_text_file_extension, - ".%s" % annotation_file_extension)) + if data_limit is not None: + self.data_limit = data_limit else: - annotation_path = None - self.all_data_files.append(DataFile(file_name, raw_text_path, annotation_path)) + self.data_limit = len(raw_text_files) - #If directory is already metamapped, use it. - if self.is_metamapped(): - for data_file in self.all_data_files: - data_file.metamapped_path = os.path.join(self.metamapped_files_directory, - data_file.raw_path.split(os.path.sep)[-1] - .replace(".%s" % self.raw_text_file_extension, ".metamapped")) + if self.data_limit < 1 or self.data_limit > len(raw_text_files): + raise ValueError( + "Parameter 'data_limit' must be between 1 and number of raw text files in data_directory") + + # required ann files for this to be a training directory + ann_files = [file.replace(".%s" % raw_text_file_extension, ".%s" % annotation_file_extension) for file + in + raw_text_files] + # only a training directory if every text file has a corresponding ann_file + self.is_training_directory = all([os.path.isfile(os.path.join(data_directory, ann_file)) for ann_file in ann_files]) + + + # set all file attributes except metamap_path as it is optional. + for file in raw_text_files: + file_name = file[:-len(raw_text_file_extension) - 1] + raw_text_path = os.path.join(data_directory, file) + + if self.is_training_directory: + annotation_path = os.path.join(data_directory, file.replace(".%s" % raw_text_file_extension, + ".%s" % annotation_file_extension)) + else: + annotation_path = None + self.all_data_files.append(DataFile(file_name, raw_text_path, annotation_path)) + + #If directory is already metamapped, use it. + if self.is_metamapped(): + for data_file in self.all_data_files: + data_file.metamapped_path = os.path.join(self.metamapped_files_directory, + data_file.raw_path.split(os.path.sep)[-1] + .replace(".%s" % self.raw_text_file_extension, ".metamapped")) def get_data_files(self): @@ -330,7 +351,7 @@ def compute_confusion_matrix(self, dataset, leniency=0): raise ValueError("dataset must be instance of Dataset") #verify files are consistent - diff = set([file.ann_path for file in self]).difference(set([file.ann_path for file in dataset])) + diff = set([file.ann_path.split(os.sep)[-1] for file in self]).difference(set([file.ann_path.split(os.sep)[-1] for file in dataset])) if diff: raise ValueError("Dataset of predictions is missing the files: "+str(list(diff))) @@ -362,13 +383,14 @@ def compute_ambiguity(self, dataset): of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix. :param dataset: a Dataset object containing a predicted version of this dataset. + :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side. :return: a dictionary containing the ambiguity computations on each gold, predicted file pair """ if not isinstance(dataset, Dataset): raise ValueError("dataset must be instance of Dataset") # verify files are consistent - diff = set([file.ann_path for file in self]).difference(set([file.ann_path for file in dataset])) + diff = set([file.ann_path.split(os.sep)[-1] for file in self]).difference(set([file.ann_path.split(os.sep)[-1] for file in dataset])) if diff: raise ValueError("Dataset of predictions is missing the files: " + str(list(diff))) diff --git a/medacy/ner/__init__.py b/medacy/ner/__init__.py new file mode 100644 index 00000000..0ac52b69 --- /dev/null +++ b/medacy/ner/__init__.py @@ -0,0 +1 @@ +from .model.model import Model diff --git a/medacy/model/__init__.py b/medacy/ner/model/__init__.py similarity index 62% rename from medacy/model/__init__.py rename to medacy/ner/model/__init__.py index eaaa1316..b26bfe83 100644 --- a/medacy/model/__init__.py +++ b/medacy/ner/model/__init__.py @@ -1,3 +1,2 @@ from .model import Model -from .feature_extractor import FeatureExtractor from .stratified_k_fold import SequenceStratifiedKFold \ No newline at end of file diff --git a/medacy/model/_model.py b/medacy/ner/model/_model.py similarity index 100% rename from medacy/model/_model.py rename to medacy/ner/model/_model.py diff --git a/medacy/model/model.py b/medacy/ner/model/model.py similarity index 90% rename from medacy/model/model.py rename to medacy/ner/model/model.py index a60f073c..2acc372c 100644 --- a/medacy/model/model.py +++ b/medacy/ner/model/model.py @@ -5,7 +5,7 @@ import logging, os, joblib, time, importlib from medacy.data import Dataset from .stratified_k_fold import SequenceStratifiedKFold -from medacy.pipelines.base.base_pipeline import BasePipeline +from medacy.ner.pipelines import BasePipeline from pathos.multiprocessing import ProcessingPool as Pool, cpu_count from ._model import predict_document, construct_annotations_from_tuples from sklearn_crfsuite import metrics @@ -126,17 +126,27 @@ def predict(self, dataset, prediction_directory = None): annotations = predict_document(model, doc, medacy_pipeline) return annotations - def cross_validate(self, num_folds=10, dataset=None, write_predictions=False): + def cross_validate(self, num_folds=10, training_dataset=None, prediction_directory=None): """ Performs k-fold stratified cross-validation using our model and pipeline. + If the training dataset and prediction_directory are passed, intermediate predictions during cross validation + are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute + the prediction ambiguity with the methods present in the Dataset class to support pipeline development without + a designated evaluation set. + :param num_folds: number of folds to split training data into for cross validation - :param dataset: Dataset that sequences were extracted from - :return: Prints out performance metrics + :param training_dataset: Dataset that is being cross validated (optional) + :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory. + :return: Prints out performance metrics, if prediction_directory """ if num_folds <= 1: raise ValueError("Number of folds for cross validation must be greater than 1") + if prediction_directory is not None and training_dataset is None: + raise ValueError("Cannot generated predictions during cross validation if training dataset is not given." + " Please pass the training dataset in the 'training_dataset' parameter.") + assert self.model is not None, "Cannot cross validate a un-fit model" assert self.X_data is not None and self.y_data is not None, \ "Must have features and labels extracted for cross validation" @@ -168,7 +178,7 @@ def cross_validate(self, num_folds=10, dataset=None, write_predictions=False): learner.fit(train_data, y_train) y_pred = learner.predict(test_data) - if write_predictions: + if prediction_directory is not None: # Dict for storing mapping of sequences to their corresponding file preds_by_document = {filename: [] for filename in list(set([x[2] for x in X_data]))} @@ -264,16 +274,24 @@ def cross_validate(self, num_folds=10, dataset=None, write_predictions=False): logging.info("\n"+tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max'], tablefmt='orgtbl')) - if write_predictions: + if prediction_directory: # Write annotations generated from cross-validation - prediction_directory = dataset.data_directory + "/predictions/" - for data_file in dataset.get_data_files(): + if isinstance(prediction_directory, str): + prediction_directory = prediction_directory + else: + prediction_directory = training_dataset.data_directory + "/predictions/" + if os.path.isdir(prediction_directory): + logging.warning("Overwritting existing predictions") + else: + os.makedirs(prediction_directory) + for data_file in training_dataset.get_data_files(): logging.info("Predicting file: %s", data_file.file_name) with open(data_file.raw_path, 'r') as raw_text: doc = medacy_pipeline.spacy_pipeline.make_doc(raw_text.read()) preds = preds_by_document[data_file.file_name] annotations = construct_annotations_from_tuples(doc, preds) annotations.to_ann(write_location=os.path.join(prediction_directory, data_file.file_name + ".ann")) + return Dataset(data_directory=prediction_directory) def _extract_features(self, data_file, medacy_pipeline, is_metamapped): """ @@ -378,4 +396,3 @@ def load_external(package_name): def __str__(self): return self.get_info() - diff --git a/medacy/model/stratified_k_fold.py b/medacy/ner/model/stratified_k_fold.py similarity index 100% rename from medacy/model/stratified_k_fold.py rename to medacy/ner/model/stratified_k_fold.py diff --git a/medacy/pipelines/__init__.py b/medacy/ner/pipelines/__init__.py similarity index 71% rename from medacy/pipelines/__init__.py rename to medacy/ner/pipelines/__init__.py index 04e07315..162442e9 100644 --- a/medacy/pipelines/__init__.py +++ b/medacy/ner/pipelines/__init__.py @@ -2,4 +2,5 @@ from .systematic_review_pipeline import SystematicReviewPipeline from .fda_nano_drug_label_pipeline import FDANanoDrugLabelPipeline from .drug_event_pipeline import DrugEventPipeline -from .testing_pipeline import TestingPipeline \ No newline at end of file +from .testing_pipeline import TestingPipeline +from .base.base_pipeline import BasePipeline diff --git a/medacy/pipelines/base/__init__.py b/medacy/ner/pipelines/base/__init__.py similarity index 100% rename from medacy/pipelines/base/__init__.py rename to medacy/ner/pipelines/base/__init__.py diff --git a/medacy/pipelines/base/base_pipeline.py b/medacy/ner/pipelines/base/base_pipeline.py similarity index 98% rename from medacy/pipelines/base/base_pipeline.py rename to medacy/ner/pipelines/base/base_pipeline.py index 894b385d..1d04b3df 100644 --- a/medacy/pipelines/base/base_pipeline.py +++ b/medacy/ner/pipelines/base/base_pipeline.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from ...pipeline_components.base import BaseComponent +from medacy.pipeline_components.base import BaseComponent class BasePipeline(ABC): """ diff --git a/medacy/pipelines/clinical_pipeline.py b/medacy/ner/pipelines/clinical_pipeline.py similarity index 88% rename from medacy/pipelines/clinical_pipeline.py rename to medacy/ner/pipelines/clinical_pipeline.py index 073dc29d..cfef2cb0 100644 --- a/medacy/pipelines/clinical_pipeline.py +++ b/medacy/ner/pipelines/clinical_pipeline.py @@ -1,9 +1,9 @@ import spacy, sklearn_crfsuite from .base import BasePipeline -from ..pipeline_components import ClinicalTokenizer -from medacy.model.feature_extractor import FeatureExtractor +from medacy.pipeline_components import ClinicalTokenizer +from medacy.pipeline_components.feature_extraction.discrete_feature_extractor import FeatureExtractor -from ..pipeline_components import GoldAnnotatorComponent, MetaMapComponent, UnitComponent, MetaMap +from medacy.pipeline_components import GoldAnnotatorComponent, MetaMapComponent, MetaMap class ClinicalPipeline(BasePipeline): diff --git a/medacy/pipelines/drug_event_pipeline.py b/medacy/ner/pipelines/drug_event_pipeline.py similarity index 89% rename from medacy/pipelines/drug_event_pipeline.py rename to medacy/ner/pipelines/drug_event_pipeline.py index 7f2d4ee9..80a5c802 100644 --- a/medacy/pipelines/drug_event_pipeline.py +++ b/medacy/ner/pipelines/drug_event_pipeline.py @@ -1,10 +1,10 @@ import spacy, sklearn_crfsuite from .base import BasePipeline -from medacy.model.feature_extractor import FeatureExtractor +from medacy.pipeline_components.feature_extraction.discrete_feature_extractor import FeatureExtractor -from ..pipeline_components import GoldAnnotatorComponent, MetaMapComponent, CharacterTokenizer -from ..pipeline_components.lexicon import LexiconComponent -from ..pipeline_components.patterns import TableMatcherComponent +from medacy.pipeline_components import GoldAnnotatorComponent, MetaMapComponent, CharacterTokenizer +from medacy.pipeline_components.lexicon import LexiconComponent +from medacy.pipeline_components.patterns import TableMatcherComponent class DrugEventPipeline(BasePipeline): @@ -174,4 +174,4 @@ def get_tokenizer(self): def get_feature_extractor(self): extractor = FeatureExtractor(window_size=3, spacy_features=['pos_', 'shape_', 'prefix_', 'suffix_', 'like_num', 'text', 'head']) - return extractor + return extractor \ No newline at end of file diff --git a/medacy/pipelines/fda_nano_drug_label_pipeline.py b/medacy/ner/pipelines/fda_nano_drug_label_pipeline.py similarity index 88% rename from medacy/pipelines/fda_nano_drug_label_pipeline.py rename to medacy/ner/pipelines/fda_nano_drug_label_pipeline.py index a5b014b1..d663c58d 100644 --- a/medacy/pipelines/fda_nano_drug_label_pipeline.py +++ b/medacy/ner/pipelines/fda_nano_drug_label_pipeline.py @@ -1,9 +1,9 @@ import spacy, sklearn_crfsuite from .base import BasePipeline -from ..pipeline_components import SystematicReviewTokenizer, ClinicalTokenizer -from medacy.model.feature_extractor import FeatureExtractor +from medacy.pipeline_components import ClinicalTokenizer +from medacy.pipeline_components.feature_extraction.discrete_feature_extractor import FeatureExtractor -from ..pipeline_components import GoldAnnotatorComponent, MetaMapComponent, UnitComponent +from medacy.pipeline_components import GoldAnnotatorComponent, MetaMapComponent class FDANanoDrugLabelPipeline(BasePipeline): diff --git a/medacy/pipelines/systematic_review_pipeline.py b/medacy/ner/pipelines/systematic_review_pipeline.py similarity index 88% rename from medacy/pipelines/systematic_review_pipeline.py rename to medacy/ner/pipelines/systematic_review_pipeline.py index f007e850..84a797e9 100644 --- a/medacy/pipelines/systematic_review_pipeline.py +++ b/medacy/ner/pipelines/systematic_review_pipeline.py @@ -1,9 +1,9 @@ import spacy, sklearn_crfsuite from .base import BasePipeline -from ..pipeline_components import MetaMap, SystematicReviewTokenizer -from medacy.model.feature_extractor import FeatureExtractor +from medacy.pipeline_components import MetaMap, SystematicReviewTokenizer +from medacy.pipeline_components.feature_extraction.discrete_feature_extractor import FeatureExtractor -from ..pipeline_components import GoldAnnotatorComponent, MetaMapComponent, UnitComponent +from medacy.pipeline_components import GoldAnnotatorComponent, MetaMapComponent class SystematicReviewPipeline(BasePipeline): diff --git a/medacy/pipelines/testing_pipeline.py b/medacy/ner/pipelines/testing_pipeline.py similarity index 87% rename from medacy/pipelines/testing_pipeline.py rename to medacy/ner/pipelines/testing_pipeline.py index 471abb77..57dc661b 100644 --- a/medacy/pipelines/testing_pipeline.py +++ b/medacy/ner/pipelines/testing_pipeline.py @@ -1,9 +1,9 @@ import spacy, sklearn_crfsuite from .base import BasePipeline -from ..pipeline_components import ClinicalTokenizer -from medacy.model.feature_extractor import FeatureExtractor +from medacy.pipeline_components import ClinicalTokenizer +from medacy.pipeline_components.feature_extraction.discrete_feature_extractor import FeatureExtractor -from ..pipeline_components import GoldAnnotatorComponent +from medacy.pipeline_components import GoldAnnotatorComponent class TestingPipeline(BasePipeline): diff --git a/medacy/pipeline_components/__init__.py b/medacy/pipeline_components/__init__.py index ad2b6e22..f768d364 100644 --- a/medacy/pipeline_components/__init__.py +++ b/medacy/pipeline_components/__init__.py @@ -17,3 +17,6 @@ from .units.time_unit_component import TimeUnitComponent from .units.frequency_unit_component import FrequencyUnitComponent from .units.measurement_unit_component import MeasurementUnitComponent + + +from .feature_extraction.discrete_feature_extractor import FeatureExtractor diff --git a/medacy/tests/pipelines/__init__.py b/medacy/pipeline_components/feature_extraction/__init__.py similarity index 100% rename from medacy/tests/pipelines/__init__.py rename to medacy/pipeline_components/feature_extraction/__init__.py diff --git a/medacy/model/feature_extractor.py b/medacy/pipeline_components/feature_extraction/discrete_feature_extractor.py similarity index 100% rename from medacy/model/feature_extractor.py rename to medacy/pipeline_components/feature_extraction/discrete_feature_extractor.py diff --git a/medacy/pipeline_components/patterns/table_matcher_component.py b/medacy/pipeline_components/patterns/table_matcher_component.py index 4db91681..9a1ab75b 100644 --- a/medacy/pipeline_components/patterns/table_matcher_component.py +++ b/medacy/pipeline_components/patterns/table_matcher_component.py @@ -31,8 +31,9 @@ def __call__(self, doc): for match in re.finditer(TABLE_PATTERN, doc.text): start, end = match.span() span = doc.char_span(start, end) + if span is None: + continue for token in span: token._.set('feature_is_from_table', True) return doc - diff --git a/medacy/relation/__init__.py b/medacy/relation/__init__.py new file mode 100644 index 00000000..0ac52b69 --- /dev/null +++ b/medacy/relation/__init__.py @@ -0,0 +1 @@ +from .model.model import Model diff --git a/medacy/tests/tools/con_form/__init__.py b/medacy/tests/ner/__init__.py similarity index 100% rename from medacy/tests/tools/con_form/__init__.py rename to medacy/tests/ner/__init__.py diff --git a/medacy/tools/con_form/__init__.py b/medacy/tests/ner/model/__init__.py similarity index 100% rename from medacy/tools/con_form/__init__.py rename to medacy/tests/ner/model/__init__.py diff --git a/medacy/tests/model/test_model_prediction.py b/medacy/tests/ner/model/test_model_prediction.py similarity index 95% rename from medacy/tests/model/test_model_prediction.py rename to medacy/tests/ner/model/test_model_prediction.py index fa8fe20e..d06d90de 100644 --- a/medacy/tests/model/test_model_prediction.py +++ b/medacy/tests/ner/model/test_model_prediction.py @@ -1,6 +1,6 @@ from unittest import TestCase -from medacy.model import Model -from medacy.pipelines import TestingPipeline +from medacy.ner.model import Model +from medacy.ner.pipelines import TestingPipeline from medacy.tools import Annotations from medacy.data import Dataset import os, importlib, pkg_resources, tempfile, shutil diff --git a/medacy/tests/ner/pipelines/__init__.py b/medacy/tests/ner/pipelines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/medacy/tests/pipelines/test_clinical_pipeline.py b/medacy/tests/ner/pipelines/test_clinical_pipeline.py similarity index 92% rename from medacy/tests/pipelines/test_clinical_pipeline.py rename to medacy/tests/ner/pipelines/test_clinical_pipeline.py index 28490c6d..63a494d6 100644 --- a/medacy/tests/pipelines/test_clinical_pipeline.py +++ b/medacy/tests/ner/pipelines/test_clinical_pipeline.py @@ -1,5 +1,5 @@ from unittest import TestCase -from medacy.pipelines import ClinicalPipeline +from medacy.ner.pipelines import ClinicalPipeline from medacy.pipeline_components import GoldAnnotatorComponent, MetaMap diff --git a/medacy/tests/tools/__init__.py b/medacy/tests/tools/__init__.py index 6253b287..d0799406 100644 --- a/medacy/tests/tools/__init__.py +++ b/medacy/tests/tools/__init__.py @@ -1 +1 @@ -from .con_test_data.con_test import con_text, source_text +from medacy.tests.tools.converters.con_test_data.con_test import con_text, source_text diff --git a/medacy/tests/tools/converters/__init__.py b/medacy/tests/tools/converters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/medacy/tests/tools/converters/con_test_data/__init__.py b/medacy/tests/tools/converters/con_test_data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/medacy/tests/tools/con_test_data/con_test.py b/medacy/tests/tools/converters/con_test_data/con_test.py similarity index 100% rename from medacy/tests/tools/con_test_data/con_test.py rename to medacy/tests/tools/converters/con_test_data/con_test.py diff --git a/medacy/tests/tools/converters/conversion_tools/test_line.py b/medacy/tests/tools/converters/conversion_tools/test_line.py new file mode 100644 index 00000000..a6409b12 --- /dev/null +++ b/medacy/tests/tools/converters/conversion_tools/test_line.py @@ -0,0 +1,45 @@ +import unittest +from medacy.tools.converters.conversion_tools.line import Line + + +# Sample text must be on the lowest level of indentation so that +# the indentation is not counted towards the indices. + +sample_text_1 = """ABELCET (Amphotericin B Lipid Complex Injection)DESCRIPTIONABELCET is a sterile, pyrogen-free suspension for intravenous infusion. +ABELCET consists of ampho-tericin B complexed with two phospholipids in a 1:1 drug-to-lipid molar ratio. +The two phospholipids,L-α-dimyristoylphosphatidylcholine (DMPC) and L-α-dimyristoylphosphatidylglycerol (DMPG), are pre-sent in a 7:3 molar ratio. +ABELCET is yellow and opaque in appearance, with a pH of 5 - 7. +NOTE: Liposomal encapsulation or incorporation in a lipid complex can substantially affect adrug's functional properties relative to those of the unencapsulated or nonlipid-associated drug. +Inaddition, different liposomal or lipid-complexed products with a common active ingredient mayvary from one another in the chemical composition and physical form of the lipid component. +Suchdifferences may affect functional properties of these drug products.Amphotericin B is a polyene, antifungal antibiotic produced from a strain of Streptomyces nodosus.Amphotericin B is designated chemically as [1R-(1R*, 3S*, 5R*, 6R*, 9R*, 11R*, 15S*, 16R*, 17R*,18S*, 19E, 21E, 23E, 25E, 27E, 29E, 31E, 33R*, 35S*, 36R*, 37S*)]-33-[(3-Amino-3, 6- D-mannopyranosyl) oxy]-1,3,5,6,9,11,17,37-octahydroxy-15,16,18-trimethyl-13-oxo-14,39-dioxabicy-clo[33.3.1] nonatriaconta-19, 21, 23, 25, 27, 29, 31-heptaene-36-carboxylic acid. +It has a molecular weight of 924.09 and a molecular formula of C47H73NO17. +The structural formula is: +ABELCET is provided as a sterile, opaque suspension in 20 mL glass, single-use vials.""" + +sample_text_2 = """This is the first sample line +This is the second line +Also this line +This is another line +Also this line +The previous line is a repeat on purpose +Also this line +This is so much fun""" + + +class TestLine(unittest.TestCase): + """Unit tests for line.py""" + + def test_init_lines_no_repeats(self): + """Test that indices are accurate when there are no repeated lines.""" + text_lines = sample_text_1.split('\n') + line_objs = Line.init_lines(sample_text_1) + expected = [sample_text_1.index(line) for line in text_lines] + actual = [line.index for line in line_objs] + self.assertListEqual(actual, expected) + + def test_init_lines_with_repeats(self): + """Test that indices are accurate even when lines are repeated.""" + line_objs = Line.init_lines(sample_text_2) + expected = [0, 30, 54, 69, 90, 105, 146, 161] + actual = [line.index for line in line_objs] + self.assertListEqual(actual, expected) diff --git a/medacy/tests/tools/con_form/test_brat_to_con.py b/medacy/tests/tools/converters/test_brat_to_con.py similarity index 92% rename from medacy/tests/tools/con_form/test_brat_to_con.py rename to medacy/tests/tools/converters/test_brat_to_con.py index ed2feb06..d5521b85 100644 --- a/medacy/tests/tools/con_form/test_brat_to_con.py +++ b/medacy/tests/tools/converters/test_brat_to_con.py @@ -1,10 +1,10 @@ """ :author: Steele W. Farnsworth -:date: 28 December, 2018 +:date: 13 March, 2019 """ -import unittest, tempfile, os, shutil -from medacy.tools.con_form.brat_to_con import * +import unittest, tempfile +from medacy.tools.converters.brat_to_con import * brat_text = """T1 tradename 0 7 ABELCET T2 activeingredient 9 23 Amphotericin B @@ -92,33 +92,35 @@ def setUpClass(cls): cls.output_file_path = os.path.join(cls.test_dir, "output_file.txt") + cls.lines = Line.init_lines(source_text) + @classmethod def tearDownClass(cls): shutil.rmtree(cls.test_dir) - def is_valid_brat_valid_1(self): + def test_is_valid_brat_valid_1(self): """Tests that when is_valid_brat() gets called on a valid line without a new line character, it returns True.""" sample = "T3 nanoparticle 24 37 Lipid Complex" result = is_valid_brat(sample) self.assertTrue(result) - def is_valid_brat_valid_2(self): + def test_is_valid_brat_valid_2(self): """Tests that when is_valid_brat() is called on a valid line with a new line character, it returns True.""" sample = "T12 nanoparticle 674 683 liposomal\n" result = is_valid_brat(sample) self.assertTrue(result) - def is_valid_brat_invalid_1(self): + def test_is_valid_brat_invalid_1(self): """Tests what when is_valid_brat() is called on an invalid line without a new line character, it returns False.""" sample = "T3 nanoparticle s 37 Lipid Complex" result = is_valid_brat(sample) self.assertFalse(result) - def is_valid_brat_invalid_2(self): + def test_is_valid_brat_invalid_2(self): """Tests what when is_valid_brat() is called on an invalid line with a new line character, it returns False.""" sample = "T12 674 683 liposomal\n" result = is_valid_brat(sample) - self.assertTrue(result) + self.assertFalse(result) def test_line_to_dict(self): """Tests that line_to_dict() accurately converts a line of input text to an expected dict format.""" @@ -142,10 +144,10 @@ def test_get_word_num_1(self): """ # The annotation used is "T5 tradename 132 139 ABELCET" sample_line = "ABELCET consists of ampho-tericin B complexed with two phospholipids in a 1:1 drug-to-lipid molar ratio." - line_index = get_line_index(source_text, sample_line) + this_line = self.lines[1] expected = 0 - actual = get_word_num(source_text, line_index, 132) - self.assertEqual(expected, actual) + actual = get_word_num(this_line, 132) + self.assertEqual(actual, expected) def test_get_word_num_2(self): """ @@ -154,17 +156,16 @@ def test_get_word_num_2(self): """ # The annotation used is "T16 activeingredient 1009 1023 Amphotericin B" sample_line = "Suchdifferences may affect functional properties of these drug products.Amphotericin B is a polyene, antifungal antibiotic produced from a strain of Streptomyces nodosus.Amphotericin B is designated chemically as [1R-(1R*, 3S*, 5R*, 6R*, 9R*, 11R*, 15S*, 16R*, 17R*,18S*, 19E, 21E, 23E, 25E, 27E, 29E, 31E, 33R*, 35S*, 36R*, 37S*)]-33-[(3-Amino-3, 6- D-mannopyranosyl) oxy]-1,3,5,6,9,11,17,37-octahydroxy-15,16,18-trimethyl-13-oxo-14,39-dioxabicy-clo[33.3.1] nonatriaconta-19, 21, 23, 25, 27, 29, 31-heptaene-36-carboxylic acid." - line_index = get_line_index(source_text, sample_line) + this_line = self.lines[6] expected = 21 - actual = get_word_num(source_text, line_index, 1009) + actual = get_word_num(this_line, 1009) self.assertEqual(expected, actual) - - @unittest.skip("Not currently working") + def test_valid_brat_to_con(self): """Convert the test file from brat to con. Assert that the con output matches the sample con text.""" con_output = convert_brat_to_con(self.brat_file_path, self.text_file_path) - self.assertEqual(con_output, con_text) - + self.assertEqual(con_text, con_output) + def test_invalid_file_path(self): """Passes an invalid file path to convert_brat_to_con().""" with self.assertRaises(FileNotFoundError): @@ -175,7 +176,7 @@ def test_valid_brat_matching_text_name(self): Assert that the con output matches the sample con text when the automatic text-file-finding feature is utilized """ con_output = convert_brat_to_con(self.brat_file_path) - self.assertEqual(con_output, con_text) + self.assertEqual(con_text, con_output) def test_invalid_brat_text(self): """Assert that invalid brat text produces no output.""" diff --git a/medacy/tests/tools/con_form/test_con_to_brat.py b/medacy/tests/tools/converters/test_con_to_brat.py similarity index 95% rename from medacy/tests/tools/con_form/test_con_to_brat.py rename to medacy/tests/tools/converters/test_con_to_brat.py index 1bc0703d..3e3d0e2b 100644 --- a/medacy/tests/tools/con_form/test_con_to_brat.py +++ b/medacy/tests/tools/converters/test_con_to_brat.py @@ -1,10 +1,10 @@ """ :author: Steele W. Farnsworth -:date: 17 February, 2019 +:date: 13 March, 2019 """ -import unittest, tempfile, os, shutil -from medacy.tools.con_form.con_to_brat import * +import unittest, tempfile +from medacy.tools.converters.con_to_brat import * brat_text = """T1 tradename 0 7 ABELCET T2 activeingredient 9 23 Amphotericin B @@ -126,25 +126,25 @@ def test_line_to_dict(self): expected = {"data_item": "Amphotericin B", "start_ind": "7:8", "end_ind": "7:9", "data_type": "activeingredient"} actual = line_to_dict(sample) self.assertDictEqual(expected, actual) - - @unittest.skip("Not currently working") + + @unittest.skip def test_valid_brat_to_con(self): """Convert the test file from brat to con. Assert that the con output matches the sample con text.""" brat_output = convert_con_to_brat(self.con_file_path, self.text_file_path) - self.assertEqual(brat_output, brat_text) + self.assertEqual(brat_text, brat_output) def test_invalid_file_path(self): """Passes an invalid file path to convert_con_to_brat().""" with self.assertRaises(FileNotFoundError): convert_con_to_brat("this isn't a valid file path", "neither is this") - - @unittest.skip("Not currently working") + + @unittest.skip def test_valid_con_matching_text_name(self): """ Assert that the con output matches the sample con text when the automatic text-file-finding feature is utilized """ brat_output = convert_con_to_brat(self.con_file_path) - self.assertEqual(brat_output, brat_text) + self.assertEqual(brat_text, brat_output) def test_invalid_brat_text(self): """Assert that invalid brat text produces no output.""" diff --git a/medacy/tests/tools/test_annotation.py b/medacy/tests/tools/test_annotation.py index cbcf1dc7..48bd385d 100644 --- a/medacy/tests/tools/test_annotation.py +++ b/medacy/tests/tools/test_annotation.py @@ -2,8 +2,8 @@ from unittest import TestCase from medacy.data import Dataset from medacy.tools import Annotations, InvalidAnnotationError -from os.path import join, isfile -from medacy.tests.tools.con_test_data.con_test import con_text, source_text as con_source_text +from os.path import join +from medacy.tests.tools.converters.con_test_data.con_test import con_text, source_text as con_source_text class TestAnnotation(TestCase): diff --git a/medacy/tools/__init__.py b/medacy/tools/__init__.py index 95516efd..5d95883f 100644 --- a/medacy/tools/__init__.py +++ b/medacy/tools/__init__.py @@ -1,3 +1,3 @@ from .annotations import Annotations, InvalidAnnotationError from .data_file import DataFile -from .con_form import brat_to_con, con_to_brat +from medacy.tools.converters import brat_to_con, con_to_brat diff --git a/medacy/tools/annotations.py b/medacy/tools/annotations.py index 8f48232e..1076bad5 100644 --- a/medacy/tools/annotations.py +++ b/medacy/tools/annotations.py @@ -9,8 +9,8 @@ elements correspond to keys in the 'entities' dictionary. """ import os, logging, tempfile -from medacy.tools.con_form.con_to_brat import convert_con_to_brat -from medacy.tools.con_form.brat_to_con import convert_brat_to_con +from medacy.tools.converters.con_to_brat import convert_con_to_brat +from medacy.tools.converters.brat_to_con import convert_brat_to_con from math import floor, ceil import numpy as np from spacy.displacy import EntityRenderer diff --git a/medacy/tools/converters/__init__.py b/medacy/tools/converters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/medacy/tools/ade_to_brat.py b/medacy/tools/converters/ade_to_brat.py similarity index 100% rename from medacy/tools/ade_to_brat.py rename to medacy/tools/converters/ade_to_brat.py diff --git a/medacy/tools/con_form/brat_to_con.py b/medacy/tools/converters/brat_to_con.py similarity index 75% rename from medacy/tools/con_form/brat_to_con.py rename to medacy/tools/converters/brat_to_con.py index 25ac8b61..e8fe0b40 100644 --- a/medacy/tools/con_form/brat_to_con.py +++ b/medacy/tools/converters/brat_to_con.py @@ -3,31 +3,40 @@ Each '.ann' file must have a '.txt' file in the same directory with the same name, minus the extension. Use '-c' (without quotes) as an optional final command-line argument to copy the text files used in the conversion process to the output directory. + Also possible to import 'convert_brat_to_con()' directly and pass the paths to the ann and txt files for individual conversion. :author: Steele W. Farnsworth -:date: 16 February, 2019 +:date: 13 March, 2019 """ from sys import argv from re import split, fullmatch, DOTALL, findall +from medacy.tools.converters.conversion_tools.line import Line +import re import os import shutil import logging +import tabulate # A regex pattern for consecutive whitespace other than a new line character -whitespace_pattern = "( +|\t+)+" +whitespace_pattern = re.compile("( +|\t+)+") +# Regex pattern for BRAT T annotations +brat_pattern_T = r"T\d+\t\S+ \d+ \d+\t.+" + +# Used for stats at the end +num_lines = 0 +num_skipped_regex = 0 def is_valid_brat(item: str): """Returns a boolean value for whether or not a given line is in the BRAT format.""" # Define the regex pattern for BRAT. # Note that this pattern allows for three to six spaces to count as a tab - brat_pattern = r"[TREAMN]\d+(\t| {3,6})\S+ \d+ \d+(\t| {3,6}).+" if not isinstance(item, str): return False - if fullmatch(brat_pattern, item, DOTALL): return True + if fullmatch(brat_pattern_T, item, DOTALL): return True else: return False @@ -41,7 +50,7 @@ def line_to_dict(item): split1 = split("\t", item) split2 = split(" ", split1[1]) split3 = [split1[0]] + split2 + [split1[2]] - s = [i.rstrip() for i in split3] # remove whitespace + s = [i.strip() for i in split3] # remove whitespace return {"id_type": s[0][0], "id_num": int(s[0][1:]), "data_type": s[1], "start_ind": int(s[2]), "end_ind": int(s[3]), "data_item": s[4]} @@ -54,12 +63,6 @@ def switch_extension(name, ext): return os.path.splitext(name)[0] + ext -def get_line_index(text_, line_): - """Returns the index of the start of a given line. Assumes that the line_ - argument is long enough that (and thus so specific that) it only occurs once.""" - return text_.index(line_) - - def find_line_num(text_, start): """ :param text_: The text of the file, ex. f.read() @@ -69,19 +72,18 @@ def find_line_num(text_, start): return text_[:int(start)].count("\n") -def get_word_num(text_, line_index, entity_index): +def get_word_num(line_obj: Line, entity_index): """ - Returns the word number starting at zero that a given BRAT entity start index is within its line. - In the previous line, "Returns" is word 0 and "starting" is word 4. Words are counted by the number of consecutive - white spaces. - :param text_: The text of the document that the word occurs in. - :param line_index: The index of the first char of the line the word occurs in. - :param entity_index: The index of the first char of the word relative to the start of the document. - :return: The word number (see above explanation for what a word number is) of the given index within its line. + Returns the word number relative to the start of the line, with counting starting at 0, + of the first word of the entity. + :param line_obj: The Line that the entity occurs in. + :param entity_index: The absolute index of the entity, given by the annotation. + :return: The word index of the entity. """ - substring_before_entity = text_[line_index:entity_index] + index_within_line = entity_index - line_obj.index + substring_before_entity = line_obj.text[:index_within_line] matched_spaces = findall(whitespace_pattern, substring_before_entity) - return matched_spaces.__len__() + return len(matched_spaces) def convert_brat_to_con(brat_file_path, text_file_path=None): @@ -94,6 +96,8 @@ def convert_brat_to_con(brat_file_path, text_file_path=None): :return: A string (not a file) of the con equivalent of the brat file. """ + global num_lines, num_skipped_regex + # By default, find txt file with equivalent name if text_file_path is None: text_file_path = switch_extension(brat_file_path, ".txt") @@ -102,12 +106,12 @@ def convert_brat_to_con(brat_file_path, text_file_path=None): " directory") with open(text_file_path, 'r') as text_file: text = text_file.read() - text_lines = text.split('\n') + text_lines = Line.init_lines(text) # Otherwise open the file with the path passed to the function elif os.path.isfile(text_file_path): with open(text_file_path, 'r') as text_file: text = text_file.read() - text_lines = text.split('\n') + text_lines = Line.init_lines(text) else: raise FileNotFoundError("No text file path was provided or the file was not found." " Note that direct string input of the source text is not supported.") @@ -129,25 +133,25 @@ def convert_brat_to_con(brat_file_path, text_file_path=None): continue elif not is_valid_brat(line): logging.warning("Incorrectly formatted line in %s was skipped: \"%s\"." % (brat_file_path, line)) + num_skipped_regex += 1 continue d = line_to_dict(line) start_line_num = find_line_num(text, d["start_ind"]) - start_text_line = text_lines[start_line_num] - start_line_index = get_line_index(text, start_text_line) - start_word_num = get_word_num(text, start_line_index, d["start_ind"]) + start_source_line = text_lines[start_line_num] + start_word_num = get_word_num(start_source_line, d["start_ind"]) start_str = str(start_line_num + 1) + ':' + str(start_word_num) end_line_num = find_line_num(text, d["end_ind"]) - end_text_line = text_lines[end_line_num] - end_line_index = get_line_index(text, end_text_line) - end_word_num = get_word_num(text, end_line_index, d["end_ind"]) + end_word_num = start_word_num + len(re.findall(whitespace_pattern, d["data_item"])) end_str = str(end_line_num + 1) + ':' + str(end_word_num) con_line = "c=\"%s\" %s %s||t=\"%s\"\n" % (d["data_item"], start_str, end_str, d['data_type']) output_lines += con_line + num_lines += 1 + return output_lines @@ -155,13 +159,11 @@ def convert_brat_to_con(brat_file_path, text_file_path=None): # Get the input and output directories from the command line. - if not argv.__len__() >= 3: + if len(argv) < 3: # Command-line arguments must be provided for the input and output directories. - # Else, prints instructions and aborts the program. - print("Please run the program again, entering the input and output directories as command-line arguments" - " in that order. Optionally, enter '-c' as a final command line argument if you want to copy" - " the text files used in the conversion over to the output directory.") - exit() + raise IOError("Please run the program again, entering the input and output directories as command-line" + " arguments in that order. Optionally, enter '-c' as a final command line argument if you want" + " to copy the text files used in the conversion over to the output directory.") try: input_dir_name = argv[1] @@ -188,11 +190,10 @@ def convert_brat_to_con(brat_file_path, text_file_path=None): raise FileNotFoundError("There were no ann files in the input directory with a corresponding text file. " "Please ensure that the input directory contains ann files and that each file has " "a corresponding txt file (see help for this program).") - exit() - # Create the log file - log_file_path = os.path.join(output_dir_name + "conversion.log") - logging.basicConfig(filename=log_file_path, level=logging.WARNING) + # Create the log + log_path = os.path.join(output_dir_name, "conversion.log") + logging.basicConfig(filename=log_path, level=logging.WARNING) for input_file_name in ann_files: full_file_path = os.path.join(input_dir_name, input_file_name) @@ -203,8 +204,22 @@ def convert_brat_to_con(brat_file_path, text_file_path=None): # Paste all the text files used in the conversion process to the output directory # if there's a fourth command line argument and that argument is -c - if argv.__len__() == 4 and argv[3] == "-c": + if len(argv) >= 4 and argv[3] == "-c": text_files_with_match = [f for f in text_files if switch_extension(f, ".ann") in ann_files] for f in text_files_with_match: full_name = os.path.join(input_dir_name, f) shutil.copy(full_name, output_dir_name) + + # Compile and print stats to log + stat_headers = ["Total lines", "Total converted", + "Skipped did not match regex", "Percent converted"] + + stat_data = [ + num_lines, + num_lines - num_skipped_regex, + num_skipped_regex, + (num_lines - num_skipped_regex) / num_lines + ] + + conversion_stats = tabulate.tabulate(headers=stat_headers, tabular_data=[stat_data]) + logging.warning("\n" + conversion_stats) diff --git a/medacy/tools/con_form/con_to_brat.py b/medacy/tools/converters/con_to_brat.py similarity index 56% rename from medacy/tools/con_form/con_to_brat.py rename to medacy/tools/converters/con_to_brat.py index 7460715c..1daa91ca 100644 --- a/medacy/tools/con_form/con_to_brat.py +++ b/medacy/tools/converters/con_to_brat.py @@ -6,16 +6,31 @@ Function 'convert_con_to_brat()' can be imported independently and run on individual files. -This version does not produce accurate output. Revisions are underway. +This program can be used for conversion independently from medaCy if the Line class is copied +and pasted into a copy of this program. :author: Steele W. Farnsworth -:date: 18 February, 2019 +:date: 13 March, 2019 """ -from sys import argv as cmd_arg, exit -from re import split, findall, fullmatch, DOTALL +from sys import argv, exit +from re import split, findall, fullmatch +from medacy.tools.converters.conversion_tools.line import Line +import re import os import shutil +import logging +import tabulate + + +# Regex patterns +whitespace_pattern = "( +|\t+)+" +con_pattern = "c=\".+?\" \d+:\d+ \d+:\d+\|\|t=\".+?\"(|\n)" + +# Used for stats at the end +num_lines = 0 +num_skipped_regex = 0 +num_skipped_value_error = 0 def is_valid_con(item: str): @@ -25,7 +40,6 @@ def is_valid_con(item: str): :return: Boolean of whether or not the line matches a con regular expression. """ if not isinstance(item, str): return False - con_pattern = "c=\".+?\" \d+:\d+ \d+:\d+\|\|t=\".+?\"(|\n)" if fullmatch(con_pattern, item): return True else: return False @@ -49,25 +63,58 @@ def switch_extension(name, ext): return os.path.splitext(name)[0] + ext -def get_absolute_index(txt, txt_lns, ind): +def get_absolute_index(txt_lns, ind, entity): """ - Given one of the \d+:\d+ spans, which represent the index of a char relative to the start of the line it's on, + Given one of the \d+:\d+ spans, which represent the index of a word relative to the start of the line it's on, returns the index of that char relative to the start of the file. - :param txt: The text file associated with the annotation. - :param txt_lns: The same text file as a list broken by lines + :param txt_lns: The list of Line objects for that file. :param ind: The string in format \d+:\d+ + :param entity: The text of the entity :return: The absolute index """ - # convert ind to line_num and char_num + # Convert ind to line_num and char_num nums = split(":", ind) line_num = int(nums[0]) - 1 # line nums in con start at 1 and not 0 - char_num = int(nums[1]) + word_num = int(nums[1]) this_line = txt_lns[line_num] - line_index = txt.index(this_line) # get the absolute index of the entire line - abs_index = line_index + char_num - return abs_index + line_index = this_line.index + + # Get index of word following n space + split_by_whitespace = split(whitespace_pattern, this_line.text) + split_by_whitespace = [s for s in split_by_whitespace if s != ''] + split_by_ws_no_ws = [s for s in split_by_whitespace if not s.isspace()] + all_whitespace = [s for s in split_by_whitespace if s.isspace()] + + # Adjust word_num if first character cluster is whitespace + if split_by_whitespace[0].isspace(): + line_to_target_word = split_by_ws_no_ws[:word_num - 1] + else: + line_to_target_word = split_by_ws_no_ws[:word_num] + + num_non_whitespace = sum([len(w) for w in line_to_target_word]) + num_whitespace = sum([len(w) for w in all_whitespace[:word_num]]) + + index_within_line = num_whitespace + num_non_whitespace + line_to_start_index = this_line.text[index_within_line:] + entity_pattern_escaped = re.escape(entity) + entity_pattern_spaced = re.sub(r"\\\s+", r"\s+", entity_pattern_escaped) + + try: + # Search for entity regardless of case or composition of intermediate spaces + # match = re.search(entity_pattern_spaced, this_line.text, re.IGNORECASE)[0] + match = re.search(entity_pattern_spaced, line_to_start_index, re.IGNORECASE)[0] + offset = line_to_start_index.index(match) # adjusts if entity is not the first char in its "word" + except (ValueError, TypeError): + logging.warning("""Entity not found in its expected line: + \t"%s" + \t"%s" + \tRevision of input data may be required; conversion for this item was skipped""" % (entity, this_line) + ) + return -1 + + return index_within_line + line_index + offset def convert_con_to_brat(con_file_path, text_file_path=None): @@ -81,6 +128,8 @@ def convert_con_to_brat(con_file_path, text_file_path=None): :return: A string representation of the brat file, which can then be written to file if desired. """ + global num_lines, num_skipped_regex, num_skipped_value_error + # By default, find txt file with equivalent name if text_file_path is None: text_file_path = switch_extension(con_file_path, ".txt") @@ -89,15 +138,17 @@ def convert_con_to_brat(con_file_path, text_file_path=None): " directory") with open(text_file_path, 'r') as text_file: text = text_file.read() - text_lines = text.split('\n') + text_lines = Line.init_lines(text) # Else, open the file with the path passed to the function elif os.path.isfile(text_file_path): with open(text_file_path, 'r') as text_file: text = text_file.read() - text_lines = text.split('\n') + text_lines = Line.init_lines(text) else: raise FileNotFoundError("No text file path was provided or the file was not found." " Note that direct string input of the source text is not supported.") + num_lines += len(text_lines) + # If con_file_path is actually a path, open it and split it into lines if os.path.isfile(con_file_path): with open(con_file_path, 'r') as con_file: @@ -111,10 +162,17 @@ def convert_con_to_brat(con_file_path, text_file_path=None): output_text = "" t = 1 for line in con_text_lines: - if not is_valid_con(line): continue + if line == "" or line.startswith("#"): continue + elif not is_valid_con(line): + logging.warning("Incorrectly formatted line in %s was skipped: \"%s\"." % (con_file_path, line)) + num_skipped_regex += 1 + continue d = line_to_dict(line) - start_ind = get_absolute_index(text, text_lines, d["start_ind"]) - span_length = d["data_item"].__len__() + start_ind = get_absolute_index(text_lines, d["start_ind"], d["data_item"]) + if start_ind == -1: + num_skipped_value_error += 1 + continue # skips data that could not be converted + span_length = len(d["data_item"]) end_ind = start_ind + span_length output_line = "T%s\t%s %s %s\t%s\n" % (str(t), d["data_type"], str(start_ind), str(end_ind), d["data_item"]) output_text += output_line @@ -127,34 +185,42 @@ def convert_con_to_brat(con_file_path, text_file_path=None): # Get the input and output directories from the command line. - if not cmd_arg.__len__() >= 3: + if len(argv) < 3: # Command-line arguments must be provided for the input and output directories. - # Else, prints instructions and aborts the program. - print("Please run the program again, entering the input and output directories as command-line arguments" - " in that order. Optionally, enter '-c' as a final command line argument if you want to copy" - " the text files used in the conversion over to the output directory.") - exit() + raise IOError("Please run the program again, entering the input and output directories as command-line" + " arguments in that order. Optionally, enter '-c' as a final command line argument if you want" + " to copy the text files used in the conversion over to the output directory.") try: - input_dir_name = cmd_arg[1] + input_dir_name = argv[1] input_dir = os.listdir(input_dir_name) except FileNotFoundError: # dir doesn't exist while not os.path.isdir(input_dir_name): input_dir_name = input("Input directory not found; please try another directory:") input_dir = os.listdir(input_dir_name) try: - output_dir_name = cmd_arg[2] + output_dir_name = argv[2] output_dir = os.listdir(output_dir_name) except FileNotFoundError: while not os.path.isdir(output_dir_name): output_dir_name = input("Output directory not found; please try another directory:") output_dir = os.listdir(output_dir_name) + # Create the log + log_path = os.path.join(output_dir_name, "conversion.log") + logging.basicConfig(filename=log_path) + # Get only the text files in input_dir text_files = [f for f in input_dir if f.endswith(".txt")] # Get only the con files in input_dir that have a ".txt" equivalent con_files = [f for f in input_dir if f.endswith(".con") and switch_extension(f, ".txt") in text_files] + # Ensure user is aware if there are no files to convert + if len(con_files) < 1: + raise FileNotFoundError("There were no con files in the input directory with a corresponding text file. " + "Please ensure that the input directory contains ann files and that each file has " + "a corresponding txt file (see help for this program).") + for input_file_name in con_files: full_file_path = os.path.join(input_dir_name, input_file_name) output_file_name = switch_extension(input_file_name, ".ann") @@ -165,8 +231,23 @@ def convert_con_to_brat(con_file_path, text_file_path=None): # Paste all the text files used in the conversion process to the output directory # if there's a fourth command line argument and that argument is -c - if cmd_arg.__len__() == 4 and cmd_arg[3] == "-c": + if len(argv) >= 4 and argv[3] == "-c": text_files_with_match = [f for f in text_files if switch_extension(f, ".con") in con_files] for f in text_files_with_match: full_name = os.path.join(input_dir_name, f) shutil.copy(full_name, output_dir_name) + + # Compile and print stats to log + stat_headers = ["Total lines", "Total converted", "Lines skipped", "Skipped due to value error", + "Skipped did not match regex", "Percent converted"] + stat_data = [ + num_lines, + num_lines - num_skipped_regex - num_skipped_value_error, + num_skipped_regex + num_skipped_value_error, + num_skipped_value_error, + num_skipped_regex, + (num_lines - num_skipped_regex - num_skipped_value_error) / num_lines + ] + + conversion_stats = tabulate.tabulate(headers=stat_headers, tabular_data=[stat_data]) + logging.warning("\n" + conversion_stats) diff --git a/medacy/tools/converters/conversion_tools/__init__.py b/medacy/tools/converters/conversion_tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/medacy/tools/converters/conversion_tools/line.py b/medacy/tools/converters/conversion_tools/line.py new file mode 100644 index 00000000..72af12de --- /dev/null +++ b/medacy/tools/converters/conversion_tools/line.py @@ -0,0 +1,60 @@ +""" +:author: Steele Farnsworth +:date: 13 March, 2019 +""" + + +class Line: + """ + Represents a line of text in the text file related to an annotation file, ensuring that each line has an accurate + start index as one of its attributes regardless of whether that line appears more than once + """ + + def __init__(self, line_text: str, line_num: int, line_index: int): + self.text = line_text + self.num = line_num + self.index = line_index + + @staticmethod + def init_lines(full_text: str): + """ + Creates all the Line objects for a given text file, storing them in a list where index n is the nth - 1 + line of the document. + :param full_text: The entire text of the document. + :return: The list of Lines. + """ + global_start_ind = 0 + global_line_num = 0 + + full_text_lines = full_text.split('\n') + text_lines = [] + + for given_line in full_text_lines: + + sub_index = 0 + matches = [] + while sub_index < global_start_ind: + for previous_line in text_lines: + if given_line == previous_line.text: + matches.append(previous_line) + sub_index += previous_line.index + + if matches: + # Get the text from the end of the last match onward + search_text_start = matches[-1].index + len(matches[-1].text) + search_text = full_text[search_text_start:] + start_ind = search_text.index(given_line) + search_text_start + else: # The line is unique so str.index() will be accurate + start_ind = full_text.index(given_line) + + new_line = Line(given_line, global_line_num, start_ind) + text_lines.append(new_line) + + global_start_ind = text_lines[-1].index + global_line_num += 1 + + return text_lines + + def __str__(self): + """String representation of a line, with its index and text separated by a pipe.""" + return "%i | %s" % (self.index, self.text)