Skip to content

Commit

Permalink
added option to lowercase tokens for pretrained embedding compatibili…
Browse files Browse the repository at this point in the history
…ty if necessary
  • Loading branch information
swabhs committed Jun 30, 2016
1 parent cfb3062 commit 3bb3abf
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
7 changes: 7 additions & 0 deletions parser/c2.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ namespace cpyp {
class Corpus {
public:
bool USE_SPELLING = false;
bool USE_LOWERWV = false;
// String literals
static constexpr const char* UNK = "UNK";
static constexpr const char* BAD0 = "<BAD0>";
Expand Down Expand Up @@ -163,6 +164,9 @@ class Corpus {

// Token
token = tok_pos_pair.substr(0, postag_char_idx);
if (USE_LOWERWV) {
transform(token.begin(), token.end(), token.begin(), ::tolower);
}
if (!tok_dict.Contains(token)) {
// character stuff // TODO(Swabha): look into later
unsigned j = 0;
Expand Down Expand Up @@ -322,6 +326,9 @@ class Corpus {

// Token
token = tok_pos_pair.substr(0, postag_charpos);
if (USE_LOWERWV) {
transform(token.begin(), token.end(), token.begin(), ::tolower);
}
unsigned tok_id = tok_dict.Convert(token);
token_vocab_size = tok_dict.size();
++num_tokens;
Expand Down
5 changes: 4 additions & 1 deletion parser/lstm-parse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ void init_command_line(int argc, char** argv, po::variables_map* conf) {
po::value<unsigned>()->default_value(100), "LSTM input dimension")(
"dropout", po::value<float>()->default_value(0.2f), "Dropout rate")(
"train,t", "Should training be run?")("words,w",
po::value<string>(), "pretrained word embeddings")("use_spelling,S",
po::value<string>(), "pretrained word embeddings")("use_lowerwv",
"Lowercase tokens for wv compatibility")("use_spelling,S",
"Use spelling model")("gold_conll,g", po::value<string>(),
"Gold dev/test conll file for eval")("output_conll,s",
po::value<string>(), "Predicted dev/test conll file for eval")(
Expand Down Expand Up @@ -1458,7 +1459,9 @@ int main(int argc, char** argv) {

USE_POS = conf.count("use_pos_tags");
USE_SPELLING = conf.count("use_spelling"); //Miguel

corpus.USE_SPELLING = USE_SPELLING;
corpus.USE_LOWERWV = conf.count("use_lowerwv");

LAYERS = conf["layers"].as<unsigned>();
INPUT_DIM = conf["input_dim"].as<unsigned>();
Expand Down

0 comments on commit 3bb3abf

Please sign in to comment.