conf/agents.proto

/*
Copyright (c) Meta Platforms, Inc. and affiliates.

This source code is licensed under the MIT license found in the
LICENSE file in the root directory of this source tree.
*/
// Format this file with clang after editing:
//   clang-format-8 conf/*.proto -i
syntax = "proto2";
package fairdiplomacy;

import public "conf/common.proto";

message RandomAgent {}

message BaseStrategyModelAgent {
  // Required. Path to BaseStrategyModel checkpoint.
  optional string model_path = 1;

  // Required. Softmax temperature
  optional float temperature = 2;

  // Optional. Share of probability mass to keep for sampling.
  optional float top_p = 3 [ default = 1.0 ];

  // Optional. Predict orders for a full-press (vs no-press) game.
  optional bool has_press = 4 [ default = false ];

  // Optional. Use FP16 at inference time for everything.
  optional bool half_precision = 5 [ default = false ];

  // CUDA device to use. Set to -1 to use CPU.
  optional int32 device = 6 [ default = 0 ];
}

message BaseStrategyModelRollouts {
  optional uint32 n_threads = 1 [ default = 70 ];

  // Temperature used for rollouts
  optional float temperature = 2 [ default = 1.0 ];

  // Nucleus ratio used for rollouts. During nucleus sampling only
  // the smallest subset of actions that has probability at least top_p is
  // considered. All other actions are never sampled.
  optional float top_p = 3 [ default = 1.0 ];

  // Maximum rollout length in MOVEMENT PHASES heuristically evaluating the game
  optional int32 max_rollout_length = 4 [ default = -1 ];

  // # of rollouts to run in parallel for each possible action
  optional uint32 average_n_rollouts = 5 [ default = 1 ];

  // Optional float 0 - 1 to mix in raw sum of squares ratio
  optional float mix_square_ratio_scoring = 6 [ default = 0 ];

  // Optional. Enables optimization on clonning dipcc.Game objects.
  optional bool clear_old_all_possible_orders = 7 [ default = false ];

  // Optional. A string of "year,prob;year,prob;..."
  // "year,prob" indicates that at the start of SPRING of that year or later
  // years there is a probability of the game ending instantly and being scored
  // as-is, therefore we average the rollout with prob * raw score whenever the
  // rollout reaches a new year.
  optional string year_spring_prob_of_ending = 8;

  // DEPRECATED (now computed automatically)
  // Predict orders for a full-press (vs no-press) game.
  optional bool has_press = 900 [ default = false ];
}

message PlausibleOrderSampling {
  // Number of order-sets (actions) to consider at each step in search code
  optional uint32 n_plausible_orders = 1;
  // Optional, limit number of actions to consider as a ratio of # units
  // # plausible actions = min(ceil(max_actions_units_ratio * #units),
  // n_plausible_orders)
  optional float max_actions_units_ratio = 2 [ default = -1 ];

  // Optional. Excludes all-hold orders of length >=N from plausible orders.
  optional int32 exclude_n_holds = 3 [ default = -1 ];

  optional uint32 req_size = 4 [ default = 700 ];

  // Optional. Batch size for sample queries.
  // If not set, will equal to req_size.
  optional uint32 batch_size = 5 [ default = 0 ];

  // Optional. If true, then take the first n_plausible_orders
  // order-sets returned by parlai, rather than the top N.
  optional bool parlai_take_first = 6 [ default = false ];

  // Optional. If true, candidate orders are generated by parlai, but then
  // "rescored" by parlai to produce the blueprint policy.
  optional bool do_parlai_rescoring = 7 [ default = false ];

  // Optional. If non-zero, only rescore the top N orders, discard the rest.
  optional uint32 n_rescore = 8 [ default = 0 ];

  optional uint32 parlai_req_size = 9 [ default = 200 ];

  optional uint32 parlai_batch_size = 10 [ default = 20 ];

  // Optional. If rescoring with parlai, consider the top frac*limit
  // orders from base_strategy_model and add them to the plausible set if
  // missing
  optional float augment_base_strategy_model_frac = 11 [ default = 0 ];

  // Optional. If true and multiple GPU available, will paralelize over GPUs.
  // This may slow down smaller runs as it disables parlai cache and so the
  // models will be reloaded on each agent initialization.
  optional bool allow_multi_gpu = 12;
};

message DoubleOracleExploration {
  // Required. How many iterations to do. Use at least as many as powers you
  // have.
  optional int32 max_iters = 1;
  // Optional. Min EV diff to choose a new action over existing one.
  optional float min_diff = 2 [ default = 0.0 ];

  // Optional. Min EV relative diff in percentage. Must be in [1.0, 100].
  optional float min_diff_percentage = 8 [ default = 0.0 ];

  // Optional. Maximum number of opponent actions to consider for EV
  // computation. If not set, will be infinite.
  optional int32 max_op_actions = 4;

  // Optional. If set, will use the full policy of the opponent to compute
  // Q(s, a). Only supported if there is a single opponent. If false, will
  // sample some number of actions for the opponent.
  optional bool use_exact_op_policy = 10 [ default = true ];

  // Optional. If set, a couple of games will be considered identical if the
  // final board state is the same. This flag should be relatively save for 2p
  // games.
  optional bool use_board_state_hashing = 5;

  // Optional. If set, will try to make for each power in random order rather
  // than in lexiographic order aka in order of POWERS list.
  optional bool shuffle_powers = 6;

  // Optional. If set, will regenerate the set of orders after each DO
  // iteration. Otherwise will generate set of orders once.
  optional bool regenerate_every_iter = 7;

  // Optional. If set, will only run
  optional bool only_agent_power = 9;

  // Optional. If set, then the algeorithm will do less iterations if worked
  // more than this number of seconds.
  optional int32 max_seconds = 11 [ default = 0 ];

  // Optional. If set, will redefine number iterations used to compute
  // equilibrium in the search agent.
  optional int32 n_rollouts = 12;

  message Generation {
    // Required. How many random actions to sample at each iteration.
    optional int32 max_actions = 1;

    message UniformGenerator {
      // Optional. Consider orders that support foreign units.
      optional bool allow_foreign_supports = 1 [ default = true ];
    }

    message ColumnGenerator {
      // Optional. If set, will use this model instead of one in the agent.
      optional string model_path = 1;
      // Optional. Consider orders that support foreign units.
      optional float temperature = 2 [ default = 1.0 ];
    }

    message BaseStrategyModelGenerator {
      // Optional. If set, will use this model instead of one in the agent.
      optional string model_path = 1;
      // Optional. Consider orders that support foreign units.
      optional float temperature = 2 [ default = 1.0 ];
      // Optional. Prediction order: default or random.
      optional string location_order = 3 [ default = "default" ];
    }

    message LocalUniformGenerator {
      // Required. How many action to take from the policy and try to
      // modify.
      optional int32 num_base_actions = 1;
      // Optional. If set, will sample actions from the policy. Otherwise
      // will take top actions (default).
      optional bool use_sampling = 2;
      // Optional. Whether to use blueprint policy (default) or search policy to
      // select base actions.
      optional bool use_search_policy = 3;
      // Optional. Replace supports with hold in the base action, if the action
      // is not coordinated.
      optional bool fix_uncoordinated_base = 4;
      // Optional. If set, will search not only over locations adjancent to
      // existing units, but over all locations. In other words, this will group
      // units by points of influence.
      optional bool with_holes = 5;
    }

    oneof maybe_uniform { UniformGenerator uniform = 2; }

    oneof maybe_column { ColumnGenerator column = 3; }

    oneof maybe_base_strategy_model {
      BaseStrategyModelGenerator base_strategy_model = 4;
    }

    oneof maybe_local_uniform { LocalUniformGenerator local_uniform = 5; }
  }

  optional Generation generation = 3;
}

message SearchBotAgent {
  // Required. Path to BaseStrategyModel checkpoint. This model is always used
  // to select plausbile actions; it's also used for rollouts and value
  // computation, unless these are specified expilicitly.
  optional string model_path = 1;

  // Number of postman server processes to launch
  optional uint32 n_server_procs = 3;

  // Distribute server processes over multiple GPUs
  optional uint32 n_gpu = 4;

  // Model server maximum batch size
  optional uint32 max_batch_size = 5 [ default = 700 ];

  // Number of CFR iterations
  optional int32 n_rollouts = 6 [ default = -1 ];

  // CUDA device to use, if > 0
  optional int32 device = 13 [ default = 0 ];

  // Optional, if True, sample from final iter instead of average iter
  optional bool use_final_iter = 18 [ default = true ];

  // Optional: separate model path to compute the value function. If not set
  // model_path is used.
  optional string value_model_path = 19;

  // Optional: separate model path for rollouts. If not set model_path is used.
  optional string rollout_model_path = 30;

  // Optional host:port for value model server
  optional string use_value_server_addr = 20;

  // Optional, if >0 then play BP strategy for this many iters
  optional int32 bp_iters = 23 [ default = 0 ];

  // Optional, if >0 then play BP strategy for this many iters
  optional float bp_prob = 24 [ default = 0 ];

  // Optional, if >0, then at each rollout step will use the current
  // model-predicted value as this fraction of the final estimate
  // (i.e. exponentially decaying effect of rollouts of increasing length)
  optional float rollout_value_frac = 25 [ default = 0 ];

  optional bool cache_rollout_results = 26 [ default = false ];

  // Optional. If set, will compute cache for the all possible joint actions
  // before running CFR. The procomputation will be skippef if have more than 2
  // alive powers.
  optional bool precompute_cache = 43 [ default = false ];

  // Debugging for situation check tests only
  // Use the seed for plausible actions, then pick a random seed for rollouts
  optional bool reset_seed_on_rollout = 27 [ default = false ];

  // If this power is a loser, i.e. its action values are less
  // than this value, then this power plays blueprint.
  optional float loser_bp_value = 28 [ default = 0 ];

  // First CFR iteration to start following loser_bp_value
  // (we need a few iterations to accurately estimate the action values)
  optional float loser_bp_iter = 29 [ default = 64 ];

  // If true, then exploitability is calculated at regular intervals.
  optional bool enable_compute_nash_conv = 32 [ default = false ];

  // Optional. If set, than this model will be used for plausible orders.
  optional ParlaiModel parlai_model = 34;

  // Optional. If set, then the agent will do less iterations if worked more
  // than this number of seconds.
  optional int32 max_seconds = 37 [ default = 0 ];

  // Optional. By default, use optimistic cfr
  optional bool use_optimistic_cfr = 38 [ default = true ];

  // Rollout parameters
  optional BaseStrategyModelRollouts rollouts_cfg = 39;

  message PlausibleOrderAugmentation {
    message RandomExploration {
      // Optional. How many top plausible orders always to keep. Has priority
      // over max_actions_to_drop.
      optional int32 min_actions_to_keep = 1 [ default = 0 ];
      // Optional. Make sure we replace at least this number of orders.
      optional int32 max_actions_to_drop = 2 [ default = 0 ];
    }

    oneof augmentation_type {
      // Fills plausible orders up to limit with random diverse actions.
      RandomExploration random = 1;
      // Iteratively tries to find actions that has higher EV from a pool of
      // random actions.
      DoubleOracleExploration do = 2;
    }
  }
  // Defines plausible order agumentation.
  optional PlausibleOrderAugmentation order_aug = 40;

  optional PlausibleOrderSampling plausible_orders_cfg = 41;

  // Optional. Use FP16 at inference time for everything.
  optional bool half_precision = 44 [ default = false ];

  // Optional. Dialogue parameters
  oneof maybe_dialogue { ParlaiDialogue dialogue = 50; }

  optional ParlaiModel parlai_model_orders = 51;

  // Optional. If true, generate messages based on CFR-predicted orders
  optional bool cfr_messages = 52 [ default = false ];

  enum PolicyToPseudo {
    // sample an action from the policy
    SAMPLE = 0;
    // take the argmax action
    ARGMAX = 1;
  };
  // This section configures different strategies on how to generate message
  // pseudo-orders for other powers. If strategy=NONE, then pseudo-orders are
  // sampled from the computed equilibrium.
  // Other strategies described in the enum.
  message BilateralDialogue {
    // heyhi can only handle enum overrides if the enum is defined in the
    // same subcfg as it's used. See conf.py:208
    enum BilateralDialogueStrategy {
      // Sample pseudo-orders for all powers from the computed equilibrium
      NONE = 0;

      // From the set of opponent actions where blueprint probability >=
      // min_order_prob, pick the action which maximizes agent_power's value
      EXPLOIT = 1;
      // From the set of opponent actions where (average) equilibrium
      // probability >= min_order_prob, pick the action which maximizes
      // agent_power's value
      BEST_EQ = 2;

      // From the set of opponent actions where population
      // probability >= min_order_prob, pick the action which maximizes
      // agent_power's value
      BEST_POP = 3;
    };
    optional BilateralDialogueStrategy strategy = 1 [ default = NONE ];
    // The minimum order probability (in either the BP or equilibrium, depending
    // on strategy) that will be considered for pseudo-orders.
    optional float min_order_prob = 2 [ default = 0.02 ];
  }
  optional BilateralDialogue bilateral_dialogue = 54;

  // If specified, will simulate this SearchBot agent end-to-end with the
  // specified number of samples, and will compute a best response to their
  // average policy. Exploited agent MUST have use_final_iter = false.
  oneof maybe_exploited_searchbot_cfg {
    SearchBotAgent exploited_searchbot_cfg = 57;
  }
  optional int32 exploited_agent_num_samples = 58 [ default = 1 ];
  optional string exploited_agent_power = 59;

  // Set to true to enable logging of CFR average policies and utilities on
  // iterations other than the final iteration.
  optional bool log_intermediate_iterations = 60 [ default = false ];

  // Set to true to enable logging of bilateral CFR values, showing the effects
  // of other powers' actions on the agent_power's utility.
  optional bool log_bilateral_values = 61 [ default = false ];

  // Optional. If true, enables setting player ratings
  optional bool set_player_rating = 62 [ default = false ];

  // Optional. Player rating to be used. Only used when set_player_rating
  // is set to true
  optional float player_rating = 63;

  message QRE {
    // QRE Hedge parameter eta: roughly, the learning rate. We believe that 10
    // is a reasonable value.
    optional float eta = 1 [ default = 10.0 ];

    // QRE parameter lambda: roughly, the temperature of the policy, in units of
    // SoS score
    optional float qre_lambda = 2 [ default = 0.0 ];

    enum QRETargetPi {
      // Uses the uniform policy as the target policy for QRE
      UNIFORM = 0;

      // Uses blueprint policy as the target policy for QRE
      BLUEPRINT = 1;
    }

    // QRE target pi to be used
    optional QRETargetPi target_pi = 3 [ default = UNIFORM ];

    // If set, uses agent_qre_lambda for agent
    // and qre_lambda for opponents
    optional float agent_qre_lambda = 4;

    // Multiply the weight on the entropy part of KL term by this.
    optional float qre_entropy_factor = 5 [ default = 1.0 ];

    // If set, uses agent_qre_entropy_factor for the agent and
    // qre_entropy_factor for opponents
    optional float agent_qre_entropy_factor = 6;
  }

  // Optional. Uses QRE instead of CFR
  oneof maybe_qre { QRE qre = 64; }

  // Optional. If set, the plausible actions will be rescored with model below
  // to get a blueprint policy.
  optional string rescoring_blueprint_model_path = 65;

  // Optional. If true, do an incremental BP update after each message.
  // Only relevnt if cfr_messages=true
  optional bool do_incremental_search = 66;

  // If true, then always use your most likely orders as pseudo-orders,
  // even if PPO model says they are unlikely.
  optional bool use_truthful_pseudoorders = 67 [ default = false ];
  optional bool use_truthful_pseudoorders_recipient = 68 [ default = false ];
  // If one of use_truthful_pseudoorders or use_truthful_pseudoorders_recipient
  // is set, we don't need to compute probabilities of all pseudo-orders for
  // either us or the recipient. But we still computing them for logging
  // purposes. Turning the flag all will remove these logs, but will speed
  // things up.
  optional bool skip_policy_evaluation_for_truthful_pseudoorders = 69;

  optional bool use_greedy_po_for_rollout = 70;

  message BRCorrBilateralSearch {
    // whether use pair search in pseudo order generation
    optional bool enable_for_pseudo_order = 1 [ default = false ];
    // use br to correlated bilateral search to generate final orders
    optional bool enable_for_final_order = 3 [ default = false ];
    // number of conditional joint action samples used to estimate evs for
    // actions given a pair of power
    optional int32 bilateral_search_num_cond_sample = 4 [ default = 10 ];
    // num of samples to compute best response against correlated bilateral br
    optional int32 br_num_sample = 5 [ default = 1000 ];
    // whther to use all power model to compute p_joint
    // e.g. if set to false, the weights for opponent joint actions should all
    // be 1
    optional bool use_all_power_for_p_joint = 6 [ default = true ];
    // a small probability to be added to the p_joint(a1, a2, ..., a6) and
    // prod_i p_bp(ai) to prevent the weight from overflowing
    optional float joint_action_min_prob = 7 [ default = 0 ];
    // coefficient to regularize br policy towards bp, ev_a + lambda log(bp_a)
    optional float br_regularize_lambda = 8 [ default = 0 ];
    // clip unnormalized weight (i.e. importance ratio) with [min, max]
    optional float min_unnormalized_weight = 9 [ default = -1 ];
    optional float max_unnormalized_weight = 10 [ default = -1 ];

    // this should be specified by searchbot.model_path
    optional string all_power_model_path = 101 [ deprecated = true ];
  };

  // Optional. If set, use best response to correlated bilateral search
  // to generate final order and maybe pseudo orders
  oneof maybe_br_corr_bilateral_search {
    BRCorrBilateralSearch br_corr_bilateral_search = 71;
  };

  message MessageSearch {
    // Number of messages to (maybe) sample
    optional int32 n_messages = 1 [ default = 1 ];
    // Absolute score differential threshold
    optional float max_score_diff_threshold = 5 [ default = 0.007 ];
    // Relative score differential threshold
    optional float max_rel_score_diff_threshold = 6 [ default = 0.1 ];

    enum MessageSearchStrategy {
      // Dummy strategy; returns random message
      NONE = 0;
      // Selects best message
      BEST = 1;
      // Selects message using softmax with temperature `softmax_temperature`
      SOFTMAX = 2;
      // Selects random message among the top `filter_top_k`
      FILTER = 3;
    }
    optional MessageSearchStrategy strategy = 7 [ default = BEST ];
    // Only applies for SOFTMAX strategy.
    optional float softmax_temperature = 8 [ default = 0.003 ];
    // Only applies for FILTER strategy.
    optional int32 filter_top_k = 9 [ default = 5 ];
  }

  // Optional. If set, perform message search when generating messages
  oneof maybe_message_search { MessageSearch message_search = 72; }

  // Has no effect.
  optional bool use_predicted_final_scores = 8 [ deprecated = true ];
  // Has no effect.
  optional bool postman_sync_batches = 11 [ deprecated = true ];
  // Has no effect.
  optional string use_server_addr = 12 [ deprecated = true ];
  // Has no effect.
  optional PolicyToPseudo policy_to_pseudo = 53 [ deprecated = true ];
  // Has no effect.
  optional int32 n_message_search = 55 [ default = 0, deprecated = true ];
  // Has no effect.
  optional string message_search_reply = 56 [ deprecated = true ];
}

message BQRE1PAgent {

  // SearchBotAgent configurations that will be inherited to initialize the
  // BQRE1PAgnet
  optional SearchBotAgent base_searchbot_cfg = 1;

  // Number of player types we will be modelling
  optional int32 num_player_types = 2;

  // Current agent player type
  // Note: agent_type=1 will have the lowest lambda
  // and thus would be the strongest player
  optional int32 agent_type = 3;

  message PlayerTypes {
    message Policy {
      // Leave both fields blank to indicate no-rescoring.
      optional string model_path = 1;
      optional string name = 2;
    }

    // LogUniformLambdas allows to create some number of player types that
    // differ by lambda and maybe the policy we use to rescore.
    //
    // The following invariant must be true:
    //  num_player_types = num_lambda * max(len(policies), 1) +
    //  int(include_zero_lambda)
    // If no such num_lambda exists, then an exception will be raised.
    //
    // If not `policies` provided then the default policy is used:
    //   - if target_pi=UNIFORM, then it will be uniform
    //   - otherwise the policy from plausible_orders as input to run_search is
    //       used (= rescoring_blueprint_model_path is set or model_path
    //       oterwise)
    //
    // If policies are provided, then they will be used to assign probabilities
    // to the plausible_actions. It's still possible to use the incming
    // probabilities by passing empty model_path.
    message LogUniformLambdas {

      optional double min_lambda = 1;
      optional double max_lambda = 2;

      repeated Policy policies = 3;

      // If true, will include a special type with lambda=0. Note 0-lambda type
      // will be added only for one policy, as the policy doesn't matter for
      // such lambda. This type will be the first type.
      optional bool include_zero_lambda = 5;
    }

    oneof player_types { LogUniformLambdas log_uniform = 1; }
  }

  // The player types to use.
  optional PlayerTypes player_types = 9;

  // if true, then the agent type is public, meaning it has a singleton
  // distribution from the beginning of the game.
  optional bool agent_type_is_public = 8 [ default = false ];

  // Raise all lambdas to this power and scale by this value in 1901.
  optional float pow_lambdas_1901 = 10;
  optional float scale_lambdas_1901 = 11;

  // Raise all lambdas to this power and scale by this value in 1901 spring.
  // Takes precedence over pow_lambdas_1901 and scale_lambdas_1901 for 1901
  // spring.
  optional float pow_lambdas_1901_spring = 12;
  optional float scale_lambdas_1901_spring = 13;

  // Scale lambdas of everyone based on the sqrt(variance) of their values when
  // simply uniformly sampling actions for everyone from the blueprint rolling
  // out through one movement phase.
  // Always adds a minimum epsilon*epsilon to the variance estimate.
  // dynamic_lambda_stdev_baseline indicates the standard deviation at which
  // no lambda scaling will be applied.
  optional float dynamic_lambda_stdev_espilon = 14;
  optional float dynamic_lambda_stdev_baseline = 15;
  optional int32 dynamic_lambda_stdev_num_samples = 16 [ default = 0 ];

  // DEPRECATED.
  optional float lambda_min = 105 [ deprecated = true ];
  optional float lambda_multiplier = 106 [ deprecated = true ];
  optional bool do_bayesian_updates = 107 [ deprecated = true ];
  optional int32 update_past_n_phases = 108 [ deprecated = true ];
}

message TheBestAgent {
  // Required. Model to predict responses to our orders by other powers. Must be
  // condition-aware all-power model. Will be use for everything else by default
  optional string conditional_policy_model_path = 2;

  // Optional. Plausible order model.
  optional string plausible_model_path = 1;
  optional string anchor_joint_policy_model_path = 20;
  // Required. Model to compute values.
  optional string value_model_path = 3;

  // CUDA device to use, if > 0
  optional int32 device = 13 [ default = 0 ];
  optional bool half_precision = 44;
  optional uint32 max_batch_size = 8 [ default = 512 ];
  // Optional. If set this, batch size will be used to sample from conditional
  // policy, otherwise max_batch_size is used.
  optional uint32 conditional_max_batch_size = 14;

  // Required. Number of BR samples to use to estimate EVs.
  optional int32 num_br_samples = 4;
  // Optional. Samples to draw from the independent-pikl to rescore.
  // By default 10 * num_br_samples.
  optional int32 num_importance_samples = 15;

  optional float qre_lambda = 10;
  optional float qre_eta = 11 [ default = 10000 ];

  // If set, will compute probability P(a in plaisuble| a ~ joint policy).
  optional bool compute_inside_ratio = 12 [ default = true ];

  optional PlausibleOrderSampling plausible_orders_cfg = 5;

  // Note, the agent will die if the max_rollout_length is not equal to 0.
  optional BaseStrategyModelRollouts rollouts_cfg = 6;

  // Use up to this number of samples from the joint policy to compute value.
  optional int32 num_value_computation_samples = 17 [ default = 1000 ];

  enum SamplingType {
    INDEPENDENT_PIKL = 1;
    JOINT_CONDITIONAL = 2;
    HYBRID_JOINT_AND_INDEP_PIKL = 3;
  }

  optional SamplingType sampling_type = 16 [ default = INDEPENDENT_PIKL ];

  // In HYBRID_JOINT_AND_INDEP_PIKL what is the probability to sample from the
  // independent PiKL policy.
  optional float hybrid_independent_pikl_prob = 18 [ default = 0.5 ];

  // In HYBRID_JOINT_AND_INDEP_PIKL what is the temperature of sampling from the
  // joint.
  optional float hybrid_joint_temp = 19 [ default = 1.0 ];
}

message BRSearchAgent {
  // Path to BaseStrategyModel checkpoint.
  optional string model_path = 1;

  // Model server maximum batch size
  optional uint32 max_batch_size = 5 [ default = 700 ];

  // If true, use model predicted final scores in heuristic evalauation
  // If false, use current SC counts after max_rollout_length steps
  optional bool use_predicted_final_scores = 8;

  // CUDA device to use
  optional int32 device = 13 [ default = 0 ];

  // Optional: separate model path to compute the value function
  optional string value_model_path = 14;

  // Rollout params
  optional BaseStrategyModelRollouts rollouts_cfg = 15;

  optional PlausibleOrderSampling plausible_orders_cfg = 16;
}

message ReproAgent {
  // Required, path to game.json file
  optional string game_path = 1;
}

message ParlaiFlags {
  // ------------------------------
  // INFERENCE FLAGS
  // This is a subset of flags related to model inference from:
  // https://github.com/facebookresearch/ParlAI/blob/62a3f4546048997a80f05ce4b00c31e93cb694d1/parlai/core/torch_generator_agent.py#L383
  // ------------------------------

  // Optional. Inference type: beam, greedy, topk, nucleus, delayedbeam.
  // Default: greedy.
  optional string inference = 1;

  // Optional. Beam size. Default: 1.
  optional int32 beam_size = 2;

  // K used in Top K sampling. Default: 10.
  optional int32 topk = 3;

  // p used in nucleus sampling. Default: 0.9
  optional float topp = 4;

  // Temperature to add during decoding. Default: 1.0;
  optional float temperature = 5;

  // Minimum beam length for decoding. Default is -1.
  optional int32 beam_min_length = 6;

  // Intra-sequence n-grams to block during decoding. Default is -1.
  optional int32 beam_block_ngram = 7;

  // N-gram length to block from the conference
  optional int32 beam_context_block_ngram = 8;

  // ------------------------------
  // MISCELLANEOUS FLAGS
  // ------------------------------

  // GPU to set. Default is -1.
  optional int32 gpu = 9;

  // Set the player rating (1-5), if applicable to the model. Default is False.
  optional int32 set_player_rating = 10;

  // Only relevant to the silence classifier when `sample_classifier` is False;
  // threshold for when to speak
  optional float threshold = 11;

  // Only relevant to the classifier; whether or not to sample
  optional bool sample_classifier = 13 [ default = false ];

  // Only relevant to silence classifier with sample_classifier=true.
  // Rescales the speak probability by this multiplier. >1 to speak more.
  optional float prob_speak_multiplier = 14 [ default = 1 ];

  // Set the player chattiness (1-20), if applicable to the model (currently
  // it's only applicable to the SILENCE classifier). Default is False.
  optional int32 set_player_chattiness = 12;

  // Use model parallel
  optional bool model_parallel = 16 [ default = false ];

  // Use special nucleus sampling
  optional bool topp_special = 17 [ default = false ];

  // For special nucleus sampling
  optional float topp_special_threshold = 18 [ default = 0.9 ];

  // Set pot type
  optional string pot_type = 19 [ default = "Sum-of-squares" ];

  // DEPRECATED -- DO NOT USE
  optional int64 phase_minutes = 20 [ default = 1440 ];

  // Nucleus sampling probability cut off
  optional float probability_cutoff = 21 [ default = 0.0 ];
}

message ParlaiModel {
  // Required
  optional string model_path = 1;

  // Optional
  optional ParlaiFlags overrides = 2;

  // Optional
  optional string remote_addr = 3;
}

message ParlaiDiscriminativeNucleusModel {
  // Required
  optional string dialogue_model_file = 1;

  // Required
  optional string lm_model_file = 2;

  // Optional
  optional int32 dialogue_agent_gpu = 3 [ default = 0 ];

  // Optional
  optional int32 lm_agent_gpu = 4 [ default = 1 ];

  // Optional
  optional int32 label_truncate = 5 [ default = 50 ];

  // Optional
  optional float topp = 6 [ default = 0.5 ];

  // Optional
  optional int32 beam_size = 7 [ default = 1 ];

  // Optional
  optional int32 beam_min_length = 8 [ default = 1 ];

  // Optional
  optional int32 beam_context_block_ngram = 9 [ default = -1 ];

  // Optional
  optional int32 set_player_rating = 10 [ default = 5 ];

  // Optional
  optional bool model_parallel = 11 [ default = true ];

  // Optional
  optional float token_scoring_threshold = 12 [ default = 0.7 ];

  // Optional
  optional string scoring_method = 13 [ default = "speaker_listener" ];
}

message NonsenseClassifier {
  optional string name = 1;
  optional ParlaiModel nonsense_classifier = 2;
}

message ParlaiNonsenseDetectionEnsemble {
  repeated NonsenseClassifier nonsense_classifiers = 1;
}

message ParlaiNoPressAgent {
  // Required. Orders model
  optional ParlaiModel model_orders = 1;
}

message ParlaiDialogue {
  // Required. Dialogue model or discriminative nucleus model
  oneof dialogue_model {
    ParlaiModel model_dialogue = 1;
    ParlaiDiscriminativeNucleusModel model_discriminativenucleus_dialogue = 2;
  }

  // Optional. Filter offensive dialogue language
  optional bool filter_offensive_dialogue = 3 [ default = true ];

  // Optional. Pseudo-orders model
  optional ParlaiModel model_pseudo_orders = 4;

  // Optional. Sleep classifer
  optional ParlaiModel model_sleep_classifier = 5;

  // Optional. Recipient classifer
  optional ParlaiModel model_recipient_classifier = 6;

  // Optional. Draw classifier
  optional ParlaiModel model_draw_classifier = 13;

  // Optional. Zeroshot nonsense classifer
  optional ParlaiModel model_zshot_nonsense_classifier = 7;

  // Optional nonsense classifier ensemble
  optional ParlaiNonsenseDetectionEnsemble ensemble_nonsense_classifier = 8;

  // Optional. If true, reuse pseudo-orders for consecutive messages.
  // (But recompute pseudo-orders when the agent receives a new message).
  optional bool reuse_pseudo_for_consecutive_messages = 9 [ default = false ];

  // Optional. If true, reuse pseudo-orders for all messages in a phase.
  optional bool reuse_pseudo_for_phase = 12 [ default = false ];

  // Optional. Resample dialogue on filter
  optional int32 resample_dialogue_on_filter = 11 [ default = 0 ];

  // Optional. Only allow sampling inf if p(inf) >= threshold
  optional float sleep_inf_threshold = 14 [ default = 0 ];

  // Optional. Same as above but only for replies.
  optional float sleep_inf_threshold_reply = 15 [ default = 0 ];

  // Filter the first message an agent sends in the game if the likelihood of
  // being from a weak player is above threshold
  optional float rating_threshold_first_message = 16 [ default = 1.0 ];

  // Filter other messages if likelihood of being from a weak player is above
  // threshold
  optional float rating_threshold_other = 17 [ default = 1.0 ];

  // DEPRECATED use use_initiate_sleep_heuristic_n_years
  optional bool initiate_sleep_heuristic_every_phase = 18 [ default = false ];

  // If set, use initial message prompting
  optional string initial_message_prompts_path = 19;
  // Number of prompt messages to prepend to the dialogue history
  optional int32 initial_message_prompts_count = 20 [ default = 1 ];
  // Timestamp spacing between prompts
  optional int32 initial_message_prompt_spacing_seconds = 21 [ default = 900 ];

  optional bool allow_multi_gpu = 22;

  // If True, in 5m games sleep times will be contrained to a short sleep time
  // (15s) or inf
  optional bool binarize_sleep_times_in_5m_games = 23 [ default = false ];

  // If True, filter messages containing matching regexes related to grounding
  // issues (new players, times of day, etc.)
  optional bool should_filter_grounding = 24 [ default = false ];

  // If >0, force an inf sleep time for a recipient to whom we have sent
  // multiple consecutive messages this phase with no response
  optional int32 limit_consecutive_outbound_messages = 25 [ default = -1 ];

  // Send a message to everybody every phase until
  optional int32 use_initiate_sleep_heuristic_n_years = 26 [ default = 0 ];

  // Apply some hardcoded heuristics based on start-of-phase pseudos
  // to determine whether we need to message someone.
  optional bool use_pseudoorders_initiate_sleep_heuristic = 29
      [ default = false ];

  // Optional. If set, will use this batch size when queries dialogue model for
  // multiple responses. If not set, the batch size will be equal to the request
  // size.
  optional int32 dialogue_batch_size = 27;

  // If set, filters a message when the difference of the pseudo order
  // likelihood for the agent after and before the message is less than this
  // value.
  optional float pseudo_orders_correspondence_threshold = 28;

  // If set, filters for issues relating to the model not knowing when the game
  // is scheduled to end.
  optional int32 grounding_last_playable_year = 30;

  // If base_strategy_model-predicted score is below this value, block message
  // initiation
  optional float block_initiation_if_pred_value_below = 31 [ default = 0 ];

  // If true and grounding_last_playable_year is set, block messages in the
  // last season except to powers with whom we are coordinating a support or
  // convoy.
  optional bool use_last_phase_silence_except_coordination_heuristic = 32;
}

message ParlaiFullPressAgent {
  // Orders model. Required if no order_handler specified. Exists outside of
  // order_handler oneof for backwards compat.
  optional ParlaiModel model_orders = 1;

  // Optional. Dialogue models
  oneof maybe_dialogue { ParlaiDialogue dialogue = 3; }
}

message ParlAIRescoringBaseStrategyModelOrderHandler {
  // Required. Path to base_strategy_model model.
  optional string base_strategy_model_model_path = 1;

  // Required. Parlai model used to re-score base_strategy_model orders.
  optional ParlaiModel model_orders = 2;

  // Options controlling base_strategy_model sampling.
  optional PlausibleOrderSampling plausible_orders_cfg = 3;

  // add ranking-algorithm-specific stuff here?
}

message ParlAIBestResponseOrderHandler {
  // Required. Order model
  optional ParlaiModel model_orders = 1;

  // Required. Path to BaseStrategyModel checkpoint.
  optional string model_path_base_strategy_model = 3;

  // Optional, default provided.
  //  - beam: use top actions from beam search
  //  - sample_topk10_top: sample with topk=10 and take top by probability
  //  - sample_topk10_random: sample with topk=10 and take first sampled N
  //  - sample_topk10_t0.5_random: sample with temp=0.5 and take first sampled N
  // All modes respect plausible_orders_req_size and
  // plausible_orders_batch_size. For beam the two must match.
  optional string parlai_method = 4 [ default = "sample_topk10_top" ];

  // Size of rollout thread pool
  optional uint32 n_rollout_procs = 5 [ default = 70 ];

  // Number of rollouts per plausible order
  optional uint32 rollouts_per_plausible_order = 6;

  // Maximum rollout length in MOVEMENT PHASES heuristically evaluating the game
  optional int32 max_rollout_length = 7 [ default = -1 ]; // required

  // Number of order-sets (actions) to consider at each step in search code
  optional int32 n_plausible_orders = 8 [ default = -1 ];

  // Optional, cap n_plausible_orders as a ratio of # orderable units
  optional float max_actions_units_ratio = 9 [ default = -1 ];

  // Temperature used for rollouts
  optional float rollout_temperature = 10 [ default = -1 ];

  // Optional. Nucleus ratio used for rollouts. During nucleus sampling only the
  // smallest subset of actions that has probability at least top_p is
  // considered. All other actions are never sampled.
  optional float rollout_top_p = 11 [ default = 1.0 ];

  // Maximum batch size
  optional uint32 max_batch_size = 12;

  // Number of queries to plausible orders model
  optional uint32 plausible_orders_req_size = 13;

  // Optional. Batch size for sample queries. Must divide
  // plausible_orders_req_size. If not set, will equal to
  // plausible_orders_req_size.
  optional uint32 plausible_orders_batch_size = 14 [ default = 0 ];
}

message Agent {
  repeated Include includes = 100;
  oneof agent {
    RandomAgent random = 2;
    BaseStrategyModelAgent base_strategy_model = 3;
    SearchBotAgent searchbot = 4;
    BRSearchAgent br_search = 5;
    ReproAgent repro = 7;
    ParlaiNoPressAgent parlai = 9;
    ParlaiFullPressAgent parlai_full_press = 11;
    BQRE1PAgent bqre1p = 15;
    TheBestAgent best_agent = 18;
  }
}