From 43b1955619221a1b378aba163ef6a87e854eacec Mon Sep 17 00:00:00 2001 From: Bin Wang <25761459+wkk@users.noreply.github.com> Date: Tue, 17 Aug 2021 14:38:31 -0400 Subject: [PATCH] Add scripts for setting up hdfs (#738) * Add scripts for setting up hdfs Co-authored-by: Bin Wang --- bin/README.md | 5 +- bin/deploy-hdfs.py | 71 +++++++++++++++++++++++++++ deployment/ansible.cfg | 3 ++ deployment/install-hdfs.yml | 71 +++++++++++++++++++++++++++ deployment/templates/core-site.xml.j2 | 9 ++++ deployment/templates/hdfs-site.xml.j2 | 17 +++++++ deployment/templates/workers.j2 | 3 ++ 7 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 bin/deploy-hdfs.py create mode 100644 deployment/ansible.cfg create mode 100644 deployment/install-hdfs.yml create mode 100644 deployment/templates/core-site.xml.j2 create mode 100644 deployment/templates/hdfs-site.xml.j2 create mode 100644 deployment/templates/workers.j2 diff --git a/bin/README.md b/bin/README.md index 95b144de5..d83f6c1df 100644 --- a/bin/README.md +++ b/bin/README.md @@ -53,7 +53,8 @@ on a remoate cluster when Hillview is installed * `delete-data.py`: delete a folder from all machines in a Hillview cluster * `deploy.py`: copy the Hillview binaries to all machines in a Hillview cluster -* `download-data.py`: downloads the specified files from all machines in a cluster +* `deploy-hdfs.py`: download and install HDFS on all machines in a hillview cluster +* `download-data.py`: download the specified files from all machines in a cluster * `hillviewCommon.py`: common library used by other Python programs * `run-on-all.py`: run a command on all machines in a Hillview cluster * `start.py`: start the Hillview service on a remote cluster @@ -138,4 +139,4 @@ $ ./download-data.py mycluster.json data/x ``` When downloading the files this utility will create locally a folder -for each machine in the cluster. \ No newline at end of file +for each machine in the cluster. diff --git a/bin/deploy-hdfs.py b/bin/deploy-hdfs.py new file mode 100644 index 000000000..50b1760e1 --- /dev/null +++ b/bin/deploy-hdfs.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +import os +from argparse import ArgumentParser +from configparser import ConfigParser +from tempfile import NamedTemporaryFile + +try: + import ansible_runner +except ModuleNotFoundError: + import textwrap + import sys + print(textwrap.dedent("""\ + This script requires ansible and ansible-runner. + You can install these modules using the following command: + pip install --user ansible ansible-runner + """), file=sys.stderr) + sys.exit(-1) + +from hillviewCommon import get_config + +# sections in inventory +NAMENODE = "namenode" +DATANODE = "datanode" +DEFAULT_VARS = "all:vars" + +# specifies which hadoop version to use +HADOOP_VERSION = "3.3.1" + + +def write_inventory_file(config, file): + inventory = ConfigParser(allow_no_value=True) + + # use the webserver node as namenode + inventory.add_section(NAMENODE) + inventory.set(NAMENODE, config.get_webserver().host) + + # use the workers as datanodes + inventory.add_section(DATANODE) + for worker in config.get_workers(): + inventory.set(DATANODE, worker.host) + + inventory.add_section(DEFAULT_VARS) + inventory.set(DEFAULT_VARS, "ansible_user", config.get_user()) + inventory.set(DEFAULT_VARS, "hadoop_version", HADOOP_VERSION) + + inventory.write(file) + file.flush() + + +def get_deployment_dir(): + """ + Assumes there is a deployment folder in the project root that contains the needed ansible files. + :return: The absolute path to the deployment folder. + """ + project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + return os.path.join(project_root, "deployment") + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("config", help="json cluster configuration file") + args = parser.parse_args() + config = get_config(parser, args) + + with NamedTemporaryFile(mode="w") as inventory_file: + write_inventory_file(config, inventory_file) + ansible_runner.run( + project_dir=get_deployment_dir(), + inventory=inventory_file.name, + playbook="install-hdfs.yml" + ) diff --git a/deployment/ansible.cfg b/deployment/ansible.cfg new file mode 100644 index 000000000..beead76aa --- /dev/null +++ b/deployment/ansible.cfg @@ -0,0 +1,3 @@ +[defaults] +host_key_checking = false +interpreter_python = /usr/bin/python3 diff --git a/deployment/install-hdfs.yml b/deployment/install-hdfs.yml new file mode 100644 index 000000000..307b84f06 --- /dev/null +++ b/deployment/install-hdfs.yml @@ -0,0 +1,71 @@ +--- + +- hosts: all + tasks: + - name: Download hadoop + get_url: + url: "https://mirrors.ocf.berkeley.edu/apache/hadoop/common/hadoop-{{ hadoop_version }}/hadoop-{{ hadoop_version}}.tar.gz" + dest: "/tmp/hadoop-{{ hadoop_version }}.tar.gz" + + - set_fact: HADOOP_HOME="{{ ansible_env.HOME }}/hadoop-{{ hadoop_version }}" + + - name: Unarchive hadoop + unarchive: + src: "/tmp/hadoop-{{ hadoop_version }}.tar.gz" + dest: "{{ ansible_env.HOME }}" + remote_src: true + creates: "{{ HADOOP_HOME }}" + + - name: Set HADOOP_HOME environment variable + lineinfile: + path: "{{ ansible_env.HOME }}/.profile" + line: "export HADOOP_HOME=$HOME/hadoop-{{ hadoop_version }}" + + - name: Set PATH environment variable + lineinfile: + path: "{{ ansible_env.HOME }}/.profile" + line: "export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH" + + - name: Get JAVA_HOME + shell: bash -lc 'echo $JAVA_HOME' + register: echo_java_home + changed_when: false + failed_when: echo_java_home.stdout == "" + + - name: Set JAVA_HOME for hadoop environment + lineinfile: + path: "{{ HADOOP_HOME }}/etc/hadoop/hadoop-env.sh" + regexp: "^# export JAVA_HOME=$" + line: "export JAVA_HOME={{ echo_java_home.stdout }}" + + - name: Write workers config + template: + src: templates/workers.j2 + dest: "{{ HADOOP_HOME }}/etc/hadoop/workers" + + - name: Write core-site.xml + template: + src: templates/core-site.xml.j2 + dest: "{{ HADOOP_HOME }}/etc/hadoop/core-site.xml" + + - name: Write hdfs-site.xml + template: + src: templates/hdfs-site.xml.j2 + dest: "{{ HADOOP_HOME }}/etc/hadoop/hdfs-site.xml" + +- hosts: namenode + tasks: + - name: Format hdfs if it hasn't been formatted + shell: + cmd: "{{ HADOOP_HOME }}/bin/hdfs namenode -format" + creates: "{{ HADOOP_HOME }}/data/namenode/current/VERSION" + + - name: Check hdfs status + shell: "{{ HADOOP_HOME }}/bin/hdfs dfsadmin -report" + register: hdfs_report + changed_when: false + failed_when: false + + - name: Start hdfs + shell: "{{ HADOOP_HOME }}/sbin/start-dfs.sh" + when: hdfs_report.rc != 0 diff --git a/deployment/templates/core-site.xml.j2 b/deployment/templates/core-site.xml.j2 new file mode 100644 index 000000000..199e44cda --- /dev/null +++ b/deployment/templates/core-site.xml.j2 @@ -0,0 +1,9 @@ + + + + + + fs.defaultFS + hdfs://{{ hostvars[groups['namenode'][0]]['ansible_default_ipv4']['address'] }}:9000 + + diff --git a/deployment/templates/hdfs-site.xml.j2 b/deployment/templates/hdfs-site.xml.j2 new file mode 100644 index 000000000..acb50c3eb --- /dev/null +++ b/deployment/templates/hdfs-site.xml.j2 @@ -0,0 +1,17 @@ + + + + + + dfs.namenode.name.dir + {{ HADOOP_HOME }}/data/namenode + + + dfs.datanode.data.dir + {{ HADOOP_HOME }}/data/datanode + + + dfs.replication + 2 + + diff --git a/deployment/templates/workers.j2 b/deployment/templates/workers.j2 new file mode 100644 index 000000000..1a8d33f20 --- /dev/null +++ b/deployment/templates/workers.j2 @@ -0,0 +1,3 @@ +{% for node in groups["datanode"] %} +{{ node }} +{% endfor %}