Skip to content
This repository has been archived by the owner on Aug 30, 2022. It is now read-only.

Commit

Permalink
Add scripts for setting up hdfs (#738)
Browse files Browse the repository at this point in the history
* Add scripts for setting up hdfs
Co-authored-by: Bin Wang <[email protected]>
  • Loading branch information
bin-wang authored Aug 17, 2021
1 parent 11dbb58 commit 43b1955
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 2 deletions.
5 changes: 3 additions & 2 deletions bin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ on a remoate cluster when Hillview is installed

* `delete-data.py`: delete a folder from all machines in a Hillview cluster
* `deploy.py`: copy the Hillview binaries to all machines in a Hillview cluster
* `download-data.py`: downloads the specified files from all machines in a cluster
* `deploy-hdfs.py`: download and install HDFS on all machines in a hillview cluster
* `download-data.py`: download the specified files from all machines in a cluster
* `hillviewCommon.py`: common library used by other Python programs
* `run-on-all.py`: run a command on all machines in a Hillview cluster
* `start.py`: start the Hillview service on a remote cluster
Expand Down Expand Up @@ -138,4 +139,4 @@ $ ./download-data.py mycluster.json data/x
```

When downloading the files this utility will create locally a folder
for each machine in the cluster.
for each machine in the cluster.
71 changes: 71 additions & 0 deletions bin/deploy-hdfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env python3
import os
from argparse import ArgumentParser
from configparser import ConfigParser
from tempfile import NamedTemporaryFile

try:
import ansible_runner
except ModuleNotFoundError:
import textwrap
import sys
print(textwrap.dedent("""\
This script requires ansible and ansible-runner.
You can install these modules using the following command:
pip install --user ansible ansible-runner
"""), file=sys.stderr)
sys.exit(-1)

from hillviewCommon import get_config

# sections in inventory
NAMENODE = "namenode"
DATANODE = "datanode"
DEFAULT_VARS = "all:vars"

# specifies which hadoop version to use
HADOOP_VERSION = "3.3.1"


def write_inventory_file(config, file):
inventory = ConfigParser(allow_no_value=True)

# use the webserver node as namenode
inventory.add_section(NAMENODE)
inventory.set(NAMENODE, config.get_webserver().host)

# use the workers as datanodes
inventory.add_section(DATANODE)
for worker in config.get_workers():
inventory.set(DATANODE, worker.host)

inventory.add_section(DEFAULT_VARS)
inventory.set(DEFAULT_VARS, "ansible_user", config.get_user())
inventory.set(DEFAULT_VARS, "hadoop_version", HADOOP_VERSION)

inventory.write(file)
file.flush()


def get_deployment_dir():
"""
Assumes there is a deployment folder in the project root that contains the needed ansible files.
:return: The absolute path to the deployment folder.
"""
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
return os.path.join(project_root, "deployment")


if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("config", help="json cluster configuration file")
args = parser.parse_args()
config = get_config(parser, args)

with NamedTemporaryFile(mode="w") as inventory_file:
write_inventory_file(config, inventory_file)
ansible_runner.run(
project_dir=get_deployment_dir(),
inventory=inventory_file.name,
playbook="install-hdfs.yml"
)
3 changes: 3 additions & 0 deletions deployment/ansible.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[defaults]
host_key_checking = false
interpreter_python = /usr/bin/python3
71 changes: 71 additions & 0 deletions deployment/install-hdfs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
---

- hosts: all
tasks:
- name: Download hadoop
get_url:
url: "https://mirrors.ocf.berkeley.edu/apache/hadoop/common/hadoop-{{ hadoop_version }}/hadoop-{{ hadoop_version}}.tar.gz"
dest: "/tmp/hadoop-{{ hadoop_version }}.tar.gz"

- set_fact: HADOOP_HOME="{{ ansible_env.HOME }}/hadoop-{{ hadoop_version }}"

- name: Unarchive hadoop
unarchive:
src: "/tmp/hadoop-{{ hadoop_version }}.tar.gz"
dest: "{{ ansible_env.HOME }}"
remote_src: true
creates: "{{ HADOOP_HOME }}"

- name: Set HADOOP_HOME environment variable
lineinfile:
path: "{{ ansible_env.HOME }}/.profile"
line: "export HADOOP_HOME=$HOME/hadoop-{{ hadoop_version }}"

- name: Set PATH environment variable
lineinfile:
path: "{{ ansible_env.HOME }}/.profile"
line: "export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH"

- name: Get JAVA_HOME
shell: bash -lc 'echo $JAVA_HOME'
register: echo_java_home
changed_when: false
failed_when: echo_java_home.stdout == ""

- name: Set JAVA_HOME for hadoop environment
lineinfile:
path: "{{ HADOOP_HOME }}/etc/hadoop/hadoop-env.sh"
regexp: "^# export JAVA_HOME=$"
line: "export JAVA_HOME={{ echo_java_home.stdout }}"

- name: Write workers config
template:
src: templates/workers.j2
dest: "{{ HADOOP_HOME }}/etc/hadoop/workers"

- name: Write core-site.xml
template:
src: templates/core-site.xml.j2
dest: "{{ HADOOP_HOME }}/etc/hadoop/core-site.xml"

- name: Write hdfs-site.xml
template:
src: templates/hdfs-site.xml.j2
dest: "{{ HADOOP_HOME }}/etc/hadoop/hdfs-site.xml"

- hosts: namenode
tasks:
- name: Format hdfs if it hasn't been formatted
shell:
cmd: "{{ HADOOP_HOME }}/bin/hdfs namenode -format"
creates: "{{ HADOOP_HOME }}/data/namenode/current/VERSION"

- name: Check hdfs status
shell: "{{ HADOOP_HOME }}/bin/hdfs dfsadmin -report"
register: hdfs_report
changed_when: false
failed_when: false

- name: Start hdfs
shell: "{{ HADOOP_HOME }}/sbin/start-dfs.sh"
when: hdfs_report.rc != 0
9 changes: 9 additions & 0 deletions deployment/templates/core-site.xml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://{{ hostvars[groups['namenode'][0]]['ansible_default_ipv4']['address'] }}:9000</value>
</property>
</configuration>
17 changes: 17 additions & 0 deletions deployment/templates/hdfs-site.xml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>{{ HADOOP_HOME }}/data/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>{{ HADOOP_HOME }}/data/datanode</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
</configuration>
3 changes: 3 additions & 0 deletions deployment/templates/workers.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{% for node in groups["datanode"] %}
{{ node }}
{% endfor %}

0 comments on commit 43b1955

Please sign in to comment.