Apache Hadoop (həˈduːp) 은 안정적이고 확장 가능한 분산 컴퓨팅을 위한 오픈 소스 소프트웨어 유틸리티 모음이다. MapReduce 프로그래밍 모델을 사용하여 빅데이터의 분산 저장 및 처리를 위한 소프트웨어 프레임워크를 제공한다.
Word Count Using MapReduce
map(key, value): // key: document name; value: text of the document for each word w in value: emit(w, 1) reduce(key, values): // key: a word; value: an iterator over counts result = 0 for each count v in values: result += v emit(key, result)
~/home$ sudo apt update
~/home$ sudo apt install openjdk-11-jdk
~/home$ java -version
~/home$ wget https://downloads.apache.org/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz
~/home$ tar -xzf hadoop-3.4.1.tar.gz
~/home$ mv hadoop-3.4.1 ~/hadoop
~/home$ vi .bashrc
export HADOOP_HOME=~/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
~/home$ source ~/.bashrc
~/hadoop/etc/hadoop$ vi core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
~/home$ vi ~/hadoop/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
~/home$ echo $JAVA_HOME
~/home$ hadoop version
~/home$ hdfs namenode -format
# hdfs namenode -format
# 2025-04-04 00:56:29,202 INFO namenode.NameNode: STARTUP_MSG:
# /************************************************************
# STARTUP_MSG: Starting NameNode
# STARTUP_MSG: host = GramPro/127.0.1.1
# STARTUP_MSG: args = [-format]
# STARTUP_MSG: version = 3.4.1
~/home$ start-dfs.sh
# Starting namenodes on [localhost]
# Starting datanodes
# Starting secondary namenodes [GramPro]
~/home$
~/home$ hadoop version
# Hadoop 3.4.1
# Source code repository https://github.com/apache/hadoop.git -r 4d7825309348956336b8f06a08322b78422849b1
# Compiled by mthakur on 2024-10-09T14:57Z
# Compiled on platform linux-x86_64
# Compiled with protoc 3.23.4
# From source with checksum 7292fe9dba5e2e44e3a9f763fce3e680
# This command was run using /home/shyim/hadoop/share/hadoop/common/hadoop-common-3.4.1.jar
~/home$
~/home$ mapred streaming
# No Arguments Given!
# Usage: mapred streaming [options]
# Options:
# dumptb <glob-pattern> Dumps all files that match the given pattern to
# standard output as typed bytes.
# loadtb <path> Reads typed bytes from standard input and stores them in
# a sequence file in the specified path
# [streamjob] <args> Runs streaming job with given arguments
~/home$
~/home$ hdfs dfs -mkdir -p /user/test
~/home$
~/home$ hdfs dfs -ls /
# Found 1 items
# drwxr-xr-x - yimsh supergroup 0 2025-04-07 21:29 /user
~/home$ hdfs dfs -put ../input_small.txt /user/yimsh/test
~/home$
~/home$ hdfs dfs -ls /user/yimsh/test
# Found 1 items
# -rw-r--r-- 3 yimsh supergroup 31 2025-04-07 21:53 /user/yimsh/test/input_small.txt