python으로 wordcount mapper, reducer 구현
> echo "Hello World Bye World" >> part-00000.txt
> echo "Hello Hadoop Goodbye Hadoop" >> part-00001.txt
> hdfs dfs -mkdir -p /user/somnode/wordcount/input/
> hdfs dfs -put part-*.txt /user/somnode/wordcount/input/
#!/usr/bin/env python
import sys
for line in sys.stdin:
words = line.strip().split()
for word in words:
print('{}\t{}'.format(word, 1))
#!/usr/bin/env python
import sys
def print_output(word, count):
print('{}\t{}'.format(word, count))
word, count = None, 0
for line in sys.stdin:
fields = line.strip().split('\t')
if fields[0] != word:
if word is not None:
print_output(word, count)
word, count = fields[0], 0
count += 1
print_output(word, count)
> hdfs dfs -rm -r /user/somnode/wordcount/output
> hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.0.jar -files "wordcount_mapper.py,wordcount_reducer.py" -input /user/somnode/wordcount/input/*.txt -output /user/somnode/wordcount/output -mapper wordcount_mapper.py -reducer wordcount_reducer.py
> hdfs dfs -text /user/somnode/wordcount/output/part-*