wget http://www.gutenberg.org/cache ... 1.txt
hadoop dfs -mkdir wordcount
#!/usr/bin/php
<?php
// iterate through lines
while($line = fgets(STDIN)){
// remove leading and trailing
$line = ltrim($line);
$line = rtrim($line);
// split the line in words
$words = preg_split('/\s/', $line, -1, PREG_SPLIT_NO_EMPTY);
// iterate through words
foreach( $words as $key ) {
// print word (key) to standard output
// the output will be used in the
// reduce (reducer.php) step
// word (key) tab-delimited wordcount (1)
printf("%s\t%d\n", $key, 1);
}
}
?>
#!/usr/bin/php
<?php
$last_key = NULL;
$running_total = 0;
// iterate through lines
while($line = fgets(STDIN)) {
// remove leading and trailing
$line = ltrim($line);
$line = rtrim($line);
// split line into key and count
list($key,$count) = explode("\t", $line);
// this if else structure works because
// hadoop sorts the mapper output by it keys
// before sending it to the reducer
// if the last key retrieved is the same
// as the current key that have been received
if ($last_key === $key) {
// increase running total of the key
$running_total += $count;
} else {
if ($last_key != NULL)
// output previous key and its running total
printf("%s\t%d\n", $last_key, $running_total);
// reset last key and running total
// by assigning the new key and its value
$last_key = $key;
$running_total = $count;
}
}
?>
head -n1000 pg2701.txt | ./mapper.php | sort | ./reducer.php我们在Apache Hadoop集群上运行它:
hadoop jar /usr/hadoop/2.5.1/libexec/lib/hadoop-streaming-2.5.1.jar \
-mapper "./mapper.php"
-reducer "./reducer.php"
-input "hello/mobydick.txt"
-output "hello/result"
hdfs dfs -cat hello/result/part-00000
wget https://raw.githubusercontent. ... a.csv
hadoop dfs -mkdir goldprice
hadoop dfs -copyFromLocal ./data.csv goldprice/data.csv
#!/usr/bin/php
<?php
// iterate through lines
while($line = fgets(STDIN)){
// remove leading and trailing
$line = ltrim($line);
$line = rtrim($line);
// regular expression to capture year and gold value
preg_match("/^(.*?)\-(?:.*),(.*)$/", $line, $matches);
if ($matches) {
// key: year, value: gold price
printf("%s\t%.3f\n", $matches[1], $matches[2]);
}
}
?>
#!/usr/bin/php
<?php
$last_key = NULL;
$running_total = 0;
$running_average = 0;
$number_of_items = 0;
// iterate through lines
while($line = fgets(STDIN)) {
// remove leading and trailing
$line = ltrim($line);
$line = rtrim($line);
// split line into key and count
list($key,$count) = explode("\t", $line);
// if the last key retrieved is the same
// as the current key that have been received
if ($last_key === $key) {
// increase number of items
$number_of_items++;
// increase running total of the key
$running_total += $count;
// (re)calculate average for that key
$running_average = $running_total / $number_of_items;
} else {
if ($last_key != NULL)
// output previous key and its running average
printf("%s\t%.4f\n", $last_key, $running_average);
// reset key, running total, running average
// and number of items
$last_key = $key;
$number_of_items = 1;
$running_total = $count;
$running_average = $count;
}
}
if ($last_key != NULL)
// output previous key and its running average
printf("%s\t%.3f\n", $last_key, $running_average);
?>
head -n1000 data.csv | ./mapper.php | sort | ./reducer.php
hadoop jar /usr/hadoop/2.5.1/libexec/lib/hadoop-streaming-2.5.1.jar \查看平均值
-mapper "./mapper.php"
-reducer "./reducer.php"
-input "goldprice/data.csv"
-output "goldprice/result"
hdfs dfs -cat goldprice/result/part-00000
hdfs dfs -get goldprice/result/part-00000 gold.dat创建一个gnu plot配置文件(gold.plot)并复制以下内容
# Gnuplot script file for generating gold prices生成图表:
set terminal png
set output "chart.jpg"
set style data lines
set nokey
set grid
set title "Gold prices"
set xlabel "Year"
set ylabel "Price"
plot "gold.dat"
gnuplot gold.plot这会生成一个名为chart.jpg的文件。看起来像这样:
译者:杜江
作者:Glenn De Backer
原文:https://www.simplicity.be/article/big-data-php/
本文为 @ 21CTO 创作并授权 21CTO 发布,未经许可,请勿转载。
内容授权事宜请您联系 webmaster@21cto.com或关注 21CTO 公众号。
该文观点仅代表作者本人,21CTO 平台仅提供信息存储空间服务。