本文共 4173 字,大约阅读时间需要 13 分钟。
提供了一份招聘信息数据集,以下是清洗条件:
package 招聘数据清洗;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;import org.apache.hadoop.mapreduce.Mapper;public class Map extends Mapper{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 忽略包含字段名的记录 if (value.toString().startsWith("\uFEFFpositionName")) { return; } // 拆分字段 String[] fields = value.toString().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)", -1); // 去空 if (valid(fields)) { // 处理薪资 if (fields[1].contains("*")) { String[] salary = fields[1].split("\\*"); String[] salarys = salary[0].split("-"); int max = Integer.parseInt(salarys[1].trim().substring(0, salarys[1].length() - 1)) * Integer.parseInt(salary[1]); int min = Integer.parseInt(salarys[0].trim().substring(0, salarys[0].length() - 1)); fields[1] = ((max + min) / 2) + "k"; } else { String[] salary = fields[1].split("-"); int max = Integer.parseInt(salary[1].trim().substring(0, salary[1].length() - 1)); int min = Integer.parseInt(salary[0].trim().substring(0, salary[0].length() - 1)); fields[1] = ((max + min) / 2) + "k"; } // 输出结果 StringBuffer sb = new StringBuffer(); for (int i = 0; i < fields.length; i++) { sb.append(fields[i]).append("\t"); } context.write(new Text(sb.toString()), NullWritable.get()); } } private boolean valid(String[] fields) { for (String item : fields) { if (item.trim().isEmpty()) { return false; } } return true; }}
package 招聘数据清洗;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;public class Reduce extends Reducer{ @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { context.write(key, NullWritable.get()); }}
package 招聘数据清洗;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class Driver { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setJarByClass(Driver.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); Path in = new Path("G:\\Projects\\IdeaProject-C\\MapReduce\\src\\main\\java\\招聘数据清洗\\data\\zhaopin.txt"); Path out = new Path("G:\\Projects\\IdeaProject-C\\MapReduce\\src\\main\\java\\招聘数据清洗\\output"); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); FileSystem fs = FileSystem.get(conf); if (fs.exists(out)) { fs.delete(out, true); } System.exit(job.waitForCompletion(true) ? 0 : 1); }} BOM编码格式问题
字段分割问题
" , "和","的不同处理方式转载地址:http://lceq.baihongyu.com/