public class SolrDeleteDuplicates extends org.apache.hadoop.mapreduce.Reducer<org.apache.hadoop.io.Text,SolrDeleteDuplicates.SolrRecord,org.apache.hadoop.io.Text,SolrDeleteDuplicates.SolrRecord> implements org.apache.hadoop.util.Tool
SolrDeleteDuplicates.SolrRecord
instances(which contain id, boost and timestamp)SolrDeleteDuplicates.SolrRecord
s with the same digest will be
grouped together. Now, of these documents with the same digests, delete all
of them except the one with the highest score (boost field). If two (or more)
documents have the same score, then the document with the latest timestamp is
kept. Again, every other is deleted from solr index.Modifier and Type | Class and Description |
---|---|
static class |
SolrDeleteDuplicates.SolrInputFormat |
static class |
SolrDeleteDuplicates.SolrInputSplit |
static class |
SolrDeleteDuplicates.SolrRecord |
static class |
SolrDeleteDuplicates.SolrRecordReader |
Modifier and Type | Field and Description |
---|---|
static org.slf4j.Logger |
LOG |
Constructor and Description |
---|
SolrDeleteDuplicates() |
Modifier and Type | Method and Description |
---|---|
void |
cleanup(org.apache.hadoop.mapreduce.Reducer.Context context) |
boolean |
dedup(String solrUrl) |
org.apache.hadoop.conf.Configuration |
getConf() |
static void |
main(String[] args) |
void |
reduce(org.apache.hadoop.io.Text key,
Iterable<SolrDeleteDuplicates.SolrRecord> values,
org.apache.hadoop.mapreduce.Reducer.Context context) |
int |
run(String[] args) |
void |
setConf(org.apache.hadoop.conf.Configuration conf) |
void |
setup(org.apache.hadoop.mapreduce.Reducer.Context job) |
public org.apache.hadoop.conf.Configuration getConf()
getConf
in interface org.apache.hadoop.conf.Configurable
public void setConf(org.apache.hadoop.conf.Configuration conf)
setConf
in interface org.apache.hadoop.conf.Configurable
public void setup(org.apache.hadoop.mapreduce.Reducer.Context job) throws IOException
setup
in class org.apache.hadoop.mapreduce.Reducer<org.apache.hadoop.io.Text,SolrDeleteDuplicates.SolrRecord,org.apache.hadoop.io.Text,SolrDeleteDuplicates.SolrRecord>
IOException
public void cleanup(org.apache.hadoop.mapreduce.Reducer.Context context) throws IOException
cleanup
in class org.apache.hadoop.mapreduce.Reducer<org.apache.hadoop.io.Text,SolrDeleteDuplicates.SolrRecord,org.apache.hadoop.io.Text,SolrDeleteDuplicates.SolrRecord>
IOException
public void reduce(org.apache.hadoop.io.Text key, Iterable<SolrDeleteDuplicates.SolrRecord> values, org.apache.hadoop.mapreduce.Reducer.Context context) throws IOException
reduce
in class org.apache.hadoop.mapreduce.Reducer<org.apache.hadoop.io.Text,SolrDeleteDuplicates.SolrRecord,org.apache.hadoop.io.Text,SolrDeleteDuplicates.SolrRecord>
IOException
public boolean dedup(String solrUrl) throws IOException, InterruptedException, ClassNotFoundException
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException
run
in interface org.apache.hadoop.util.Tool
IOException
InterruptedException
ClassNotFoundException
Copyright © 2015 The Apache Software Foundation