问题导读
1.使用CSVRequestHandler过程中,如何避免空行出现的异常?
2.solr是如何导入csv文件的?
3.字段里面本来就有逗号符号等,是如何解决的?
今天想用DIH导入csv文件,于是数据源用FileDataSource+自定义转换器粗略实现了一下
- package com.besttone.transformer;
-
- import java.util.Map;
-
- public class CsvTransformer {
-
- // 参考资料 http://wiki.apache.org/solr/DIHCustomTransformer
-
- public Object transformRow(Map<String, Object> row) {
- // TODO Auto-generated method stub
-
- Object rawLine = row.get("rawLine");
-
- if (rawLine != null) {
- String[] props = rawLine.toString().split(",");
- row.put("id", props[0]);
- row.put("name", props[1]);
- }
-
- return row;
- }
- }
复制代码
发现很多问题,比如字段里面本来就有逗号符号等等,用这个粗略的转换器肯定是无法实现了,于是继续找文档发现solr自带了一个CSVRequestHandler,不过默认在solrconfig.xml里是没有配这个requestHandler的,于是先配一个:
- <!-- CSV update handler, loaded on demand -->
- <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy">
- </requestHandler>
复制代码
这样在浏览器里输入URL:http://localhost:8088/solr-src/csv-core/update/csv?stream.file=D:/dpimport/test_data2.csv&stream.contentType=text/plain;charset=utf-8&fieldnames=id,name&commit=true
就能将csv文件导入进去了,我的csv文件有两个字段一个id,一个name,做了点测试数据如:
1,aaa
2,bbb
...
连续行导入当然没有问题,当中间有空行时,office 的 csv 文件会变成:
1,aaa
,
2,bbb
也就是空行会有一个逗号,然后导入的时候恰巧ID字段的FiledSchema是唯一不能为空的,会导致创建索引文件的时候出异常,于是我将CSVRequestHandler源码扩展了一下,我增加了一个参数emptyLine,然后在load方法中加了一个逻辑:
- //是否支持空数据行
- if(emptyLine)
- {
- int totalLength=0;
- for (int i = 0; i < vals.length; i++) {
- totalLength += vals[i].length();
- }
- if (totalLength==0)
- {
- continue;
- }
- }
复制代码
修改后的CSVRequestHandler如下:
- /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- package org.apache.solr.handler;
-
- import org.apache.solr.request.SolrQueryRequest;
- import org.apache.solr.response.SolrQueryResponse;
- import org.apache.solr.common.SolrException;
- import org.apache.solr.common.SolrInputDocument;
- import org.apache.solr.common.params.SolrParams;
- import org.apache.solr.common.params.UpdateParams;
- import org.apache.solr.common.util.StrUtils;
- import org.apache.solr.common.util.ContentStream;
- import org.apache.solr.schema.IndexSchema;
- import org.apache.solr.schema.SchemaField;
- import org.apache.solr.update.*;
- import org.apache.solr.update.processor.UpdateRequestProcessor;
- import org.apache.solr.internal.csv.CSVStrategy;
- import org.apache.solr.internal.csv.CSVParser;
- import org.apache.commons.io.IOUtils;
-
- import java.util.regex.Pattern;
- import java.util.List;
- import java.io.*;
-
- /**
- * @version $Id: CSVRequestHandler.java 1298169 2012-03-07 22:27:54Z uschindler $
- */
-
- public class CSVRequestHandler extends ContentStreamHandlerBase {
-
- @Override
- protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
- return new SingleThreadedCSVLoader(req, processor);
- }
-
- //////////////////////// SolrInfoMBeans methods //////////////////////
- @Override
- public String getDescription() {
- return "Add/Update multiple documents with CSV formatted rows";
- }
-
- @Override
- public String getVersion() {
- return "$Revision: 1298169 $";
- }
-
- @Override
- public String getSourceId() {
- return "$Id: CSVRequestHandler.java 1298169 2012-03-07 22:27:54Z uschindler $";
- }
-
- @Override
- public String getSource() {
- return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/handler/CSVRequestHandler.java $";
- }
- }
-
-
- abstract class CSVLoader extends ContentStreamLoader {
- public static final String SEPARATOR="separator";
- public static final String FIELDNAMES="fieldnames";
- public static final String HEADER="header";
- public static final String SKIP="skip";
- public static final String SKIPLINES="skipLines";
- public static final String MAP="map";
- public static final String TRIM="trim";
- public static final String EMPTY="keepEmpty";
- public static final String SPLIT="split";
- public static final String ENCAPSULATOR="encapsulator";
- public static final String ESCAPE="escape";
- public static final String OVERWRITE="overwrite";
- <span style="color:#ff6666;"> public static final String EMPTYLINE="emptyLine";//是否支持空数据行
- </span>
- private static Pattern colonSplit = Pattern.compile(":");
- private static Pattern commaSplit = Pattern.compile(",");
-
- final IndexSchema schema;
- final SolrParams params;
- final CSVStrategy strategy;
- final UpdateRequestProcessor processor;
-
-
- String[] fieldnames;
- SchemaField[] fields;
- CSVLoader.FieldAdder[] adders;
-
- int skipLines; // number of lines to skip at start of file
-
- <span style="color:#ff6666;">boolean emptyLine; //是否支持空数据行
- </span>
- final AddUpdateCommand templateAdd;
-
-
-
- /** Add a field to a document unless it's zero length.
- * The FieldAdder hierarchy handles all the complexity of
- * further transforming or splitting field values to keep the
- * main logic loop clean. All implementations of add() must be
- * MT-safe!
- */
- private class FieldAdder {
- void add(SolrInputDocument doc, int line, int column, String val) {
- if (val.length() > 0) {
- doc.addField(fields[column].getName(),val,1.0f);
- }
- }
- }
-
- /** add zero length fields */
- private class FieldAdderEmpty extends CSVLoader.FieldAdder {
- @Override
- void add(SolrInputDocument doc, int line, int column, String val) {
- doc.addField(fields[column].getName(),val,1.0f);
- }
- }
-
- /** trim fields */
- private class FieldTrimmer extends CSVLoader.FieldAdder {
- private final CSVLoader.FieldAdder base;
- FieldTrimmer(CSVLoader.FieldAdder base) { this.base=base; }
- @Override
- void add(SolrInputDocument doc, int line, int column, String val) {
- base.add(doc, line, column, val.trim());
- }
- }
-
- /** map a single value.
- * for just a couple of mappings, this is probably faster than
- * using a HashMap.
- */
- private class FieldMapperSingle extends CSVLoader.FieldAdder {
- private final String from;
- private final String to;
- private final CSVLoader.FieldAdder base;
- FieldMapperSingle(String from, String to, CSVLoader.FieldAdder base) {
- this.from=from;
- this.to=to;
- this.base=base;
- }
- @Override
- void add(SolrInputDocument doc, int line, int column, String val) {
- if (from.equals(val)) val=to;
- base.add(doc,line,column,val);
- }
- }
-
- /** Split a single value into multiple values based on
- * a CSVStrategy.
- */
- private class FieldSplitter extends CSVLoader.FieldAdder {
- private final CSVStrategy strategy;
- private final CSVLoader.FieldAdder base;
- FieldSplitter(CSVStrategy strategy, CSVLoader.FieldAdder base) {
- this.strategy = strategy;
- this.base = base;
- }
-
- @Override
- void add(SolrInputDocument doc, int line, int column, String val) {
- CSVParser parser = new CSVParser(new StringReader(val), strategy);
- try {
- String[] vals = parser.getLine();
- if (vals!=null) {
- for (String v: vals) base.add(doc,line,column,v);
- } else {
- base.add(doc,line,column,val);
- }
- } catch (IOException e) {
- throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,e);
- }
- }
- }
-
-
- String errHeader="CSVLoader:";
-
- CSVLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
- this.processor = processor;
- this.params = req.getParams();
- schema = req.getSchema();
-
- templateAdd = new AddUpdateCommand();
- templateAdd.allowDups=false;
- templateAdd.overwriteCommitted=true;
- templateAdd.overwritePending=true;
-
- if (params.getBool(OVERWRITE,true)) {
- templateAdd.allowDups=false;
- templateAdd.overwriteCommitted=true;
- templateAdd.overwritePending=true;
- } else {
- templateAdd.allowDups=true;
- templateAdd.overwriteCommitted=false;
- templateAdd.overwritePending=false;
- }
-
- templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);
-
- strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, CSVStrategy.ESCAPE_DISABLED, false, false, false, true);
- String sep = params.get(SEPARATOR);
- if (sep!=null) {
- if (sep.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid separator:'"+sep+"'");
- strategy.setDelimiter(sep.charAt(0));
- }
-
- String encapsulator = params.get(ENCAPSULATOR);
- if (encapsulator!=null) {
- if (encapsulator.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+encapsulator+"'");
- }
-
- String escape = params.get(ESCAPE);
- if (escape!=null) {
- if (escape.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid escape:'"+escape+"'");
- }
-
- // if only encapsulator or escape is set, disable the other escaping mechanism
- if (encapsulator == null && escape != null) {
- strategy.setEncapsulator( CSVStrategy.ENCAPSULATOR_DISABLED);
- strategy.setEscape(escape.charAt(0));
- } else {
- if (encapsulator != null) {
- strategy.setEncapsulator(encapsulator.charAt(0));
- }
- if (escape != null) {
- char ch = escape.charAt(0);
- strategy.setEscape(ch);
- if (ch == '\\') {
- // If the escape is the standard backslash, then also enable
- // unicode escapes (it's harmless since 'u' would not otherwise
- // be escaped.
- strategy.setUnicodeEscapeInterpretation(true);
- }
- }
- }
-
- String fn = params.get(FIELDNAMES);
- fieldnames = fn != null ? commaSplit.split(fn,-1) : null;
-
- Boolean hasHeader = params.getBool(HEADER);
-
- skipLines = params.getInt(SKIPLINES,0);
-
- <span style="color:#ff6666;">emptyLine = params.getBool(EMPTYLINE, false);//扩展
- </span>
- if (fieldnames==null) {
- if (null == hasHeader) {
- // assume the file has the headers if they aren't supplied in the args
- hasHeader=true;
- } else if (!hasHeader) {
- throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"CSVLoader: must specify fieldnames=<fields>* or header=true");
- }
- } else {
- // if the fieldnames were supplied and the file has a header, we need to
- // skip over that header.
- if (hasHeader!=null && hasHeader) skipLines++;
-
- prepareFields();
- }
- }
-
- /** create the FieldAdders that control how each field is indexed */
- void prepareFields() {
- // Possible future optimization: for really rapid incremental indexing
- // from a POST, one could cache all of this setup info based on the params.
- // The link from FieldAdder to this would need to be severed for that to happen.
-
- fields = new SchemaField[fieldnames.length];
- adders = new CSVLoader.FieldAdder[fieldnames.length];
- String skipStr = params.get(SKIP);
- List<String> skipFields = skipStr==null ? null : StrUtils.splitSmart(skipStr,',');
-
- CSVLoader.FieldAdder adder = new CSVLoader.FieldAdder();
- CSVLoader.FieldAdder adderKeepEmpty = new CSVLoader.FieldAdderEmpty();
-
- for (int i=0; i<fields.length; i++) {
- String fname = fieldnames[i];
- // to skip a field, leave the entries in fields and addrs null
- if (fname.length()==0 || (skipFields!=null && skipFields.contains(fname))) continue;
-
- fields[i] = schema.getField(fname);
- boolean keepEmpty = params.getFieldBool(fname,EMPTY,false);
- adders[i] = keepEmpty ? adderKeepEmpty : adder;
-
- // Order that operations are applied: split -> trim -> map -> add
- // so create in reverse order.
- // Creation of FieldAdders could be optimized and shared among fields
-
- String[] fmap = params.getFieldParams(fname,MAP);
- if (fmap!=null) {
- for (String mapRule : fmap) {
- String[] mapArgs = colonSplit.split(mapRule,-1);
- if (mapArgs.length!=2)
- throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Map rules must be of the form 'from:to' ,got '"+mapRule+"'");
- adders[i] = new CSVLoader.FieldMapperSingle(mapArgs[0], mapArgs[1], adders[i]);
- }
- }
-
- if (params.getFieldBool(fname,TRIM,false)) {
- adders[i] = new CSVLoader.FieldTrimmer(adders[i]);
- }
-
- if (params.getFieldBool(fname,SPLIT,false)) {
- String sepStr = params.getFieldParam(fname,SEPARATOR);
- char fsep = sepStr==null || sepStr.length()==0 ? ',' : sepStr.charAt(0);
- String encStr = params.getFieldParam(fname,ENCAPSULATOR);
- char fenc = encStr==null || encStr.length()==0 ? (char)-2 : encStr.charAt(0);
- String escStr = params.getFieldParam(fname,ESCAPE);
- char fesc = escStr==null || escStr.length()==0 ? CSVStrategy.ESCAPE_DISABLED : escStr.charAt(0);
-
- CSVStrategy fstrat = new CSVStrategy(fsep,fenc,CSVStrategy.COMMENTS_DISABLED,fesc, false, false, false, false);
- adders[i] = new CSVLoader.FieldSplitter(fstrat, adders[i]);
- }
- }
- }
-
- private void input_err(String msg, String[] line, int lineno) {
- StringBuilder sb = new StringBuilder();
- sb.append(errHeader).append(", line=").append(lineno).append(",").append(msg).append("\n\tvalues={");
- for (String val: line) {
- sb.append("'").append(val).append("',"); }
- sb.append('}');
- throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,sb.toString());
- }
-
- private void input_err(String msg, String[] lines, int lineNo, Throwable e) {
- StringBuilder sb = new StringBuilder();
- sb.append(errHeader).append(", line=").append(lineNo).append(",").append(msg).append("\n\tvalues={");
- if (lines != null) {
- for (String val : lines) {
- sb.append("'").append(val).append("',");
- }
- } else {
- sb.append("NO LINES AVAILABLE");
- }
- sb.append('}');
- throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,sb.toString(), e);
- }
-
- /** load the CSV input */
- @Override
- public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream) throws IOException {
- errHeader = "CSVLoader: input=" + stream.getSourceInfo();
- Reader reader = null;
-
- try {
- reader = stream.getReader();
- if (skipLines>0) {
- if (!(reader instanceof BufferedReader)) {
- reader = new BufferedReader(reader);
- }
- BufferedReader r = (BufferedReader)reader;
- for (int i=0; i<skipLines; i++) {
- r.readLine();
- }
- }
-
- CSVParser parser = new CSVParser(reader, strategy);
-
- // parse the fieldnames from the header of the file
- if (fieldnames==null) {
- fieldnames = parser.getLine();
- if (fieldnames==null) {
- throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Expected fieldnames in CSV input");
- }
- prepareFields();
- }
-
- // read the rest of the CSV file
- for(;;) {
- int line = parser.getLineNumber(); // for error reporting in MT mode
- String[] vals = null;
- try {
- vals = parser.getLine();
-
- } catch (IOException e) {
- //Catch the exception and rethrow it with more line information
- input_err("can't read line: " + line, null, line, e);
- }
- if (vals==null) break;
- <span style="color:#ff0000;">//是否支持空数据行
- </span><span style="color:#ff0000;"> if(emptyLine)
- {
- int totalLength=0;
- for (int i = 0; i < vals.length; i++) {
- totalLength += vals[i].length();
- }
- if (totalLength==0)
- {
- continue;
- }
- }
- </span>
- if (vals.length != fields.length) {
- input_err("expected "+fields.length+" values but got "+vals.length, vals, line);
- }
-
- addDoc(line,vals);
- }
- } finally{
- if (reader != null) {
- IOUtils.closeQuietly(reader);
- }
- }
- }
-
- /** called for each line of values (document) */
- abstract void addDoc(int line, String[] vals) throws IOException;
-
- /** this must be MT safe... may be called concurrently from multiple threads. */
- void doAdd(int line, String[] vals, SolrInputDocument doc, AddUpdateCommand template) throws IOException {
- // the line number is passed simply for error reporting in MT mode.
- // first, create the lucene document
- for (int i=0; i<vals.length; i++) {
- if (fields[i]==null) continue; // ignore this field
- String val = vals[i];
- adders[i].add(doc, line, i, val);
- }
-
- template.solrDoc = doc;
- processor.processAdd(template);
- }
-
- }
-
-
- class SingleThreadedCSVLoader extends CSVLoader {
- SingleThreadedCSVLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
- super(req, processor);
- }
-
- @Override
- void addDoc(int line, String[] vals) throws IOException {
- templateAdd.indexedId = null;
- SolrInputDocument doc = new SolrInputDocument();
- doAdd(line, vals, doc, templateAdd);
- }
- }
-
复制代码
这样在上面那个请求URL的基础上再&emptyLine=true,就能避免空行出现的异常了。
以上是针对3.6版本的solr的修改。不同版本不一定可行
|