需求:

  从oracle数据库中根据记录的文件名filename_html(多个文件以逗号隔开),文件路径path,备用文件名bakpath中获取

主机172.21.0.31上对应的html文件内容,并且只能通过sftp访问html文件,获取文件内容建立索引.

aaarticlea/png;base64," alt="" />

问题:

  目前的难点是字段filename_html中可以有多个文件名,并且多个文件抽取到一个索引字段content下面.另一个是数据访问方式sftp方式.目前DIH组件中没有相应的SFTP访问.

解决方法:

  引入jsch组件包.开发相应SFTP组件.

1.编写BinSFTPDataSource数据源,用于生成响应的InputStream流.编写过程中注意流的关闭,否则容易造成Too many files 异常.

package org.apache.solr.handler.dataimport;

import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
import static org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow; import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import com.jcraft.jsch.ChannelSftp;
import com.jcraft.jsch.JSch;
import com.jcraft.jsch.JSchException;
import com.jcraft.jsch.Session; import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
import java.util.regex.Pattern; /**
*/
public class BinSFTPDataSource extends DataSource<InputStream> {
Logger LOG = LoggerFactory.getLogger(BinSFTPDataSource.class); private Session session ;
private ChannelSftp channel;
private InputStream is; private String baseUrl; private String username; private String password; private String host; private int connectionTimeout = CONNECTION_TIMEOUT; private int readTimeout = READ_TIMEOUT; private Context context; private Properties initProps; public BinSFTPDataSource() {
} @Override
public void init(Context context, Properties initProps) {
this.context = context;
this.initProps = initProps; baseUrl = getInitPropWithReplacements(BASE_URL);
String cTimeout = getInitPropWithReplacements(CONNECTION_TIMEOUT_FIELD_NAME);
String rTimeout = getInitPropWithReplacements(READ_TIMEOUT_FIELD_NAME);
username = getInitPropWithReplacements(USERNAME);
password = getInitPropWithReplacements(PASSWORD);
host = getInitPropWithReplacements(HOST);
if (cTimeout != null) {
try {
connectionTimeout = Integer.parseInt(cTimeout);
} catch (NumberFormatException e) {
LOG.warn("Invalid connection timeout: " + cTimeout);
}
}
if (rTimeout != null) {
try {
readTimeout = Integer.parseInt(rTimeout);
} catch (NumberFormatException e) {
LOG.warn("Invalid read timeout: " + rTimeout);
}
}
try {
JSch jsch = new JSch(); // 创建JSch对象
session = jsch.getSession(username, host, PORT);
if (password != null)
session.setPassword(password);
Properties config = new Properties();
config.put("StrictHostKeyChecking", "no");
session.setConfig(config); session.setTimeout(readTimeout);
session.connect(connectionTimeout);
} catch (JSchException e) {
close();
e.printStackTrace();
}
} @Override
public InputStream getData(String filename) {
if(StringUtils.isEmpty(filename)) return null;
if(StringUtils.isNotEmpty(baseUrl))
filename = baseUrl + filename;
try {
LOG.info("session isConnect:"+session.isConnected());
channel = (ChannelSftp) session.openChannel("sftp");
channel.connect(); // 建立SFTP通道的连接
LOG.info("channel isConnect:"+channel.isConnected());
is = channel.get(filename);
return is;
} catch (Exception e) {
close();
LOG.error("Exception thrown while getting data", e);
wrapAndThrow(SEVERE, e, "Exception in invoking url " +filename);
return null;// unreachable
}
} @Override
public void close() {
if(is!=null)
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
if(channel!=null) channel.disconnect();
// if(session!=null) session.disconnect();
} public String getBaseUrl() {
return baseUrl;
} private String getInitPropWithReplacements(String propertyName) {
final String expr = initProps.getProperty(propertyName);
if (expr == null) {
return null;
}
return context.replaceTokens(expr);
} static final Pattern URIMETHOD = Pattern.compile("sftp:/",
Pattern.CASE_INSENSITIVE); public static final String ENCODING = "encoding"; public static final String BASE_URL = "baseUrl"; public static final String UTF_8 = "UTF-8"; public static final String CONNECTION_TIMEOUT_FIELD_NAME = "connectionTimeout"; public static final String READ_TIMEOUT_FIELD_NAME = "readTimeout"; public static final int CONNECTION_TIMEOUT = 5000; public static final int READ_TIMEOUT = 10000; public static final String USERNAME = "username"; public static final String PASSWORD = "password"; public static final String HOST = "host"; public static final int PORT = 22; }

2. 编写URLListEntityProcessor.java类,用于循环遍历多url文件.

package org.apache.solr.handler.dataimport;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map; /**
* 主要用于抽取多个文件内容.可是是本地主机也可以是远程主机上的文件
*/
public class URLListEntityProcessor extends EntityProcessorBase {
/**
* 文件名字符串
*/
protected String fileNames;
/**
* 文件名字符串分隔符
*/
protected String regex; /**
* data-config.xml中给定的基础目录
*/
protected String baseDir; /**
* The recursive given in data-config. Default value is false.
*/
protected boolean recursive = false; @Override
public void init(Context context) {
super.init(context);
fileNames = context.getEntityAttribute(FILE_NAMES);
if (fileNames != null) {
fileNames = context.replaceTokens(fileNames);
}
regex = context.getEntityAttribute(REGEX);
if (regex != null) {
regex = context.replaceTokens(regex);
}
baseDir = context.getEntityAttribute(BASE_DIR);
if (baseDir == null)
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"'baseDir' is a required attribute");
baseDir = context.replaceTokens(baseDir); String r = context.getEntityAttribute(RECURSIVE);
if (r != null)
recursive = Boolean.parseBoolean(r);
} @Override
public Map<String, Object> nextRow() {
if (rowIterator != null)
return getNext();
List<Map<String, Object>> fileDetails = new ArrayList<Map<String, Object>>();
getUrls(fileDetails);
rowIterator = fileDetails.iterator();
return getNext();
} private void getUrls(final List<Map<String, Object>> fileDetails) {
String[] names = fileNames.split(regex);
for(String name : names){
Map<String, Object> details = new HashMap<String, Object>();
details.put(FILE_NAME, baseDir+name);
fileDetails.add(details);
}
} public static final String DIR = "fileDir"; public static final String ABSOLUTE_FILE = "fileAbsolutePath"; public static final String FILE_NAME = "fileName"; public static final String FILE_NAMES = "fileNames"; public static final String BASE_DIR = "baseDir"; public static final String REGEX = "regex"; public static final String RECURSIVE = "recursive"; }

3.配置data-config.xml文件:

<dataConfig>
<dataSource name="jdbc" driver="oracle.jdbc.driver.OracleDriver"
url="jdbc:oracle:thin:@127.0.0.1:1522:ORCLLI" user="kms_iep" password="kms_iep" batchSize="2000"/>
<dataSource name="binSftp" type="BinSFTPDataSource"
username="kms" password="kms" host="127.0.0.1"
connectionTimeout="10000" readTimeout="20000" />
<document>
<entity pk="ID" dataSource="jdbc" name="province"
query="select (provincecode || '_' || kng_id) as id,
kng_id,
kng_type as type,
kng_title as title,
provincecode,
opertime,
modify_date,
url,
pack_month_fee,
pack_type,
pack_sen_flow,
filename_html,
('/kmsinterface/jt/province_bak/' || provincecode || '/' ||
to_char(opertime, 'yyyymmdd') || substr(filepath,instr(filepath,'/',2)) || '/'
) as path,
('/kmsinterface/' || provincecode ||filepath) as bakpath
from IEP_UPLOAD_DOCUMENT t
where kng_status = 0 and provincecode='ah' and to_char(opertime,'yyyy-mm-dd')='2014-12-24'
"
deltaQuery="select (provincecode || '_' || kng_id) as id from IEP_UPLOAD_DOCUMENT where kng_status = 0 and opertime &gt; to_date('${dih.last_index_time}','yyyy-mm-dd hh24:mi:ss') order by opertime asc"
deltaImportQuery="select * from (
select (provincecode || '_' || kng_id) as id,
kng_id,
kng_type as type,
kng_title as title,
provincecode,
opertime,
modify_date,
url,
pack_month_fee,
pack_type,
pack_sen_flow,
filename_html,
('/kmsinterface/jt/province_bak/' || provincecode || '/' ||
to_char(opertime, 'yyyymmdd') || substr(filepath,instr(filepath,'/',2)) || '/' ) as path,
('/kmsinterface/' || provincecode ||filepath) as bakpath
from IEP_UPLOAD_DOCUMENT t
where kng_status = 0 )
where id = '${dih.delta.ID}'"
deletePKQuery="select (provincecode || '_' || kng_id) as id from IEP_UPLOAD_DOCUMENT where kng_status = 1 and opertime &gt; to_date('${dih.last_index_time}','yyyy-mm-dd hh24:mi:ss') order by id desc"
transformer="DateFormatTransformer,RegexTransformer"
onError="skip">
<field column="ID" name="id" />
<field column="KNG_ID" name="kng_id" />
<field column="type" name="type" />
<field column="TITLE" name="title" />
<field column="PROVINCECODE" name="provincecode" />
<field column="OPERTIME" name="opertime" dateTimeFormat="yyyy-MM-dd HH:mm:ss"/>
<field column="MODIFY_DATE" name="modify_date" dateTimeFormat="yyyy-MM-dd HH:mm:ss"/>
<field column="URL" name="url" />
<field column="PACK_MONTH_FEE" name="pack_month_fee" />
<field column="PACK_TYPE" name="pack_type" />
<field column="PACK_SEN_FLOW" name="pack_sen_flow" />
<entity name="urllist1" processor="URLListEntityProcessor"
baseDir="/kms/solr${province.PATH}"
fileNames="${province.FILENAME_HTML}" regex=","> <!--解析附件-->
<entity name="test1" processor="TikaEntityProcessor" url="${urllist1.fileName}"
dataSource="url" format="text"
transformer="HTMLStripTransformer,RegexTransformer" onError="skip">
<field column="text" name="content" stripHTML="true" regex="\t|\r|\n|\s" replaceWith="" />
</entity>
</entity> <entity name="urllist2" processor="URLListEntityProcessor"
baseDir="/kms/solr${province.BAKPATH}"
fileNames="${province.FILENAME_HTML}" regex=","> <entity name="test2" processor="TikaEntityProcessor" url="${urllist2.fileName}"
dataSource="url" format="text"
transformer="HTMLStripTransformer,RegexTransformer" onError="skip">
<field column="text" name="content" stripHTML="true" regex="\t|\r|\n|\s" replaceWith="" />
</entity>
</entity> </entity>
</document>
</dataConfig>

solr中通过SFTP访问文件建立索引的更多相关文章

  1. 利用Lucene将被索引文件目录中的所有文件建立索引

    1.新建两个文件夹htm和index,其中htm中存放被索引的文件,index文件中存放建立的索引文件. 2.新建解析目录中所有文件的类,用来解析指定目录下的所有文件. import java.io. ...

  2. Solr学习笔记之3、Solr dataimport - 从SQLServer导入数据建立索引

    Solr学习笔记之3.Solr导入SQLServer数据建立索引 一.下载MSSQLServer的JDBC驱动 下载:Microsoft JDBC Driver 4.0 for SQL Server ...

  3. SQL Server 2008中如何为XML字段建立索引

    from:http://blog.csdn.net/tjvictor/article/details/4370771 SQL Server中的XML索引分为两类:主XML 索引和辅助XML索引.其中辅 ...

  4. 如何在Linux中使用sFTP上传或下载文件与文件夹

    如何在Linux中使用sFTP上传或下载文件与文件夹 sFTP(安全文件传输程序)是一种安全的交互式文件传输程序,其工作方式与 FTP(文件传输协议)类似. 然而,sFTP 比 FTP 更安全;它通过 ...

  5. SQL 数据优化之不建立索引的情况

    索引可以提高数据的检索效率,也可以降低数据库的IO成本,并且索引还可以降低数据库的排序成本.排序分组操作主要消耗的就是CPU资源和内存,所以能够在排序分组操作中好好的利用索引将会极大地降低CPU资源的 ...

  6. MYSQL建立索引需要注意几点

    1.建立索引的时机:若表中的某字段出现在select.过滤.排序条件中,为该字段建立索引是值得的.2.对于like '%xxx'的模糊查询,普通的索引是无法满足的,需要建立全文索引.3.对于有多个条件 ...

  7. SQL Server MYSQL 对外键建立索引的必要性

    背景: 大家知道在定义外键时,都会给出on delete .....   on update .....: 这里指定的就是当主表的列发生变化时,从表的列要用怎么样的变化去迎合.对从表中的外键,建立索引 ...

  8. Python第五天 文件访问 for循环访问文件 while循环访问文件 字符串的startswith函数和split函数 linecache模块

    Python第五天   文件访问    for循环访问文件    while循环访问文件   字符串的startswith函数和split函数  linecache模块 目录 Pycharm使用技巧( ...

  9. Java随机访问文件

    使用随机访问文件,我们可以从文件读取以及写入文件.使用文件输入和输出流的读取和写入是顺序过程.使用随机访问文件,可以在文件中的任何位置读取或写入.RandomAccessFile类的一个对象可以进行随 ...

随机推荐

  1. 构建Spark作业

    首先,要清楚,一个Java或Scala或python实现的Spark作业. 1.用sbt构建Spark作业 2.用Maven构建Spark作业 3.用non-maven-aware工具构建Spark作 ...

  2. c#读properties文件

    @(编程) properties文件 MONGO_URL = mongodb://172.16.20.3/srtc_dc CURRENT_VERSION = 2.0 IS_AUTO_UPDATE = ...

  3. HTTP中缓存相关

    1.客户端如何区分缓存命中和未命中 两种情况下,返回的状态码都是200,客户端有一个方法可以判断,就是使用Date首部,将Date首部与当前时间进行比较,如果响应中时间日期值比较早,客户端可以认为这是 ...

  4. UDP广播问题

    http://bbs.csdn.net/topics/390218123 Broadcast Address(广播地址)是专门用于同时向网络中所有工作站进行发送的一个地址.在使用TCP/IP 协议的网 ...

  5. *** WARNING L1: UNRESOLVED EXTERNAL SYMBOL

    kei编译时提示: *** WARNING L1: UNRESOLVED EXTERNAL SYMBOL *** WARNING L1:reference made to unresolved ext ...

  6. HDU 2859 Phalanx (dp)

    题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=2859 给你一个n*n的矩阵,问你最大的对称度是多少(左下右上为对称线) dp[i][j]表示i行j列元 ...

  7. ZOJ1648 Circuit Board(线段相交)

    裸的判断线段相交

  8. 解决sencha touch显示.JSON包含中文数据时显示乱码问题

    按照ST官方示例navigationview做的一个示例.数据源是一个.json文件.但是显示的时候如果.json文件里有中文则乱码.我知道是编码问题,但是不知道怎么改,如何改. 问了N个人最后解决方 ...

  9. android 简易定时器

    定时器 1.在android 应用开发当中,很多时候都要用到定时器,而要实现定时器更多的时候要用到两个类:Timer,和TimerTask 2.API对Timer的解释是:

  10. 以静态变量保存 Spring ApplicationContext

    package com.thinkgem.jeesite.common.utils; import java.net.HttpURLConnection; import java.net.URL; i ...