数据挖掘:提取百度知道QA中的影视信息
1. 背景
网站上爬取了部分关于影视的百度知道QA,为了后续提高影视的搜索效果,需要基于百度知道QA的答案抽取相关的影视信息。
2. 流程
目前已有基础的媒资视频库信息,基于媒资视频库中的视频名称,构建分词字典,结合使用AC双数组,然后针对百度的QA进行分词。针对分词后的结果,可以结合视频热度与评分进行筛选。
3. 代码实现
(1) 基于文本(格式为每行一条视频名称),结合AC双数组,构建分词
package com.test.model.act; import com.google.common.collect.Lists;
import com.test.util.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import static com.test.model.act.AhoCorasickDoubleArrayTrie.*;
import java.io.*;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap; /**
* @author test
* @date 2018/11/1
*/
public class Act { private static Logger logger = LoggerFactory.getLogger(Act.class); private static Act instance = null;
private static String path = "act";
private AhoCorasickDoubleArrayTrie<Resource> act = new AhoCorasickDoubleArrayTrie<>(); public static Act getInstance() throws IOException, ClassNotFoundException {
if(null == instance){
instance = new Act();
}
return instance;
} public Act() throws IOException, ClassNotFoundException {
this.initTrie();
} /**
* AC自动机初始化
* @throws IOException
* @throws ClassNotFoundException
*/
private void initTrie() throws IOException, ClassNotFoundException {
if(new File(path).exists()){
FileInputStream fis = new FileInputStream(path);
ObjectInputStream ois = new ObjectInputStream(fis);
long curTime = System.currentTimeMillis()/1000;
act.load(ois);
logger.info("load act cost: " + (System.currentTimeMillis()/1000 - curTime));
}else{
TreeMap<String, Resource> treeMap = new TreeMap<>();
List<String> datas = IOUtil.getPreprocessedData("videoNames.txt");
for(String data : datas){
data = data.trim();
if(!treeMap.containsKey(data)){
Resource resource = new Resource(data);
treeMap.put(data, resource);
}
}
long curTime = System.currentTimeMillis()/1000;
act.build(treeMap);
logger.info("build act cost: " + (System.currentTimeMillis()/1000 - curTime)); curTime = System.currentTimeMillis()/1000;
act.save(new ObjectOutputStream(new FileOutputStream(path)));
logger.info("save act cost: " + (System.currentTimeMillis()/1000 - curTime));
}
} /**
* AC字段树最长匹配分词
* @param queryText
* @return
*/
public List<Term<Resource>> parse(String queryText){
final List<Term<Resource>> terms = Lists.newArrayList();
act.parseText(queryText, new AhoCorasickDoubleArrayTrie.IHit<Resource>(){
@Override
public void hit(int begin, int end, Resource value) {
Iterator<Term<Resource>> iterator = terms.iterator();
int length = end - begin;
boolean isSubStr = false;
while (iterator.hasNext()) {
Term<Resource> current = iterator.next();
// 相交且小于当前,移除
if (current.end >= begin && length > current.getLength()) {
iterator.remove();
}
if(current.getValue().getValue().contains(value.getValue())){
isSubStr = true;
}
}
if(!isSubStr){
terms.add(new Term<Resource>(begin, end, value));
}
}
});
return terms;
} public List<String> neatSplitResult(List<Term<Resource>> terms){
List<String> dupResults = Lists.newArrayList();
for(int j = terms.size() - 1; j > 0; j --){
String termJ = terms.get(j).getValue().getValue();
if(!terms.get(j-1).getValue().getValue().endsWith(termJ)){
dupResults.add(termJ);
}
}
dupResults.add(terms.get(0).getValue().getValue()); List<String> results = Lists.newArrayList();
for(int j = dupResults.size() - 1; j >= 0; j--){
results.add(dupResults.get(j));
}
return results;
}
}
(2) 引用的AhoCorasickDoubleArrayTrie
package com.test.model.act; import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.*;
import java.util.concurrent.LinkedBlockingDeque; /**
* An implementation of Aho Corasick algorithm based on Double Array Trie
*
* @author hankcs
*/
public class AhoCorasickDoubleArrayTrie<V> implements Serializable {
/**
* check array of the Double Array Trie structure
*/
protected int check[];
/**
* base array of the Double Array Trie structure
*/
protected int base[];
/**
* fail table of the Aho Corasick automata
*/
protected int fail[];
/**
* output table of the Aho Corasick automata
*/
protected int[][] output;
/**
* outer value array
*/
protected V[] v; /**
* the length of every key
*/
protected int[] l; /**
* the size of base and check array
*/
protected int size; /**
* Parse text
*
* @param text The text
* @return a list of outputs
*/
public List<Hit<V>> parseText(String text) {
int position = 1;
int currentState = 0;
List<Hit<V>> collectedEmits = new LinkedList<Hit<V>>();
for (int i = 0; i < text.length(); ++i) {
currentState = getState(currentState, text.charAt(i));
storeEmits(position, currentState, collectedEmits);
++position;
} return collectedEmits;
} /**
* Parse text
*
* @param text The text
* @param processor A processor which handles the output
*/
public void parseText(String text, IHit<V> processor) {
int position = 1;
int currentState = 0;
for (int i = 0; i < text.length(); ++i) {
currentState = getState(currentState, text.charAt(i));
int[] hitArray = output[currentState];
if (hitArray != null) {
for (int hit : hitArray) {
processor.hit(position - l[hit], position, v[hit]);
}
}
++position;
}
} /**
* Parse text
*
* @param text The text
* @param processor A processor which handles the output
*/
public void parseText(String text, IHitCancellable<V> processor) {
int currentState = 0;
for (int i = 0; i < text.length(); i++) {
final int position = i + 1;
currentState = getState(currentState, text.charAt(i));
int[] hitArray = output[currentState];
if (hitArray != null) {
for (int hit : hitArray) {
boolean proceed = processor.hit(position - l[hit], position, v[hit]);
if (!proceed) {
return;
}
}
}
}
} /**
* Parse text
*
* @param text The text
* @param processor A processor which handles the output
*/
public void parseText(char[] text, IHit<V> processor) {
int position = 1;
int currentState = 0;
for (char c : text) {
currentState = getState(currentState, c);
int[] hitArray = output[currentState];
if (hitArray != null) {
for (int hit : hitArray) {
processor.hit(position - l[hit], position, v[hit]);
}
}
++position;
}
} /**
* Parse text
*
* @param text The text
* @param processor A processor which handles the output
*/
public void parseText(char[] text, IHitFull<V> processor) {
int position = 1;
int currentState = 0;
for (char c : text) {
currentState = getState(currentState, c);
int[] hitArray = output[currentState];
if (hitArray != null) {
for (int hit : hitArray) {
processor.hit(position - l[hit], position, v[hit], hit);
}
}
++position;
}
} /**
* Save
*
* @param out An ObjectOutputStream object
* @throws IOException Some IOException
*/
public void save(ObjectOutputStream out) throws IOException {
out.writeObject(base);
out.writeObject(check);
out.writeObject(fail);
out.writeObject(output);
out.writeObject(l);
out.writeObject(v);
} /**
* Load
*
* @param in An ObjectInputStream object
* @throws IOException
* @throws ClassNotFoundException
*/
public void load(ObjectInputStream in) throws IOException, ClassNotFoundException {
base = (int[]) in.readObject();
check = (int[]) in.readObject();
fail = (int[]) in.readObject();
output = (int[][]) in.readObject();
l = (int[]) in.readObject();
v = (V[]) in.readObject();
} /**
* Get value by a String key, just like a map.get() method
*
* @param key The key
* @return
*/
public V get(String key) {
int index = exactMatchSearch(key);
if (index >= 0) {
return v[index];
} return null;
} /**
* Pick the value by index in value array <br>
* Notice that to be more efficiently, this method DONOT check the parameter
*
* @param index The index
* @return The value
*/
public V get(int index) {
return v[index];
} /**
* Processor handles the output when hit a keyword
*/
public interface IHit<V> {
/**
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
*
* @param begin the beginning index, inclusive.
* @param end the ending index, exclusive.
* @param value the value assigned to the keyword
*/
void hit(int begin, int end, V value);
} /**
* Processor handles the output when hit a keyword, with more detail
*/
public interface IHitFull<V> {
/**
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
*
* @param begin the beginning index, inclusive.
* @param end the ending index, exclusive.
* @param value the value assigned to the keyword
* @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value
*/
void hit(int begin, int end, V value, int index);
} /**
* Callback that allows to cancel the search process.
*/
public interface IHitCancellable<V> {
/**
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
*
* @param begin the beginning index, inclusive.
* @param end the ending index, exclusive.
* @param value the value assigned to the keyword
* @return Return true for continuing the search and false for stopping it.
*/
boolean hit(int begin, int end, V value);
} /**
* A result output
*
* @param <V> the value type
*/
public class Hit<V> {
/**
* the beginning index, inclusive.
*/
public final int begin;
/**
* the ending index, exclusive.
*/
public final int end;
/**
* the value assigned to the keyword
*/
public final V value; public Hit(int begin, int end, V value) {
this.begin = begin;
this.end = end;
this.value = value;
} @Override
public String toString() {
return String.format("[%d:%d]=%s", begin, end, value);
}
} /**
* transmit state, supports failure function
*
* @param currentState
* @param character
* @return
*/
private int getState(int currentState, char character) {
// 先按success跳转
int newCurrentState = transitionWithRoot(currentState, character);
// 跳转失败的话,按failure跳转
while (newCurrentState == -1)
{
currentState = fail[currentState];
newCurrentState = transitionWithRoot(currentState, character);
}
return newCurrentState;
} /**
* store output
*
* @param position
* @param currentState
* @param collectedEmits
*/
private void storeEmits(int position, int currentState, List<Hit<V>> collectedEmits) {
int[] hitArray = output[currentState];
if (hitArray != null) {
for (int hit : hitArray) {
collectedEmits.add(new Hit<V>(position - l[hit], position, v[hit]));
}
}
} /**
* transition of a state
*
* @param current
* @param c
* @return
*/
protected int transition(int current, char c) {
int b = current;
int p; p = b + c + 1;
if (b == check[p]) {
b = base[p]; } else {
return -1;
}
p = b;
return p;
} /**
* transition of a state, if the state is root and it failed, then returns the root
*
* @param nodePos
* @param c
* @return
*/
protected int transitionWithRoot(int nodePos, char c) {
int b = base[nodePos];
int p; p = b + c + 1;
if (b != check[p]) {
if (nodePos == 0) {
return 0;
}
return -1;
} return p;
} /**
* Build a AhoCorasickDoubleArrayTrie from a map
*
* @param map a map containing key-value pairs
*/
public void build(Map<String, V> map) {
new Builder().build(map);
} /**
* match exactly by a key
*
* @param key the key
* @return the index of the key, you can use it as a perfect hash function
*/
public int exactMatchSearch(String key) {
return exactMatchSearch(key, 0, 0, 0);
} /**
* match exactly by a key
*
* @param key
* @param pos
* @param len
* @param nodePos
* @return
*/
private int exactMatchSearch(String key, int pos, int len, int nodePos) {
if (len <= 0){
len = key.length();
}
if (nodePos <= 0){
nodePos = 0;
} int result = -1; char[] keyChars = key.toCharArray(); int b = base[nodePos];
int p; for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p]) {
b = base[p];
} else {
return result;
}
} p = b;
int n = base[p];
if (b == check[p] && n < 0) {
result = -n - 1;
}
return result;
} /**
* match exactly by a key
*
* @param keyChars the char array of the key
* @param pos the begin index of char array
* @param len the length of the key
* @param nodePos the starting position of the node for searching
* @return the value index of the key, minus indicates null
*/
private int exactMatchSearch(char[] keyChars, int pos, int len, int nodePos) {
int result = -1; int b = base[nodePos];
int p; for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p]){
b = base[p];
} else {
return result;
}
} p = b;
int n = base[p];
if (b == check[p] && n < 0) {
result = -n - 1;
}
return result;
} /**
* Get the size of the keywords
*
* @return
*/
public int size() {
return v.length;
} /**
* A builder to build the AhoCorasickDoubleArrayTrie
*/
private class Builder {
/**
* the root state of trie
*/
private State rootState = new State();
/**
* whether the position has been used
*/
private boolean used[];
/**
* the allocSize of the dynamic array
*/
private int allocSize;
/**
* a parameter controls the memory growth speed of the dynamic array
*/
private int progress;
/**
* the next position to check unused memory
*/
private int nextCheckPos;
/**
* the size of the key-pair sets
*/
private int keySize; /**
* Build from a map
*
* @param map a map containing key-value pairs
*/
@SuppressWarnings("unchecked")
public void build(Map<String, V> map) {
// 把值保存下来
v = (V[]) map.values().toArray();
l = new int[v.length];
Set<String> keySet = map.keySet();
// 构建二分trie树
addAllKeyword(keySet);
// 在二分trie树的基础上构建双数组trie树
buildDoubleArrayTrie(keySet.size());
used = null;
// 构建failure表并且合并output表
constructFailureStates();
rootState = null;
loseWeight();
} /**
* fetch siblings of a parent node
*
* @param parent parent node
* @param siblings parent node's child nodes, i . e . the siblings
* @return the amount of the siblings
*/
private int fetch(State parent, List<Map.Entry<Integer, State>> siblings) {
if (parent.isAcceptable()) {
State fakeNode = new State(-(parent.getDepth() + 1)); // 此节点是parent的子节点,同时具备parent的输出
fakeNode.addEmit(parent.getLargestValueId());
siblings.add(new AbstractMap.SimpleEntry<Integer, State>(0, fakeNode));
}
for (Map.Entry<Character, State> entry : parent.getSuccess().entrySet()) {
siblings.add(new AbstractMap.SimpleEntry<Integer, State>(entry.getKey() + 1, entry.getValue()));
}
return siblings.size();
} /**
* add a keyword
*
* @param keyword a keyword
* @param index the index of the keyword
*/
private void addKeyword(String keyword, int index) {
State currentState = this.rootState;
for (Character character : keyword.toCharArray()) {
currentState = currentState.addState(character);
}
currentState.addEmit(index);
l[index] = keyword.length();
} /**
* add a collection of keywords
*
* @param keywordSet the collection holding keywords
*/
private void addAllKeyword(Collection<String> keywordSet) {
int i = 0;
for (String keyword : keywordSet) {
addKeyword(keyword, i++);
}
} /**
* construct failure table
*/
private void constructFailureStates() {
fail = new int[size + 1];
fail[1] = base[0];
output = new int[size + 1][];
Queue<State> queue = new LinkedBlockingDeque<State>(); // 第一步,将深度为1的节点的failure设为根节点
for (State depthOneState : this.rootState.getStates()) {
depthOneState.setFailure(this.rootState, fail);
queue.add(depthOneState);
constructOutput(depthOneState);
} // 第二步,为深度 > 1 的节点建立failure表,这是一个bfs
while (!queue.isEmpty()) {
State currentState = queue.remove(); for (Character transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState); State traceFailureState = currentState.failure();
while (traceFailureState.nextState(transition) == null) {
traceFailureState = traceFailureState.failure();
}
State newFailureState = traceFailureState.nextState(transition);
targetState.setFailure(newFailureState, fail);
targetState.addEmit(newFailureState.emit());
constructOutput(targetState);
}
}
} /**
* construct output table
*/
private void constructOutput(State targetState) {
Collection<Integer> emit = targetState.emit();
if (emit == null || emit.size() == 0) {
return;
}
int output[] = new int[emit.size()];
Iterator<Integer> it = emit.iterator();
for (int i = 0; i < output.length; ++i) {
output[i] = it.next();
}
AhoCorasickDoubleArrayTrie.this.output[targetState.getIndex()] = output;
} private void buildDoubleArrayTrie(int keySize) {
progress = 0;
this.keySize = keySize;
resize(65536 * 32); // 32个双字节 base[0] = 1;
nextCheckPos = 0; State root_node = this.rootState; List<Map.Entry<Integer, State>> siblings = new ArrayList<Map.Entry<Integer, State>>(root_node.getSuccess().entrySet().size());
fetch(root_node, siblings);
insert(siblings);
} /**
* allocate the memory of the dynamic array
*
* @param newSize
* @return
*/
private int resize(int newSize) {
int[] base2 = new int[newSize];
int[] check2 = new int[newSize];
boolean used2[] = new boolean[newSize];
if (allocSize > 0) {
System.arraycopy(base, 0, base2, 0, allocSize);
System.arraycopy(check, 0, check2, 0, allocSize);
System.arraycopy(used, 0, used2, 0, allocSize);
} base = base2;
check = check2;
used = used2; return allocSize = newSize;
} /**
* insert the siblings to double array trie
*
* @param siblings the siblings being inserted
* @return the position to insert them
*/
private int insert(List<Map.Entry<Integer, State>> siblings) {
int begin = 0;
int pos = Math.max(siblings.get(0).getKey() + 1, nextCheckPos) - 1;
int nonzero_num = 0;
int first = 0; if (allocSize <= pos){
resize(pos + 1);
} outer:
// 此循环体的目标是找出满足base[begin + a1...an] == 0的n个空闲空间,a1...an是siblings中的n个节点
while (true) {
pos++; if (allocSize <= pos){
resize(pos + 1);
} if (check[pos] != 0) {
nonzero_num++;
continue;
} else if (first == 0) {
nextCheckPos = pos;
first = 1;
} // 当前位置离第一个兄弟节点的距离
begin = pos - siblings.get(0).getKey();
if (allocSize <= (begin + siblings.get(siblings.size() - 1).getKey())) {
// progress can be zero // 防止progress产生除零错误
double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1);
resize((int) (allocSize * l));
} if (used[begin]){
continue;
} for (int i = 1; i < siblings.size(); i++){
if (check[begin + siblings.get(i).getKey()] != 0){
continue outer;
}
}
break;
} // -- Simple heuristics --
// if the percentage of non-empty contents in check between the
// index
// 'next_check_pos' and 'check' is greater than some constant value
// (e.g. 0.9),
// new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95){
// 从位置 next_check_pos 开始到 pos 间,如果已占用的空间在95%以上,下次插入节点时,直接从 pos 位置处开始查找
nextCheckPos = pos;
}
used[begin] = true; size = (size > begin + siblings.get(siblings.size() - 1).getKey() + 1) ? size : begin + siblings.get(siblings.size() - 1).getKey() + 1; for (Map.Entry<Integer, State> sibling : siblings) {
check[begin + sibling.getKey()] = begin;
} for (Map.Entry<Integer, State> sibling : siblings) {
List<Map.Entry<Integer, State>> new_siblings = new ArrayList<Map.Entry<Integer, State>>(sibling.getValue().getSuccess().entrySet().size() + 1); // 一个词的终止且不为其他词的前缀,其实就是叶子节点
if (fetch(sibling.getValue(), new_siblings) == 0) {
base[begin + sibling.getKey()] = (-sibling.getValue().getLargestValueId() - 1);
progress++;
} else {
// dfs
int h = insert(new_siblings);
base[begin + sibling.getKey()] = h;
}
sibling.getValue().setIndex(begin + sibling.getKey());
}
return begin;
} /**
* free the unnecessary memory
*/
private void loseWeight() {
int nbase[] = new int[size + 65535];
System.arraycopy(base, 0, nbase, 0, size);
base = nbase; int ncheck[] = new int[size + 65535];
System.arraycopy(check, 0, ncheck, 0, size);
check = ncheck;
}
} public static class Term<T> {
public int start;
public int end;
public T value; public Term(int start, int end, T value) {
this.start = start;
this.end = end;
this.value = value;
} public int getLength() {
return end - start;
} public int getStart() {
return start;
} public void setStart(int start) {
this.start = start;
} public int getEnd() {
return end;
} public void setEnd(int end) {
this.end = end;
} public T getValue() {
return value;
} public void setValue(T value) {
this.value = value;
}
} public static class Resource implements Serializable{
String value;
String rawData; public Resource(String value) {
this.value = value;
} public Resource(String value, String rawData) {
this.value = value;
this.rawData = rawData;
} public String getValue() {
return value;
} public void setValue(String value) {
this.value = value;
} public String getRawData() {
return rawData;
} public void setRawData(String rawData) {
this.rawData = rawData;
} @Override
public String toString() {
return "Resource{" +
"value='" + value + '\'' +
", rawData='" + rawData + '\'' +
'}';
}
}
}
(3) 引用的State
package com.test.model.act; import java.util.*; /**
* <p>
* 一个状态有如下几个功能
* </p>
* <p/>
* <ul>
* <li>success; 成功转移到另一个状态</li>
* <li>failure; 不可顺着字符串跳转的话,则跳转到一个浅一点的节点</li>
* <li>emits; 命中一个模式串</li>
* </ul>
* <p/>
* <p>
* 根节点稍有不同,根节点没有 failure 功能,它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。
* </p>
*
* @author Robert Bor
*/
public class State { /**
* 模式串的长度,也是这个状态的深度
*/
protected final int depth; protected Character character; /**
* fail 函数,如果没有匹配到,则跳转到此状态。
*/
private State failure = null; /**
* 只要这个状态可达,则记录模式串
*/
private Set<Integer> emits = null;
/**
* goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
*/
private Map<Character, State> success = new TreeMap<Character, State>(); /**
* 在双数组中的对应下标
*/
private int index; /**
* 构造深度为0的节点
*/
public State() {
this(0);
} /**
* 构造深度为depth的节点
*
* @param depth
*/
public State(int depth) {
this.depth = depth;
} /**
* 获取节点深度
*
* @return
*/
public int getDepth() {
return this.depth;
} /**
* 添加一个匹配到的模式串(这个状态对应着这个模式串)
*
* @param keyword
*/
public void addEmit(int keyword) {
if (this.emits == null) {
this.emits = new TreeSet<Integer>(Collections.reverseOrder());
}
this.emits.add(keyword);
} /**
* 获取最大的值
*
* @return
*/
public Integer getLargestValueId() {
if (emits == null || emits.size() == 0) {
return null;
} return emits.iterator().next();
} /**
* 添加一些匹配到的模式串
*
* @param emits
*/
public void addEmit(Collection<Integer> emits) {
for (int emit : emits) {
addEmit(emit);
}
} /**
* 获取这个节点代表的模式串(们)
*
* @return
*/
public Collection<Integer> emit() {
return this.emits == null ? Collections.<Integer>emptyList() : this.emits;
} /**
* 是否是终止状态
*
* @return
*/
public boolean isAcceptable() {
return this.depth > 0 && this.emits != null;
} /**
* 获取failure状态
*
* @return
*/
public State failure() {
return this.failure;
} /**
* 设置failure状态
*
* @param failState
*/
public void setFailure(State failState, int fail[]) {
this.failure = failState;
fail[index] = failState.index;
} /**
* 转移到下一个状态
*
* @param character 希望按此字符转移
* @param ignoreRootState 是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
* @return 转移结果
*/
private State nextState(Character character, boolean ignoreRootState) {
State nextState = this.success.get(character);
if (!ignoreRootState && nextState == null && this.depth == 0) {
nextState = this;
}
return nextState;
} /**
* 按照character转移,根节点转移失败会返回自己(永远不会返回null)
*
* @param character
* @return
*/
public State nextState(Character character) {
return nextState(character, false);
} /**
* 按照character转移,任何节点转移失败会返回null
*
* @param character
* @return
*/
public State nextStateIgnoreRootState(Character character) {
return nextState(character, true);
} public State addState(Character character) {
State nextState = nextStateIgnoreRootState(character);
if (nextState == null) {
nextState = new State(this.depth + 1);
this.success.put(character, nextState);
}
return nextState;
} public Collection<State> getStates() {
return this.success.values();
} public Collection<Character> getTransitions() {
return this.success.keySet();
} @Override
public String toString() {
final StringBuilder sb = new StringBuilder("State{");
sb.append("depth=").append(depth);
sb.append(", ID=").append(index);
sb.append(", emits=").append(emits);
sb.append(", success=").append(success.keySet());
sb.append(", failureID=").append(failure == null ? "-1" : failure.index);
sb.append(", failure=").append(failure);
sb.append('}');
return sb.toString();
} /**
* 获取goto表
*
* @return
*/
public Map<Character, State> getSuccess() {
return success;
} public int getIndex() {
return index;
} public void setIndex(int index) {
this.index = index;
}
}
使用时只需要调用Act中的parse方法即可得到分词数据
4. 提取效果
(1) 原始百度QA,部分截图如图所示:
(2) 提取的结果
阿飞正传,龙凤智多星,堕落天使,旺角卡门,江湖龙虎斗,2046,恶男,繁花,春光乍泄,花样年华,猛鬼差馆,重庆森林,一代宗师,爱情谜语,君子好逑,我要金龟婿,猛鬼学堂,最后胜利,空心大少爷,再战江湖,爱神,小狐仙,北京之夏,蓝莓之夜,神勇双响炮续集,伊人再见,彩云曲
数据挖掘:提取百度知道QA中的影视信息的更多相关文章
- 使用Java POI来选择提取Word文档中的表格信息
通过使用Java POI来提取Word(1992)文档中的表格信息,其中POI支持不同的ms文档类型,在具体操作中需要注意.本文主要是通过POI来提取微软2003文档中的表格信息,具体code如下(事 ...
- WIRESHARK 提取百度云客户端中的下载地址
序言:近两年感觉云盘是越来越不行了,什么115,360,华为,新浪一些网盘纷纷关门,现在手里能用的就只剩下一个百度云坚挺着.可那速度,简直感人,好吧,没钱冲VIP,就去找了一个破解版,用了一年,爽得飞 ...
- C# 提取Word文档中的图片
C# 提取Word文档中的图片 图片和文字是word文档中两种最常见的对象,在微软word中,如果我们想要提取出一个文档内的图片,只需要右击图片选择另存为然后命名保存就可以了,今天这篇文章主要是实现使 ...
- php提取淘宝URL中ID的代码
一段可以提取淘宝URL中ID的PHP代码. 例如: <?php $taobao = 'taobao.com'; $tmall = 'tmall.com'; $guojitmall = 'tmal ...
- PHP 提取图片img标记中的任意属性
PHP 提取图片img标记中的任意属性的简单实例. 复制代码代码如下: <?php /* PHP正则提取图片img标记中的任意属性 */ $str = '<center><im ...
- SQL Server 2008空间数据应用系列十一:提取MapInfo地图数据中的空间数据解决方案
原文:SQL Server 2008空间数据应用系列十一:提取MapInfo地图数据中的空间数据解决方案 友情提示,您阅读本篇博文的先决条件如下: 1.本文示例基于Microsoft SQL Serv ...
- 百度搜索URL中的参数都是什么
最近,点石排名更新了一个新功能——站内搜索.其实理解起来也很简单,就是通过URL限定搜索结果为某个网站,从而参与点击(例如:https://www.baidu.com/s?wd=SEO&si= ...
- VBA/VBScript提取Word(*.doc)文件中包含的图片(照片)
VBA/VBScript提取Word(*.doc)文件中包含的图片(照片) 要处理的人事简历表是典型的Word文档,其中一人一份doc,里面包含有个人的照片,如果要把里面的照片复制出来就比较麻烦了 ...
- 使用 python 提取照片中的手机信息
使用 python 提取照片中的手机信息 最近在做一个项目,有一个很重要的点是需要获取使用用户的手机信息,这里我选择从照片中获取信息.有人会问为什么不从手机里面直接获取设备信息.由于现在android ...
随机推荐
- XML在C#与Unity3D中的实战运用
一.xml文件是什么? XML是指可扩展标记语言,英文:Extensible Markup Language的缩写. 二.xml文件有什么用? 用来传输和存储数据,比如网络通信,本地配置文件等. 三. ...
- java 中 ==
@Test public void fuu2(){ String a = new String("aw"); String b = new String("aw" ...
- [SinGuLaRiTy] 复习模板-数据结构
[SinGuLaRiTy-1040] Copyright (c) SinGuLaRiTy 2017. All Rights Reserved. 二维线段树 2D_Segment_Tree //示例:单 ...
- loj #2255. 「SNOI2017」炸弹
#2255. 「SNOI2017」炸弹 题目描述 在一条直线上有 NNN 个炸弹,每个炸弹的坐标是 XiX_iXi,爆炸半径是 RiR_iRi,当一个炸弹爆炸时,如果另一个炸弹所在位置 X ...
- Cygwin apt-cyg ”MD5 sum did not match”
直接编辑apt-cyg 文件,找到md5sum,替换修改成sha512sum. # check the md5 digest=`cat "desc" | awk '/^instal ...
- Liunux疑问
Liunux疑问 其中的各种软件的安装有模糊的点,待解决 待解决 待解决 待解决 ... ...
- 推荐分享一个牛X的自定义PHP加密解密类
通俗点说,用它来进行加密,同一个字符串,每次进行加密,得出的结果都是不一样的,大大加强了数据安全性.同时还可设定加密后数据的有效期,简直牛掰了 #食用方法 将下面的第二份模块代码保存为 Mcrypt. ...
- RFC3920
RFC3920 可扩展的消息和出席信息协议 (XMPP): 核心协议 关于本文的说明 本文为互联网社区定义了一个互联网标准跟踪协议,并且申请讨论协议和提出了改进的建议.请参照"互联网官方协议 ...
- [JLOI2015]管道连接(斯坦纳树)
[Luogu3264] 原题解 多个频道,每个频道的关键点要求相互联通 详见代码,非常巧妙 #include<cstdio> #include<iostream> #inclu ...
- Django 中 admin 的执行流程
Django 中 admin 的执行流程 1 循环加载执行所有已经注册的 app 中的 admin.py 文件 def autodiscover(): autodiscover_modules('ad ...