数据挖掘：提取百度知道QA中的影视信息

1. 背景

网站上爬取了部分关于影视的百度知道QA，为了后续提高影视的搜索效果，需要基于百度知道QA的答案抽取相关的影视信息。

2. 流程

目前已有基础的媒资视频库信息，基于媒资视频库中的视频名称，构建分词字典，结合使用AC双数组，然后针对百度的QA进行分词。针对分词后的结果，可以结合视频热度与评分进行筛选。

3. 代码实现

(1) 基于文本(格式为每行一条视频名称)，结合AC双数组，构建分词

package com.test.model.act;

import com.google.common.collect.Lists;

import com.test.util.IOUtil;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import static com.test.model.act.AhoCorasickDoubleArrayTrie.*;

import java.io.*;

import java.util.Iterator;

import java.util.List;

import java.util.TreeMap;

/**

 * @author test

 * @date 2018/11/1

 */

public class Act {

    private static Logger logger = LoggerFactory.getLogger(Act.class);

    private static Act instance = null;

    private static String path = "act";

    private AhoCorasickDoubleArrayTrie<Resource> act = new AhoCorasickDoubleArrayTrie<>();

    public static Act getInstance() throws IOException, ClassNotFoundException {

        if(null == instance){

            instance = new Act();

        }

        return instance;

    }

    public Act() throws IOException, ClassNotFoundException {

        this.initTrie();

    }

    /**

     * AC自动机初始化

     * @throws IOException

     * @throws ClassNotFoundException

     */

    private void initTrie() throws IOException, ClassNotFoundException {

        if(new File(path).exists()){

            FileInputStream fis = new FileInputStream(path);

            ObjectInputStream ois = new ObjectInputStream(fis);

            long curTime = System.currentTimeMillis()/1000;

            act.load(ois);

            logger.info("load act cost: " + (System.currentTimeMillis()/1000 - curTime));

        }else{

            TreeMap<String, Resource> treeMap = new TreeMap<>();

            List<String> datas = IOUtil.getPreprocessedData("videoNames.txt");

            for(String data : datas){

                data = data.trim();

                if(!treeMap.containsKey(data)){

                    Resource resource = new Resource(data);

                    treeMap.put(data, resource);

                }

            }

            long curTime = System.currentTimeMillis()/1000;

            act.build(treeMap);

            logger.info("build act cost: " + (System.currentTimeMillis()/1000 - curTime));

            curTime = System.currentTimeMillis()/1000;

            act.save(new ObjectOutputStream(new FileOutputStream(path)));

            logger.info("save act cost: " + (System.currentTimeMillis()/1000 - curTime));

        }

    }

    /**

     * AC字段树最长匹配分词

     * @param queryText

     * @return

     */

    public List<Term<Resource>> parse(String queryText){

        final List<Term<Resource>> terms = Lists.newArrayList();

        act.parseText(queryText, new AhoCorasickDoubleArrayTrie.IHit<Resource>(){

            @Override

            public void hit(int begin, int end, Resource value) {

                Iterator<Term<Resource>> iterator = terms.iterator();

                int length = end - begin;

                boolean isSubStr = false;

                while (iterator.hasNext()) {

                    Term<Resource> current = iterator.next();

                    // 相交且小于当前，移除

                    if (current.end >= begin && length > current.getLength()) {

                        iterator.remove();

                    }

                    if(current.getValue().getValue().contains(value.getValue())){

                        isSubStr = true;

                    }

                }

                if(!isSubStr){

                    terms.add(new Term<Resource>(begin, end, value));

                }

            }

        });

        return terms;

    }

    public List<String> neatSplitResult(List<Term<Resource>> terms){

        List<String> dupResults = Lists.newArrayList();

        for(int j = terms.size() - 1; j > 0; j --){

            String termJ = terms.get(j).getValue().getValue();

            if(!terms.get(j-1).getValue().getValue().endsWith(termJ)){

                dupResults.add(termJ);

            }

        }

        dupResults.add(terms.get(0).getValue().getValue());

        List<String> results = Lists.newArrayList();

        for(int j = dupResults.size() - 1; j >= 0; j--){

            results.add(dupResults.get(j));

        }

        return results;

    }

}

(2) 引用的AhoCorasickDoubleArrayTrie

package com.test.model.act;

import java.io.IOException;

import java.io.ObjectInputStream;

import java.io.ObjectOutputStream;

import java.io.Serializable;

import java.util.*;

import java.util.concurrent.LinkedBlockingDeque;

/**

 * An implementation of Aho Corasick algorithm based on Double Array Trie

 *

 * @author hankcs

 */

public class AhoCorasickDoubleArrayTrie<V> implements Serializable {

    /**

     * check array of the Double Array Trie structure

     */

    protected int check[];

    /**

     * base array of the Double Array Trie structure

     */

    protected int base[];

    /**

     * fail table of the Aho Corasick automata

     */

    protected int fail[];

    /**

     * output table of the Aho Corasick automata

     */

    protected int[][] output;

    /**

     * outer value array

     */

    protected V[] v;

    /**

     * the length of every key

     */

    protected int[] l;

    /**

     * the size of base and check array

     */

    protected int size;

    /**

     * Parse text

     *

     * @param text The text

     * @return a list of outputs

     */

    public List<Hit<V>> parseText(String text) {

        int position = 1;

        int currentState = 0;

        List<Hit<V>> collectedEmits = new LinkedList<Hit<V>>();

        for (int i = 0; i < text.length(); ++i) {

            currentState = getState(currentState, text.charAt(i));

            storeEmits(position, currentState, collectedEmits);

            ++position;

        }

        return collectedEmits;

    }

    /**

     * Parse text

     *

     * @param text      The text

     * @param processor A processor which handles the output

     */

    public void parseText(String text, IHit<V> processor) {

        int position = 1;

        int currentState = 0;

        for (int i = 0; i < text.length(); ++i) {

            currentState = getState(currentState, text.charAt(i));

            int[] hitArray = output[currentState];

            if (hitArray != null) {

                for (int hit : hitArray) {

                    processor.hit(position - l[hit], position, v[hit]);

                }

            }

            ++position;

        }

    }

    /**

     * Parse text

     *

     * @param text      The text

     * @param processor A processor which handles the output

     */

    public void parseText(String text, IHitCancellable<V> processor) {

        int currentState = 0;

        for (int i = 0; i < text.length(); i++) {

            final int position = i + 1;

            currentState = getState(currentState, text.charAt(i));

            int[] hitArray = output[currentState];

            if (hitArray != null) {

                for (int hit : hitArray) {

                    boolean proceed = processor.hit(position - l[hit], position, v[hit]);

                    if (!proceed) {

                        return;

                    }

                }

            }

        }

    }

    /**

     * Parse text

     *

     * @param text      The text

     * @param processor A processor which handles the output

     */

    public void parseText(char[] text, IHit<V> processor) {

        int position = 1;

        int currentState = 0;

        for (char c : text) {

            currentState = getState(currentState, c);

            int[] hitArray = output[currentState];

            if (hitArray != null) {

                for (int hit : hitArray) {

                    processor.hit(position - l[hit], position, v[hit]);

                }

            }

            ++position;

        }

    }

    /**

     * Parse text

     *

     * @param text      The text

     * @param processor A processor which handles the output

     */

    public void parseText(char[] text, IHitFull<V> processor) {

        int position = 1;

        int currentState = 0;

        for (char c : text) {

            currentState = getState(currentState, c);

            int[] hitArray = output[currentState];

            if (hitArray != null) {

                for (int hit : hitArray) {

                    processor.hit(position - l[hit], position, v[hit], hit);

                }

            }

            ++position;

        }

    }

    /**

     * Save

     *

     * @param out An ObjectOutputStream object

     * @throws IOException Some IOException

     */

    public void save(ObjectOutputStream out) throws IOException {

        out.writeObject(base);

        out.writeObject(check);

        out.writeObject(fail);

        out.writeObject(output);

        out.writeObject(l);

        out.writeObject(v);

    }

    /**

     * Load

     *

     * @param in An ObjectInputStream object

     * @throws IOException

     * @throws ClassNotFoundException

     */

    public void load(ObjectInputStream in) throws IOException, ClassNotFoundException {

        base = (int[]) in.readObject();

        check = (int[]) in.readObject();

        fail = (int[]) in.readObject();

        output = (int[][]) in.readObject();

        l = (int[]) in.readObject();

        v = (V[]) in.readObject();

    }

    /**

     * Get value by a String key, just like a map.get() method

     *

     * @param key The key

     * @return

     */

    public V get(String key) {

        int index = exactMatchSearch(key);

        if (index >= 0) {

            return v[index];

        }

        return null;

    }

    /**

     * Pick the value by index in value array <br>

     * Notice that to be more efficiently, this method DONOT check the parameter

     *

     * @param index The index

     * @return The value

     */

    public V get(int index) {

        return v[index];

    }

    /**

     * Processor handles the output when hit a keyword

     */

    public interface IHit<V> {

        /**

         * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword

         *

         * @param begin the beginning index, inclusive.

         * @param end   the ending index, exclusive.

         * @param value the value assigned to the keyword

         */

        void hit(int begin, int end, V value);

    }

    /**

     * Processor handles the output when hit a keyword, with more detail

     */

    public interface IHitFull<V> {

        /**

         * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword

         *

         * @param begin the beginning index, inclusive.

         * @param end   the ending index, exclusive.

         * @param value the value assigned to the keyword

         * @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value

         */

        void hit(int begin, int end, V value, int index);

    }

    /**

     * Callback that allows to cancel the search process.

     */

    public interface IHitCancellable<V> {

        /**

         * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword

         *

         * @param begin the beginning index, inclusive.

         * @param end   the ending index, exclusive.

         * @param value the value assigned to the keyword

         * @return Return true for continuing the search and false for stopping it.

         */

        boolean hit(int begin, int end, V value);

    }

    /**

     * A result output

     *

     * @param <V> the value type

     */

    public class Hit<V> {

        /**

         * the beginning index, inclusive.

         */

        public final int begin;

        /**

         * the ending index, exclusive.

         */

        public final int end;

        /**

         * the value assigned to the keyword

         */

        public final V value;

        public Hit(int begin, int end, V value) {

            this.begin = begin;

            this.end = end;

            this.value = value;

        }

        @Override

        public String toString() {

            return String.format("[%d:%d]=%s", begin, end, value);

        }

    }

    /**

     * transmit state, supports failure function

     *

     * @param currentState

     * @param character

     * @return

     */

    private int getState(int currentState, char character) {

        // 先按success跳转

        int newCurrentState = transitionWithRoot(currentState, character);

        // 跳转失败的话，按failure跳转

        while (newCurrentState == -1)

        {

            currentState = fail[currentState];

            newCurrentState = transitionWithRoot(currentState, character);

        }

        return newCurrentState;

    }

    /**

     * store output

     *

     * @param position

     * @param currentState

     * @param collectedEmits

     */

    private void storeEmits(int position, int currentState, List<Hit<V>> collectedEmits) {

        int[] hitArray = output[currentState];

        if (hitArray != null) {

            for (int hit : hitArray) {

                collectedEmits.add(new Hit<V>(position - l[hit], position, v[hit]));

            }

        }

    }

    /**

     * transition of a state

     *

     * @param current

     * @param c

     * @return

     */

    protected int transition(int current, char c) {

        int b = current;

        int p;

        p = b + c + 1;

        if (b == check[p]) {

            b = base[p];

        } else {

            return -1;

        }

        p = b;

        return p;

    }

    /**

     * transition of a state, if the state is root and it failed, then returns the root

     *

     * @param nodePos

     * @param c

     * @return

     */

    protected int transitionWithRoot(int nodePos, char c) {

        int b = base[nodePos];

        int p;

        p = b + c + 1;

        if (b != check[p]) {

            if (nodePos == 0) {

                return 0;

            }

            return -1;

        }

        return p;

    }

    /**

     * Build a AhoCorasickDoubleArrayTrie from a map

     *

     * @param map a map containing key-value pairs

     */

    public void build(Map<String, V> map) {

        new Builder().build(map);

    }

    /**

     * match exactly by a key

     *

     * @param key the key

     * @return the index of the key, you can use it as a perfect hash function

     */

    public int exactMatchSearch(String key) {

        return exactMatchSearch(key, 0, 0, 0);

    }

    /**

     * match exactly by a key

     *

     * @param key

     * @param pos

     * @param len

     * @param nodePos

     * @return

     */

    private int exactMatchSearch(String key, int pos, int len, int nodePos) {

        if (len <= 0){

            len = key.length();

        }

        if (nodePos <= 0){

            nodePos = 0;

        }

        int result = -1;

        char[] keyChars = key.toCharArray();

        int b = base[nodePos];

        int p;

        for (int i = pos; i < len; i++) {

            p = b + (int) (keyChars[i]) + 1;

            if (b == check[p]) {

                b = base[p];

            } else {

                return result;

            }

        }

        p = b;

        int n = base[p];

        if (b == check[p] && n < 0) {

            result = -n - 1;

        }

        return result;

    }

    /**

     * match exactly by a key

     *

     * @param keyChars the char array of the key

     * @param pos      the begin index of char array

     * @param len      the length of the key

     * @param nodePos  the starting position of the node for searching

     * @return the value index of the key, minus indicates null

     */

    private int exactMatchSearch(char[] keyChars, int pos, int len, int nodePos) {

        int result = -1;

        int b = base[nodePos];

        int p;

        for (int i = pos; i < len; i++) {

            p = b + (int) (keyChars[i]) + 1;

            if (b == check[p]){

                b = base[p];

            } else {

                return result;

            }

        }

        p = b;

        int n = base[p];

        if (b == check[p] && n < 0) {

            result = -n - 1;

        }

        return result;

    }

    /**

     * Get the size of the keywords

     *

     * @return

     */

    public int size() {

        return v.length;

    }

    /**

     * A builder to build the AhoCorasickDoubleArrayTrie

     */

    private class Builder {

        /**

         * the root state of trie

         */

        private State rootState = new State();

        /**

         * whether the position has been used

         */

        private boolean used[];

        /**

         * the allocSize of the dynamic array

         */

        private int allocSize;

        /**

         * a parameter controls the memory growth speed of the dynamic array

         */

        private int progress;

        /**

         * the next position to check unused memory

         */

        private int nextCheckPos;

        /**

         * the size of the key-pair sets

         */

        private int keySize;

        /**

         * Build from a map

         *

         * @param map a map containing key-value pairs

         */

        @SuppressWarnings("unchecked")

        public void build(Map<String, V> map) {

            // 把值保存下来

            v = (V[]) map.values().toArray();

            l = new int[v.length];

            Set<String> keySet = map.keySet();

            // 构建二分trie树

            addAllKeyword(keySet);

            // 在二分trie树的基础上构建双数组trie树

            buildDoubleArrayTrie(keySet.size());

            used = null;

            // 构建failure表并且合并output表

            constructFailureStates();

            rootState = null;

            loseWeight();

        }

        /**

         * fetch siblings of a parent node

         *

         * @param parent   parent node

         * @param siblings parent node's child nodes, i . e . the siblings

         * @return the amount of the siblings

         */

        private int fetch(State parent, List<Map.Entry<Integer, State>> siblings) {

            if (parent.isAcceptable()) {

                State fakeNode = new State(-(parent.getDepth() + 1));  // 此节点是parent的子节点，同时具备parent的输出

                fakeNode.addEmit(parent.getLargestValueId());

                siblings.add(new AbstractMap.SimpleEntry<Integer, State>(0, fakeNode));

            }

            for (Map.Entry<Character, State> entry : parent.getSuccess().entrySet()) {

                siblings.add(new AbstractMap.SimpleEntry<Integer, State>(entry.getKey() + 1, entry.getValue()));

            }

            return siblings.size();

        }

        /**

         * add a keyword

         *

         * @param keyword a keyword

         * @param index   the index of the keyword

         */

        private void addKeyword(String keyword, int index) {

            State currentState = this.rootState;

            for (Character character : keyword.toCharArray()) {

                currentState = currentState.addState(character);

            }

            currentState.addEmit(index);

            l[index] = keyword.length();

        }

        /**

         * add a collection of keywords

         *

         * @param keywordSet the collection holding keywords

         */

        private void addAllKeyword(Collection<String> keywordSet) {

            int i = 0;

            for (String keyword : keywordSet) {

                addKeyword(keyword, i++);

            }

        }

        /**

         * construct failure table

         */

        private void constructFailureStates() {

            fail = new int[size + 1];

            fail[1] = base[0];

            output = new int[size + 1][];

            Queue<State> queue = new LinkedBlockingDeque<State>();

            // 第一步，将深度为1的节点的failure设为根节点

            for (State depthOneState : this.rootState.getStates()) {

                depthOneState.setFailure(this.rootState, fail);

                queue.add(depthOneState);

                constructOutput(depthOneState);

            }

            // 第二步，为深度 > 1 的节点建立failure表，这是一个bfs

            while (!queue.isEmpty()) {

                State currentState = queue.remove();

                for (Character transition : currentState.getTransitions()) {

                    State targetState = currentState.nextState(transition);

                    queue.add(targetState);

                    State traceFailureState = currentState.failure();

                    while (traceFailureState.nextState(transition) == null) {

                        traceFailureState = traceFailureState.failure();

                    }

                    State newFailureState = traceFailureState.nextState(transition);

                    targetState.setFailure(newFailureState, fail);

                    targetState.addEmit(newFailureState.emit());

                    constructOutput(targetState);

                }

            }

        }

        /**

         * construct output table

         */

        private void constructOutput(State targetState) {

            Collection<Integer> emit = targetState.emit();

            if (emit == null || emit.size() == 0) {

                return;

            }

            int output[] = new int[emit.size()];

            Iterator<Integer> it = emit.iterator();

            for (int i = 0; i < output.length; ++i) {

                output[i] = it.next();

            }

            AhoCorasickDoubleArrayTrie.this.output[targetState.getIndex()] = output;

        }

        private void buildDoubleArrayTrie(int keySize) {

            progress = 0;

            this.keySize = keySize;

            resize(65536 * 32); // 32个双字节

            base[0] = 1;

            nextCheckPos = 0;

            State root_node = this.rootState;

            List<Map.Entry<Integer, State>> siblings = new ArrayList<Map.Entry<Integer, State>>(root_node.getSuccess().entrySet().size());

            fetch(root_node, siblings);

            insert(siblings);

        }

        /**

         * allocate the memory of the dynamic array

         *

         * @param newSize

         * @return

         */

        private int resize(int newSize) {

            int[] base2 = new int[newSize];

            int[] check2 = new int[newSize];

            boolean used2[] = new boolean[newSize];

            if (allocSize > 0) {

                System.arraycopy(base, 0, base2, 0, allocSize);

                System.arraycopy(check, 0, check2, 0, allocSize);

                System.arraycopy(used, 0, used2, 0, allocSize);

            }

            base = base2;

            check = check2;

            used = used2;

            return allocSize = newSize;

        }

        /**

         * insert the siblings to double array trie

         *

         * @param siblings the siblings being inserted

         * @return the position to insert them

         */

        private int insert(List<Map.Entry<Integer, State>> siblings) {

            int begin = 0;

            int pos = Math.max(siblings.get(0).getKey() + 1, nextCheckPos) - 1;

            int nonzero_num = 0;

            int first = 0;

            if (allocSize <= pos){

                resize(pos + 1);

            }

            outer:

            // 此循环体的目标是找出满足base[begin + a1...an]  == 0的n个空闲空间,a1...an是siblings中的n个节点

            while (true) {

                pos++;

                if (allocSize <= pos){

                    resize(pos + 1);

                }

                if (check[pos] != 0) {

                    nonzero_num++;

                    continue;

                } else if (first == 0) {

                    nextCheckPos = pos;

                    first = 1;

                }

                // 当前位置离第一个兄弟节点的距离

                begin = pos - siblings.get(0).getKey();

                if (allocSize <= (begin + siblings.get(siblings.size() - 1).getKey())) {

                    // progress can be zero // 防止progress产生除零错误

                    double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1);

                    resize((int) (allocSize * l));

                }

                if (used[begin]){

                    continue;

                }

                for (int i = 1; i < siblings.size(); i++){

                    if (check[begin + siblings.get(i).getKey()] != 0){

                        continue outer;

                    }

                }

                break;

            }

            // -- Simple heuristics --

            // if the percentage of non-empty contents in check between the

            // index

            // 'next_check_pos' and 'check' is greater than some constant value

            // (e.g. 0.9),

            // new 'next_check_pos' index is written by 'check'.

            if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95){

                // 从位置 next_check_pos 开始到 pos 间，如果已占用的空间在95%以上，下次插入节点时，直接从 pos 位置处开始查找

                nextCheckPos = pos;

            }

            used[begin] = true;

            size = (size > begin + siblings.get(siblings.size() - 1).getKey() + 1) ? size : begin + siblings.get(siblings.size() - 1).getKey() + 1;

            for (Map.Entry<Integer, State> sibling : siblings) {

                check[begin + sibling.getKey()] = begin;

            }

            for (Map.Entry<Integer, State> sibling : siblings) {

                List<Map.Entry<Integer, State>> new_siblings = new ArrayList<Map.Entry<Integer, State>>(sibling.getValue().getSuccess().entrySet().size() + 1);

                // 一个词的终止且不为其他词的前缀，其实就是叶子节点

                if (fetch(sibling.getValue(), new_siblings) == 0) {

                    base[begin + sibling.getKey()] = (-sibling.getValue().getLargestValueId() - 1);

                    progress++;

                } else {

                    // dfs

                    int h = insert(new_siblings);

                    base[begin + sibling.getKey()] = h;

                }

                sibling.getValue().setIndex(begin + sibling.getKey());

            }

            return begin;

        }

        /**

         * free the unnecessary memory

         */

        private void loseWeight() {

            int nbase[] = new int[size + 65535];

            System.arraycopy(base, 0, nbase, 0, size);

            base = nbase;

            int ncheck[] = new int[size + 65535];

            System.arraycopy(check, 0, ncheck, 0, size);

            check = ncheck;

        }

    }

    public static class Term<T> {

        public int start;

        public int end;

        public T value;

        public Term(int start, int end, T value) {

            this.start = start;

            this.end = end;

            this.value = value;

        }

        public int getLength() {

            return end - start;

        }

        public int getStart() {

            return start;

        }

        public void setStart(int start) {

            this.start = start;

        }

        public int getEnd() {

            return end;

        }

        public void setEnd(int end) {

            this.end = end;

        }

        public T getValue() {

            return value;

        }

        public void setValue(T value) {

            this.value = value;

        }

    }

    public static class Resource implements Serializable{

        String value;

        String rawData;

        public Resource(String value) {

            this.value = value;

        }

        public Resource(String value, String rawData) {

            this.value = value;

            this.rawData = rawData;

        }

        public String getValue() {

            return value;

        }

        public void setValue(String value) {

            this.value = value;

        }

        public String getRawData() {

            return rawData;

        }

        public void setRawData(String rawData) {

            this.rawData = rawData;

        }

        @Override

        public String toString() {

            return "Resource{" +

                    "value='" + value + '\'' +

                    ", rawData='" + rawData + '\'' +

                    '}';

        }

    }

}

(3) 引用的State

package com.test.model.act;

import java.util.*;

/**

 * <p>

 * 一个状态有如下几个功能

 * </p>

 * <p/>

 * <ul>

 * <li>success; 成功转移到另一个状态</li>

 * <li>failure; 不可顺着字符串跳转的话，则跳转到一个浅一点的节点</li>

 * <li>emits; 命中一个模式串</li>

 * </ul>

 * <p/>

 * <p>

 * 根节点稍有不同，根节点没有 failure 功能，它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。

 * </p>

 *

 * @author Robert Bor

 */

public class State {

    /**

     * 模式串的长度，也是这个状态的深度

     */

    protected final int depth;

    protected Character character;

    /**

     * fail 函数，如果没有匹配到，则跳转到此状态。

     */

    private State failure = null;

    /**

     * 只要这个状态可达，则记录模式串

     */

    private Set<Integer> emits = null;

    /**

     * goto 表，也称转移函数。根据字符串的下一个字符转移到下一个状态

     */

    private Map<Character, State> success = new TreeMap<Character, State>();

    /**

     * 在双数组中的对应下标

     */

    private int index;

    /**

     * 构造深度为0的节点

     */

    public State() {

        this(0);

    }

    /**

     * 构造深度为depth的节点

     *

     * @param depth

     */

    public State(int depth) {

        this.depth = depth;

    }

    /**

     * 获取节点深度

     *

     * @return

     */

    public int getDepth() {

        return this.depth;

    }

    /**

     * 添加一个匹配到的模式串（这个状态对应着这个模式串)

     *

     * @param keyword

     */

    public void addEmit(int keyword) {

        if (this.emits == null) {

            this.emits = new TreeSet<Integer>(Collections.reverseOrder());

        }

        this.emits.add(keyword);

    }

    /**

     * 获取最大的值

     *

     * @return

     */

    public Integer getLargestValueId() {

        if (emits == null || emits.size() == 0) {

            return null;

        }

        return emits.iterator().next();

    }

    /**

     * 添加一些匹配到的模式串

     *

     * @param emits

     */

    public void addEmit(Collection<Integer> emits) {

        for (int emit : emits) {

            addEmit(emit);

        }

    }

    /**

     * 获取这个节点代表的模式串（们）

     *

     * @return

     */

    public Collection<Integer> emit() {

        return this.emits == null ? Collections.<Integer>emptyList() : this.emits;

    }

    /**

     * 是否是终止状态

     *

     * @return

     */

    public boolean isAcceptable() {

        return this.depth > 0 && this.emits != null;

    }

    /**

     * 获取failure状态

     *

     * @return

     */

    public State failure() {

        return this.failure;

    }

    /**

     * 设置failure状态

     *

     * @param failState

     */

    public void setFailure(State failState, int fail[]) {

        this.failure = failState;

        fail[index] = failState.index;

    }

    /**

     * 转移到下一个状态

     *

     * @param character       希望按此字符转移

     * @param ignoreRootState 是否忽略根节点，如果是根节点自己调用则应该是true，否则为false

     * @return 转移结果

     */

    private State nextState(Character character, boolean ignoreRootState) {

        State nextState = this.success.get(character);

        if (!ignoreRootState && nextState == null && this.depth == 0) {

            nextState = this;

        }

        return nextState;

    }

    /**

     * 按照character转移，根节点转移失败会返回自己（永远不会返回null）

     *

     * @param character

     * @return

     */

    public State nextState(Character character) {

        return nextState(character, false);

    }

    /**

     * 按照character转移，任何节点转移失败会返回null

     *

     * @param character

     * @return

     */

    public State nextStateIgnoreRootState(Character character) {

        return nextState(character, true);

    }

    public State addState(Character character) {

        State nextState = nextStateIgnoreRootState(character);

        if (nextState == null) {

            nextState = new State(this.depth + 1);

            this.success.put(character, nextState);

        }

        return nextState;

    }

    public Collection<State> getStates() {

        return this.success.values();

    }

    public Collection<Character> getTransitions() {

        return this.success.keySet();

    }

    @Override

    public String toString() {

        final StringBuilder sb = new StringBuilder("State{");

        sb.append("depth=").append(depth);

        sb.append(", ID=").append(index);

        sb.append(", emits=").append(emits);

        sb.append(", success=").append(success.keySet());

        sb.append(", failureID=").append(failure == null ? "-1" : failure.index);

        sb.append(", failure=").append(failure);

        sb.append('}');

        return sb.toString();

    }

    /**

     * 获取goto表

     *

     * @return

     */

    public Map<Character, State> getSuccess() {

        return success;

    }

    public int getIndex() {

        return index;

    }

    public void setIndex(int index) {

        this.index = index;

    }

}

使用时只需要调用Act中的parse方法即可得到分词数据

4. 提取效果

(1) 原始百度QA，部分截图如图所示：

(2) 提取的结果

阿飞正传,龙凤智多星,堕落天使,旺角卡门,江湖龙虎斗,2046,恶男,繁花,春光乍泄,花样年华,猛鬼差馆,重庆森林,一代宗师,爱情谜语,君子好逑,我要金龟婿,猛鬼学堂,最后胜利,空心大少爷,再战江湖,爱神,小狐仙,北京之夏,蓝莓之夜,神勇双响炮续集,伊人再见,彩云曲

数据挖掘：提取百度知道QA中的影视信息的更多相关文章

使用Java POI来选择提取Word文档中的表格信息
通过使用Java POI来提取Word(1992)文档中的表格信息,其中POI支持不同的ms文档类型,在具体操作中需要注意.本文主要是通过POI来提取微软2003文档中的表格信息,具体code如下(事 ...
WIRESHARK 提取百度云客户端中的下载地址
序言:近两年感觉云盘是越来越不行了,什么115,360,华为,新浪一些网盘纷纷关门,现在手里能用的就只剩下一个百度云坚挺着.可那速度,简直感人,好吧,没钱冲VIP,就去找了一个破解版,用了一年,爽得飞 ...
C# 提取Word文档中的图片
C# 提取Word文档中的图片图片和文字是word文档中两种最常见的对象,在微软word中,如果我们想要提取出一个文档内的图片,只需要右击图片选择另存为然后命名保存就可以了,今天这篇文章主要是实现使 ...
php提取淘宝URL中ID的代码
一段可以提取淘宝URL中ID的PHP代码. 例如: <?php $taobao = 'taobao.com'; $tmall = 'tmall.com'; $guojitmall = 'tmal ...
PHP 提取图片img标记中的任意属性
PHP 提取图片img标记中的任意属性的简单实例. 复制代码代码如下: <?php /* PHP正则提取图片img标记中的任意属性 */ $str = '<center><im ...
SQL Server 2008空间数据应用系列十一：提取MapInfo地图数据中的空间数据解决方案
原文:SQL Server 2008空间数据应用系列十一:提取MapInfo地图数据中的空间数据解决方案友情提示,您阅读本篇博文的先决条件如下: 1.本文示例基于Microsoft SQL Serv ...
百度搜索URL中的参数都是什么
最近,点石排名更新了一个新功能——站内搜索.其实理解起来也很简单,就是通过URL限定搜索结果为某个网站,从而参与点击(例如:https://www.baidu.com/s?wd=SEO&si= ...
VBA/VBScript提取Word(*.doc)文件中包含的图片(照片)
VBA/VBScript提取Word(*.doc)文件中包含的图片(照片) 要处理的人事简历表是典型的Word文档,其中一人一份doc,里面包含有个人的照片,如果要把里面的照片复制出来就比较麻烦了 ...
使用 python 提取照片中的手机信息
使用 python 提取照片中的手机信息最近在做一个项目,有一个很重要的点是需要获取使用用户的手机信息,这里我选择从照片中获取信息.有人会问为什么不从手机里面直接获取设备信息.由于现在android ...

随机推荐

SQL server T-SQL存储过程
简介存储过程是由一些SQL语句和控制语句组成的被封装起来的过程,它驻留在数据库中,可以被客户应用程序调用,也可以从另一个过程或触发器调用.它的参数可以被传递和返回.与应用程序中的函数过程类似,存储过 ...
go的Type switch是一个switch语句么?
相信这样的语句在go中大家见的很多 switch t := arg.(type) { default: fmt.Printf("unexpected type %T\n", t) ...
在成员函数中调用虚函数（关于多态的注意事项）------新标准c++程序设计
类的成员函数之间可以互相调用.在成员函数(静态成员函数.构造函数和析构函数除外)中调用其他虚成员函数的语句是多态的.例如: #include<iostream> using namespa ...
PLSQL Developer连接远程Oracle
注:内容来网络 (一)不安装客户端的解决办法. 第一种方法: 1.在安装ORACLE服务器的机器上搜索下列文件, oci.dll ocijdbc10.dll ociw32.dll orannzsbb1 ...
golang并发练习代码笔记
golang语言的精髓就是它的并发机制,十分简单,并且极少数在语言层面实现并发机制的语言,golang被成为网络时代的c语言,golang的缔造者也有c语言的缔造者,Go语言是google 推出的一门 ...
公司拷贝回家的工程用sts导入clean package报错java.lang.NoClassDefFoundError
从公司拷贝工程回家加班,用相同版本的sts和jdk但是run as maven build clean package 总是报错java.lang.NoClassDefFoundError: ...
Python——requests的安装及入门-贴吧爬虫
一.windows平台下requests的安装 1.win+R,输入cmd,打开命令行窗口,输入命令:pip install requests ,即可自动安装库成功 2.输入命令:pip list,即 ...
c语言数据结构学习心得——排序
排序:将无序的序列重新排列为有序的序列. 插入类排序插入类排序:在一个有序的序列中,插入一个新的关键字,知道所有的关键字都插入形成一个有序的序列. 直接插入排序:首先以一个元素为有序的序列,然后将后 ...
自定义内核启动后的Logo
1.使用图像GIMP工具 2.详细步骤如下: A.将800x480的图片导入到GIMP工具. B.选中GIMP菜单栏进行以下操作图像 -->模式 ...
Qt 学习之路 2（54）：剪贴板
Qt 学习之路 2(54):剪贴板豆子 2013年6月8日 Qt 学习之路 2 2条评论剪贴板的操作经常和前面所说的拖放技术在一起使用.大家对剪贴板都很熟悉.我们可以简单地把它理解成一个数据存储池 ...