flink流处理从0到1

一、DataStream API之Data Sources（消费者之数据源）

介绍：

source是程序的数据源输入，你可以通过StreamExecutionEnvironment.addSource(sourceFunction)来为你的程序添加一个source。 flink提供了大量的已经实现好的source方法，你也可以自定义source 通过实现sourceFunction接口来自定义无并行度的source，或者你也可以通过实现ParallelSourceFunction 接口 or 继承RichParallelSourceFunction 来自定义有并行度的source。

类型：

基于文件

readTextFile(path) 读取文本文件，文件遵循TextInputFormat 读取规则，逐行读取并返回。

基于socket

socketTextStream从socker中读取数据，元素可以通过一个分隔符切开。

基于集合

fromCollection(Collection) 通过java 的collection集合创建一个数据流，集合中的所有元素必须是相同类型的。

自定义输入

addSource 可以实现读取第三方数据源的数据系统内置提供了一批connectors，连接器会提供对应的source支持【kafka】

代码实现：

1、fromCollection

package xuwei.tech.streaming;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.ArrayList;

/**

 * 把collection集合作为数据源

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingFromCollection {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        ArrayList<Integer> data = new ArrayList<>();

        data.add(10);

        data.add(15);

        data.add(20);

        //指定数据源

        DataStreamSource<Integer> collectionData = env.fromCollection(data);

        //通map对数据进行处理

        DataStream<Integer> num = collectionData.map(new MapFunction<Integer, Integer>() {

            @Override

            public Integer map(Integer value) throws Exception {

                return value + 1;

            }

        });

        //直接打印

        num.print().setParallelism(1);

        env.execute("StreamingFromCollection");

    }

}

2、创建自定义单并行度为1的SourceFunction（addSource）

① 创建自定义单并行度为1的SourceFunction

package xuwei.tech.streaming.custormSource;

import org.apache.flink.streaming.api.functions.source.SourceFunction;

/**

 * 自定义实现并行度为1的source

 *

 * 模拟产生从1开始的递增数字

 *

 *

 * 注意：

 * SourceFunction 和 SourceContext 都需要指定数据类型，如果不指定，代码运行的时候会报错

 * Caused by: org.apache.flink.api.common.functions.InvalidTypesException:

 * The types of the interface org.apache.flink.streaming.api.functions.source.SourceFunction could not be inferred.

 * Support for synthetic interfaces, lambdas, and generic or raw types is limited at this point

 *

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class MyNoParalleSource implements SourceFunction<Long>{

    private long count = 1L;

    private boolean isRunning = true;

    /**

     * 主要的方法

     * 启动一个source

     * 大部分情况下，都需要在这个run方法中实现一个循环，这样就可以循环产生数据了

     *

     * @param ctx

     * @throws Exception

     */

    @Override

    public void run(SourceContext<Long> ctx) throws Exception {

        while(isRunning){

            ctx.collect(count);

            count++;

            //每秒产生一条数据

            Thread.sleep(1000);

        }

    }

    /**

     * 取消一个cancel的时候会调用的方法

     *

     */

    @Override

    public void cancel() {

        isRunning = false;

    }

}

②实现自定义单并行度为1的SourceFunction

package xuwei.tech.streaming.custormSource;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

/**

 * 使用并行度为1的source

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoWithMyNoPralalleSource {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //获取数据源

        DataStreamSource<Long> text = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意：针对此source，并行度只能设置为1

        DataStream<Long> num = text.map(new MapFunction<Long, Long>() {

            @Override

            public Long map(Long value) throws Exception {

                System.out.println("接收到数据：" + value);

                return value;

            }

        });

        //每2秒钟处理一次数据

        DataStream<Long> sum = num.timeWindowAll(Time.seconds(2)).sum(0);

        //打印结果

        sum.print().setParallelism(1);

        String jobName = StreamingDemoWithMyNoPralalleSource.class.getSimpleName();

        env.execute(jobName);

    }

}

3、创建自定义多并行度为1的ParallelSourceFunction （addSource）

① 创建自定义多并行度为1的ParallelSourceFunction

package xuwei.tech.streaming.custormSource;

import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;

/**

 * 自定义实现一个支持并行度的source

 * Created by xuwei.tech on 2018/10/23.

 */

public class MyParalleSource implements ParallelSourceFunction<Long> {

    private long count = 1L;

    private boolean isRunning = true;

    /**

     * 主要的方法

     * 启动一个source

     * 大部分情况下，都需要在这个run方法中实现一个循环，这样就可以循环产生数据了

     *

     * @param ctx

     * @throws Exception

     */

    @Override

    public void run(SourceContext<Long> ctx) throws Exception {

        while(isRunning){

            ctx.collect(count);

            count++;

            //每秒产生一条数据

            Thread.sleep(1000);

        }

    }

    /**

     * 取消一个cancel的时候会调用的方法

     *

     */

    @Override

    public void cancel() {

        isRunning = false;

    }

}

②实现自定义多并行度为1的ParallelSourceFunction

package xuwei.tech.streaming.custormSource;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

/**

 * 使用多并行度的source

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoWithMyPralalleSource {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //获取数据源

        DataStreamSource<Long> text = env.addSource(new MyParalleSource()).setParallelism(2);

        DataStream<Long> num = text.map(new MapFunction<Long, Long>() {

            @Override

            public Long map(Long value) throws Exception {

                System.out.println("接收到数据：" + value);

                return value;

            }

        });

        //每2秒钟处理一次数据

        DataStream<Long> sum = num.timeWindowAll(Time.seconds(2)).sum(0);

        //打印结果

        sum.print().setParallelism(1);

        String jobName = StreamingDemoWithMyPralalleSource.class.getSimpleName();

        env.execute(jobName);

    }

}

4、创建自定义多并行度为1的RichParallelSourceFunction（addSource）

① 创建自定义多并行度为1的RichParallelSourceFunction

package xuwei.tech.streaming.custormSource;

import org.apache.flink.configuration.Configuration;

import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;

/**

 * 自定义实现一个支持并行度的source

 *

 * RichParallelSourceFunction 会额外提供open和close方法

 * 针对source中如果需要获取其他链接资源，那么可以在open方法中获取资源链接，在close中关闭资源链接

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class MyRichParalleSource extends RichParallelSourceFunction<Long> {

    private long count = 1L;

    private boolean isRunning = true;

    /**

     * 主要的方法

     * 启动一个source

     * 大部分情况下，都需要在这个run方法中实现一个循环，这样就可以循环产生数据了

     *

     * @param ctx

     * @throws Exception

     */

    @Override

    public void run(SourceContext<Long> ctx) throws Exception {

        while(isRunning){

            ctx.collect(count);

            count++;

            //每秒产生一条数据

            Thread.sleep(1000);

        }

    }

    /**

     * 取消一个cancel的时候会调用的方法

     *

     */

    @Override

    public void cancel() {

        isRunning = false;

    }

    /**

     * 这个方法只会在最开始的时候被调用一次

     * 实现获取链接的代码

     * @param parameters

     * @throws Exception

     */

    @Override

    public void open(Configuration parameters) throws Exception {

        System.out.println("open.............");

        super.open(parameters);

    }

    /**

     * 实现关闭链接的代码

     * @throws Exception

     */

    @Override

    public void close() throws Exception {

        super.close();

    }

}

②实现自定义多并行度为1的RichParallelSourceFunction

package xuwei.tech.streaming.custormSource;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

/**

 * 使用多并行度的source

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoWithMyRichPralalleSource {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //获取数据源

        DataStreamSource<Long> text = env.addSource(new MyRichParalleSource()).setParallelism(2);

        DataStream<Long> num = text.map(new MapFunction<Long, Long>() {

            @Override

            public Long map(Long value) throws Exception {

                System.out.println("接收到数据：" + value);

                return value;

            }

        });

        //每2秒钟处理一次数据

        DataStream<Long> sum = num.timeWindowAll(Time.seconds(2)).sum(0);

        //打印结果

        sum.print().setParallelism(1);

        String jobName = StreamingDemoWithMyRichPralalleSource.class.getSimpleName();

        env.execute(jobName);

    }

}

二、DataStream API之Transformations

介绍：

map：输入一个元素，然后返回一个元素，中间可以做一些清洗转换等操作
flatmap：输入一个元素，可以返回零个，一个或者多个元素
filter：过滤函数，对传入的数据进行判断，符合条件的数据会被留下
keyBy：根据指定的key进行分组，相同key的数据会进入同一个分区【典型用法见备注】
reduce：对数据进行聚合操作，结合当前元素和上一次reduce返回的值进行聚合操作，然后返回一个新的值
aggregations：sum(),min(),max()等
window：在后面单独详解
Union：合并多个流，新的流会包含所有流中的数据，但是union是一个限制，就是所有合并的流类型必须是一致的。
Connect：和union类似，但是只能连接两个流，两个流的数据类型可以不同，会对两个流中的数据应用不同的处理方法。
CoMap, CoFlatMap：在ConnectedStreams中需要使用这种函数，类似于map和flatmap
Split：根据规则把一个数据流切分为多个流
Select：和split配合使用，选择切分后的流

代码实现：

1、filter

package xuwei.tech.streaming.streamAPI;

import org.apache.flink.api.common.functions.FilterFunction;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

import xuwei.tech.streaming.custormSource.MyNoParalleSource;

/**

 * Filter演示

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoFilter {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //获取数据源

        DataStreamSource<Long> text = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意：针对此source，并行度只能设置为1

        DataStream<Long> num = text.map(new MapFunction<Long, Long>() {

            @Override

            public Long map(Long value) throws Exception {

                System.out.println("原始接收到数据：" + value);

                return value;

            }

        });

        //执行filter过滤，满足条件的数据会被留下

        DataStream<Long> filterData = num.filter(new FilterFunction<Long>() {

            //把所有的奇数过滤掉

            @Override

            public boolean filter(Long value) throws Exception {

                return value % 2 == 0;

            }

        });

        DataStream<Long> resultData = filterData.map(new MapFunction<Long, Long>() {

            @Override

            public Long map(Long value) throws Exception {

                System.out.println("过滤之后的数据：" + value);

                return value;

            }

        });

        //每2秒钟处理一次数据

        DataStream<Long> sum = resultData.timeWindowAll(Time.seconds(2)).sum(0);

        //打印结果

        sum.print().setParallelism(1);

        String jobName = StreamingDemoFilter.class.getSimpleName();

        env.execute(jobName);

    }

}

2、Split

package xuwei.tech.streaming.streamAPI;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.collector.selector.OutputSelector;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.datastream.SplitStream;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

import xuwei.tech.streaming.custormSource.MyNoParalleSource;

import java.util.ArrayList;

/**

 * split

 *

 * 根据规则把一个数据流切分为多个流

 *

 * 应用场景：

 * 可能在实际工作中，源数据流中混合了多种类似的数据，多种类型的数据处理规则不一样，所以就可以在根据一定的规则，

 * 把一个数据流切分成多个数据流，这样每个数据流就可以使用不用的处理逻辑了

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoSplit {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //获取数据源

        DataStreamSource<Long> text = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意：针对此source，并行度只能设置为1

        //对流进行切分，按照数据的奇偶性进行区分

        SplitStream<Long> splitStream = text.split(new OutputSelector<Long>() {

            @Override

            public Iterable<String> select(Long value) {

                ArrayList<String> outPut = new ArrayList<>();

                if (value % 2 == 0) {

                    outPut.add("even");//偶数

                } else {

                    outPut.add("odd");//奇数

                }

                return outPut;

            }

        });

        //选择一个或者多个切分后的流

        DataStream<Long> evenStream = splitStream.select("even");

        DataStream<Long> oddStream = splitStream.select("odd");

        DataStream<Long> moreStream = splitStream.select("odd","even");

        //打印结果

        moreStream.print().setParallelism(1);

        String jobName = StreamingDemoSplit.class.getSimpleName();

        env.execute(jobName);

    }

}

3、union（注意两个数据源类型必须相同）

package xuwei.tech.streaming.streamAPI;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

import xuwei.tech.streaming.custormSource.MyNoParalleSource;

/**

 * union

 * 合并多个流，新的流会包含所有流中的数据，但是union是一个限制，就是所有合并的流类型必须是一致的

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoUnion {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //获取数据源

        DataStreamSource<Long> text1 = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意：针对此source，并行度只能设置为1

        DataStreamSource<Long> text2 = env.addSource(new MyNoParalleSource()).setParallelism(1);

        //把text1和text2组装到一起

        DataStream<Long> text = text1.union(text2);

        DataStream<Long> num = text.map(new MapFunction<Long, Long>() {

            @Override

            public Long map(Long value) throws Exception {

                System.out.println("原始接收到数据：" + value);

                return value;

            }

        });

        //每2秒钟处理一次数据

        DataStream<Long> sum = num.timeWindowAll(Time.seconds(2)).sum(0);

        //打印结果

        sum.print().setParallelism(1);

        String jobName = StreamingDemoUnion.class.getSimpleName();

        env.execute(jobName);

    }

}

4、Connect（可以合并两种类型不一样的数据流）

package xuwei.tech.streaming.streamAPI;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.datastream.ConnectedStreams;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.functions.co.CoMapFunction;

import org.apache.flink.streaming.api.windowing.time.Time;

import xuwei.tech.streaming.custormSource.MyNoParalleSource;

/**

 * connect

 * 和union类似，但是只能连接两个流，两个流的数据类型可以不同，会对两个流中的数据应用不同的处理方法

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoConnect {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //获取数据源

        DataStreamSource<Long> text1 = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意：针对此source，并行度只能设置为1

        DataStreamSource<Long> text2 = env.addSource(new MyNoParalleSource()).setParallelism(1);

        SingleOutputStreamOperator<String> text2_str = text2.map(new MapFunction<Long, String>() {

            @Override

            public String map(Long value) throws Exception {

                return "str_" + value;

            }

        });

        ConnectedStreams<Long, String> connectStream = text1.connect(text2_str);

        SingleOutputStreamOperator<Object> result = connectStream.map(new CoMapFunction<Long, String, Object>() {

            @Override

            public Object map1(Long value) throws Exception {

                return value;

            }

            @Override

            public Object map2(String value) throws Exception {

                return value;

            }

        });

        //打印结果

        result.print().setParallelism(1);

        String jobName = StreamingDemoConnect.class.getSimpleName();

        env.execute(jobName);

    }

}

5、broadcast（broadcast分区规则）

package xuwei.tech.streaming.streamAPI;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

import xuwei.tech.streaming.custormSource.MyNoParalleSource;

/**

 *  broadcast分区规则

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoWithMyNoPralalleSourceBroadcast {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.setParallelism(4);

        //获取数据源

        DataStreamSource<Long> text = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意：针对此source，并行度只能设置为1

        DataStream<Long> num = text.broadcast().map(new MapFunction<Long, Long>() {

            @Override

            public Long map(Long value) throws Exception {

                long id = Thread.currentThread().getId();

                System.out.println("线程id："+id+",接收到数据：" + value);

                return value;

            }

        });

        //每2秒钟处理一次数据

        DataStream<Long> sum = num.timeWindowAll(Time.seconds(2)).sum(0);

        //打印结果

        sum.print().setParallelism(1);

        String jobName = StreamingDemoWithMyNoPralalleSourceBroadcast.class.getSimpleName();

        env.execute(jobName);

    }

}

总结：

Flink 状态(State)管理与恢复

一. 状态(State)

介绍：

我们前面写的word count的例子，没有包含状态管理。如果一个task在处理过程中挂掉了，那么它在内存中的状态都会丢失，所有的数据都需要重新计算。从容错和消息处理的语义上(at least once, exactly once)，Flink引入了state和checkpoint。
首先区分一下两个概念
state一般指一个具体的task/operator的状态【state数据默认保存在java的堆内存中】
而checkpoint【可以理解为checkpoint是把state数据持久化存储了】，则表示了一个Flink Job在一个特定时刻的一份全局状态快照，即包含了所有task/operator的状态
注意：task是Flink中执行的基本单位。operator指算子(transformation)。
State可以被记录，在失败的情况下数据还可以恢复
Flink中有两种基本类型的State
Keyed State
Operator State
Keyed State和Operator State，可以以两种形式存在：
原始状态(raw state)
托管状态(managed state)
托管状态是由Flink框架管理的状态
而原始状态，由用户自行管理状态具体的数据结构，框架在做checkpoint的时候，使用byte[]来读写状态内容，对其内部数据结构一无所知。
通常在DataStream上的状态推荐使用托管的状态，当实现一个用户自定义的operator时，会使用到原始状态。

1. State-Keyed State

顾名思义，就是基于KeyedStream上的状态。这个状态是跟特定的key绑定的，对KeyedStream流上的每一个key，都对应一个state。
stream.keyBy(…)
保存state的数据结构
ValueState<T>:即类型为T的单值状态。这个状态与对应的key绑定，是最简单的状态了。它可以通过update方法更新状态值，通过value()方法获取状态值
ListState<T>:即key上的状态值为一个列表。可以通过add方法往列表中附加值；也可以通过get()方法返回一个Iterable<T>来遍历状态值
ReducingState<T>:这种状态通过用户传入的reduceFunction，每次调用add方法添加值的时候，会调用reduceFunction，最后合并到一个单一的状态值
MapState<UK, UV>:即状态值为一个map。用户通过put或putAll方法添加元素
需要注意的是，以上所述的State对象，仅仅用于与状态进行交互（更新、删除、清空等），而真正的状态值，有可能是存在内存、磁盘、或者其他分布式存储系统中。相当于我们只是持有了这个状态的句柄

2. State-Operator State

与Key无关的State，与Operator绑定的state，整个operator只对应一个state
保存state的数据结构
ListState<T>
举例来说，Flink中的Kafka Connector，就使用了operator state。它会在每个connector实例中，保存该实例中消费topic的所有(partition, offset)映射

二、状态容错

1、checkpoint

依靠checkPoint机制
保证exactly-once
只能保证Flink系统内的exactly-once
对于source和sink需要依赖外部的组件一同保证

checkPoint介绍：

为了保证state的容错性，Flink需要对state进行checkpoint。
Checkpoint是Flink实现容错机制最核心的功能，它能够根据配置周期性地基于Stream中各个Operator/task的状态来生成快照，从而将这些状态数据定期持久化存储下来，当Flink程序一旦意外崩溃时，重新运行程序时可以有选择地从这些快照进行恢复，从而修正因为故障带来的程序数据异常
Flink的checkpoint机制可以与(stream和state)的持久化存储交互的前提：
持久化的source，它需要支持在一定时间内重放事件。这种sources的典型例子是持久化的消息队列（比如Apache Kafka，RabbitMQ等）或文件系统（比如HDFS，S3，GFS等）
用于state的持久化存储，例如分布式文件系统（比如HDFS，S3，GFS等）

checkPoint配置：

默认checkpoint功能是disabled的，想要使用的时候需要先启用
checkpoint开启之后，默认的checkPointMode是Exactly-once
checkpoint的checkPointMode有两种，Exactly-once和At-least-once
Exactly-once对于大多数应用来说是最合适的。At-least-once可能用在某些延迟超低的应用程序（始终延迟为几毫秒）

checkpoint配置conf

默认checkpoint功能是disabled的，想要使用的时候需要先启用
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 每隔1000 ms进行启动一个检查点【设置checkpoint的周期】
env.enableCheckpointing(1000);
// 高级选项：
// 设置模式为exactly-once （这是默认值）
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
// 确保检查点之间有至少500 ms的间隔【checkpoint最小间隔】
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
// 检查点必须在一分钟内完成，或者被丢弃【checkpoint的超时时间】
env.getCheckpointConfig().setCheckpointTimeout(60000);
// 同一时间只允许进行一个检查点
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
// 表示一旦Flink处理程序被cancel后，会保留Checkpoint数据，以便根据实际需要恢复到指定的Checkpoint【详细解释见备注】
env.getCheckpointConfig().enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

2、State Backend(状态的后端存储)

介绍：

默认情况下，state会保存在taskmanager的内存中，checkpoint会存储在JobManager的内存中。
state 的store和checkpoint的位置取决于State Backend的配置
env.setStateBackend(…)
一共有三种State Backend
MemoryStateBackend
FsStateBackend
RocksDBStateBackend

state配置：

修改State Backend的两种方式

第一种：单任务调整
修改当前任务代码
env.setStateBackend(new FsStateBackend("hdfs://namenode:9000/flink/checkpoints"));
或者new MemoryStateBackend()
或者new RocksDBStateBackend(filebackend, true);【需要添加第三方依赖】
第二种：全局调整
修改flink-conf.yaml
state.backend: filesystem
state.checkpoints.dir: hdfs://namenode:9000/flink/checkpoints
注意：state.backend的值可以是下面几种：jobmanager(MemoryStateBackend), filesystem(FsStateBackend), rocksdb(RocksDBStateBackend)

三、Restart Strategies(重启策略)

介绍：

Flink支持不同的重启策略，以在故障发生时控制作业如何重启
集群在启动时会伴随一个默认的重启策略，在没有定义具体重启策略时会使用该默认策略。如果在工作提交时指定了一个重启策略，该策略会覆盖集群的默认策略
默认的重启策略可以通过 Flink 的配置文件 flink-conf.yaml 指定。配置参数 restart-strategy 定义了哪个策略被使用。
常用的重启策略
固定间隔 (Fixed delay)
失败率 (Failure rate)
无重启 (No restart)
如果没有启用 checkpointing，则使用无重启 (no restart) 策略。
如果启用了 checkpointing，但没有配置重启策略，则使用固定间隔 (fixed-delay) 策略，其中 Integer.MAX_VALUE 参数是尝试重启次数
重启策略可以在flink-conf.yaml中配置，表示全局的配置。也可以在应用代码中动态指定，会覆盖全局配置

介绍：

Flink通过Savepoint功能可以做到程序升级后，继续从升级前的那个点开始执行计算，保证数据不中断
全局，一致性快照。可以保存数据源offset，operator操作状态等信息
可以从应用在过去任意做了savepoint的时刻开始继续消费

配置使用：

在flink-conf.yaml中配置Savepoint存储位置

不是必须设置，但是设置后，后面创建指定Job的Savepoint时，可以不用在手动执行命令时指定Savepoint的位置
state.savepoints.dir: hdfs://namenode:9000/flink/savepoints

触发一个savepoint【直接触发或者在cancel的时候触发】

bin/flink savepoint jobId [targetDirectory] [-yid yarnAppId]【针对on yarn模式需要指定-yid参数】
bin/flink cancel -s [targetDirectory] jobId [-yid yarnAppId]【针对on yarn模式需要指定-yid参数】

从指定的savepoint启动job bin/flink run -s savepointPath [runArgs]

总结：checkPoint vs savePoint

checkPoint

应用定时触发，用于保存状态，会过期
内部应用失败重启的时候使用

savePoint

用户手动执行，是指向Checkpoint的指针，不会过期
在升级的情况下使用
注意：为了能够在作业的不同版本之间以及 Flink 的不同版本之间顺利升级，强烈推荐程序员通过 uid(String) 方法手动的给算子赋予 ID，这些 ID 将用于确定每一个算子的状态范围。如果不手动给各算子指定 ID，则会由 Flink 自动给每个算子生成一个 ID。只要这些 ID 没有改变就能从保存点（savepoint）将程序恢复回来。而这些自动生成的 ID 依赖于程序的结构，并且对代码的更改是很敏感的。因此，强烈建议用户手动的设置 ID。

9、checkpoint（检查点）

package xuwei.tech.streaming;

import org.apache.flink.api.common.functions.FlatMapFunction;

import org.apache.flink.api.java.utils.ParameterTool;

import org.apache.flink.contrib.streaming.state.RocksDBStateBackend;

import org.apache.flink.runtime.state.filesystem.FsStateBackend;

import org.apache.flink.runtime.state.memory.MemoryStateBackend;

import org.apache.flink.streaming.api.CheckpointingMode;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.CheckpointConfig;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

import org.apache.flink.util.Collector;

/**

 * checkpoint

 *

 * Created by xuwei.tech on 2018/10/8.

 */

public class SocketWindowWordCountJavaCheckPoint {

    public static void main(String[] args) throws Exception{

        //获取需要的端口号

        int port;

        try {

            ParameterTool parameterTool = ParameterTool.fromArgs(args);

            port = parameterTool.getInt("port");

        }catch (Exception e){

            System.err.println("No port set. use default port 9000--java");

            port = 9000;

        }

        //获取flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 每隔1000 ms进行启动一个检查点【设置checkpoint的周期】

        env.enableCheckpointing(1000);

        // 高级选项：

        // 设置模式为exactly-once （这是默认值）

        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);

        // 确保检查点之间有至少500 ms的间隔【checkpoint最小间隔】

        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);

        // 检查点必须在一分钟内完成，或者被丢弃【checkpoint的超时时间】

        env.getCheckpointConfig().setCheckpointTimeout(60000);

        // 同一时间只允许进行一个检查点

        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);

        // 表示一旦Flink处理程序被cancel后，会保留Checkpoint数据，以便根据实际需要恢复到指定的Checkpoint【详细解释见备注】

        //ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION:表示一旦Flink处理程序被cancel后，会保留Checkpoint数据，以便根据实际需要恢复到指定的Checkpoint

        //ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION: 表示一旦Flink处理程序被cancel后，会删除Checkpoint数据，只有job执行失败的时候才会保存checkpoint

        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        //设置statebackend

        //env.setStateBackend(new MemoryStateBackend());

        //env.setStateBackend(new FsStateBackend("hdfs://hadoop100:9000/flink/checkpoints"));

        //env.setStateBackend(new RocksDBStateBackend("hdfs://hadoop100:9000/flink/checkpoints",true));

        String hostname = "hadoop100";

        String delimiter = "\n";

        //连接socket获取输入的数据

        DataStreamSource<String> text = env.socketTextStream(hostname, port, delimiter);

        // a a c

        // a 1

        // a 1

        // c 1

        DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() {

            public void flatMap(String value, Collector<WordWithCount> out) throws Exception {

                String[] splits = value.split("\\s");

                for (String word : splits) {

                    out.collect(new WordWithCount(word, 1L));

                }

            }

        }).keyBy("word")

                .timeWindow(Time.seconds(2), Time.seconds(1))//指定时间窗口大小为2秒，指定时间间隔为1秒

                .sum("count");//在这里使用sum或者reduce都可以

                /*.reduce(new ReduceFunction<WordWithCount>() {

                                    public WordWithCount reduce(WordWithCount a, WordWithCount b) throws Exception {

                                        return new WordWithCount(a.word,a.count+b.count);

                                    }

                                })*/

        //把数据打印到控制台并且设置并行度

        windowCounts.print().setParallelism(1);

        //这一行代码一定要实现，否则程序不执行

        env.execute("Socket window count");

    }

    public static class WordWithCount{

        public String word;

        public long count;

        public  WordWithCount(){}

        public WordWithCount(String word,long count){

            this.word = word;

            this.count = count;

        }

        @Override

        public String toString() {

            return "WordWithCount{" +

                    "word='" + word + '\'' +

                    ", count=" + count +

                    '}';

        }

    }

}

三、DataStream API之partition

介绍：

Random partitioning：随机分区
dataStream.shuffle()
Rebalancing：对数据集进行再平衡，重分区，消除数据倾斜
dataStream.rebalance()
Rescaling：解释见备注
dataStream.rescale()
Custom partitioning：自定义分区
自定义分区需要实现Partitioner接口
dataStream.partitionCustom(partitioner, "someKey")
或者dataStream.partitionCustom(partitioner, 0);
Broadcasting：在后面单独详解

代码实现：

1、Partitioner

①创建分区类

package xuwei.tech.streaming.custormPartition;

import org.apache.flink.api.common.functions.Partitioner;

/**

 * Created by xuwei.tech on 2018/10/23.

 */

public class MyPartition implements Partitioner<Long> {

    @Override

    public int partition(Long key, int numPartitions) {

        System.out.println("分区总数："+numPartitions);

        if(key % 2 == 0){

            return 0;

        }else{

            return 1;

        }

    }

}

②实现分区类的对象化

package xuwei.tech.streaming.custormPartition;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.api.java.tuple.Tuple1;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import xuwei.tech.streaming.custormSource.MyNoParalleSource;

/**

 *

 * 使用自定义分析

 * 根据数字的奇偶性来分区

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class SteamingDemoWithMyParitition {

    public static void main(String[] args) throws Exception{

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.setParallelism(2);

        DataStreamSource<Long> text = env.addSource(new MyNoParalleSource());

        //对数据进行转换，把long类型转成tuple1类型

        DataStream<Tuple1<Long>> tupleData = text.map(new MapFunction<Long, Tuple1<Long>>() {

            @Override

            public Tuple1<Long> map(Long value) throws Exception {

                return new Tuple1<>(value);

            }

        });

        //分区之后的数据

        DataStream<Tuple1<Long>> partitionData= tupleData.partitionCustom(new MyPartition(), 0);

        DataStream<Long> result = partitionData.map(new MapFunction<Tuple1<Long>, Long>() {

            @Override

            public Long map(Tuple1<Long> value) throws Exception {

                System.out.println("当前线程id：" + Thread.currentThread().getId() + ",value: " + value);

                return value.getField(0);

            }

        });

        result.print().setParallelism(1);

        env.execute("SteamingDemoWithMyParitition");

    }

}

四、DataStream API之Data Sink（数据落地）

介绍：

writeAsText()：将元素以字符串形式逐行写入，这些字符串通过调用每个元素的toString()方法来获取
print() / printToErr()：打印每个元素的toString()方法的值到标准输出或者标准错误输出流中
自定义输出addSink【kafka、redis】

1、内置Connectors

Apache Kafka (source/sink)
Apache Cassandra (sink)
Elasticsearch (sink)
Hadoop FileSystem (sink)
RabbitMQ (source/sink)
Apache ActiveMQ (source/sink)
Redis (sink)

2. 自定义sink

实现自定义的sink
实现SinkFunction接口
或者继承RichSinkFunction
参考org.apache.flink.streaming.connectors.redis.RedisSink

代码

1、落地到Redis

package xuwei.tech.streaming.sink;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.api.java.tuple.Tuple2;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.connectors.redis.RedisSink;

import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig;

import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommand;

import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommandDescription;

import org.apache.flink.streaming.connectors.redis.common.mapper.RedisMapper;

/**

 * 接收socket数据，把数据保存到redis中

 *

 * list

 *

 * lpush list_key value

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingDemoToRedis {

    public static void main(String[] args) throws Exception{

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<String> text = env.socketTextStream("hadoop100", 9000, "\n");

        //lpsuh l_words word

        //对数据进行组装,把string转化为tuple2<String,String>

        DataStream<Tuple2<String, String>> l_wordsData = text.map(new MapFunction<String, Tuple2<String, String>>() {

            @Override

            public Tuple2<String, String> map(String value) throws Exception {

                return new Tuple2<>("l_words", value);

            }

        });

        //创建redis的配置

        FlinkJedisPoolConfig conf = new FlinkJedisPoolConfig.Builder().setHost("hadoop110").setPort(6379).build();

        //创建redissink

        RedisSink<Tuple2<String, String>> redisSink = new RedisSink<>(conf, new MyRedisMapper());

        l_wordsData.addSink(redisSink);

        env.execute("StreamingDemoToRedis");

    }

    public static class MyRedisMapper implements RedisMapper<Tuple2<String, String>>{

        //表示从接收的数据中获取需要操作的redis key

        @Override

        public String getKeyFromData(Tuple2<String, String> data) {

            return data.f0;

        }

        //表示从接收的数据中获取需要操作的redis value

        @Override

        public String getValueFromData(Tuple2<String, String> data) {

            return data.f1;

        }

        @Override

        public RedisCommandDescription getCommandDescription() {

            return new RedisCommandDescription(RedisCommand.LPUSH);

        }

    }

}

2、落地到kafka（生产者）

package xuwei.tech.streaming;

import org.apache.flink.api.common.serialization.SimpleStringSchema;

import org.apache.flink.streaming.api.CheckpointingMode;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.CheckpointConfig;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;

import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011;

import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper;

import java.util.Properties;

/**

 * kafkaSink

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingKafkaSink {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //checkpoint配置

        env.enableCheckpointing(5000);

        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);

        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);

        env.getCheckpointConfig().setCheckpointTimeout(60000);

        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);

        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        //设置statebackend

        //env.setStateBackend(new RocksDBStateBackend("hdfs://hadoop100:9000/flink/checkpoints",true));

        DataStreamSource<String> text = env.socketTextStream("hadoop100", 9001, "\n");

        String brokerList = "hadoop110:9092";

        String topic = "t1";

        Properties prop = new Properties();

        prop.setProperty("bootstrap.servers",brokerList);

        //第一种解决方案，设置FlinkKafkaProducer011里面的事务超时时间

        //设置事务超时时间

        //prop.setProperty("transaction.timeout.ms",60000*15+"");

        //第二种解决方案，设置kafka的最大事务超时时间

        //FlinkKafkaProducer011<String> myProducer = new FlinkKafkaProducer011<>(brokerList, topic, new SimpleStringSchema());

        //使用仅一次语义的kafkaProducer

        FlinkKafkaProducer011<String> myProducer = new FlinkKafkaProducer011<>(topic, new KeyedSerializationSchemaWrapper<String>(new SimpleStringSchema()), prop, FlinkKafkaProducer011.Semantic.EXACTLY_ONCE);

        text.addSink(myProducer);

        env.execute("StreamingFromCollection");

    }

}

3、消费kafka数据

package xuwei.tech.streaming;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.api.common.serialization.SimpleStringSchema;

import org.apache.flink.streaming.api.CheckpointingMode;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.CheckpointConfig;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;

import java.util.ArrayList;

import java.util.Properties;

/**

 * kafkaSource

 *

 * Created by xuwei.tech on 2018/10/23.

 */

public class StreamingKafkaSource {

    public static void main(String[] args) throws Exception {

        //获取Flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //checkpoint配置

        env.enableCheckpointing(5000);

        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);

        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);

        env.getCheckpointConfig().setCheckpointTimeout(60000);

        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);

        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        //设置statebackend

        //env.setStateBackend(new RocksDBStateBackend("hdfs://hadoop100:9000/flink/checkpoints",true));

        String topic = "t1";

        Properties prop = new Properties();

        prop.setProperty("bootstrap.servers","hadoop110:9092");

        prop.setProperty("group.id","con1");

        FlinkKafkaConsumer011<String> myConsumer = new FlinkKafkaConsumer011<>(topic, new SimpleStringSchema(), prop);

        myConsumer.setStartFromGroupOffsets();//默认消费策略

        DataStreamSource<String> text = env.addSource(myConsumer);

        text.print().setParallelism(1);

        env.execute("StreamingFromCollection");

    }

}

五、DataStream API之watermark

介绍：

在使用eventTime的时候如何处理乱序数据？
我们知道，流处理从事件产生，到流经source，再到operator，中间是有一个过程和时间的。虽然大部分情况下，流到operator的数据都是按照事件产生的时间顺序来的，但是也不排除由于网络延迟等原因，导致乱序的产生，特别是使用kafka的话，多个分区的数据无法保证有序。所以在进行window计算的时候，我们又不能无限期的等下去，必须要有个机制来保证一个特定的时间后，必须触发window去进行计算了。这个特别的机制，就是watermark，watermark是用于处理乱序事件的。
watermark可以翻译为水位线

代码：

1、Watermark案例1

package xuwei.tech.streaming.watermark;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.api.java.tuple.Tuple;

import org.apache.flink.api.java.tuple.Tuple2;

import org.apache.flink.streaming.api.TimeCharacteristic;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks;

import org.apache.flink.streaming.api.functions.windowing.WindowFunction;

import org.apache.flink.streaming.api.watermark.Watermark;

import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;

import org.apache.flink.streaming.api.windowing.time.Time;

import org.apache.flink.streaming.api.windowing.windows.TimeWindow;

import org.apache.flink.util.Collector;

import javax.annotation.Nullable;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Iterator;

import java.util.List;

/**

 *

 * Watermark 案例

 *

 * Created by xuwei.tech.

 */

public class StreamingWindowWatermark {

    public static void main(String[] args) throws Exception {

        //定义socket的端口号

        int port = 9000;

        //获取运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //设置使用eventtime，默认是使用processtime

        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        //设置并行度为1,默认并行度是当前机器的cpu数量

        env.setParallelism(1);

        //连接socket获取输入的数据

        DataStream<String> text = env.socketTextStream("hadoop100", port, "\n");

        //解析输入的数据

        DataStream<Tuple2<String, Long>> inputMap = text.map(new MapFunction<String, Tuple2<String, Long>>() {

            @Override

            public Tuple2<String, Long> map(String value) throws Exception {

                String[] arr = value.split(",");

                return new Tuple2<>(arr[0], Long.parseLong(arr[1]));

            }

        });

        //抽取timestamp和生成watermark

        DataStream<Tuple2<String, Long>> waterMarkStream = inputMap.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<Tuple2<String, Long>>() {

            Long currentMaxTimestamp = 0L;

            final Long maxOutOfOrderness = 10000L;// 最大允许的乱序时间是10s

            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");

            /**

             * 定义生成watermark的逻辑

             * 默认100ms被调用一次

             */

            @Nullable

            @Override

            public Watermark getCurrentWatermark() {

                return new Watermark(currentMaxTimestamp - maxOutOfOrderness);

            }

            //定义如何提取timestamp

            @Override

            public long extractTimestamp(Tuple2<String, Long> element, long previousElementTimestamp) {

                long timestamp = element.f1;

                currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp);

                long id = Thread.currentThread().getId();

                System.out.println("currentThreadId:"+id+",key:"+element.f0+",eventtime:["+element.f1+"|"+sdf.format(element.f1)+"],currentMaxTimestamp:["+currentMaxTimestamp+"|"+

                        sdf.format(currentMaxTimestamp)+"],watermark:["+getCurrentWatermark().getTimestamp()+"|"+sdf.format(getCurrentWatermark().getTimestamp())+"]");

                return timestamp;

            }

        });

        DataStream<String> window = waterMarkStream.keyBy(0)

                .window(TumblingEventTimeWindows.of(Time.seconds(3)))//按照消息的EventTime分配窗口，和调用TimeWindow效果一样

                .apply(new WindowFunction<Tuple2<String, Long>, String, Tuple, TimeWindow>() {

                    /**

                     * 对window内的数据进行排序，保证数据的顺序

                     * @param tuple

                     * @param window

                     * @param input

                     * @param out

                     * @throws Exception

                     */

                    @Override

                    public void apply(Tuple tuple, TimeWindow window, Iterable<Tuple2<String, Long>> input, Collector<String> out) throws Exception {

                        String key = tuple.toString();

                        List<Long> arrarList = new ArrayList<Long>();

                        Iterator<Tuple2<String, Long>> it = input.iterator();

                        while (it.hasNext()) {

                            Tuple2<String, Long> next = it.next();

                            arrarList.add(next.f1);

                        }

                        Collections.sort(arrarList);

                        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");

                        String result = key + "," + arrarList.size() + "," + sdf.format(arrarList.get(0)) + "," + sdf.format(arrarList.get(arrarList.size() - 1))

                                + "," + sdf.format(window.getStart()) + "," + sdf.format(window.getEnd());

                        out.collect(result);

                    }

                });

        //测试-把结果打印到控制台即可

        window.print();

        //注意：因为flink是懒加载的，所以必须调用execute方法，上面的代码才会执行

        env.execute("eventtime-watermark");

    }

}

2、Watermark案例2

package xuwei.tech.streaming.watermark;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.api.java.tuple.Tuple;

import org.apache.flink.api.java.tuple.Tuple2;

import org.apache.flink.streaming.api.TimeCharacteristic;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks;

import org.apache.flink.streaming.api.functions.windowing.WindowFunction;

import org.apache.flink.streaming.api.watermark.Watermark;

import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;

import org.apache.flink.streaming.api.windowing.time.Time;

import org.apache.flink.streaming.api.windowing.windows.TimeWindow;

import org.apache.flink.util.Collector;

import org.apache.flink.util.OutputTag;

import javax.annotation.Nullable;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Iterator;

import java.util.List;

/**

 *

 * Watermark 案例

 *

 * sideOutputLateData 收集迟到的数据

 *

 * Created by xuwei.tech.

 */

public class StreamingWindowWatermark2 {

    public static void main(String[] args) throws Exception {

        //定义socket的端口号

        int port = 9000;

        //获取运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //设置使用eventtime，默认是使用processtime

        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        //设置并行度为1,默认并行度是当前机器的cpu数量

        env.setParallelism(1);

        //连接socket获取输入的数据

        DataStream<String> text = env.socketTextStream("hadoop100", port, "\n");

        //解析输入的数据

        DataStream<Tuple2<String, Long>> inputMap = text.map(new MapFunction<String, Tuple2<String, Long>>() {

            @Override

            public Tuple2<String, Long> map(String value) throws Exception {

                String[] arr = value.split(",");

                return new Tuple2<>(arr[0], Long.parseLong(arr[1]));

            }

        });

        //抽取timestamp和生成watermark

        DataStream<Tuple2<String, Long>> waterMarkStream = inputMap.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<Tuple2<String, Long>>() {

            Long currentMaxTimestamp = 0L;

            final Long maxOutOfOrderness = 10000L;// 最大允许的乱序时间是10s

            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");

            /**

             * 定义生成watermark的逻辑

             * 默认100ms被调用一次

             */

            @Nullable

            @Override

            public Watermark getCurrentWatermark() {

                return new Watermark(currentMaxTimestamp - maxOutOfOrderness);

            }

            //定义如何提取timestamp

            @Override

            public long extractTimestamp(Tuple2<String, Long> element, long previousElementTimestamp) {

                long timestamp = element.f1;

                currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp);

                System.out.println("key:"+element.f0+",eventtime:["+element.f1+"|"+sdf.format(element.f1)+"],currentMaxTimestamp:["+currentMaxTimestamp+"|"+

                        sdf.format(currentMaxTimestamp)+"],watermark:["+getCurrentWatermark().getTimestamp()+"|"+sdf.format(getCurrentWatermark().getTimestamp())+"]");

                return timestamp;

            }

        });

        //保存被丢弃的数据

        OutputTag<Tuple2<String, Long>> outputTag = new OutputTag<Tuple2<String, Long>>("late-data"){};

        //注意，由于getSideOutput方法是SingleOutputStreamOperator子类中的特有方法，所以这里的类型，不能使用它的父类dataStream。

        SingleOutputStreamOperator<String> window = waterMarkStream.keyBy(0)

                .window(TumblingEventTimeWindows.of(Time.seconds(3)))//按照消息的EventTime分配窗口，和调用TimeWindow效果一样

                //.allowedLateness(Time.seconds(2))//允许数据迟到2秒

                .sideOutputLateData(outputTag)

                .apply(new WindowFunction<Tuple2<String, Long>, String, Tuple, TimeWindow>() {

                    /**

                     * 对window内的数据进行排序，保证数据的顺序

                     * @param tuple

                     * @param window

                     * @param input

                     * @param out

                     * @throws Exception

                     */

                    @Override

                    public void apply(Tuple tuple, TimeWindow window, Iterable<Tuple2<String, Long>> input, Collector<String> out) throws Exception {

                        String key = tuple.toString();

                        List<Long> arrarList = new ArrayList<Long>();

                        Iterator<Tuple2<String, Long>> it = input.iterator();

                        while (it.hasNext()) {

                            Tuple2<String, Long> next = it.next();

                            arrarList.add(next.f1);

                        }

                        Collections.sort(arrarList);

                        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");

                        String result = key + "," + arrarList.size() + "," + sdf.format(arrarList.get(0)) + "," + sdf.format(arrarList.get(arrarList.size() - 1))

                                + "," + sdf.format(window.getStart()) + "," + sdf.format(window.getEnd());

                        out.collect(result);

                    }

                });

        //把迟到的数据暂时打印到控制台，实际中可以保存到其他存储介质中

        DataStream<Tuple2<String, Long>> sideOutput = window.getSideOutput(outputTag);

        sideOutput.print();

        //测试-把结果打印到控制台即可

        window.print();

        //注意：因为flink是懒加载的，所以必须调用execute方法，上面的代码才会执行

        env.execute("eventtime-watermark");

    }

}

六、Flink Window和Time详解

①window操作与介绍

介绍：

window介绍：

聚合事件（比如计数、求和）在流上的工作方式与批处理不同。
比如，对流中的所有元素进行计数是不可能的，因为通常流是无限的（无界的）。所以，流上的聚合需要由 window 来划定范围，比如 “计算过去的5分钟” ，或者 “最后100个元素的和” 。
window是一种可以把无限数据切割为有限数据块的手段
窗口可以是时间驱动的【Time Window】（比如：每30秒）或者数据驱动的【Count Window】（比如：每100个元素）。

window类型

窗口通常被区分为不同的类型:
tumbling windows：滚动窗口【没有重叠】
sliding windows：滑动窗口【有重叠】
session windows：会话窗口

代码：

1、window 全量聚合

介绍：

全量聚合
等属于窗口的数据到齐，才开始进行聚合计算【可以实现对窗口内的数据进行排序等需求】
apply(windowFunction)
process(processWindowFunction)
processWindowFunction比windowFunction提供了更多的上下文信息。

package xuwei.tech.streaming;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.api.common.functions.ReduceFunction;

import org.apache.flink.api.java.tuple.Tuple;

import org.apache.flink.api.java.tuple.Tuple2;

import org.apache.flink.api.java.utils.ParameterTool;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;

import org.apache.flink.streaming.api.windowing.time.Time;

import org.apache.flink.streaming.api.windowing.windows.TimeWindow;

import org.apache.flink.util.Collector;

/**

 * window 全量聚合

 */

public class SocketDemoFullCount {

    public static void main(String[] args) throws Exception{

        //获取需要的端口号

        int port;

        try {

            ParameterTool parameterTool = ParameterTool.fromArgs(args);

            port = parameterTool.getInt("port");

        }catch (Exception e){

            System.err.println("No port set. use default port 9000--java");

            port = 9000;

        }

        //获取flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        String hostname = "hadoop100";

        String delimiter = "\n";

        //连接socket获取输入的数据

        DataStreamSource<String> text = env.socketTextStream(hostname, port, delimiter);

        DataStream<Tuple2<Integer,Integer>> intData = text.map(new MapFunction<String, Tuple2<Integer,Integer>>() {

            @Override

            public Tuple2<Integer,Integer> map(String value) throws Exception {

                return new Tuple2<>(1,Integer.parseInt(value));

            }

        });

        intData.keyBy(0)

                .timeWindow(Time.seconds(5))

                .process(new ProcessWindowFunction<Tuple2<Integer,Integer>, String, Tuple, TimeWindow>() {

                    @Override

                    public void process(Tuple key, Context context, Iterable<Tuple2<Integer, Integer>> elements, Collector<String> out)

                            throws Exception {

                        System.out.println("执行process。。。");

                        long count = 0;

                        for(Tuple2<Integer,Integer> element: elements){

                            count++;

                        }

                        out.collect("window:"+context.window()+",count:"+count);

                    }

                }).print();

        //这一行代码一定要实现，否则程序不执行

        env.execute("Socket window count");

    }

}

2、window增量聚合

介绍：

增量聚合
窗口中每进入一条数据，就进行一次计算
reduce(reduceFunction)
aggregate(aggregateFunction)
sum(),min(),max()

package xuwei.tech.streaming;

import org.apache.flink.api.common.functions.MapFunction;

import org.apache.flink.api.common.functions.ReduceFunction;

import org.apache.flink.api.java.tuple.Tuple2;

import org.apache.flink.api.java.utils.ParameterTool;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

/**

 * window 增量聚合

 */

public class SocketDemoIncrAgg {

    public static void main(String[] args) throws Exception{

        //获取需要的端口号

        int port;

        try {

            ParameterTool parameterTool = ParameterTool.fromArgs(args);

            port = parameterTool.getInt("port");

        }catch (Exception e){

            System.err.println("No port set. use default port 9000--java");

            port = 9000;

        }

        //获取flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        String hostname = "hadoop100";

        String delimiter = "\n";

        //连接socket获取输入的数据

        DataStreamSource<String> text = env.socketTextStream(hostname, port, delimiter);

        DataStream<Tuple2<Integer,Integer>> intData = text.map(new MapFunction<String, Tuple2<Integer,Integer>>() {

            @Override

            public Tuple2<Integer,Integer> map(String value) throws Exception {

                return new Tuple2<>(1,Integer.parseInt(value));

            }

        });

        intData.keyBy(0)

                .timeWindow(Time.seconds(5))

                .reduce(new ReduceFunction<Tuple2<Integer, Integer>>() {

                    @Override

                    public Tuple2<Integer, Integer> reduce(Tuple2<Integer, Integer> value1, Tuple2<Integer, Integer> value2) throws Exception {

                        System.out.println("执行reduce操作："+value1+","+value2);

                        return new Tuple2<>(value1.f0,value1.f1+value2.f1);

                    }

                }).print();

        //这一行代码一定要实现，否则程序不执行

        env.execute("Socket window count");

    }

}

8、窗口滑动计算

package xuwei.tech.streaming;

import org.apache.flink.api.common.functions.FlatMapFunction;

import org.apache.flink.api.java.utils.ParameterTool;

import org.apache.flink.contrib.streaming.state.RocksDBStateBackend;

import org.apache.flink.runtime.state.filesystem.FsStateBackend;

import org.apache.flink.runtime.state.memory.MemoryStateBackend;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.datastream.DataStreamSource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

import org.apache.flink.util.Collector;

/**

 * 滑动窗口计算

 *

 * 通过socket模拟产生单词数据

 * flink对数据进行统计计算

 *

 * 需要实现每隔1秒对最近2秒内的数据进行汇总计算

 *

 *

 * Created by xuwei.tech on 2018/10/8.

 */

public class SocketWindowWordCountJava {

    public static void main(String[] args) throws Exception{

        //获取需要的端口号

        int port;

        try {

            ParameterTool parameterTool = ParameterTool.fromArgs(args);

            port = parameterTool.getInt("port");

        }catch (Exception e){

            System.err.println("No port set. use default port 9000--java");

            port = 9000;

        }

        //获取flink的运行环境

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        String hostname = "hadoop100";

        String delimiter = "\n";

        //连接socket获取输入的数据

        DataStreamSource<String> text = env.socketTextStream(hostname, port, delimiter);

        // a a c

        // a 1

        // a 1

        // c 1

        DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() {

            public void flatMap(String value, Collector<WordWithCount> out) throws Exception {

                String[] splits = value.split("\\s");

                for (String word : splits) {

                    out.collect(new WordWithCount(word, 1L));

                }

            }

        }).keyBy("word")

                .timeWindow(Time.seconds(2), Time.seconds(1))//指定时间窗口大小为2秒，指定时间间隔为1秒

                .sum("count");//在这里使用sum或者reduce都可以

                /*.reduce(new ReduceFunction<WordWithCount>() {

                                    public WordWithCount reduce(WordWithCount a, WordWithCount b) throws Exception {

                                        return new WordWithCount(a.word,a.count+b.count);

                                    }

                                })*/

        //把数据打印到控制台并且设置并行度

        windowCounts.print().setParallelism(1);

        //这一行代码一定要实现，否则程序不执行

        env.execute("Socket window count");

    }

    public static class WordWithCount{

        public String word;

        public long count;

        public  WordWithCount(){}

        public WordWithCount(String word,long count){

            this.word = word;

            this.count = count;

        }

        @Override

        public String toString() {

            return "WordWithCount{" +

                    "word='" + word + '\'' +

                    ", count=" + count +

                    '}';

        }

    }

}

window总结与感悟：

②time操作与介绍

介绍：

针对stream数据中的时间，可以分为以下三种
Event Time：事件产生的时间，它通常由事件中的时间戳描述。
Ingestion time：事件进入Flink的时间
Processing Time：事件被处理时当前系统的时间

代码：

1、Processing Time

package myflink.job;

import com.alibaba.fastjson.JSON;

import myflink.model.UrlInfo;

import org.apache.commons.codec.digest.DigestUtils;

import org.apache.commons.lang3.time.DateFormatUtils;

import org.apache.flink.api.common.functions.ReduceFunction;

import org.apache.flink.api.common.serialization.SimpleStringSchema;

import org.apache.flink.api.java.functions.KeySelector;

import org.apache.flink.streaming.api.TimeCharacteristic;

import org.apache.flink.streaming.api.datastream.KeyedStream;

import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;

import org.apache.flink.streaming.api.windowing.time.Time;

import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;

import java.util.Date;

import java.util.Properties;

public class WindowTest {

    public static void main(String[] args) throws Exception {

        // 从kafka中获取数据

        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        Properties properties = new Properties();

        properties.put("bootstrap.servers", "localhost:9092");

        properties.put("zookeeper.connect", "localhost:2181");

        properties.put("group.id", "metric-group");

        properties.put("auto.offset.reset", "latest");

        properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

        properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

        SingleOutputStreamOperator<UrlInfo> dataStreamSource = env.addSource(

                new FlinkKafkaConsumer010<String>(

                        "testjin",// topic

                        new SimpleStringSchema(),

                        properties

                )

        ).setParallelism(1)

                // map操作，转换，从一个数据流转换成另一个数据流，这里是从string-->UrlInfo

                .map(string -> {

                    UrlInfo urlInfo = JSON.parseObject(string, UrlInfo.class);

                    urlInfo.setDomain(urlInfo.generateDomain());

                    return urlInfo;

                });

        // 根据domain做keyby

        KeyedStream<UrlInfo, String> keyedStream = dataStreamSource.keyBy(new KeySelector<UrlInfo, String>() {

            @Override

            public String getKey(UrlInfo urlInfo) throws Exception {

                return urlInfo.getDomain();

            }

        });

        // 设置时间类型为Processing Time

        env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);

        // 使用timeWindow

        SingleOutputStreamOperator<UrlInfo> windowReduceStream = keyedStream.timeWindow(Time.seconds(30))

        .reduce((ReduceFunction<UrlInfo>) (t1, t2) -> {

            UrlInfo urlInfo = new UrlInfo();

            // domain都是同一个partition，所以都是同一个

            urlInfo.setDomain(t1.getDomain());

            urlInfo.setUrl(urlInfo.getDomain() + "/reduce/" + DateFormatUtils.format(new Date(),"yyyy-MM-dd'T'HH:mm:ss"));

            urlInfo.setHash(DigestUtils.md5Hex(urlInfo.getUrl()));

            urlInfo.setCount(t1.getCount() + 1);// 在reduce中做累加计数

            return urlInfo;

        }).returns(UrlInfo.class);

        windowReduceStream.addSink(new PrintSinkFunction<>());

        env.execute("execute window reduce info");

    }

}