This document is posted to help you gain knowledge. Please leave a comment to let me know what you think about it! Share it to your friends and learn new things together.
Transcript
JCConf Dataflow Workshop Labs {Simon Su / 20161015}
@SuppressWarnings("serial") public class TestMain { private static final Logger LOG = LoggerFactory.getLogger(TestMain.class);
public static void main(String[] args) { Pipeline p = Pipeline.create( PipelineOptionsFactory.fromArgs(args).withValidation().create()); p.apply(TextIO.Read.named("sample-book").from("gs://jcconf2016-dataflow-workshop/sample/book-sample.txt")) .apply(ParDo.of(new DoFn<String, String>() { @Override public void processElement(ProcessContext c) { c.output(c.element().toUpperCase()); } })) .apply(ParDo.of(new DoFn<String, Void>() { @Override public void processElement(ProcessContext c) { LOG.info(c.element()); } })); p.run(); } }
進一步修改程式,讓資料輸出到Google Cloud Storage…
@SuppressWarnings("serial") public class TestMain { private static final Logger LOG = LoggerFactory.getLogger(TestMain.class); public static void main(String[] args) { Pipeline p = Pipeline.create( PipelineOptionsFactory.fromArgs(args).withValidation().create()); p.apply(TextIO.Read.named("sample-book").from("gs://jcconf2016-dataflow-workshop/sample/book-sample.txt")) .apply(ParDo.of(new DoFn<String, String>() { @Override public void processElement(ProcessContext c) { c.output(c.element().toUpperCase()); } })) .apply(TextIO.Write.named("output-book").to("gs://jcconf2016-dataflow-workshop/result/book-sample.txt")); p.run(); } }
增加Transform Function,將段落字元切割
@SuppressWarnings("serial") public class TestMain { private static final Logger LOG = LoggerFactory.getLogger(TestMain.class); public static void main(String[] args) { Pipeline p = Pipeline.create( PipelineOptionsFactory.fromArgs(args).withValidation().create()); p.apply(TextIO.Read.named("sample-book").from("gs://jcconf2016-dataflow-workshop/sample/book-sample.txt")) .apply(ParDo.of(new DoFn<String, String>() { private final Aggregator<Long, Long> emptyLines = createAggregator("emptyLines", new Sum.SumLongFn()); @Override public void processElement(ProcessContext c) { if (c.element().trim().isEmpty()) {
emptyLines.addValue(1L); }
// Split the line into words. String[] words = c.element().split("[^a-zA-Z']+");
// Output each word encountered into the output PCollection. for (String word : words) { if (!word.isEmpty()) { c.output(word); } }
/* * Copyright (C) 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.jcconf2016.demo; import java.util.ArrayList; import java.util.List; import org.joda.time.Duration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.cloud.dataflow.sdk.Pipeline; import com.google.cloud.dataflow.sdk.io.BigQueryIO; import com.google.cloud.dataflow.sdk.io.PubsubIO; import com.google.cloud.dataflow.sdk.options.Default; import com.google.cloud.dataflow.sdk.options.Description; import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; import com.google.cloud.dataflow.sdk.options.StreamingOptions;
import com.google.cloud.dataflow.sdk.transforms.DoFn; import com.google.cloud.dataflow.sdk.transforms.ParDo; import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows; import com.google.cloud.dataflow.sdk.transforms.windowing.Window; import com.google.cloud.dataflow.sdk.values.KV; import com.google.cloud.dataflow.sdk.values.PCollection; /** * A starter example for writing Google Cloud Dataflow programs. * * <p> * The example takes two strings, converts them to their upper-case * representation and logs them. * * <p> * To run this starter example locally using DirectPipelineRunner, just execute * it without any additional parameters from your favorite development * environment. In Eclipse, this corresponds to the existing 'LOCAL' run * configuration. * * <p> * To run this starter example using managed resource in Google Cloud Platform, * you should specify the following command-line options: * --project=<YOUR_PROJECT_ID> * --stagingLocation=<STAGING_LOCATION_IN_CLOUD_STORAGE> * --runner=BlockingDataflowPipelineRunner In Eclipse, you can just modify the * existing 'SERVICE' run configuration. */ @SuppressWarnings("serial") public class StreamingPipeline {
static final int WINDOW_SIZE = 1; // Default window duration in minutes
public static interface Options extends StreamingOptions { @Description("Fixed window duration, in minutes") @Default.Integer(WINDOW_SIZE) Integer getWindowSize();
void setWindowSize(Integer value);
@Description("Whether to run the pipeline with unbounded input") boolean isUnbounded();