###############################################################################
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################

# This image contains a Python SDK build and dependencies.
# By default it runs wordcount against a locally accessible HDFS service.
# See hdfs_integration_test.sh for example usage.

FROM python:2
WORKDIR /app
ENV HDFSCLI_CONFIG /app/sdks/python/apache_beam/io/hdfs_integration_test/hdfscli.cfg
RUN pip install --no-cache-dir holdup gsutil
RUN gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt .

# Install Beam and dependencies.
ADD sdks/python /app/sdks/python
ADD model /app/model
RUN cd sdks/python && \
    python setup.py sdist && \
    pip install --no-cache-dir $(ls dist/apache-beam-*.tar.gz | tail -n1)[gcp]

# Run wordcount, and write results to HDFS.
CMD holdup -t 45 http://namenode:50070 http://datanode:50075 && \
    echo "Waiting for safe mode to end." && \
    sleep 45 && \
    hdfscli -v -v -v upload -f kinglear.txt / && \
    python -m apache_beam.examples.wordcount \
        --input hdfs://kinglear* \
        --output hdfs://py-wordcount-integration \
        --hdfs_host namenode --hdfs_port 50070 --hdfs_user root
