-
Notifications
You must be signed in to change notification settings - Fork 2
/
Dockerfile
103 lines (91 loc) · 4.77 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#
# Containerized Amazon Recommender System (CARS) Project
#
# Authors: Brianna Blain-Castelli, Nikkolas Irwin, Adam Cassell, and Andrew Munoz
# Date: 04/01/2020
# Purpose: Build a Big Data application using a Conda environment and Docker.
# Course: CS 636 Big Data Systems
# Project: CARS is an application that builds a recommender system from datasets provided by
# UCSD (see citation below).
#
# Dataset URL: https://nijianmo.github.io/amazon/index.html
#
# ***IMPORTANT*** You must download the dataset files for a particular category to your local machine yourself due
# to their size. As long as your dataset files are in the same directory as the Dockerfile, then
# they will be added to the volume and usable by the container as expected.
#
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#
# Citation: Justifying recommendations using distantly-labeled reviews and fined-grained aspects
# Jianmo Ni, Jiacheng Li, Julian McAuley
# Empirical Methods in Natural Language Processing (EMNLP), 2019
# PDF: http://cseweb.ucsd.edu/~jmcauley/pdfs/emnlp19a.pdf
#
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#
# Example Usage: The commands listed below provide examples with an image called cars with the optional tag latest.
# The commands listed below also use an example container called cars_ctnr
#
# 1. Command to Build Image:
#
# docker image build -t cars:latest .
#
# 2. Command to Run Container and Build Volume:
#
# docker container run -d -p 8888:8888 -it --name cars_container --mount source=cars_local_volume,target=/home/jovyan/work cars
#
# 3. Command to View Currently Running Jupyter Notebook Server (Outside of Container):
#
# docker container exec -it cars_container jupyter notebook list
#
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#
# Base Image Attribution (jupyter/minimal-notebook):
#
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
# OS/ARCH: linux/amd64
# GitHub: https://github.com/jupyter/docker-stacks/tree/master/minimal-notebook
# DockerHub: https://hub.docker.com/layers/jupyter/minimal-notebook/latest/images/sha256-0dc8e7bd46d7dbf27c255178ef2d92bb8ca888d32776204d19b5c23f741c1414?context=explore
#
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Pull the base image (jupyter/minimal-notebook) from DockerHub
ARG BASE_IMAGE=jupyter/minimal-notebook:dc9744740e12@sha256:0dc8e7bd46d7dbf27c255178ef2d92bb8ca888d32776204d19b5c23f741c1414
ARG ROOT_IMAGE=${BASE_IMAGE}
FROM ${BASE_IMAGE}
# Specify the CARS image maintainers
LABEL maintainer="Nikkolas Irwin <[email protected]>, \
Brianna Blain-Castelli <[email protected]>, \
Andrew Munoz <[email protected]>, \
Adam Cassell <[email protected]>"
# Set the user to root during dependency management and installations
USER root
# Set the default Conda environment to the base environment
ENV CONDA_DEFAULT_ENV base
# Update apt and install, locate and Open JDK 8
RUN apt-get -y update && \
apt-get install -y locate && \
apt-get install --no-install-recommends -y openjdk-8-jre-headless ca-certificates-java && \
rm -rf /var/lib/apt/lists/*
# Using pinned spec conda==4.8.2, install explicit dependencies with Conda and then clean tarballs
RUN conda install --yes --name base \
matplotlib \
plotly \
pyspark && \
conda clean --all --force-pkgs-dirs --yes
# Set the environmental variables for Spark/PySpark
ENV APACHE_SPARK_VERSION=2.4.5
ENV PYSPARK_SUBMIT_ARGS="--master local[*] pyspark-shell"
ENV PYSPARK_PYTHON=python3
# Set the environmental variables for Jupyter Notebook
ENV PYSPARK_DRIVER_PYTHON="jupyter"
ENV PYSPARK_DRIVER_PYTHON_OPTS="notebook"
# Change the user back to the Jupyter Notebook user provided by the jupyter/minimal-notebook
USER ${NB_UID}
# Create the volume mount point to the Jupyter Notebook user's directory
VOLUME /home/jovyan/work
# Copy the dataset files from the current directory into the volume
COPY . /home/jovyan/work
# Expose port 4040 for the PySpark Application UI
EXPOSE 4040