diff --git a/README.md b/README.md index 38d1cec..a99e8ff 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,11 @@ There is a minimal version, which contains only Apache Tika and it's core depend * Italian * Spanish. -To install more languages simply update the apt-get command to include the package containing the language you required, or include your own custom packs using an ADD command. +To install more languages simply use `docker-build.sh` or manually using [docker --build-arg](https://docs.docker.com/engine/reference/commandline/build/#set-build-time-variables---build-arg) + +Obtain a list of official Tesseract packages by executing (on Linux): + + apt-cache search --names-only '^tesseract-ocr-[a-z]{3}$' ## Available Tags diff --git a/docker-tool.sh b/docker-tool.sh index 998312a..44199a7 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -21,11 +21,14 @@ while getopts ":h" opt; do case ${opt} in h ) echo "Usage:" - echo " docker-tool.sh -h Display this help message." - echo " docker-tool.sh build Builds images for ." - echo " docker-tool.sh test Tests images for ." - echo " docker-tool.sh publish Publishes images for to Docker Hub." - echo " docker-tool.sh latest Tags images for as latest on Docker Hub." + echo " docker-tool.sh -h Display this help message." + echo " docker-tool.sh build [] [] Builds images for , apply [], via special []." + echo " docker-tool.sh test Tests images for ." + echo " docker-tool.sh publish Publishes images for to Docker Hub." + echo " docker-tool.sh latest Tags images for as latest on Docker Hub." + echo "" + echo "Note: [] is optional for full image," + echo " to customize various tesseract-ocr packages. Otherwise the default packages are installed." exit 0 ;; \? ) @@ -59,19 +62,23 @@ shift $((OPTIND -1)) subcommand=$1; shift version=$1; shift jar=$1; shift +tesseract_languages=$@ if [ -z "$jar" ] then jar="tika-server" fi - case "$subcommand" in build) + build_args="--build-arg TIKA_VERSION=${version} --build-arg TIKA_JAR_NAME=${jar}" + if [[ ! -z "$tesseract_languages" ]]; then + build_args="$build_args --build-arg TESSERACT_LANGUAGES='${tesseract_languages}'" + fi # Build slim version with minimal dependencies - docker build -t apache/tika:${version} --build-arg TIKA_VERSION=${version} --build-arg TIKA_JAR_NAME=${jar} - < minimal/Dockerfile --no-cache + eval "docker build -t apache/tika:${version} ${build_args} - < minimal/Dockerfile --no-cache" # Build full version with OCR, Fonts and GDAL - docker build -t apache/tika:${version}-full --build-arg TIKA_VERSION=${version} --build-arg TIKA_JAR_NAME=${jar} - < full/Dockerfile --no-cache + eval "docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile --no-cache" ;; test) diff --git a/full/Dockerfile b/full/Dockerfile index 38b85f1..fb60c70 100644 --- a/full/Dockerfile +++ b/full/Dockerfile @@ -14,9 +14,9 @@ RUN apt-get update FROM base as dependencies ARG JRE='openjdk-14-jre-headless' +ARG TESSERACT_LANGUAGES='tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu' -RUN DEBIAN_FRONTEND=noninteractive apt-get -y install $JRE gdal-bin tesseract-ocr \ - tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu +RUN DEBIAN_FRONTEND=noninteractive apt-get -y install $JRE gdal-bin tesseract-ocr $TESSERACT_LANGUAGES RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ && DEBIAN_FRONTEND=noninteractive apt-get install -y xfonts-utils fonts-freefont-ttf fonts-liberation ttf-mscorefonts-installer wget cabextract