From 02ea0148b20b19dc48d574d6cfbb7fc8dc165b16 Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Tue, 15 Dec 2020 12:03:32 +0330 Subject: [PATCH 01/10] feat: set tesseract ocr langauges as docker build args --- full/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/full/Dockerfile b/full/Dockerfile index f3ccd83..24038a3 100644 --- a/full/Dockerfile +++ b/full/Dockerfile @@ -14,9 +14,9 @@ RUN apt-get update FROM base as dependencies ARG JRE='openjdk-14-jre-headless' +ARG TESSERACT_LANGUAGES='tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu' -RUN DEBIAN_FRONTEND=noninteractive apt-get -y install $JRE gdal-bin tesseract-ocr \ - tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu +RUN DEBIAN_FRONTEND=noninteractive apt-get -y install $JRE gdal-bin tesseract-ocr $TESSERACT_LANGUAGES RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ && DEBIAN_FRONTEND=noninteractive apt-get install -y xfonts-utils fonts-freefont-ttf fonts-liberation ttf-mscorefonts-installer wget cabextract From 106b0e3d00e623791d5fb865bcfd5d5ff105483e Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Tue, 15 Dec 2020 12:35:54 +0330 Subject: [PATCH 02/10] fix: docker-tools.sh --- docker-tool.sh | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/docker-tool.sh b/docker-tool.sh index 99a13a2..825802e 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -21,11 +21,13 @@ while getopts ":h" opt; do case ${opt} in h ) echo "Usage:" - echo " docker-tool.sh -h Display this help message." - echo " docker-tool.sh build Builds images for ." - echo " docker-tool.sh test Tests images for ." - echo " docker-tool.sh publish Publishes images for to Docker Hub." - echo " docker-tool.sh latest Tags images for as latest on Docker Hub." + echo " docker-tool.sh -h Display this help message." + echo " docker-tool.sh build [''] Builds images for via special []." + echo " docker-tool.sh test Tests images for ." + echo " docker-tool.sh publish Publishes images for to Docker Hub." + echo " docker-tool.sh latest Tags images for as latest on Docker Hub." + echo "" + ecgi "Note: [] is optional for full image, if you want to change default `tesseract-ocr` installation languages." exit 0 ;; \? ) @@ -58,13 +60,18 @@ test_docker_image() { shift $((OPTIND -1)) subcommand=$1; shift version=$1; shift +tesseract_languages=$1; shift case "$subcommand" in build) + build_args="--build-arg TIKA_VERSION=${version}" + if [[ ! -z "$tesseract_languages" ]]; then + build_args="$build_args --build-arg TESSERACT_LANGUAGES='${tesseract_languages}'" + fi # Build slim version with minimal dependencies docker build -t apache/tika:${version} --build-arg TIKA_VERSION=${version} - < minimal/Dockerfile --no-cache # Build full version with OCR, Fonts and GDAL - docker build -t apache/tika:${version}-full --build-arg TIKA_VERSION=${version} - < full/Dockerfile --no-cache + docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile --no-cache ;; test) From 57818249fed1d107ddc7d30971773f416f368e60 Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Fri, 1 Jan 2021 20:43:35 +0330 Subject: [PATCH 03/10] fix: pR --- README.md | 70 ++++++++++++++++++++++++++------------------------ docker-tool.sh | 7 ++--- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 9888c8a..154cd49 100644 --- a/README.md +++ b/README.md @@ -4,29 +4,34 @@ This repo is used to create convenience Docker images for Apache Tika Server pub The images create a functional Apache Tika Server instance that contains the latest Ubuntu running the appropriate version's server on Port 9998 using Java 8 (until version 1.20), Java 11 (1.21 and 1.24.1) and Java 14 for newer versions. -There is a minimal version, which contains only Apache Tika and it's core dependencies, and a full version, which also includes dependencies for the GDAL and Tesseract OCR parsers. To balance showing functionality versus the size of the full image, this file currently installs the language packs for the following languages: +There is a minimal version, which contains only Apache Tika and it's core dependencies, and a full version, which also includes dependencies for the GDAL and [Tesseract OCR parsers](#default-tesseract). To balance showing functionality versus the size of the full image, this file currently installs the language packs for the following languages: + * English * French * German * Italian * Spanish. -To install more languages simply update the apt-get command to include the package containing the language you required, or include your own custom packs using an ADD command. +To install more languages simply use `docker-build.sh` or manually using [docker --build-arg](https://docs.docker.com/engine/reference/commandline/build/#set-build-time-variables---build-arg) + +For see with version is supported by tesseract on official package: + + apt-cache search --names-only '^tesseract-ocr-[a-z]{3}$' ## Available Tags Below are the most recent tags: -- `latest`, `1.25`: Apache Tika Server 1.25 (Minimal) -- `latest-full`, `1.25-full`: Apache Tika Server 1.25 (Full) -- `1.25`: Apache Tika Server 1.25 (Minimal) -- `1.25-full`: Apache Tika Server 1.25 (Full) -- `1.24.1`: Apache Tika Server 1.24.1 (Minimal) -- `1.24.1-full`: Apache Tika Server 1.24.1 (Full) -- `1.24`: Apache Tika Server 1.24 (Minimal) -- `1.24-full`: Apache Tika Server 1.24 (Full) -- `1.23`: Apache Tika Server 1.23 (Minimal) -- `1.23-full`: Apache Tika Server 1.23 (Full) +* `latest`, `1.25`: Apache Tika Server 1.25 (Minimal) +* `latest-full`, `1.25-full`: Apache Tika Server 1.25 (Full) +* `1.25`: Apache Tika Server 1.25 (Minimal) +* `1.25-full`: Apache Tika Server 1.25 (Full) +* `1.24.1`: Apache Tika Server 1.24.1 (Minimal) +* `1.24.1-full`: Apache Tika Server 1.24.1 (Full) +* `1.24`: Apache Tika Server 1.24 (Minimal) +* `1.24-full`: Apache Tika Server 1.24 (Full) +* `1.23`: Apache Tika Server 1.23 (Minimal) +* `1.23-full`: Apache Tika Server 1.23 (Full) You can see a full set of tags for historical versions [here](https://hub.docker.com/r/apache/tika/tags?page=1&ordering=last_updated). @@ -42,7 +47,7 @@ Then to run the container, execute the following command: docker run -d -p 9998:9998 apache/tika: -Where is the DockerHub tag corresponding to the Apache Tika Server version - e.g. 1.23, 1.22, 1.23-full, 1.22-full. +Where `` is the DockerHub tag corresponding to the Apache Tika Server version - e.g. 1.23, 1.22, 1.23-full, 1.22-full. NOTE: The latest and latest-full tags are explicitly set to the latest released version when they are published. @@ -52,18 +57,17 @@ From version 1.25 and 1.25-full of the image it is now easier to override the de So for example if you wish to disable the OCR parser in the full image you could write a custom configuration: -``` -cat <> tika-config.xml - - - - - - - - -EOT -``` + cat <> tika-config.xml + + + + + + + + + EOT + Then by mounting this custom configuration as a volume, you could pass the command line parameter to load it docker run -d -p 9998:9998 -v `pwd`/tika-config.xml:/tika-config.xml apache/tika:1.25-full --config /tika-config.xml @@ -90,11 +94,11 @@ You can install docker-compose from [here](https://docs.docker.com/compose/insta To build the image from scratch, simply invoke: docker build -t 'apache/tika' github.com/apache/tika-docker - + You can then use the following command (using the name you allocated in the build command as part of -t option): docker run -d -p 9998:9998 apache/tika - + ## More Information For more infomation on Apache Tika Server, go to the [Apache Tika Server documentation](http://wiki.apache.org/tika/TikaJAXRS). @@ -106,15 +110,15 @@ For more information on the Apache Software Foundation, go to the [Apache Softwa ## Authors Apache Tika Dev Team (dev@tika.apache.org) - + ## Contributors There have been a range of [contributors](https://github.com/apache/tika-docker/graphs/contributors) on GitHub and via suggestions, including: -- [@grossws](https://github.com/grossws) -- [@arjunyel](https://github.com/arjunyel) -- [@mpdude](https://github.com/mpdude) -- [@laszlocsontosuw](https://github.com/laszlocsontosuw) +* [@grossws](https://github.com/grossws) +* [@arjunyel](https://github.com/arjunyel) +* [@mpdude](https://github.com/mpdude) +* [@laszlocsontosuw](https://github.com/laszlocsontosuw) ## Licence @@ -129,7 +133,7 @@ There have been a range of [contributors](https://github.com/apache/tika-docker/ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - + ## Disclaimer It is worth noting that whilst these Docker images download the binary JARs published by the Apache Tika Team on the Apache Software Foundation distribution sites, only the source release of an Apache Software Foundation project is an official release artefact. See [Release Distribution Policy](https://www.apache.org/dev/release-distribution.html) for more details. diff --git a/docker-tool.sh b/docker-tool.sh index 825802e..587503a 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -22,12 +22,13 @@ while getopts ":h" opt; do h ) echo "Usage:" echo " docker-tool.sh -h Display this help message." - echo " docker-tool.sh build [''] Builds images for via special []." + echo " docker-tool.sh build [] Builds images for via special []." echo " docker-tool.sh test Tests images for ." echo " docker-tool.sh publish Publishes images for to Docker Hub." echo " docker-tool.sh latest Tags images for as latest on Docker Hub." echo "" - ecgi "Note: [] is optional for full image, if you want to change default `tesseract-ocr` installation languages." + echo "Note: [] is optional for full image," + echo " for change default tesseract-ocr packages." exit 0 ;; \? ) @@ -60,7 +61,7 @@ test_docker_image() { shift $((OPTIND -1)) subcommand=$1; shift version=$1; shift -tesseract_languages=$1; shift +tesseract_languages=$@ case "$subcommand" in build) From 6a902d6c2193a7f1eb9017aec79e9e91815a5e82 Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Mon, 24 May 2021 14:00:59 +0430 Subject: [PATCH 04/10] Update README.md --- README.md | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index edd845d..efe55c1 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,7 @@ This repo is used to create convenience Docker images for Apache Tika Server pub The images create a functional Apache Tika Server instance that contains the latest Ubuntu running the appropriate version's server on Port 9998 using Java 8 (until version 1.20), Java 11 (1.21 and 1.24.1) and Java 14 for newer versions. -There is a minimal version, which contains only Apache Tika and it's core dependencies, and a full version, which also includes dependencies for the GDAL and [Tesseract OCR parsers](#default-tesseract). To balance showing functionality versus the size of the full image, this file currently installs the language packs for the following languages: - +There is a minimal version, which contains only Apache Tika and it's core dependencies, and a full version, which also includes dependencies for the GDAL and Tesseract OCR parsers. To balance showing functionality versus the size of the full image, this file currently installs the language packs for the following languages: * English * French * German @@ -45,7 +44,7 @@ Then to run the container, execute the following command: docker run -d -p 9998:9998 apache/tika: -Where `` is the DockerHub tag corresponding to the Apache Tika Server version - e.g. 1.23, 1.22, 1.23-full, 1.22-full. +Where is the DockerHub tag corresponding to the Apache Tika Server version - e.g. 1.23, 1.22, 1.23-full, 1.22-full. NOTE: The latest and latest-full tags are explicitly set to the latest released version when they are published. @@ -55,17 +54,18 @@ From version 1.25 and 1.25-full of the image it is now easier to override the de So for example if you wish to disable the OCR parser in the full image you could write a custom configuration: - cat <> tika-config.xml - - - - - - - - - EOT - +``` +cat <> tika-config.xml + + + + + + + + +EOT +``` Then by mounting this custom configuration as a volume, you could pass the command line parameter to load it docker run -d -p 9998:9998 -v `pwd`/tika-config.xml:/tika-config.xml apache/tika:1.25-full --config /tika-config.xml @@ -92,11 +92,11 @@ You can install docker-compose from [here](https://docs.docker.com/compose/insta To build the image from scratch, simply invoke: docker build -t 'apache/tika' github.com/apache/tika-docker - + You can then use the following command (using the name you allocated in the build command as part of -t option): docker run -d -p 9998:9998 apache/tika - + ## More Information For more infomation on Apache Tika Server, go to the [Apache Tika Server documentation](https://cwiki.apache.org/confluence/display/TIKA/TikaServer). @@ -108,15 +108,15 @@ For more information on the Apache Software Foundation, go to the [Apache Softwa ## Authors Apache Tika Dev Team (dev@tika.apache.org) - + ## Contributors There have been a range of [contributors](https://github.com/apache/tika-docker/graphs/contributors) on GitHub and via suggestions, including: -* [@grossws](https://github.com/grossws) -* [@arjunyel](https://github.com/arjunyel) -* [@mpdude](https://github.com/mpdude) -* [@laszlocsontosuw](https://github.com/laszlocsontosuw) +- [@grossws](https://github.com/grossws) +- [@arjunyel](https://github.com/arjunyel) +- [@mpdude](https://github.com/mpdude) +- [@laszlocsontosuw](https://github.com/laszlocsontosuw) ## Licence @@ -131,7 +131,7 @@ There have been a range of [contributors](https://github.com/apache/tika-docker/ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - + ## Disclaimer It is worth noting that whilst these Docker images download the binary JARs published by the Apache Tika Team on the Apache Software Foundation distribution sites, only the source release of an Apache Software Foundation project is an official release artefact. See [Release Distribution Policy](https://www.apache.org/dev/release-distribution.html) for more details. From 12cf44e33bdafe42b05646f23e06a22899f1c55e Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Wed, 26 May 2021 22:49:34 +0430 Subject: [PATCH 05/10] change for solve PR --- README.md | 2 +- docker-tool.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index efe55c1..d75a3ec 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ There is a minimal version, which contains only Apache Tika and it's core depend To install more languages simply use `docker-build.sh` or manually using [docker --build-arg](https://docs.docker.com/engine/reference/commandline/build/#set-build-time-variables---build-arg) -For see with version is supported by tesseract on official package: +Obtain a list of official Tesseract packages by executing (on Linux): apt-cache search --names-only '^tesseract-ocr-[a-z]{3}$' diff --git a/docker-tool.sh b/docker-tool.sh index 587503a..92ecd20 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -28,7 +28,7 @@ while getopts ":h" opt; do echo " docker-tool.sh latest Tags images for as latest on Docker Hub." echo "" echo "Note: [] is optional for full image," - echo " for change default tesseract-ocr packages." + echo " to customize various tesseract-ocr packages. Otherwise the default packages are installed." exit 0 ;; \? ) From 93ca3e9f21a680b715edc8dd3ab691d2e1ee0443 Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Wed, 26 May 2021 22:52:01 +0430 Subject: [PATCH 06/10] fix conflict --- docker-tool.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-tool.sh b/docker-tool.sh index 92ecd20..09bb9dd 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -61,6 +61,7 @@ test_docker_image() { shift $((OPTIND -1)) subcommand=$1; shift version=$1; shift +jar=$1; shift tesseract_languages=$@ case "$subcommand" in From a18d572eb6a2aa323ff180b538c9a6eb697d483d Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Wed, 26 May 2021 23:01:34 +0430 Subject: [PATCH 07/10] Update docker-tool.sh --- docker-tool.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/docker-tool.sh b/docker-tool.sh index 09bb9dd..ab4e74c 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -21,11 +21,11 @@ while getopts ":h" opt; do case ${opt} in h ) echo "Usage:" - echo " docker-tool.sh -h Display this help message." - echo " docker-tool.sh build [] Builds images for via special []." - echo " docker-tool.sh test Tests images for ." - echo " docker-tool.sh publish Publishes images for to Docker Hub." - echo " docker-tool.sh latest Tags images for as latest on Docker Hub." + echo " docker-tool.sh -h Display this help message." + echo " docker-tool.sh build [] Builds images for via special []." + echo " docker-tool.sh test Tests images for ." + echo " docker-tool.sh publish Publishes images for to Docker Hub." + echo " docker-tool.sh latest Tags images for as latest on Docker Hub." echo "" echo "Note: [] is optional for full image," echo " to customize various tesseract-ocr packages. Otherwise the default packages are installed." @@ -64,14 +64,19 @@ version=$1; shift jar=$1; shift tesseract_languages=$@ +if [ -z "$jar" ] +then + jar="tika-server" +fi + case "$subcommand" in build) - build_args="--build-arg TIKA_VERSION=${version}" + build_args="--build-arg TIKA_VERSION=${version} --build-arg TIKA_JAR_NAME=${jar}" if [[ ! -z "$tesseract_languages" ]]; then build_args="$build_args --build-arg TESSERACT_LANGUAGES='${tesseract_languages}'" fi # Build slim version with minimal dependencies - docker build -t apache/tika:${version} --build-arg TIKA_VERSION=${version} - < minimal/Dockerfile --no-cache + docker build -t apache/tika:${version} ${build_args} - < minimal/Dockerfile --no-cache # Build full version with OCR, Fonts and GDAL docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile --no-cache ;; From 5ffc62ffefea88fbb810ba9a7c1df9277bd47456 Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Mon, 7 Jun 2021 20:39:06 +0430 Subject: [PATCH 08/10] fix: bash args --- docker-tool.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-tool.sh b/docker-tool.sh index f93f7e1..10de1d5 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -76,9 +76,9 @@ case "$subcommand" in build_args="$build_args --build-arg TESSERACT_LANGUAGES='${tesseract_languages}'" fi # Build slim version with minimal dependencies - docker build -t apache/tika:${version} ${build_args} - < minimal/Dockerfile --no-cache + # eval "docker build -t apache/tika:${version} ${build_args} - < minimal/Dockerfile --no-cache" # Build full version with OCR, Fonts and GDAL - docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile --no-cache + eval "docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile" ;; test) From bcff5c99a04c6cb264adc49fa102e2fb5227aa70 Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Tue, 8 Jun 2021 08:38:45 +0430 Subject: [PATCH 09/10] fix: test --- docker-tool.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-tool.sh b/docker-tool.sh index 10de1d5..4c63230 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -76,7 +76,7 @@ case "$subcommand" in build_args="$build_args --build-arg TESSERACT_LANGUAGES='${tesseract_languages}'" fi # Build slim version with minimal dependencies - # eval "docker build -t apache/tika:${version} ${build_args} - < minimal/Dockerfile --no-cache" + eval "docker build -t apache/tika:${version} ${build_args} - < minimal/Dockerfile --no-cache" # Build full version with OCR, Fonts and GDAL eval "docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile" ;; From b0003dffc6b62ab6ffcdac5f21c772dbf3dbc4dd Mon Sep 17 00:00:00 2001 From: Muhammad Hussein Fattahizadeh Date: Tue, 8 Jun 2021 08:44:36 +0430 Subject: [PATCH 10/10] fix: no-cache --- docker-tool.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-tool.sh b/docker-tool.sh index 4c63230..44199a7 100755 --- a/docker-tool.sh +++ b/docker-tool.sh @@ -78,7 +78,7 @@ case "$subcommand" in # Build slim version with minimal dependencies eval "docker build -t apache/tika:${version} ${build_args} - < minimal/Dockerfile --no-cache" # Build full version with OCR, Fonts and GDAL - eval "docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile" + eval "docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile --no-cache" ;; test)