From 2889f89fd83acefa359ee0f0ddc485537dae1b26 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Wed, 8 Feb 2023 13:07:59 +0000
Subject: [PATCH 01/17] get sql works

---
 .dockerignore             |   1 +
 Dockerfile.2              | 348 ++++++++++++++++++++++++++++++++++++++
 Dockerfile.local          |   6 +
 Makefile                  |   4 +-
 configs/serve.json        |   2 +-
 seq2seq/run_seq2seq.py    |   3 +
 seq2seq/utils/pipeline.py |  13 +-
 7 files changed, 371 insertions(+), 6 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile.2
 create mode 100644 Dockerfile.local

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..9eededb9
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+transformers_cache
\ No newline at end of file
diff --git a/Dockerfile.2 b/Dockerfile.2
new file mode 100644
index 00000000..d01ed71b
--- /dev/null
+++ b/Dockerfile.2
@@ -0,0 +1,348 @@
+ARG BASE_IMAGE
+
+# ------------------------
+# Target: dev
+# ------------------------
+FROM $BASE_IMAGE as dev
+
+ARG TOOLKIT_USER_ID=13011
+ARG TOOLKIT_GROUP_ID=13011
+
+RUN apt-get update \
+    # Required to save git hashes
+    && apt-get install -y -q git curl unzip make gettext \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+ENV XDG_DATA_HOME=/app/.local/share \
+    XDG_CACHE_HOME=/app/.cache \
+    XDG_BIN_HOME=/app/.local/bin \
+    XDG_CONFIG_HOME=/app/.config
+RUN mkdir -p $XDG_DATA_HOME \
+    && mkdir -p $XDG_CACHE_HOME \
+    && mkdir -p $XDG_BIN_HOME \
+    && mkdir -p $XDG_CONFIG_HOME \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app
+
+# Install C++ toolchain, Facebook thrift, and dependencies
+RUN curl https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends software-properties-common \
+    && apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main" \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends \
+        binfmt-support libllvm9 llvm-9 llvm-9-dev llvm-9-runtime llvm-9-tools python-chardet python-pygments python-yaml \
+        g++ \
+        cmake \
+        libboost-all-dev \
+        libevent-dev \
+        libdouble-conversion-dev \
+        libgoogle-glog-dev \
+        libgflags-dev \
+        libiberty-dev \
+        liblz4-dev \
+        liblzma-dev \
+        libsnappy-dev \
+        make \
+        zlib1g-dev \
+        binutils-dev \
+        libjemalloc-dev \
+        libssl-dev \
+        pkg-config \
+        libunwind-dev \
+        libmysqlclient-dev \
+        bison \
+        flex \
+        libsodium-dev \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/zstd /app/third_party/zstd/
+RUN cd /app/third_party/zstd \
+    && make -j4 \
+    && make install \
+    && make clean
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/fmt /app/third_party/fmt/
+RUN cd /app/third_party/fmt/ \
+    && mkdir _build \
+    && cd _build \
+    && cmake -DBUILD_SHARED_LIBS=ON -DBUILD_EXAMPLES=off -DBUILD_TESTS=off ../. \
+    && make -j4 \
+    && make install \
+    && cd .. \
+    && rm -rf _build
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/folly /app/third_party/folly/
+RUN pip install cython \
+    && cd /app/third_party/folly \
+    && mkdir _build \
+    && cd _build \
+    && cmake -DBUILD_SHARED_LIBS=ON -DPYTHON_EXTENSIONS=ON -DBUILD_EXAMPLES=off -DBUILD_TESTS=off ../. \
+    && make -j4 \
+    && make install \
+    && cp folly/cybld/dist/folly-0.0.1-cp37-cp37m-linux_x86_64.whl /app/ \
+    && chown $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/folly-0.0.1-cp37-cp37m-linux_x86_64.whl \
+    && pip install /app/folly-0.0.1-cp37-cp37m-linux_x86_64.whl \
+    && cd .. \
+    && rm -rf _build
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/rsocket-cpp /app/third_party/rsocket-cpp/
+RUN cd /app/third_party/rsocket-cpp \
+    && mkdir _build \
+    && cd _build \
+    && cmake -DBUILD_SHARED_LIBS=ON -DBUILD_EXAMPLES=off -DBUILD_TESTS=off ../. \
+    && make -j4 \
+    && make install \
+    && cd .. \
+    && rm -rf _build
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/fizz /app/third_party/fizz/
+RUN cd /app/third_party/fizz \
+    && mkdir _build \
+    && cd _build \
+    && cmake -DBUILD_SHARED_LIBS=ON -DBUILD_EXAMPLES=off -DBUILD_TESTS=off ../fizz \
+    && make -j4 \
+    && make install \
+    && cd .. \
+    && rm -rf _build
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/wangle /app/third_party/wangle/
+RUN cd /app/third_party/wangle \
+    && mkdir _build \
+    && cd _build \
+    && cmake -DBUILD_SHARED_LIBS=ON -DBUILD_EXAMPLES=off -DBUILD_TESTS=off ../wangle \
+    && make -j4 \
+    && make install \
+    && cd .. \
+    && rm -rf _build
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/fbthrift /app/third_party/fbthrift/
+RUN cd /app/third_party/fbthrift \
+    && mkdir _build \
+    && cd _build \
+    && cmake \
+        -DBUILD_SHARED_LIBS=ON \
+        -DPYTHON_INCLUDE_DIR=$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())") \
+        -DPYTHON_LIBRARY=$(python -c "import distutils.sysconfig as sysconfig; import os; print(os.path.join(sysconfig.get_config_var('LIBDIR'), sysconfig.get_config_var('LDLIBRARY')))") \
+        -Dthriftpy3=ON \
+        ../. \
+    && make -j4 \
+    && DESTDIR=/ make install \
+    && cp thrift/lib/py3/cybld/dist/thrift-0.0.1-cp37-cp37m-linux_x86_64.whl /app/ \
+    && chown $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/thrift-0.0.1-cp37-cp37m-linux_x86_64.whl \
+    && pip install /app/thrift-0.0.1-cp37-cp37m-linux_x86_64.whl \
+    && cd .. \
+    && rm -rf _build
+
+# Install Rust toolchain
+ENV RUSTUP_HOME=/app/.local/rustup \
+    CARGO_HOME=/app/.local/cargo \
+    PATH=/app/.local/cargo/bin:$PATH
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        gcc \
+        libc6-dev \
+        wget \
+        ; \
+    \
+    url="https://static.rust-lang.org/rustup/dist/x86_64-unknown-linux-gnu/rustup-init"; \
+    wget "$url"; \
+    chmod +x rustup-init; \
+    ./rustup-init -y --no-modify-path --default-toolchain nightly-2021-06-01; \
+    rm rustup-init; \
+    chmod -R a+w $RUSTUP_HOME $CARGO_HOME; \
+    rustup --version; \
+    cargo --version; \
+    rustc --version; \
+    rm -rf /var/lib/apt/lists/*; \
+    chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.local/cargo; \
+    chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.local/rustup;
+
+# Install Haskell toolchain
+ENV BOOTSTRAP_HASKELL_NONINTERACTIVE=yes \
+    BOOTSTRAP_HASKELL_NO_UPGRADE=yes \
+    GHCUP_USE_XDG_DIRS=yes \
+    GHCUP_INSTALL_BASE_PREFIX=/app \
+    CABAL_DIR=/app/.cabal \
+    PATH=/app/.cabal/bin:/app/.local/bin:$PATH
+RUN buildDeps=" \
+        curl \
+        "; \
+    deps=" \
+        libtinfo-dev \
+        libgmp3-dev \
+        "; \
+    apt-get update \
+    && apt-get install -y --no-install-recommends $buildDeps $deps \
+    && curl --proto '=https' --tlsv1.2 -sSf https://get-ghcup.haskell.org | sh \
+    && ghcup install ghc \
+    && ghcup install cabal \
+    && cabal update \
+    && apt-get install -y --no-install-recommends git \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && git clone https://github.com/haskell/cabal.git \
+    && cd cabal \
+    && git checkout f5f8d933db229d30e6fc558f5335f0a4e85d7d44 \
+    && sed -i 's/3.5.0.0/3.6.0.0/' */*.cabal \
+    && cabal install cabal-install/ \
+        --allow-newer=Cabal-QuickCheck:Cabal \
+        --allow-newer=Cabal-described:Cabal \
+        --allow-newer=Cabal-tree-diff:Cabal \
+        --allow-newer=cabal-install:Cabal \
+        --allow-newer=cabal-install-solver:Cabal \
+    && cd .. \
+    && rm -rf cabal/ \
+    && rm -rf /app/.cabal/packages/* \
+    && rm -rf /app/.cabal/logs/* \
+    && rm -rf /app/.cache/ghcup \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.cabal \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.local/bin \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.local/share/ghcup
+
+# Build Facebook hsthrift
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/hsthrift /app/third_party/hsthrift/
+RUN cd /app/third_party/hsthrift \
+    && make thrift-cpp \
+    && cabal update \
+    && cabal build exe:thrift-compiler \
+    && make thrift-hs \
+    && cabal install exe:thrift-compiler \
+    && cabal clean \
+    && rm -rf /app/.cabal/packages/* \
+    && rm -rf /app/.cabal/logs/* \
+    && chown -h $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.cabal/bin/thrift-compiler \
+    && find /app/.cabal/store/ghc-8.10.*/ -maxdepth 2 -type d -group root -exec chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID {} \; \
+    && find . -group root -exec chown $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID {} \;
+
+# Install misc utilities and add toolkit user
+ENV LANG=en_US.UTF-8
+RUN apt update && \
+    apt install -y \
+        zsh fish gnupg lsb-release \
+        ca-certificates supervisor openssh-server bash ssh tmux jq \
+        curl wget vim procps htop locales nano man net-tools iputils-ping \
+        openssl libicu[0-9][0-9] libkrb5-3 zlib1g gnome-keyring libsecret-1-0 desktop-file-utils x11-utils && \
+    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg && \
+    echo \
+        "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \
+        $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \
+    apt-get update && \
+    apt-get install -y docker-ce docker-ce-cli containerd.io && \
+    sed -i "s/# en_US.UTF-8/en_US.UTF-8/" /etc/locale.gen && \
+    locale-gen && \
+    useradd -m -u $TOOLKIT_USER_ID -s /bin/bash --non-unique toolkit && \
+    passwd -d toolkit && \
+    useradd -m -u $TOOLKIT_USER_ID -s /bin/bash --non-unique console && \
+    passwd -d console && \
+    useradd -m -u $TOOLKIT_USER_ID -s /bin/bash --non-unique _toolchain && \
+    passwd -d _toolchain && \
+    useradd -m -u $TOOLKIT_USER_ID -s /bin/bash --non-unique coder && \
+    passwd -d coder && \
+    chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /run /etc/shadow /etc/profile && \
+    apt autoremove --purge && apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
+    echo ssh >> /etc/securetty && \
+    rm -f /etc/legal /etc/motd
+
+# Build Huggingface tokenizers Rust libraries
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID third_party/tokenizers /app/third_party/tokenizers/
+RUN cd /app/third_party/tokenizers \
+    && rustup --version \
+    && cargo --version \
+    && rustc --version \
+    && cargo build --release \
+    && cp target/release/libtokenizers_haskell.so /usr/lib/ \
+    && rm -rf target \
+    && find /app/.local/cargo -group root -exec chown $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID {} \;
+ENV TOKENIZERS_PARALLELISM=false
+
+# Install Python toolchain
+ENV PYTHONPATH=/app
+RUN pip install --no-cache-dir pre-commit "poetry==1.1.7"
+# Disable virtualenv creation to install our dependencies system-wide.
+RUN poetry config virtualenvs.create false
+# Config file is not readable by other users by default, which prevents
+# it from being read on Drone, therefore make it readable.
+RUN chmod go+r $XDG_CONFIG_HOME/pypoetry/config.toml
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID pyproject.toml poetry.lock /app/
+RUN poetry install --extras "deepspeed" \
+    && pip install /app/folly-0.0.1-cp37-cp37m-linux_x86_64.whl \
+    && pip install /app/thrift-0.0.1-cp37-cp37m-linux_x86_64.whl \
+    && rm -rf $XDG_CACHE_HOME/pip \
+    && rm -rf $XDG_CACHE_HOME/pypoetry \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID $XDG_CONFIG_HOME/pypoetry
+
+# Unfortunately, nltk doesn't look in XDG_DATA_HOME, so therefore /usr/local/share
+RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt stopwords
+
+# ------------------------
+# Target: train
+# ------------------------
+FROM dev as train
+
+ARG TOOLKIT_USER_ID=13011
+ARG TOOLKIT_GROUP_ID=13011
+
+# Misc environment variables
+ENV HF_HOME=/transformers_cache
+
+# Copy Seq-to-seq code
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./tests /app/tests/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/spider /app/third_party/spider/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/test_suite /app/third_party/test_suite/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./configs /app/configs/
+
+# ------------------------
+# Target: eval
+# ------------------------
+FROM dev as eval
+
+ARG TOOLKIT_USER_ID=13011
+ARG TOOLKIT_GROUP_ID=13011
+
+# Add thrift file
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID picard.thrift /app/
+
+# Build Cython code
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID gen-cpp2 /app/gen-cpp2/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID gen-py3 /app/gen-py3/
+RUN thrift1 --gen mstch_cpp2 picard.thrift \
+    && thrift1 --gen mstch_py3 picard.thrift \
+    && cd gen-py3 && python setup.py build_ext --inplace \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/gen-py3 /app/gen-cpp2
+ENV PYTHONPATH=$PYTHONPATH:/app/gen-py3 \
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/app/gen-py3/picard
+
+# Build and install Picard
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID cabal.project fb-util-cabal.patch /app/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID gen-hs /app/gen-hs/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID picard /app/picard/
+RUN cabal update \
+    && cd third_party/hsthrift \
+    && make THRIFT_COMPILE=thrift-compiler thrift-cpp thrift-hs \
+    && cd ../.. \
+    && thrift-compiler --hs --use-hash-map --use-hash-set --gen-prefix gen-hs -o . picard.thrift \
+    && patch -p 1 -d third_party/hsthrift < ./fb-util-cabal.patch \
+    && cabal install --overwrite-policy=always --install-method=copy exe:picard \
+    && chown $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.cabal/bin/picard \
+    && cabal clean \
+    && rm -rf /app/third_party/hsthrift/compiler/tests \
+    && rm -rf /app/.cabal/packages/* \
+    && rm -rf /app/.cabal/logs/* \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/picard/ \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/gen-hs/ \
+    && find /app/.cabal/store/ghc-8.10.*/ -maxdepth 2 -type d -group root -exec chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID {} \; \
+    && find /app/.cabal/store/ghc-8.10.*/ -maxdepth 2 -type f -group root -exec chown $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID {} \;
+
+# Misc environment variables
+ENV HF_HOME=/transformers_cache
+
+# Copy Seq-to-seq code
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./tests /app/tests/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/spider /app/third_party/spider/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/test_suite /app/third_party/test_suite/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./configs /app/configs/
+
+# Test Picard
+RUN python /app/tests/test_picard_client.py \
+    && rm -rf /app/seq2seq/__pycache__ \
+    && rm -rf /app/gen-py3/picard/__pycache__
diff --git a/Dockerfile.local b/Dockerfile.local
new file mode 100644
index 00000000..f4c5c0d8
--- /dev/null
+++ b/Dockerfile.local
@@ -0,0 +1,6 @@
+FROM tscholak/text-to-sql-eval:6a252386bed6d4233f0f13f4562d8ae8608e7445
+
+ARG TOOLKIT_USER_ID=13011
+ARG TOOLKIT_GROUP_ID=13011
+
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 0857a93f..75fd0803 100644
--- a/Makefile
+++ b/Makefile
@@ -170,6 +170,7 @@ eval_cosql: pull-eval-image
 serve: pull-eval-image
 	mkdir -p -m 777 database
 	mkdir -p -m 777 transformers_cache
+	docker build . -t picard -f Dockerfile.local
 	docker run \
 		-it \
 		--rm \
@@ -178,7 +179,8 @@ serve: pull-eval-image
 		--mount type=bind,source=$(BASE_DIR)/database,target=/database \
 		--mount type=bind,source=$(BASE_DIR)/transformers_cache,target=/transformers_cache \
 		--mount type=bind,source=$(BASE_DIR)/configs,target=/app/configs \
-		tscholak/$(EVAL_IMAGE_NAME):$(GIT_HEAD_REF) \
+		--name picard \
+		picard \
 		/bin/bash -c "python seq2seq/serve_seq2seq.py configs/serve.json"
 
 .PHONY: prediction_output
diff --git a/configs/serve.json b/configs/serve.json
index 3a929227..18cbb5c4 100644
--- a/configs/serve.json
+++ b/configs/serve.json
@@ -16,5 +16,5 @@
     "picard_mode": "parse_with_guards",
     "picard_schedule": "incremental",
     "picard_max_tokens_to_check": 2,
-    "device": 0
+    "device": -1
 }
diff --git a/seq2seq/run_seq2seq.py b/seq2seq/run_seq2seq.py
index d55186d3..19833fc1 100644
--- a/seq2seq/run_seq2seq.py
+++ b/seq2seq/run_seq2seq.py
@@ -1,5 +1,6 @@
 # Set up logging
 import sys
+sys.path.append('.')
 import logging
 
 logging.basicConfig(
@@ -140,6 +141,7 @@ def main() -> None:
         tokenizer.add_tokens([AddedToken(" <="), AddedToken(" <")])
 
     # Load dataset
+    logger.info('loading dataset...')
     metric, dataset_splits = load_dataset(
         data_args=data_args,
         model_args=model_args,
@@ -147,6 +149,7 @@ def main() -> None:
         training_args=training_args,
         tokenizer=tokenizer,
     )
+    logger.info('loading dataset complete')
 
     # Initialize Picard if necessary
     with PicardLauncher() if picard_args.launch_picard and training_args.local_rank <= 0 else nullcontext(None):
diff --git a/seq2seq/utils/pipeline.py b/seq2seq/utils/pipeline.py
index 6f5c51cb..dcb1818d 100644
--- a/seq2seq/utils/pipeline.py
+++ b/seq2seq/utils/pipeline.py
@@ -116,11 +116,14 @@ def _parse_and_tokenize(
             del encodings["token_type_ids"]
         return encodings
 
+    def get_schema(self, db_id):
+        if db_id not in self.schema_cache:
+            self.schema_cache[db_id] = get_schema(db_path=self.db_path, db_id=db_id)
+        return self.schema_cache[db_id]
+
     def _pre_process(self, input: Text2SQLInput) -> str:
         prefix = self.prefix if self.prefix is not None else ""
-        if input.db_id not in self.schema_cache:
-            self.schema_cache[input.db_id] = get_schema(db_path=self.db_path, db_id=input.db_id)
-        schema = self.schema_cache[input.db_id]
+        schema = self.get_schema(input.db_id)
         if hasattr(self.model, "add_schema"):
             self.model.add_schema(db_id=input.db_id, db_info=schema)
         serialized_schema = serialize_schema(
@@ -304,7 +307,9 @@ def postprocess(self, model_outputs: dict, return_type=ReturnType.TEXT, clean_up
 
 
 def get_schema(db_path: str, db_id: str) -> dict:
-    schema = dump_db_json_schema(db_path + "/" + db_id + "/" + db_id + ".sqlite", db_id)
+    db_file_path = db_path + "/" + db_id + "/" + db_id + ".sqlite"
+    print(f'reading schema from {db_file_path}')
+    schema = dump_db_json_schema(db_file_path, db_id)
     return {
         "db_table_names": schema["table_names_original"],
         "db_column_names": {

From 396ca338aa31be98faa2f75d3f757644b4af1990 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Wed, 8 Feb 2023 16:06:30 +0000
Subject: [PATCH 02/17] listing dbs works

---
 Makefile                  |  2 +-
 seq2seq/serve_seq2seq.py  | 30 ++++++++++++++++++++++++++++--
 seq2seq/utils/pipeline.py | 17 +++++++++++++----
 3 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 75fd0803..2fdee796 100644
--- a/Makefile
+++ b/Makefile
@@ -100,7 +100,7 @@ build-eval-image:
 
 .PHONY: pull-eval-image
 pull-eval-image:
-	docker pull tscholak/$(EVAL_IMAGE_NAME):$(GIT_HEAD_REF)
+	docker pull tscholak/text-to-sql-eval:6a252386bed6d4233f0f13f4562d8ae8608e7445
 
 .PHONY: train
 train: pull-train-image
diff --git a/seq2seq/serve_seq2seq.py b/seq2seq/serve_seq2seq.py
index 9cd5875a..ba7eed58 100644
--- a/seq2seq/serve_seq2seq.py
+++ b/seq2seq/serve_seq2seq.py
@@ -20,9 +20,13 @@
 from fastapi import FastAPI, HTTPException
 from uvicorn import run
 from sqlite3 import Connection, connect, OperationalError
-from seq2seq.utils.pipeline import Text2SQLGenerationPipeline, Text2SQLInput, get_schema
+from seq2seq.utils.pipeline import (Text2SQLGenerationPipeline, Text2SQLInput, get_schema, get_schema_for_display,
+    get_db_file_path)
 from seq2seq.utils.picard_model_wrapper import PicardArguments, PicardLauncher, with_picard
 from seq2seq.utils.dataset import DataTrainingArguments
+import sqlite3
+from pathlib import Path
+from typing import List
 
 
 @dataclass
@@ -128,7 +132,7 @@ def response(query: str, conn: Connection) -> AskResponse:
                     status_code=500, detail=f'while executing "{query}", the following error occurred: {e.args[0]}'
                 )
 
-        @app.get("/ask/{db_id}/{question}")
+        @app.get("/ask/")
         def ask(db_id: str, question: str):
             try:
                 outputs = pipe(
@@ -143,9 +147,31 @@ def ask(db_id: str, question: str):
             finally:
                 conn.close()
 
+        @app.get("/schema/")
+        def schema_list_get():
+            db_dir = Path(pipe.db_path)
+            print(f'db_path - {db_dir}')
+            db_files = db_dir.glob("*.sqlite")
+            return [db_file.stem for db_file in db_files if db_file.stem == db_file.parent.stem]
+
+        @app.get("/schema/{db_id}")
+        def schema_get(db_id):
+            return get_schema(pipe.db_path, db_id)
+
+
+        @app.post("/schema/{db_id}")
+        def schema_post(db_id, queries: List[str]):
+            db_file_path = get_db_file_path(pipe.db_path, db_id)
+            if os.path.exists(db_file_path):
+                raise HTTPException(status_code=409, detail="database already exists")
+
+            con = sqlite3.connect(db_file_path)
+        
+
         # Run app
         run(app=app, host=backend_args.host, port=backend_args.port)
 
 
 if __name__ == "__main__":
+    print('serving....')
     main()
diff --git a/seq2seq/utils/pipeline.py b/seq2seq/utils/pipeline.py
index dcb1818d..6e76664e 100644
--- a/seq2seq/utils/pipeline.py
+++ b/seq2seq/utils/pipeline.py
@@ -116,14 +116,15 @@ def _parse_and_tokenize(
             del encodings["token_type_ids"]
         return encodings
 
-    def get_schema(self, db_id):
+
+    def get_schema_from_cache(self, db_id):
         if db_id not in self.schema_cache:
             self.schema_cache[db_id] = get_schema(db_path=self.db_path, db_id=db_id)
         return self.schema_cache[db_id]
 
     def _pre_process(self, input: Text2SQLInput) -> str:
         prefix = self.prefix if self.prefix is not None else ""
-        schema = self.get_schema(input.db_id)
+        schema = self.get_schema_from_cache(input.db_id)
         if hasattr(self.model, "add_schema"):
             self.model.add_schema(db_id=input.db_id, db_info=schema)
         serialized_schema = serialize_schema(
@@ -268,8 +269,9 @@ def _parse_and_tokenize(
 
     def _pre_process(self, input: ConversationalText2SQLInput) -> str:
         prefix = self.prefix if self.prefix is not None else ""
-        if input.db_id not in self.schema_cache:
-            self.schema_cache[input.db_id] = get_schema(db_path=self.db_path, db_id=input.db_id)
+        # if input.db_id not in self.schema_cache:
+        #     self.schema_cache[input.db_id] = get_schema(db_path=self.db_path, db_id=input.db_id)
+        schema = self.get_schema_from_cache(input.db_id)
         schema = self.schema_cache[input.db_id]
         if hasattr(self.model, "add_schema"):
             self.model.add_schema(db_id=input.db_id, db_info=schema)
@@ -305,6 +307,8 @@ def postprocess(self, model_outputs: dict, return_type=ReturnType.TEXT, clean_up
             records.append(record)
         return records
 
+def get_db_file_path(db_path: str, db_id: str) -> str:
+    return db_path + "/" + db_id + "/" + db_id + ".sqlite"
 
 def get_schema(db_path: str, db_id: str) -> dict:
     db_file_path = db_path + "/" + db_id + "/" + db_id + ".sqlite"
@@ -323,3 +327,8 @@ def get_schema(db_path: str, db_id: str) -> dict:
             "other_column_id": [other_column_id for _, other_column_id in schema["foreign_keys"]],
         },
     }
+
+def get_schema_for_display(db_path: str, db_id: str) -> dict:
+    db_file_path = db_path + "/" + db_id + "/" + db_id + ".sqlite"
+    schema = dump_db_json_schema(db_file_path, db_id)
+    return schema
\ No newline at end of file

From 18b5ab29ef4e0c56bd6fdc35c6514a1734cea410 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Thu, 9 Feb 2023 13:17:11 +0000
Subject: [PATCH 03/17] create databases

---
 seq2seq/serve_seq2seq.py | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/seq2seq/serve_seq2seq.py b/seq2seq/serve_seq2seq.py
index ba7eed58..a7ef8384 100644
--- a/seq2seq/serve_seq2seq.py
+++ b/seq2seq/serve_seq2seq.py
@@ -147,11 +147,11 @@ def ask(db_id: str, question: str):
             finally:
                 conn.close()
 
-        @app.get("/schema/")
-        def schema_list_get():
+        @app.get("/database/")
+        def database_list_get():
             db_dir = Path(pipe.db_path)
             print(f'db_path - {db_dir}')
-            db_files = db_dir.glob("*.sqlite")
+            db_files = db_dir.rglob("*.sqlite")
             return [db_file.stem for db_file in db_files if db_file.stem == db_file.parent.stem]
 
         @app.get("/schema/{db_id}")
@@ -161,11 +161,29 @@ def schema_get(db_id):
 
         @app.post("/schema/{db_id}")
         def schema_post(db_id, queries: List[str]):
-            db_file_path = get_db_file_path(pipe.db_path, db_id)
-            if os.path.exists(db_file_path):
+            db_file_path = Path(get_db_file_path(pipe.db_path, db_id))
+
+            if db_file_path.exists():
                 raise HTTPException(status_code=409, detail="database already exists")
 
-            con = sqlite3.connect(db_file_path)
+            # create parent directory if it doesn't exist
+            db_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+            print(f'creating database {db_file_path.as_posix()}...')
+            
+            con = sqlite3.connect(db_file_path.as_posix())
+            cur = con.cursor()
+            try:
+                for query in queries:
+                    cur.execute(query)
+                con.commit()
+            except OperationalError as e:
+                raise HTTPException(status_code=400, detail=e.args[0])
+            finally:
+                con.close()
+            
+            return get_schema(pipe.db_path, db_id)
+
         
 
         # Run app

From 26b019158e304f2ac77dc9681fa9ff277be923dc Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Thu, 9 Feb 2023 13:39:46 +0000
Subject: [PATCH 04/17] update schema

---
 nlp_picard.postman_collection.json | 156 +++++++++++++++++++++++++++++
 seq2seq/serve_seq2seq.py           |  28 +++++-
 2 files changed, 181 insertions(+), 3 deletions(-)
 create mode 100644 nlp_picard.postman_collection.json

diff --git a/nlp_picard.postman_collection.json b/nlp_picard.postman_collection.json
new file mode 100644
index 00000000..2ccb4da9
--- /dev/null
+++ b/nlp_picard.postman_collection.json
@@ -0,0 +1,156 @@
+{
+	"info": {
+		"_postman_id": "c47177e2-5637-48ed-9671-27acbb36fb85",
+		"name": "nlp_picard",
+		"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json",
+		"_exporter_id": "17757684"
+	},
+	"item": [
+		{
+			"name": "get query",
+			"request": {
+				"method": "GET",
+				"header": [],
+				"url": {
+					"raw": "http://54.90.26.47:8000/ask/?db_id=well&question=what is the average length of the well bore",
+					"protocol": "http",
+					"host": [
+						"54",
+						"90",
+						"26",
+						"47"
+					],
+					"port": "8000",
+					"path": [
+						"ask",
+						""
+					],
+					"query": [
+						{
+							"key": "db_id",
+							"value": "well"
+						},
+						{
+							"key": "question",
+							"value": "what is the average length of the well bore"
+						}
+					]
+				}
+			},
+			"response": []
+		},
+		{
+			"name": "get databases",
+			"request": {
+				"method": "GET",
+				"header": [],
+				"url": {
+					"raw": "http://54.90.26.47:8000/database/",
+					"protocol": "http",
+					"host": [
+						"54",
+						"90",
+						"26",
+						"47"
+					],
+					"port": "8000",
+					"path": [
+						"database",
+						""
+					]
+				}
+			},
+			"response": []
+		},
+		{
+			"name": "get schema for database",
+			"request": {
+				"method": "GET",
+				"header": [],
+				"url": {
+					"raw": "http://54.90.26.47:8000/schema/well/",
+					"protocol": "http",
+					"host": [
+						"54",
+						"90",
+						"26",
+						"47"
+					],
+					"port": "8000",
+					"path": [
+						"schema",
+						"well",
+						""
+					]
+				}
+			},
+			"response": []
+		},
+		{
+			"name": "create schema",
+			"request": {
+				"method": "POST",
+				"header": [],
+				"body": {
+					"mode": "raw",
+					"raw": "[\n    \"CREATE TABLE well(id, name)\",\n    \"CREATE TABLE wellbore(id, well_id, bore_length)\"\n]",
+					"options": {
+						"raw": {
+							"language": "json"
+						}
+					}
+				},
+				"url": {
+					"raw": "http://54.90.26.47:8000/schema/well2/",
+					"protocol": "http",
+					"host": [
+						"54",
+						"90",
+						"26",
+						"47"
+					],
+					"port": "8000",
+					"path": [
+						"schema",
+						"well2",
+						""
+					]
+				}
+			},
+			"response": []
+		},
+		{
+			"name": "update schema",
+			"request": {
+				"method": "PATCH",
+				"header": [],
+				"body": {
+					"mode": "raw",
+					"raw": "[\n    \"CREATE TABLE well_2(id, name)\",\n    \"CREATE TABLE wellbore_2(id, well_id, bore_length)\"\n]",
+					"options": {
+						"raw": {
+							"language": "json"
+						}
+					}
+				},
+				"url": {
+					"raw": "http://54.90.26.47:8000/schema/well2/",
+					"protocol": "http",
+					"host": [
+						"54",
+						"90",
+						"26",
+						"47"
+					],
+					"port": "8000",
+					"path": [
+						"schema",
+						"well2",
+						""
+					]
+				}
+			},
+			"response": []
+		}
+	]
+}
\ No newline at end of file
diff --git a/seq2seq/serve_seq2seq.py b/seq2seq/serve_seq2seq.py
index a7ef8384..99f19e2c 100644
--- a/seq2seq/serve_seq2seq.py
+++ b/seq2seq/serve_seq2seq.py
@@ -148,19 +148,19 @@ def ask(db_id: str, question: str):
                 conn.close()
 
         @app.get("/database/")
-        def database_list_get():
+        def get_database_list():
             db_dir = Path(pipe.db_path)
             print(f'db_path - {db_dir}')
             db_files = db_dir.rglob("*.sqlite")
             return [db_file.stem for db_file in db_files if db_file.stem == db_file.parent.stem]
 
         @app.get("/schema/{db_id}")
-        def schema_get(db_id):
+        def get_schema_for_database(db_id):
             return get_schema(pipe.db_path, db_id)
 
 
         @app.post("/schema/{db_id}")
-        def schema_post(db_id, queries: List[str]):
+        def create_schema(db_id, queries: List[str]):
             db_file_path = Path(get_db_file_path(pipe.db_path, db_id))
 
             if db_file_path.exists():
@@ -184,6 +184,28 @@ def schema_post(db_id, queries: List[str]):
             
             return get_schema(pipe.db_path, db_id)
 
+        @app.patch("/schema/{db_id}")
+        def update_schema(db_id, queries: List[str]):
+            db_file_path = Path(get_db_file_path(pipe.db_path, db_id))
+
+            if not db_file_path.exists():
+                raise HTTPException(status_code=404, detail="database not found")
+
+            print(f'updating database {db_file_path.as_posix()}...')
+            
+            con = sqlite3.connect(db_file_path.as_posix())
+            cur = con.cursor()
+            try:
+                for query in queries:
+                    cur.execute(query)
+                con.commit()
+            except OperationalError as e:
+                raise HTTPException(status_code=400, detail=e.args[0])
+            finally:
+                con.close()
+            
+            return get_schema(pipe.db_path, db_id)
+
         
 
         # Run app

From 0fde355e22ae2eff32ec45e190db32665b1f326a Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Fri, 10 Feb 2023 16:04:26 +0000
Subject: [PATCH 05/17] queries without schema

---
 configs/serve.json        |  4 +-
 seq2seq/serve_seq2seq.py  | 61 +++++++++++++++++++++---
 seq2seq/utils/pipeline.py | 98 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 152 insertions(+), 11 deletions(-)

diff --git a/configs/serve.json b/configs/serve.json
index 18cbb5c4..ead51e0c 100644
--- a/configs/serve.json
+++ b/configs/serve.json
@@ -11,8 +11,8 @@
     "cache_dir": "/transformers_cache",
     "num_beams": 4,
     "num_return_sequences": 1,
-    "use_picard": true,
-    "launch_picard": true,
+    "use_picard": false,
+    "launch_picard": false,
     "picard_mode": "parse_with_guards",
     "picard_schedule": "incremental",
     "picard_max_tokens_to_check": 2,
diff --git a/seq2seq/serve_seq2seq.py b/seq2seq/serve_seq2seq.py
index 99f19e2c..fccd7c23 100644
--- a/seq2seq/serve_seq2seq.py
+++ b/seq2seq/serve_seq2seq.py
@@ -20,10 +20,13 @@
 from fastapi import FastAPI, HTTPException
 from uvicorn import run
 from sqlite3 import Connection, connect, OperationalError
-from seq2seq.utils.pipeline import (Text2SQLGenerationPipeline, Text2SQLInput, get_schema, get_schema_for_display,
+from seq2seq.utils.pipeline import (Text2SQLGenerationPipeline, Text2SQLGenPipelineWithSchema,
+    Text2SQLInput, QuestionWithSchemaInput, get_schema, get_schema_for_display,
     get_db_file_path)
 from seq2seq.utils.picard_model_wrapper import PicardArguments, PicardLauncher, with_picard
+from seq2seq.utils.dataset import serialize_schema
 from seq2seq.utils.dataset import DataTrainingArguments
+from seq2seq.utils.spider import spider_get_input
 import sqlite3
 from pathlib import Path
 from typing import List
@@ -117,9 +120,20 @@ def main():
             device=backend_args.device,
         )
 
+        pipe_with_schema = Text2SQLGenPipelineWithSchema(
+            model = model,
+            tokenizer = tokenizer,
+            normalize_query = data_training_args.normalize_query,
+            device = backend_args.device)
+
+
         # Initialize REST API
         app = FastAPI()
 
+        class Query(BaseModel):
+            question: str
+            db_schema: str
+
         class AskResponse(BaseModel):
             query: str
             execution_results: list
@@ -147,21 +161,54 @@ def ask(db_id: str, question: str):
             finally:
                 conn.close()
 
+
+        @app.post("/ask-with-schema/")
+        def ask_with_schema(query: Query):
+            try:
+                outputs = pipe_with_schema(
+                    inputs = QuestionWithSchemaInput(utterance=query.question, schema=query.db_schema),
+                    num_return_sequences=data_training_args.num_return_sequences
+                )
+            except OperationalError as e:
+                raise HTTPException(status_code=404, detail=e.args[0])
+
+            return [output["generated_text"] for output in outputs]
+
+
         @app.get("/database/")
         def get_database_list():
-            db_dir = Path(pipe.db_path)
+            db_dir = Path(backend_args.db_path)
             print(f'db_path - {db_dir}')
             db_files = db_dir.rglob("*.sqlite")
             return [db_file.stem for db_file in db_files if db_file.stem == db_file.parent.stem]
 
         @app.get("/schema/{db_id}")
         def get_schema_for_database(db_id):
-            return get_schema(pipe.db_path, db_id)
+            return get_schema(backend_args.db_path, db_id)
+
+        @app.get("/schema/{db_id}/spider-input")
+        def get_spider_input(db_id, schema_serialization_type = "peteshaw",
+                                                schema_serialization_randomized = False,
+                                                schema_serialization_with_db_id = True, 
+                                                schema_serialization_with_db_content = False
+                                               ):
+            schema = pipe_with_schema.get_schema_from_cache(db_id)
+            serialized_schema = serialize_schema(question='question',
+                db_path = backend_args.db_path,
+                db_id = db_id,
+                db_column_names = schema['db_column_names'],
+                db_table_names = schema['db_table_names'],
+                schema_serialization_type = schema_serialization_type,
+                schema_serialization_randomized = schema_serialization_randomized,
+                schema_serialization_with_db_id = schema_serialization_with_db_id,
+                schema_serialization_with_db_content = schema_serialization_with_db_content, 
+                )
+            return spider_get_input('question', serialized_schema, prefix='')
 
 
         @app.post("/schema/{db_id}")
         def create_schema(db_id, queries: List[str]):
-            db_file_path = Path(get_db_file_path(pipe.db_path, db_id))
+            db_file_path = Path(get_db_file_path(backend_args.db_path, db_id))
 
             if db_file_path.exists():
                 raise HTTPException(status_code=409, detail="database already exists")
@@ -182,11 +229,11 @@ def create_schema(db_id, queries: List[str]):
             finally:
                 con.close()
             
-            return get_schema(pipe.db_path, db_id)
+            return get_schema(backend_args.db_path, db_id)
 
         @app.patch("/schema/{db_id}")
         def update_schema(db_id, queries: List[str]):
-            db_file_path = Path(get_db_file_path(pipe.db_path, db_id))
+            db_file_path = Path(get_db_file_path(backend_args.db_path, db_id))
 
             if not db_file_path.exists():
                 raise HTTPException(status_code=404, detail="database not found")
@@ -204,7 +251,7 @@ def update_schema(db_id, queries: List[str]):
             finally:
                 con.close()
             
-            return get_schema(pipe.db_path, db_id)
+            return get_schema(backend_args.db_path, db_id)
 
         
 
diff --git a/seq2seq/utils/pipeline.py b/seq2seq/utils/pipeline.py
index 6e76664e..812995af 100644
--- a/seq2seq/utils/pipeline.py
+++ b/seq2seq/utils/pipeline.py
@@ -14,6 +14,11 @@ class Text2SQLInput(object):
     utterance: str
     db_id: str
 
+@dataclass
+class QuestionWithSchemaInput(object):
+    utterance: str
+    schema: str
+
 
 class Text2SQLGenerationPipeline(Text2TextGenerationPipeline):
     """
@@ -35,7 +40,7 @@ class Text2SQLGenerationPipeline(Text2TextGenerationPipeline):
     """
 
     def __init__(self, *args, **kwargs):
-        self.db_path: str = kwargs.pop("db_path")
+        self.db_path: str = kwargs.pop("db_path", None)
         self.prefix: Optional[str] = kwargs.pop("prefix", None)
         self.normalize_query: bool = kwargs.pop("normalize_query", True)
         self.schema_serialization_type: str = kwargs.pop("schema_serialization_type", "peteshaw")
@@ -74,6 +79,7 @@ def __call__(self, inputs: Union[Text2SQLInput, List[Text2SQLInput]], *args, **k
               -- The token ids of the generated SQL.
         """
         result = super().__call__(inputs, *args, **kwargs)
+        print(f'with db output is :{result}')
         if (
             isinstance(inputs, list)
             and all(isinstance(el, Text2SQLInput) for el in inputs)
@@ -139,7 +145,9 @@ def _pre_process(self, input: Text2SQLInput) -> str:
             schema_serialization_with_db_content=self.schema_serialization_with_db_content,
             normalize_query=self.normalize_query,
         )
-        return spider_get_input(question=input.utterance, serialized_schema=serialized_schema, prefix=prefix)
+        spider_input = spider_get_input(question=input.utterance, serialized_schema=serialized_schema, prefix=prefix)
+        print(f'spider input is:{spider_input}')
+        return spider_input
 
     def postprocess(self, model_outputs: dict, return_type=ReturnType.TEXT, clean_up_tokenization_spaces=False):
         records = []
@@ -159,6 +167,92 @@ def postprocess(self, model_outputs: dict, return_type=ReturnType.TEXT, clean_up
             records.append(record)
         return records
 
+class Text2SQLGenPipelineWithSchema(Text2SQLGenerationPipeline):
+    """
+    Pipeline for text-to-SQL generation using seq2seq models. Here Database schema is passed along with query
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(...)
+    tokenizer = AutoTokenizer.from_pretrained(...)
+    db_path = ... path to "concert_singer" parent folder
+    text2sql_generator = Text2SQLGenerationPipeline(
+        model=model,
+        tokenizer=tokenizer,
+    )
+    text2sql_generator(inputs=Text2SQLInput(utterance="How many singers do we have?", db_id="concert_singer"))
+    """
+    def __init__(self, *args, **kwargs):
+        self.normalize_query: bool = kwargs.pop("normalize_query", True)
+        super().__init__(*args, **kwargs)
+
+    def _pre_process(self, input: QuestionWithSchemaInput) -> str:
+        # prefix = self.prefix if self.prefix is not None else ""
+        spider_input = spider_get_input(question=input.utterance, serialized_schema=input.schema, prefix='')
+        print(f'spider input is :{spider_input}')
+        return spider_input
+
+    def __call__(self, inputs: Union[QuestionWithSchemaInput, List[QuestionWithSchemaInput]], *args, **kwargs):
+        r"""
+        Generate the output SQL expression(s) using text(s) given as inputs.
+
+        Args:
+            inputs (:obj:`Text2SQLInput` or :obj:`List[Text2SQLInput]`):
+                Input text(s) for the encoder.
+            return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to include the decoded texts in the outputs.
+            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            truncation (:obj:`TruncationStrategy`, `optional`, defaults to :obj:`TruncationStrategy.DO_NOT_TRUNCATE`):
+                The truncation strategy for the tokenization within the pipeline.
+                :obj:`TruncationStrategy.DO_NOT_TRUNCATE` (default) will never truncate, but it is sometimes desirable
+                to truncate the input to fit the model's max_length instead of throwing an error down the line.
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework `here <./model.html#generative-models>`__).
+
+        Return:
+            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
+
+            - **generated_sql** (:obj:`str`, present when ``return_text=True``) -- The generated SQL.
+            - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
+              -- The token ids of the generated SQL.
+        """
+        result = super().__call__(inputs, *args, **kwargs)
+        print(f'with schema output is :{result}')
+        if (
+            isinstance(inputs, list)
+            and all(isinstance(el, QuestionWithSchemaInput) for el in inputs)
+            and all(len(res) == 1 for res in result)
+        ):
+            return [res[0] for res in result]
+        return result
+
+    # no changes from parent class other than input type
+    def _parse_and_tokenize(
+        self,
+        inputs: Union[QuestionWithSchemaInput, List[QuestionWithSchemaInput]],
+        *args,
+        truncation: TruncationStrategy
+    ) -> BatchEncoding:
+        if isinstance(inputs, list):
+            if self.tokenizer.pad_token_id is None:
+                raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
+            inputs = [self._pre_process(input=input) for input in inputs]
+            padding = True
+        elif isinstance(inputs, QuestionWithSchemaInput):
+            inputs = self._pre_process(input=inputs)
+            padding = False
+        else:
+            raise ValueError(
+                f" `inputs`: {inputs} have the wrong format. The should be either of type `Text2SQLInput` or type `List[Text2SQLInput]`"
+            )
+        encodings = self.tokenizer(inputs, padding=padding, truncation=truncation, return_tensors=self.framework)
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if "token_type_ids" in encodings:
+            del encodings["token_type_ids"]
+        return encodings
+
 
 @dataclass
 class ConversationalText2SQLInput(object):

From 85b553f9d1bdcb630a88b5f053140ddf7ab7da88 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Tue, 14 Feb 2023 14:59:09 +0000
Subject: [PATCH 06/17] custom train

---
 Dockerfile.local => Dockerfile.eval |  0
 Dockerfile.train                    |  6 ++++++
 Makefile                            | 20 ++++++++------------
 3 files changed, 14 insertions(+), 12 deletions(-)
 rename Dockerfile.local => Dockerfile.eval (100%)
 create mode 100644 Dockerfile.train

diff --git a/Dockerfile.local b/Dockerfile.eval
similarity index 100%
rename from Dockerfile.local
rename to Dockerfile.eval
diff --git a/Dockerfile.train b/Dockerfile.train
new file mode 100644
index 00000000..91878354
--- /dev/null
+++ b/Dockerfile.train
@@ -0,0 +1,6 @@
+FROM tscholak/text-to-sql-train:6a252386bed6d4233f0f13f4562d8ae8608e7445
+
+ARG TOOLKIT_USER_ID=13011
+ARG TOOLKIT_GROUP_ID=13011
+
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 2fdee796..9061b57e 100644
--- a/Makefile
+++ b/Makefile
@@ -64,26 +64,22 @@ pull-dev-image:
 
 .PHONY: build-train-image
 build-train-image:
-	ssh-add
-	docker buildx build \
+	docker buildx build 
 		--builder $(BUILDKIT_BUILDER) \
 		--ssh default=$(SSH_AUTH_SOCK) \
 		-f Dockerfile \
-		--tag tscholak/$(TRAIN_IMAGE_NAME):$(GIT_HEAD_REF) \
-		--tag tscholak/$(TRAIN_IMAGE_NAME):cache \
+		--tag picard \
 		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
 		--target train \
 		--cache-from type=registry,ref=tscholak/$(TRAIN_IMAGE_NAME):cache \
-		--cache-to type=inline \
-		--push \
-		git@github.com:ElementAI/picard#$(GIT_HEAD_REF)
+		--cache-to type=inline 
 
 .PHONY: pull-train-image
 pull-train-image:
-	docker pull tscholak/$(TRAIN_IMAGE_NAME):$(GIT_HEAD_REF)
+	docker pull tscholak/text-to-sql-train:6a252386bed6d4233f0f13f4562d8ae8608e7445
 
 .PHONY: build-eval-image
-build-eval-image:
+build-eval-image:    
 	ssh-add
 	docker buildx build \
 		--builder $(BUILDKIT_BUILDER) \
@@ -115,7 +111,7 @@ train: pull-train-image
 		--mount type=bind,source=$(BASE_DIR)/transformers_cache,target=/transformers_cache \
 		--mount type=bind,source=$(BASE_DIR)/configs,target=/app/configs \
 		--mount type=bind,source=$(BASE_DIR)/wandb,target=/app/wandb \
-		tscholak/$(TRAIN_IMAGE_NAME):$(GIT_HEAD_REF) \
+		picard \
 		/bin/bash -c "python seq2seq/run_seq2seq.py configs/train.json"
 
 .PHONY: train_cosql
@@ -131,7 +127,7 @@ train_cosql: pull-train-image
 		--mount type=bind,source=$(BASE_DIR)/transformers_cache,target=/transformers_cache \
 		--mount type=bind,source=$(BASE_DIR)/configs,target=/app/configs \
 		--mount type=bind,source=$(BASE_DIR)/wandb,target=/app/wandb \
-		tscholak/$(TRAIN_IMAGE_NAME):$(GIT_HEAD_REF) \
+		picard \
 		/bin/bash -c "python seq2seq/run_seq2seq.py configs/train_cosql.json"
 
 .PHONY: eval
@@ -170,7 +166,7 @@ eval_cosql: pull-eval-image
 serve: pull-eval-image
 	mkdir -p -m 777 database
 	mkdir -p -m 777 transformers_cache
-	docker build . -t picard -f Dockerfile.local
+	docker build . -t picard -f Dockerfile.eval
 	docker run \
 		-it \
 		--rm \

From d038e0cc58b17915ca910712bb3fbf5d83ff8c84 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Thu, 16 Feb 2023 05:41:16 +0000
Subject: [PATCH 07/17] build before train

---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 9061b57e..8c7c0874 100644
--- a/Makefile
+++ b/Makefile
@@ -67,7 +67,7 @@ build-train-image:
 	docker buildx build 
 		--builder $(BUILDKIT_BUILDER) \
 		--ssh default=$(SSH_AUTH_SOCK) \
-		-f Dockerfile \
+		-f Dockerfile.train \
 		--tag picard \
 		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
 		--target train \
@@ -77,6 +77,7 @@ build-train-image:
 .PHONY: pull-train-image
 pull-train-image:
 	docker pull tscholak/text-to-sql-train:6a252386bed6d4233f0f13f4562d8ae8608e7445
+	docker build . -t picard -f Dockerfile.train
 
 .PHONY: build-eval-image
 build-eval-image:    

From 58d790629738a27099e3b3ca07ceffcfa2070395 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Thu, 16 Feb 2023 05:44:05 +0000
Subject: [PATCH 08/17] pass wandb env var

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 8c7c0874..3cf00067 100644
--- a/Makefile
+++ b/Makefile
@@ -112,6 +112,7 @@ train: pull-train-image
 		--mount type=bind,source=$(BASE_DIR)/transformers_cache,target=/transformers_cache \
 		--mount type=bind,source=$(BASE_DIR)/configs,target=/app/configs \
 		--mount type=bind,source=$(BASE_DIR)/wandb,target=/app/wandb \
+		--env WANDB_API_KEY \
 		picard \
 		/bin/bash -c "python seq2seq/run_seq2seq.py configs/train.json"
 

From e0d61e165b9125a7f9f4024ee5ab3b2ae82b636b Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Thu, 16 Feb 2023 15:00:03 +0000
Subject: [PATCH 09/17] num_procs for dataset.map

---
 Makefile                 | 2 ++
 configs/train.json       | 2 +-
 seq2seq/run_seq2seq.py   | 9 ++++++---
 seq2seq/utils/dataset.py | 7 ++++++-
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 3cf00067..2d6f53a3 100644
--- a/Makefile
+++ b/Makefile
@@ -107,6 +107,8 @@ train: pull-train-image
 	docker run \
 		-it \
 		--rm \
+		--name picard \
+		--gpus all \
 		--user 13011:13011 \
 		--mount type=bind,source=$(BASE_DIR)/train,target=/train \
 		--mount type=bind,source=$(BASE_DIR)/transformers_cache,target=/transformers_cache \
diff --git a/configs/train.json b/configs/train.json
index 5e2891e8..09e3f384 100644
--- a/configs/train.json
+++ b/configs/train.json
@@ -26,7 +26,7 @@
     "warmup_ratio": 0.0,
     "warmup_steps": 0,
     "seed": 1,
-    "report_to": ["wandb"],
+    "report_to": [],
     "logging_strategy": "steps",
     "logging_first_step": true,
     "logging_steps": 4,
diff --git a/seq2seq/run_seq2seq.py b/seq2seq/run_seq2seq.py
index 19833fc1..01d24520 100644
--- a/seq2seq/run_seq2seq.py
+++ b/seq2seq/run_seq2seq.py
@@ -128,6 +128,7 @@ def main() -> None:
     )
 
     # Initialize tokenizer
+    logger.warning('loading tokenizer...')
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
@@ -141,7 +142,7 @@ def main() -> None:
         tokenizer.add_tokens([AddedToken(" <="), AddedToken(" <")])
 
     # Load dataset
-    logger.info('loading dataset...')
+    logger.warning('loading dataset...')
     metric, dataset_splits = load_dataset(
         data_args=data_args,
         model_args=model_args,
@@ -149,7 +150,7 @@ def main() -> None:
         training_args=training_args,
         tokenizer=tokenizer,
     )
-    logger.info('loading dataset complete')
+    logger.warning('loading dataset complete')
 
     # Initialize Picard if necessary
     with PicardLauncher() if picard_args.launch_picard and training_args.local_rank <= 0 else nullcontext(None):
@@ -162,6 +163,7 @@ def main() -> None:
             model_cls_wrapper = lambda model_cls: model_cls
 
         # Initialize model
+        logger.warning('loading model...')
         model = model_cls_wrapper(AutoModelForSeq2SeqLM).from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
@@ -198,6 +200,7 @@ def main() -> None:
             "target_with_db_id": data_training_args.target_with_db_id,
         }
         #using spidertrainer as it is.
+        logger.warning(f'initializing trainer...')
         if data_args.dataset in ["spider", "spider_realistic", "spider_syn", "spider_dk"]:
             trainer = SpiderTrainer(**trainer_kwargs)
         elif data_args.dataset in ["cosql", "cosql+spider"]:
@@ -207,7 +210,7 @@ def main() -> None:
 
         # Training
         if training_args.do_train:
-            logger.info("*** Train ***")
+            logger.warning("*** Train ***")
 
             checkpoint = None
 
diff --git a/seq2seq/utils/dataset.py b/seq2seq/utils/dataset.py
index d92bf0d4..fe9d037b 100644
--- a/seq2seq/utils/dataset.py
+++ b/seq2seq/utils/dataset.py
@@ -6,6 +6,11 @@
 from seq2seq.utils.bridge_content_encoder import get_database_matches
 import re
 import random
+import os
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
 
 
 @dataclass
@@ -19,7 +24,7 @@ class DataTrainingArguments:
         metadata={"help": "Overwrite the cached training and evaluation sets"},
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
+        default=int(os.cpu_count() * 0.75) if os.cpu_count() is not None else 4, # set to half of the number of CPUs
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
     max_source_length: Optional[int] = field(

From 2fb9396f401a465dc1b5bcd4b07b691aeb734331 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Fri, 17 Feb 2023 16:41:49 +0000
Subject: [PATCH 10/17] gpu utilization

---
 Dockerfile.train         |  2 ++
 configs/train.json       | 12 +++++++-----
 seq2seq/run_seq2seq.py   |  9 ++++++---
 seq2seq/utils/trainer.py | 13 +++++++++++++
 4 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.train b/Dockerfile.train
index 91878354..0241639f 100644
--- a/Dockerfile.train
+++ b/Dockerfile.train
@@ -3,4 +3,6 @@ FROM tscholak/text-to-sql-train:6a252386bed6d4233f0f13f4562d8ae8608e7445
 ARG TOOLKIT_USER_ID=13011
 ARG TOOLKIT_GROUP_ID=13011
 
+RUN pip install pynvml
+
 COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
\ No newline at end of file
diff --git a/configs/train.json b/configs/train.json
index 09e3f384..2c999549 100644
--- a/configs/train.json
+++ b/configs/train.json
@@ -13,11 +13,12 @@
     "cache_dir": "/transformers_cache",
     "do_train": true,
     "do_eval": true,
-    "fp16": false,
+    "fp16": true,
     "num_train_epochs": 3072,
-    "per_device_train_batch_size": 5,
-    "per_device_eval_batch_size": 5,
-    "gradient_accumulation_steps": 410,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 1     ,
+    "gradient_accumulation_steps": 4,
+    "gradient_checkpointing": true,
     "label_smoothing_factor": 0.0,
     "learning_rate": 1e-4,
     "adafactor": true,
@@ -40,5 +41,6 @@
     "predict_with_generate": true,
     "num_beams": 1,
     "num_beam_groups": 1,
-    "use_picard": false
+    "use_picard": false,
+    "optim": "adafactor"
 }
diff --git a/seq2seq/run_seq2seq.py b/seq2seq/run_seq2seq.py
index 01d24520..e667fb49 100644
--- a/seq2seq/run_seq2seq.py
+++ b/seq2seq/run_seq2seq.py
@@ -31,10 +31,11 @@
 from seq2seq.utils.dataset_loader import load_dataset
 from seq2seq.utils.spider import SpiderTrainer
 from seq2seq.utils.cosql import CoSQLTrainer
+from seq2seq.utils.trainer import print_gpu_utilization
 
 
 def main() -> None:
-    # See all possible arguments by passing the --help flag to this script.
+    # See all possible arguments by passing the --help flag to this script.    
     parser = HfArgumentParser(
         (PicardArguments, ModelArguments, DataArguments, DataTrainingArguments, Seq2SeqTrainingArguments)
     )
@@ -172,6 +173,9 @@ def main() -> None:
             revision=model_args.model_revision,
             use_auth_token=True if model_args.use_auth_token else None,
         )
+        print_gpu_utilization()
+
+
         if isinstance(model, T5ForConditionalGeneration):
             model.resize_token_embeddings(len(tokenizer))
 
@@ -200,7 +204,6 @@ def main() -> None:
             "target_with_db_id": data_training_args.target_with_db_id,
         }
         #using spidertrainer as it is.
-        logger.warning(f'initializing trainer...')
         if data_args.dataset in ["spider", "spider_realistic", "spider_syn", "spider_dk"]:
             trainer = SpiderTrainer(**trainer_kwargs)
         elif data_args.dataset in ["cosql", "cosql+spider"]:
@@ -218,7 +221,7 @@ def main() -> None:
                 checkpoint = training_args.resume_from_checkpoint
             elif last_checkpoint is not None:
                 checkpoint = last_checkpoint
-
+            
             train_result = trainer.train(resume_from_checkpoint=checkpoint)
             trainer.save_model()  # Saves the tokenizer too for easy upload
 
diff --git a/seq2seq/utils/trainer.py b/seq2seq/utils/trainer.py
index 2d0f3253..0bfbe649 100644
--- a/seq2seq/utils/trainer.py
+++ b/seq2seq/utils/trainer.py
@@ -6,6 +6,19 @@
 from datasets.metric import Metric
 import numpy as np
 import time
+from pynvml import *
+
+def print_gpu_utilization():
+    nvmlInit()
+    handle = nvmlDeviceGetHandleByIndex(0)
+    info = nvmlDeviceGetMemoryInfo(handle)
+    print(f"GPU memory occupied: {info.used//1024**2} MB.")
+
+
+def print_summary(result):
+    print(f"Time: {result.metrics['train_runtime']:.2f}")
+    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+    print_gpu_utilization()
 
 
 class EvalPrediction(NamedTuple):

From 0846ac052ed6ea92116d6bab02581c683aba545f Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Mon, 20 Feb 2023 08:09:21 +0000
Subject: [PATCH 11/17] training fails with docker

---
 Dockerfile.train                      | 60 +++++++++++++++++++++++++--
 Makefile                              | 23 +++++-----
 configs/ds_config_zero2.json          | 54 ++++++++++++++++++++++++
 configs/train.json                    | 11 +++--
 seq2seq/utils/picard_model_wrapper.py |  9 ++--
 5 files changed, 134 insertions(+), 23 deletions(-)
 create mode 100644 configs/ds_config_zero2.json

diff --git a/Dockerfile.train b/Dockerfile.train
index 0241639f..e2e112fb 100644
--- a/Dockerfile.train
+++ b/Dockerfile.train
@@ -1,8 +1,62 @@
-FROM tscholak/text-to-sql-train:6a252386bed6d4233f0f13f4562d8ae8608e7445
+# FROM tscholak/text-to-sql-train:6a252386bed6d4233f0f13f4562d8ae8608e7445
+# with this Dockerfile, train started fine, but unkown error due to deepspeed cpu offload
+
+ARG BASE_IMAGE
+
+# ------------------------
+# Target: dev
+# ------------------------
+FROM $BASE_IMAGE as dev
 
 ARG TOOLKIT_USER_ID=13011
 ARG TOOLKIT_GROUP_ID=13011
 
-RUN pip install pynvml
+RUN apt-get update \
+    # Required to save git hashes
+    && apt-get install -y -q git curl unzip make gettext \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV XDG_DATA_HOME=/app/.local/share \
+    XDG_CACHE_HOME=/app/.cache \
+    XDG_BIN_HOME=/app/.local/bin \
+    XDG_CONFIG_HOME=/app/.config
+RUN mkdir -p $XDG_DATA_HOME \
+    && mkdir -p $XDG_CACHE_HOME \
+    && mkdir -p $XDG_BIN_HOME \
+    && mkdir -p $XDG_CONFIG_HOME \
+    && chown -R $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app
+
+WORKDIR /app
+
+# Misc environment variables
+ENV HF_HOME=/transformers_cache
+
+
+# RUN pip install transformers==4.17.0 datasets pynvml deepspeed tenacity rapidfuzz==2.0.5 nltk==3.7 \
+#     sqlparse==0.4.2 pyarrow==7.0.0
+
+# datasets==1.18.4
+# copy poetry files
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID pyproject.toml poetry.lock /app/
+
+RUN pip install poetry && poetry config virtualenvs.create false
+RUN poetry update
+# RUN poetry install
+
+
+RUN git clone https://github.com/microsoft/DeepSpeed/ && \
+    cd DeepSpeed && git checkout v0.8.1 && \
+    rm -rf build && \
+    TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
+    --global-option="build_ext" --global-option="-j8" --no-cache -v \
+    --disable-pip-version-check 
+
+# Copy Seq-to-seq code
+# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
+# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./tests /app/tests/
+# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/spider /app/third_party/spider/
+# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/test_suite /app/third_party/test_suite/
+# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./configs /app/configs/
 
-COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
\ No newline at end of file
+# change permission for /app/.cache
+# RUN mkdir -p /app/.cache && chown  $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.cache
diff --git a/Makefile b/Makefile
index 2d6f53a3..ad6974d8 100644
--- a/Makefile
+++ b/Makefile
@@ -64,15 +64,16 @@ pull-dev-image:
 
 .PHONY: build-train-image
 build-train-image:
-	docker buildx build 
-		--builder $(BUILDKIT_BUILDER) \
-		--ssh default=$(SSH_AUTH_SOCK) \
-		-f Dockerfile.train \
-		--tag picard \
-		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
-		--target train \
-		--cache-from type=registry,ref=tscholak/$(TRAIN_IMAGE_NAME):cache \
-		--cache-to type=inline 
+	docker build . -f Dockerfile.train -t picard --build-arg BASE_IMAGE=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
+	# docker buildx build 
+	# 	--builder $(BUILDKIT_BUILDER) \
+	# 	--ssh default=$(SSH_AUTH_SOCK) \
+	# 	-f Dockerfile.train \
+	# 	--tag picard \
+	# 	--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+	# 	--target train \
+	# 	--cache-from type=registry,ref=tscholak/$(TRAIN_IMAGE_NAME):cache \
+	# 	--cache-to type=inline 
 
 .PHONY: pull-train-image
 pull-train-image:
@@ -100,7 +101,7 @@ pull-eval-image:
 	docker pull tscholak/text-to-sql-eval:6a252386bed6d4233f0f13f4562d8ae8608e7445
 
 .PHONY: train
-train: pull-train-image
+train: build-train-image
 	mkdir -p -m 777 train
 	mkdir -p -m 777 transformers_cache
 	mkdir -p -m 777 wandb
@@ -116,7 +117,7 @@ train: pull-train-image
 		--mount type=bind,source=$(BASE_DIR)/wandb,target=/app/wandb \
 		--env WANDB_API_KEY \
 		picard \
-		/bin/bash -c "python seq2seq/run_seq2seq.py configs/train.json"
+		/bin/bash -c "deepspeed --num_gpus=2 seq2seq/run_seq2seq.py configs/train.json"
 
 .PHONY: train_cosql
 train_cosql: pull-train-image
diff --git a/configs/ds_config_zero2.json b/configs/ds_config_zero2.json
new file mode 100644
index 00000000..02c1551b
--- /dev/null
+++ b/configs/ds_config_zero2.json
@@ -0,0 +1,54 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "bf16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/configs/train.json b/configs/train.json
index 2c999549..dd1e5cae 100644
--- a/configs/train.json
+++ b/configs/train.json
@@ -9,11 +9,11 @@
     "schema_serialization_with_db_content": true,
     "normalize_query": true,
     "target_with_db_id": true,
-    "output_dir": "/train",
-    "cache_dir": "/transformers_cache",
+    "output_dir": "train",
+    "cache_dir": "transformers_cache",
     "do_train": true,
     "do_eval": true,
-    "fp16": true,
+    "fp16": false,
     "num_train_epochs": 3072,
     "per_device_train_batch_size": 1,
     "per_device_eval_batch_size": 1     ,
@@ -21,8 +21,6 @@
     "gradient_checkpointing": true,
     "label_smoothing_factor": 0.0,
     "learning_rate": 1e-4,
-    "adafactor": true,
-    "adam_eps": 1e-6,
     "lr_scheduler_type": "constant",
     "warmup_ratio": 0.0,
     "warmup_steps": 0,
@@ -42,5 +40,6 @@
     "num_beams": 1,
     "num_beam_groups": 1,
     "use_picard": false,
-    "optim": "adafactor"
+    "overwrite_output_dir": true,
+    "deepspeed": "configs/ds_config_zero2.json"
 }
diff --git a/seq2seq/utils/picard_model_wrapper.py b/seq2seq/utils/picard_model_wrapper.py
index 1d771573..59befcb1 100644
--- a/seq2seq/utils/picard_model_wrapper.py
+++ b/seq2seq/utils/picard_model_wrapper.py
@@ -11,8 +11,10 @@
 import torch
 from transformers import LogitsProcessorList
 from transformers.configuration_utils import PretrainedConfig
-from transformers.generation_utils import GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput
-from transformers.generation_logits_process import LogitsProcessor
+# from transformers.generation_utils import  GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput
+from transformers.generation import GreedySearchEncoderDecoderOutput, SampleEncoderDecoderOutput, BeamSearchEncoderDecoderOutput, BeamSampleEncoderDecoderOutput
+# from transformers.generation_logits_process import LogitsProcessor
+from transformers import LogitsProcessor
 from transformers.file_utils import copy_func
 from transformers.models.auto.auto_factory import _get_model_class
 from transformers.models.auto.configuration_auto import AutoConfig
@@ -158,7 +160,8 @@ def _generate(
         logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
         eos_token_id: Optional[int] = None,
         **kwargs,
-    ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, torch.LongTensor]:
+    ) -> Union[GreedySearchEncoderDecoderOutput, SampleEncoderDecoderOutput, BeamSearchEncoderDecoderOutput, 
+        BeamSampleEncoderDecoderOutput, torch.LongTensor]:
         eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
 
         logits_processor.append(

From 8de2376a45067d6ec806fc26edb67d783ab6c5d6 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Mon, 20 Feb 2023 13:20:57 +0000
Subject: [PATCH 12/17] deepspeed works

---
 README.md          | 14 ++++++++++++++
 configs/train.json | 10 +++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ccf9504a..df88f9b8 100644
--- a/README.md
+++ b/README.md
@@ -376,3 +376,17 @@ There are three docker images that can be used to run the code:
 * **[tscholak/text-to-sql-eval](https://hub.docker.com/repository/docker/tscholak/text-to-sql-eval):** Training/evaluation image with all dependencies. Use this for evaluating a fine-tuned model with Picard. This image can also be used for training if you want to run evaluation during training with Picard. Pull it with `make pull-eval-image` from the docker hub. Rebuild the image with `make build-eval-image`.
 
 All images are tagged with the current commit hash. The images are built with the buildx tool which is available in the latest docker-ce. Use `make init-buildkit` to initialize the buildx tool on your machine. You can then use `make build-dev-image`, `make build-train-image`, etc. to rebuild the images. Local changes to the code will not be reflected in the docker images unless they are committed to git.
+
+### Using Deepspeed
+Training on 24 GB GPU was not possible for a batch size of even 1. So we need to use deepspeed. Deepspeed failed in docker silently. So we need to run on host. 
+
+```shell
+# intsall deespeed
+export PATH="/usr/local/cuda-11.7/bin:$PATH"
+export LD_LIBRARY_PATH="/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH"
+DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=0 pip install deepspeed --global-option="build_ext" --global-option="-j8"
+
+# start training
+deepspeed seq2seq/run_seq2seq.py configs/train.json
+```
+
diff --git a/configs/train.json b/configs/train.json
index dd1e5cae..a86d98e7 100644
--- a/configs/train.json
+++ b/configs/train.json
@@ -10,14 +10,14 @@
     "normalize_query": true,
     "target_with_db_id": true,
     "output_dir": "train",
-    "cache_dir": "transformers_cache",
+    "cache_dir": "~/trans_cache",
     "do_train": true,
     "do_eval": true,
     "fp16": false,
-    "num_train_epochs": 3072,
-    "per_device_train_batch_size": 1,
-    "per_device_eval_batch_size": 1     ,
-    "gradient_accumulation_steps": 4,
+    "num_train_epochs": 3072, 
+    "per_device_train_batch_size": 4,
+    "per_device_eval_batch_size": 4     ,
+    "gradient_accumulation_steps": 32,
     "gradient_checkpointing": true,
     "label_smoothing_factor": 0.0,
     "learning_rate": 1e-4,

From c5764d4289c234e77cb5ae49326a105d9798136b Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Fri, 24 Feb 2023 04:51:39 +0000
Subject: [PATCH 13/17] add foreign keys in training

---
 configs/train.json        | 17 +++++++++--------
 seq2seq/serve_seq2seq.py  |  1 +
 seq2seq/utils/cosql.py    |  1 +
 seq2seq/utils/dataset.py  | 29 +++++++++++++++++++++++++----
 seq2seq/utils/pipeline.py | 12 ++++++++++++
 seq2seq/utils/spider.py   |  2 ++
 6 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/configs/train.json b/configs/train.json
index a86d98e7..559d34f0 100644
--- a/configs/train.json
+++ b/configs/train.json
@@ -1,6 +1,6 @@
 {
-    "run_name": "t5-spider",
-    "model_name_or_path": "t5-3b",
+    "run_name": "picard-001-fk",
+    "model_name_or_path": "tscholak/cxmefzzi",
     "dataset": "spider",
     "source_prefix": "",
     "schema_serialization_type": "peteshaw",
@@ -14,10 +14,10 @@
     "do_train": true,
     "do_eval": true,
     "fp16": false,
-    "num_train_epochs": 3072, 
+    "num_train_epochs": 10, 
     "per_device_train_batch_size": 4,
-    "per_device_eval_batch_size": 4     ,
-    "gradient_accumulation_steps": 32,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 64,
     "gradient_checkpointing": true,
     "label_smoothing_factor": 0.0,
     "learning_rate": 1e-4,
@@ -25,7 +25,7 @@
     "warmup_ratio": 0.0,
     "warmup_steps": 0,
     "seed": 1,
-    "report_to": [],
+    "report_to": ["wandb"],
     "logging_strategy": "steps",
     "logging_first_step": true,
     "logging_steps": 4,
@@ -33,7 +33,7 @@
     "metric_for_best_model": "exact_match",
     "greater_is_better": true,
     "save_total_limit": 128,
-    "save_steps": 64,
+    "save_steps": 5,
     "evaluation_strategy": "steps",
     "eval_steps": 64,
     "predict_with_generate": true,
@@ -41,5 +41,6 @@
     "num_beam_groups": 1,
     "use_picard": false,
     "overwrite_output_dir": true,
-    "deepspeed": "configs/ds_config_zero2.json"
+    "deepspeed": "configs/ds_config_zero2.json",
+    "overwrite_cache": true
 }
diff --git a/seq2seq/serve_seq2seq.py b/seq2seq/serve_seq2seq.py
index fccd7c23..06750465 100644
--- a/seq2seq/serve_seq2seq.py
+++ b/seq2seq/serve_seq2seq.py
@@ -202,6 +202,7 @@ def get_spider_input(db_id, schema_serialization_type = "peteshaw",
                 schema_serialization_randomized = schema_serialization_randomized,
                 schema_serialization_with_db_id = schema_serialization_with_db_id,
                 schema_serialization_with_db_content = schema_serialization_with_db_content, 
+                foreign_keys=schema['db_foreign_keys']
                 )
             return spider_get_input('question', serialized_schema, prefix='')
 
diff --git a/seq2seq/utils/cosql.py b/seq2seq/utils/cosql.py
index 28121aa7..d7baea49 100644
--- a/seq2seq/utils/cosql.py
+++ b/seq2seq/utils/cosql.py
@@ -47,6 +47,7 @@ def cosql_add_serialized_schema(
         schema_serialization_with_db_id=data_training_args.schema_serialization_with_db_id,
         schema_serialization_with_db_content=data_training_args.schema_serialization_with_db_content,
         normalize_query=data_training_args.normalize_query,
+        foreign_keys=ex["db_foreign_keys"],
     )
     return {"serialized_schema": serialized_schema}
 
diff --git a/seq2seq/utils/dataset.py b/seq2seq/utils/dataset.py
index fe9d037b..d84461dc 100644
--- a/seq2seq/utils/dataset.py
+++ b/seq2seq/utils/dataset.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict, Callable
+from typing import Optional, List, Dict, Callable, Tuple
 from dataclasses import dataclass, field
 from datasets.dataset_dict import DatasetDict
 from datasets.arrow_dataset import Dataset
@@ -130,7 +130,10 @@ class DataTrainingArguments:
         default=True,
         metadata={"help": "Whether or not to add the database id to the target. Needed for Picard."},
     )
-
+    include_foreign_keys_in_schema: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to include foreign keys in the schema."},
+    )
     def __post_init__(self):
         if self.val_max_target_length is None:
             self.val_max_target_length = self.max_target_length
@@ -359,7 +362,10 @@ def serialize_schema(
     schema_serialization_with_db_id: bool = True,
     schema_serialization_with_db_content: bool = False,
     normalize_query: bool = True,
+    include_foreign_keys: bool = False,
+    foreign_keys: Optional[List[Tuple[str, str]]] = None
 ) -> str:
+    # logger.warning(f'foreign keys for {db_id} is {foreign_keys}. tb_tables - {db_table_names}')
     if schema_serialization_type == "verbose":
         db_id_str = "Database: {db_id}. "
         table_sep = ". "
@@ -380,8 +386,21 @@ def serialize_schema(
     else:
         raise NotImplementedError
 
-    def get_column_str(table_name: str, column_name: str) -> str:
+    def get_column_str(table_id: int, table_name: str, column_name: str, include_foreign_keys:bool) -> str:
         column_name_str = column_name.lower() if normalize_query else column_name
+        if include_foreign_keys:
+            # get location of fk in foreign_keys list
+            column_id = db_column_names['column_name'].index(column_name) if column_name in db_column_names['column_name'] else None
+            fk_idx = foreign_keys['column_id'].index(column_id) if column_id in foreign_keys['column_id'] else None
+            if fk_idx is not None:
+                other_column_id = foreign_keys['other_column_id'][fk_idx] 
+                other_table_id = db_column_names['table_id'][other_column_id]
+                other_table_name = db_table_names[other_table_id]
+                other_column_name = db_column_names['column_name'][other_column_id]
+                fk_str = f'__fk__{other_table_name}.{other_column_name}'
+                column_name_str = column_name_str + fk_str 
+
+
         if schema_serialization_with_db_content:
             matches = get_database_matches(
                 question=question,
@@ -401,7 +420,8 @@ def get_column_str(table_name: str, column_name: str) -> str:
             table=table_name.lower() if normalize_query else table_name,
             columns=column_sep.join(
                 map(
-                    lambda y: get_column_str(table_name=table_name, column_name=y[1]),
+                    lambda y: get_column_str(table_id=table_id, table_name=table_name, column_name=y[1], 
+                    include_foreign_keys=include_foreign_keys),
                     filter(
                         lambda y: y[0] == table_id,
                         zip(
@@ -420,4 +440,5 @@ def get_column_str(table_name: str, column_name: str) -> str:
         serialized_schema = db_id_str.format(db_id=db_id) + table_sep.join(tables)
     else:
         serialized_schema = table_sep.join(tables)
+    # logger.warning(f'serialized schema for {db_id} is {serialized_schema}.')
     return serialized_schema
diff --git a/seq2seq/utils/pipeline.py b/seq2seq/utils/pipeline.py
index 812995af..d10bea79 100644
--- a/seq2seq/utils/pipeline.py
+++ b/seq2seq/utils/pipeline.py
@@ -8,6 +8,9 @@
 from seq2seq.utils.spider import spider_get_input
 from seq2seq.utils.cosql import cosql_get_input
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class Text2SQLInput(object):
@@ -48,6 +51,8 @@ def __init__(self, *args, **kwargs):
         self.schema_serialization_with_db_id: bool = kwargs.pop("schema_serialization_with_db_id", True)
         self.schema_serialization_with_db_content: bool = kwargs.pop("schema_serialization_with_db_content", True)
         self.schema_cache: Dict[str, dict] = dict()
+        self.include_foreign_keys = kwargs.pop("include_foreign_keys_in_schema", True)
+        logger.warning(f'include_foreign_keys 2 is {self.include_foreign_keys}')
         super().__init__(*args, **kwargs)
 
     def __call__(self, inputs: Union[Text2SQLInput, List[Text2SQLInput]], *args, **kwargs):
@@ -144,6 +149,8 @@ def _pre_process(self, input: Text2SQLInput) -> str:
             schema_serialization_with_db_id=self.schema_serialization_with_db_id,
             schema_serialization_with_db_content=self.schema_serialization_with_db_content,
             normalize_query=self.normalize_query,
+            include_foreign_keys=self.include_foreign_keys,
+            foreign_keys=schema["db_foreign_keys"],
         )
         spider_input = spider_get_input(question=input.utterance, serialized_schema=serialized_schema, prefix=prefix)
         print(f'spider input is:{spider_input}')
@@ -280,6 +287,7 @@ class ConversationalText2SQLGenerationPipeline(Text2TextGenerationPipeline):
     """
 
     def __init__(self, *args, **kwargs):
+        logger.warning(f'kwargs is :{kwargs}')
         self.db_path: str = kwargs.pop("db_path")
         self.prefix: Optional[str] = kwargs.pop("prefix", None)
         self.normalize_query: bool = kwargs.pop("normalize_query", True)
@@ -288,6 +296,8 @@ def __init__(self, *args, **kwargs):
         self.schema_serialization_with_db_id: bool = kwargs.pop("schema_serialization_with_db_id", True)
         self.schema_serialization_with_db_content: bool = kwargs.pop("schema_serialization_with_db_content", True)
         self.schema_cache: Dict[str, dict] = dict()
+        self.include_foreign_keys = kwargs.pop("include_foreign_keys", False)
+        logger.warning(f'include foreign keys is :{self.include_foreign_keys}')
         super().__init__(*args, **kwargs)
 
     def __call__(self, inputs: Union[ConversationalText2SQLInput, List[ConversationalText2SQLInput]], *args, **kwargs):
@@ -380,6 +390,8 @@ def _pre_process(self, input: ConversationalText2SQLInput) -> str:
             schema_serialization_with_db_id=self.schema_serialization_with_db_id,
             schema_serialization_with_db_content=self.schema_serialization_with_db_content,
             normalize_query=self.normalize_query,
+            include_foreign_keys=self.include_foreign_keys_in_schema,
+            foreign_keys = schema["db_foreign_keys"]
         )
         return cosql_get_input(utterances=input.utterances, serialized_schema=serialized_schema, prefix=prefix)
 
diff --git a/seq2seq/utils/spider.py b/seq2seq/utils/spider.py
index 9bf5607e..14ed2fbb 100644
--- a/seq2seq/utils/spider.py
+++ b/seq2seq/utils/spider.py
@@ -37,6 +37,8 @@ def spider_add_serialized_schema(ex: dict, data_training_args: DataTrainingArgum
         schema_serialization_with_db_id=data_training_args.schema_serialization_with_db_id,
         schema_serialization_with_db_content=data_training_args.schema_serialization_with_db_content,
         normalize_query=data_training_args.normalize_query,
+        include_foreign_keys=data_training_args.include_foreign_keys_in_schema,
+        foreign_keys=ex["db_foreign_keys"]
     )
     return {"serialized_schema": serialized_schema}
 

From 2819bbd6bb21188cd70a9bc548aed848ef72ab02 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Mon, 27 Feb 2023 05:56:37 +0000
Subject: [PATCH 14/17] fix eval failing

---
 seq2seq/prediction_output.py | 1 +
 seq2seq/utils/trainer.py     | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/seq2seq/prediction_output.py b/seq2seq/prediction_output.py
index 6ae07947..e060a740 100644
--- a/seq2seq/prediction_output.py
+++ b/seq2seq/prediction_output.py
@@ -122,6 +122,7 @@ def get_pipeline_kwargs(
         "schema_serialization_type": data_training_args.schema_serialization_type,
         "schema_serialization_with_db_id": data_training_args.schema_serialization_with_db_id,
         "schema_serialization_with_db_content": data_training_args.schema_serialization_with_db_content,
+        "include_foreign_keys": data_training_args.include_foreign_keys,
         "device": prediction_output_args.device,
     }
 
diff --git a/seq2seq/utils/trainer.py b/seq2seq/utils/trainer.py
index 0bfbe649..6fffab6f 100644
--- a/seq2seq/utils/trainer.py
+++ b/seq2seq/utils/trainer.py
@@ -7,12 +7,13 @@
 import numpy as np
 import time
 from pynvml import *
+from loguru import logger
 
 def print_gpu_utilization():
     nvmlInit()
     handle = nvmlDeviceGetHandleByIndex(0)
     info = nvmlDeviceGetMemoryInfo(handle)
-    print(f"GPU memory occupied: {info.used//1024**2} MB.")
+    logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
 
 
 def print_summary(result):
@@ -61,10 +62,12 @@ def evaluate(
         max_length: Optional[int] = None,
         max_time: Optional[int] = None,
         num_beams: Optional[int] = None,
+        **gen_kwargs
     ) -> Dict[str, float]:
         self._max_length = max_length
         self._max_time = max_time
         self._num_beams = num_beams
+        self._gen_kwargs = gen_kwargs
 
         # memory metrics - must set up as early as possible
         self._memory_tracker.start()

From c72387f91ae34c11582f80cd334df5abfae75020 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Tue, 28 Feb 2023 09:48:59 +0000
Subject: [PATCH 15/17] include foreign keys in schema

---
 configs/serve.json       |  7 ++++---
 seq2seq/serve_seq2seq.py | 10 ++++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/configs/serve.json b/configs/serve.json
index ead51e0c..0fd31c81 100644
--- a/configs/serve.json
+++ b/configs/serve.json
@@ -1,14 +1,15 @@
 {
-    "model_path": "tscholak/3vnuv1vf",
+    "model_path": "/data/checkpoint-90",
     "source_prefix": "",
     "schema_serialization_type": "peteshaw",
     "schema_serialization_randomized": false,
     "schema_serialization_with_db_id": true,
     "schema_serialization_with_db_content": true,
+    "include_foreign_keys_in_schema": true,
     "normalize_query": true,
     "target_with_db_id": true,
-    "db_path": "/database",
-    "cache_dir": "/transformers_cache",
+    "db_path": "/home/ubuntu/trans_cache/downloads/extracted/d712e0d61bf3021b084b5268e3189f7f8882e4938131c9c749b9e008c833cef3/spider/database/",
+    "cache_dir": "~/trans_cache",
     "num_beams": 4,
     "num_return_sequences": 1,
     "use_picard": false,
diff --git a/seq2seq/serve_seq2seq.py b/seq2seq/serve_seq2seq.py
index 06750465..58f7e1fc 100644
--- a/seq2seq/serve_seq2seq.py
+++ b/seq2seq/serve_seq2seq.py
@@ -1,5 +1,6 @@
 # Set up logging
 import sys
+sys.path.append('.')
 import logging
 
 logging.basicConfig(
@@ -9,6 +10,7 @@
     level=logging.WARNING,
 )
 logger = logging.getLogger(__name__)
+from loguru import logger
 
 from typing import Optional, Dict
 from dataclasses import dataclass, field
@@ -74,6 +76,7 @@ def main():
         picard_args, backend_args, data_training_args = parser.parse_args_into_dataclasses()
 
     # Initialize config
+    logger.info(f'loading model...')
     config = AutoConfig.from_pretrained(
         backend_args.model_path,
         cache_dir=backend_args.cache_dir,
@@ -123,6 +126,7 @@ def main():
         pipe_with_schema = Text2SQLGenPipelineWithSchema(
             model = model,
             tokenizer = tokenizer,
+            db_path = backend_args.db_path,
             normalize_query = data_training_args.normalize_query,
             device = backend_args.device)
 
@@ -178,6 +182,7 @@ def ask_with_schema(query: Query):
         @app.get("/database/")
         def get_database_list():
             db_dir = Path(backend_args.db_path)
+
             print(f'db_path - {db_dir}')
             db_files = db_dir.rglob("*.sqlite")
             return [db_file.stem for db_file in db_files if db_file.stem == db_file.parent.stem]
@@ -186,8 +191,8 @@ def get_database_list():
         def get_schema_for_database(db_id):
             return get_schema(backend_args.db_path, db_id)
 
-        @app.get("/schema/{db_id}/spider-input")
-        def get_spider_input(db_id, schema_serialization_type = "peteshaw",
+        @app.get("/serialized-schema/{db_id}/")
+        def get_serialized_schema(db_id, schema_serialization_type = "peteshaw",
                                                 schema_serialization_randomized = False,
                                                 schema_serialization_with_db_id = True, 
                                                 schema_serialization_with_db_content = False
@@ -202,6 +207,7 @@ def get_spider_input(db_id, schema_serialization_type = "peteshaw",
                 schema_serialization_randomized = schema_serialization_randomized,
                 schema_serialization_with_db_id = schema_serialization_with_db_id,
                 schema_serialization_with_db_content = schema_serialization_with_db_content, 
+                include_foreign_keys=data_training_args.include_foreign_keys_in_schema,
                 foreign_keys=schema['db_foreign_keys']
                 )
             return spider_get_input('question', serialized_schema, prefix='')

From addcd6565f4a38597fa8e0cbc103a28ad81beac4 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Fri, 7 Apr 2023 14:12:15 +0000
Subject: [PATCH 16/17] eval works

---
 Dockerfile.eval              | 13 +++++++-
 Dockerfile.train             | 23 ++++++--------
 Makefile                     | 60 ++++++++++++++++++++----------------
 configs/eval.json            |  8 ++---
 configs/train.json           | 25 +++++++--------
 seq2seq/prediction_output.py |  3 +-
 6 files changed, 74 insertions(+), 58 deletions(-)

diff --git a/Dockerfile.eval b/Dockerfile.eval
index f4c5c0d8..e5712aa5 100644
--- a/Dockerfile.eval
+++ b/Dockerfile.eval
@@ -1,6 +1,17 @@
-FROM tscholak/text-to-sql-eval:6a252386bed6d4233f0f13f4562d8ae8608e7445
+ARG BASE_IMAGE
+FROM $BASE_IMAGE as dev
 
 ARG TOOLKIT_USER_ID=13011
 ARG TOOLKIT_GROUP_ID=13011
 
+RUN pip install poetry && poetry config virtualenvs.create false
+RUN poetry update
+RUN pip install transformers datasets pynvml deepspeed tenacity rapidfuzz==2.0.5 nltk==3.7 \
+    sqlparse==0.4.2 pyarrow==7.0.0 loguru accelerate
+    
+ENV XDG_DATA_HOME=/app/.local/share \
+    XDG_CACHE_HOME=/app/.cache \
+    XDG_BIN_HOME=/app/.local/bin \
+    XDG_CONFIG_HOME=/app/.config
+
 COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
\ No newline at end of file
diff --git a/Dockerfile.train b/Dockerfile.train
index e2e112fb..1c645c37 100644
--- a/Dockerfile.train
+++ b/Dockerfile.train
@@ -11,10 +11,9 @@ FROM $BASE_IMAGE as dev
 ARG TOOLKIT_USER_ID=13011
 ARG TOOLKIT_GROUP_ID=13011
 
-RUN apt-get update \
+RUN \
     # Required to save git hashes
-    && apt-get install -y -q git curl unzip make gettext \
-    && rm -rf /var/lib/apt/lists/*
+    apt-get install -y -q git curl unzip make gettext 
 
 ENV XDG_DATA_HOME=/app/.local/share \
     XDG_CACHE_HOME=/app/.cache \
@@ -31,16 +30,14 @@ WORKDIR /app
 # Misc environment variables
 ENV HF_HOME=/transformers_cache
 
-
-# RUN pip install transformers==4.17.0 datasets pynvml deepspeed tenacity rapidfuzz==2.0.5 nltk==3.7 \
-#     sqlparse==0.4.2 pyarrow==7.0.0
-
 # datasets==1.18.4
 # copy poetry files
 COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID pyproject.toml poetry.lock /app/
 
 RUN pip install poetry && poetry config virtualenvs.create false
 RUN poetry update
+RUN pip install transformers datasets pynvml deepspeed tenacity rapidfuzz==2.0.5 nltk==3.7 \
+    sqlparse==0.4.2 pyarrow==7.0.0 loguru accelerate
 # RUN poetry install
 
 
@@ -52,11 +49,11 @@ RUN git clone https://github.com/microsoft/DeepSpeed/ && \
     --disable-pip-version-check 
 
 # Copy Seq-to-seq code
-# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
-# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./tests /app/tests/
-# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/spider /app/third_party/spider/
-# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/test_suite /app/third_party/test_suite/
-# COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./configs /app/configs/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./seq2seq /app/seq2seq/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./tests /app/tests/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/spider /app/third_party/spider/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./third_party/test_suite /app/third_party/test_suite/
+COPY --chown=$TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID ./configs /app/configs/
 
 # change permission for /app/.cache
-# RUN mkdir -p /app/.cache && chown  $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.cache
+RUN mkdir -p /app/.cache && chown  $TOOLKIT_USER_ID:$TOOLKIT_GROUP_ID /app/.cache
diff --git a/Makefile b/Makefile
index ad6974d8..8188e4af 100644
--- a/Makefile
+++ b/Makefile
@@ -64,7 +64,8 @@ pull-dev-image:
 
 .PHONY: build-train-image
 build-train-image:
-	docker build . -f Dockerfile.train -t picard --build-arg BASE_IMAGE=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
+	docker build . -f Dockerfile.train -t picard --build-arg BASE_IMAGE=tscholak/text-to-sql-train:6a252386bed6d4233f0f13f4562d8ae8608e7445
+	# docker build . -f Dockerfile.train -t picard --build-arg BASE_IMAGE=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
 	# docker buildx build 
 	# 	--builder $(BUILDKIT_BUILDER) \
 	# 	--ssh default=$(SSH_AUTH_SOCK) \
@@ -82,19 +83,20 @@ pull-train-image:
 
 .PHONY: build-eval-image
 build-eval-image:    
-	ssh-add
-	docker buildx build \
-		--builder $(BUILDKIT_BUILDER) \
-		--ssh default=$(SSH_AUTH_SOCK) \
-		-f Dockerfile \
-		--tag tscholak/$(EVAL_IMAGE_NAME):$(GIT_HEAD_REF) \
-		--tag tscholak/$(EVAL_IMAGE_NAME):cache \
-		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
-		--target eval \
-		--cache-from type=registry,ref=tscholak/$(EVAL_IMAGE_NAME):cache \
-		--cache-to type=inline \
-		--push \
-		git@github.com:ElementAI/picard#$(GIT_HEAD_REF)
+	docker build . -f Dockerfile.eval -t picard-eval --build-arg BASE_IMAGE=tscholak/text-to-sql-eval:6a252386bed6d4233f0f13f4562d8ae8608e7445
+	# ssh-add
+	# docker buildx build \
+	# 	--builder $(BUILDKIT_BUILDER) \
+	# 	--ssh default=$(SSH_AUTH_SOCK) \
+	# 	-f Dockerfile \
+	# 	--tag tscholak/$(EVAL_IMAGE_NAME):$(GIT_HEAD_REF) \
+	# 	--tag tscholak/$(EVAL_IMAGE_NAME):cache \
+	# 	--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+	# 	--target eval \
+	# 	--cache-from type=registry,ref=tscholak/$(EVAL_IMAGE_NAME):cache \
+	# 	--cache-to type=inline \
+	# 	--push \
+	# 	git@github.com:ElementAI/picard#$(GIT_HEAD_REF)
 
 .PHONY: pull-eval-image
 pull-eval-image:
@@ -110,14 +112,17 @@ train: build-train-image
 		--rm \
 		--name picard \
 		--gpus all \
-		--user 13011:13011 \
-		--mount type=bind,source=$(BASE_DIR)/train,target=/train \
-		--mount type=bind,source=$(BASE_DIR)/transformers_cache,target=/transformers_cache \
-		--mount type=bind,source=$(BASE_DIR)/configs,target=/app/configs \
-		--mount type=bind,source=$(BASE_DIR)/wandb,target=/app/wandb \
+		--ulimit memlock=-1:-1 \
+		--ipc host \
+		-v $(BASE_DIR)/train_output:/train_output \
+		-v $(BASE_DIR)/transformers_cache:/transformers_cache \
+		-v $(BASE_DIR)/configs:/app/configs \
+		-v $(BASE_DIR)/wandb:/app/wandb \
+		-v $(BASE_DIR)/data:/app/data \
 		--env WANDB_API_KEY \
+		-e TRANSFORMERS_CACHE=/transformers_cache \
 		picard \
-		/bin/bash -c "deepspeed --num_gpus=2 seq2seq/run_seq2seq.py configs/train.json"
+		/bin/bash -c "deepspeed --num_gpus=4 seq2seq/run_seq2seq.py configs/train.json"
 
 .PHONY: train_cosql
 train_cosql: pull-train-image
@@ -136,19 +141,20 @@ train_cosql: pull-train-image
 		/bin/bash -c "python seq2seq/run_seq2seq.py configs/train_cosql.json"
 
 .PHONY: eval
-eval: pull-eval-image
+eval: build-eval-image
 	mkdir -p -m 777 eval
 	mkdir -p -m 777 transformers_cache
 	mkdir -p -m 777 wandb
 	docker run \
 		-it \
 		--rm \
-		--user 13011:13011 \
-		--mount type=bind,source=$(BASE_DIR)/eval,target=/eval \
-		--mount type=bind,source=$(BASE_DIR)/transformers_cache,target=/transformers_cache \
-		--mount type=bind,source=$(BASE_DIR)/configs,target=/app/configs \
-		--mount type=bind,source=$(BASE_DIR)/wandb,target=/app/wandb \
-		tscholak/$(EVAL_IMAGE_NAME):$(GIT_HEAD_REF) \
+		--gpus all \
+		-v $(BASE_DIR)/eval_output:/eval_output \
+		-v $(BASE_DIR)/transformers_cache:/transformers_cache \
+		-v $(BASE_DIR)/configs:/app/configs \
+		-v /xdata/train_output:/train_output \
+		-e TRANSFORMERS_CACHE=/transformers_cache \
+		picard-eval \
 		/bin/bash -c "python seq2seq/run_seq2seq.py configs/eval.json"
 
 .PHONY: eval_cosql
diff --git a/configs/eval.json b/configs/eval.json
index cce1861c..a8a329cc 100644
--- a/configs/eval.json
+++ b/configs/eval.json
@@ -1,6 +1,6 @@
 {
     "run_name": "t5+picard-spider-eval",
-    "model_name_or_path": "tscholak/cxmefzzi",
+    "model_name_or_path": "/train_output/checkpoint-20/",
     "dataset": "spider",
     "source_prefix": "",
     "schema_serialization_type": "peteshaw",
@@ -9,19 +9,19 @@
     "schema_serialization_with_db_content": true,
     "normalize_query": true,
     "target_with_db_id": true,
-    "output_dir": "/eval",
+    "output_dir": "/eval_output",
     "cache_dir": "/transformers_cache",
     "do_train": false,
     "do_eval": true,
     "fp16": false,
     "per_device_eval_batch_size": 1,
     "seed": 1,
-    "report_to": ["wandb"],
+    "report_to": [],
     "predict_with_generate": true,
     "num_beams": 4,
     "num_beam_groups": 1,
     "diversity_penalty": 0.0,
-    "max_val_samples": 1034,
+    "max_val_samples": 16,
     "use_picard": true,
     "launch_picard": true,
     "picard_mode": "parse_with_guards",
diff --git a/configs/train.json b/configs/train.json
index 559d34f0..cbf920b3 100644
--- a/configs/train.json
+++ b/configs/train.json
@@ -9,15 +9,15 @@
     "schema_serialization_with_db_content": true,
     "normalize_query": true,
     "target_with_db_id": true,
-    "output_dir": "train",
-    "cache_dir": "~/trans_cache",
+    "output_dir": "/train_output",
+    "cache_dir": "/transformers_cache",
     "do_train": true,
-    "do_eval": true,
+    "do_eval": false,
     "fp16": false,
-    "num_train_epochs": 10, 
+    "num_train_epochs": 6, 
     "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 4,
-    "gradient_accumulation_steps": 64,
+    "gradient_accumulation_steps": 8,
     "gradient_checkpointing": true,
     "label_smoothing_factor": 0.0,
     "learning_rate": 1e-4,
@@ -25,22 +25,23 @@
     "warmup_ratio": 0.0,
     "warmup_steps": 0,
     "seed": 1,
-    "report_to": ["wandb"],
+    "report_to": [],
     "logging_strategy": "steps",
     "logging_first_step": true,
     "logging_steps": 4,
-    "load_best_model_at_end": true,
+    "load_best_model_at_end": false,
     "metric_for_best_model": "exact_match",
     "greater_is_better": true,
     "save_total_limit": 128,
-    "save_steps": 5,
-    "evaluation_strategy": "steps",
-    "eval_steps": 64,
+    "save_strategy": "steps",
+    "save_steps": 10,
+    "evaluation_strategy": "no",
+    "eval_steps": 1,
     "predict_with_generate": true,
     "num_beams": 1,
     "num_beam_groups": 1,
     "use_picard": false,
-    "overwrite_output_dir": true,
+    "overwrite_output_dir": false,
     "deepspeed": "configs/ds_config_zero2.json",
-    "overwrite_cache": true
+    "overwrite_cache": false
 }
diff --git a/seq2seq/prediction_output.py b/seq2seq/prediction_output.py
index e060a740..2df42d9b 100644
--- a/seq2seq/prediction_output.py
+++ b/seq2seq/prediction_output.py
@@ -1,5 +1,6 @@
 # Set up logging
 import sys
+sys.path.append('.')
 import logging
 
 logging.basicConfig(
@@ -122,7 +123,7 @@ def get_pipeline_kwargs(
         "schema_serialization_type": data_training_args.schema_serialization_type,
         "schema_serialization_with_db_id": data_training_args.schema_serialization_with_db_id,
         "schema_serialization_with_db_content": data_training_args.schema_serialization_with_db_content,
-        "include_foreign_keys": data_training_args.include_foreign_keys,
+        "include_foreign_keys": data_training_args.include_foreign_keys_in_schema,
         "device": prediction_output_args.device,
     }
 

From a6b901497df50a638f598efd7dd031cc9cca61f2 Mon Sep 17 00:00:00 2001
From: Ajay Chinta <chinta512@gmail.com>
Date: Tue, 11 Apr 2023 08:22:50 +0000
Subject: [PATCH 17/17] update config

---
 configs/eval.json  | 6 +++---
 configs/train.json | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/configs/eval.json b/configs/eval.json
index a8a329cc..6434af97 100644
--- a/configs/eval.json
+++ b/configs/eval.json
@@ -1,6 +1,6 @@
 {
     "run_name": "t5+picard-spider-eval",
-    "model_name_or_path": "/train_output/checkpoint-20/",
+    "model_name_or_path": "tscholak/3vnuv1vf",
     "dataset": "spider",
     "source_prefix": "",
     "schema_serialization_type": "peteshaw",
@@ -14,14 +14,14 @@
     "do_train": false,
     "do_eval": true,
     "fp16": false,
-    "per_device_eval_batch_size": 1,
+    "per_device_eval_batch_size": 4,
     "seed": 1,
     "report_to": [],
     "predict_with_generate": true,
     "num_beams": 4,
     "num_beam_groups": 1,
     "diversity_penalty": 0.0,
-    "max_val_samples": 16,
+    "max_val_samples": 1600,
     "use_picard": true,
     "launch_picard": true,
     "picard_mode": "parse_with_guards",
diff --git a/configs/train.json b/configs/train.json
index cbf920b3..d9f3b8e2 100644
--- a/configs/train.json
+++ b/configs/train.json
@@ -14,7 +14,7 @@
     "do_train": true,
     "do_eval": false,
     "fp16": false,
-    "num_train_epochs": 6, 
+    "num_train_epochs": 32, 
     "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 4,
     "gradient_accumulation_steps": 8,
@@ -34,14 +34,14 @@
     "greater_is_better": true,
     "save_total_limit": 128,
     "save_strategy": "steps",
-    "save_steps": 10,
+    "save_steps": 64,
     "evaluation_strategy": "no",
     "eval_steps": 1,
     "predict_with_generate": true,
     "num_beams": 1,
     "num_beam_groups": 1,
-    "use_picard": false,
-    "overwrite_output_dir": false,
+    "use_picard": true,
+    "overwrite_output_dir": true,
     "deepspeed": "configs/ds_config_zero2.json",
     "overwrite_cache": false
 }