From dd617fa806cf9a26489ae4d5457fcb5023bc75e1 Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Tue, 8 Sep 2020 14:11:54 +0100
Subject: [PATCH 01/12] Added Minecraft launcher

---
 .gitignore                      |  10 ++-
 MalmoEnv/utils/launcher.py      | 147 ++++++++++++++++++++++++++++++++
 Minecraft/launchClient_quiet.sh |   2 +
 3 files changed, 157 insertions(+), 2 deletions(-)
 create mode 100644 MalmoEnv/utils/launcher.py
 create mode 100755 Minecraft/launchClient_quiet.sh

diff --git a/.gitignore b/.gitignore
index 13c0b2fbe..19aa37977 100755
--- a/.gitignore
+++ b/.gitignore
@@ -210,8 +210,14 @@ ModelManifest.xml
 # Python
 *.pyc
 
-# Thing specific to Project Malmo
+# Things specific to Project Malmo
+.vscode
 Schemas/xs3p.xsl
 Minecraft/run/config/malmomodCLIENT.cfg
 Minecraft/Minecraft_Client.launch
-
+Minecraft/run
+.minecraft
+.minecraftserver
+# Specific files generated by Malmo installation
+/Minecraft/src/main/resources/schemas.index
+/Minecraft/src/main/resources/version.properties
diff --git a/MalmoEnv/utils/launcher.py b/MalmoEnv/utils/launcher.py
new file mode 100644
index 000000000..65d165492
--- /dev/null
+++ b/MalmoEnv/utils/launcher.py
@@ -0,0 +1,147 @@
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) 2020 Microsoft Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ------------------------------------------------------------------------------------------------
+from collections.abc import Iterable
+import socket
+import subprocess
+import sys
+import time
+
+ # This script captures the Minecraft output into file called out.txt
+DEFAULT_SCRIPT = "./launchClient_quiet.sh"
+
+def launch_minecraft(ports, launch_script=DEFAULT_SCRIPT, keep_alive=False):
+    """
+    Launch Minecraft instances in the background.
+    Function will block until all instances are ready to receive commands.
+    ports - List of ports you want the instances to listen on for commands
+    launch_script - Script to launch Minecraft. Default is ./launchClient_quiet.sh
+    keep_alive - Automatically restart Minecraft instances if they exit
+    """
+    ports_collection = ports
+    if not isinstance(ports_collection, Iterable):
+        ports_collection = [ports_collection]
+
+    minecraft_instances = []
+    for port in ports_collection:
+        args = [
+            sys.executable, __file__, 
+            "--script", launch_script,
+            "--port", str(port)
+        ]
+        if keep_alive:
+            args.append("--keepalive")
+
+        proc = subprocess.Popen(args,
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        minecraft_instances.append(proc)
+
+    await_instances([
+        ("127.0.0.1", int(port))
+        for port in ports_collection
+    ])
+
+    # Determine if we need to return a collection or a single item based on the type passed for 
+    # ports initially
+    if isinstance(ports, Iterable):
+        return minecraft_instances
+    return minecraft_instances[0]
+
+def await_instances(end_points):
+    """
+    Wait until the specified enpoints are all actively listening for connections.
+    end_points - List of addresses made up of tuples of the form (HOST, PORT)
+    """
+    print(f"Waiting for {len(end_points)} instances...")
+
+    while True:
+        try:
+            for end_point in end_points:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.settimeout(10)
+                    s.connect(end_point)
+                    s.close()
+
+            print("Finished waiting for instances")
+            return
+
+        except (ConnectionError, socket.timeout):
+            # If we fail to connect, most likely the instance isn't running yet
+            time.sleep(5)
+
+
+###################################################################################################
+# The remainder of this file contains code for when this script is invoked directly rather than
+# imported into another script.
+# This is used to directly set up and launch the Minecraft process
+###################################################################################################
+
+def _parse_args():
+    # Import locally so that we're not paying the import cost when they're not used
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Malmo Launcher")
+    parser.add_argument("--script", type=str, default=DEFAULT_SCRIPT, help="Script to launch Minecraft")
+    parser.add_argument("--port", type=int, nargs="+", help="Command ports for Minecraft instances", required=True)
+    parser.add_argument("--keepalive", action="store_true", default=False, help="Relaunch the Minecraft instance if it exits")
+
+    return parser.parse_args()
+
+def _exec(*args):
+    proc = subprocess.Popen(args)
+    proc.communicate()
+    return proc.returncode
+
+def _launch_minecraft_direct(launch_script, port, keep_alive):
+    # Import locally so that we're not paying the import cost when this script is imported as a module
+    import os
+    import pathlib
+    import shutil
+    import tempfile
+
+    # Make a copy of Minecraft into a unique temp directory as it's not possible to run multiple
+    # instances from a single Minecraft directory
+    target_dir = tempfile.mkdtemp(prefix="malmo_") + "/malmo"
+    source_dir = str(pathlib.Path(__file__).parent.absolute()) + "/../.."
+    print(f"Cloning {source_dir} into {target_dir}...")
+    shutil.copytree(source_dir, target_dir)
+
+    # Launch Minecraft using the specified script
+    print(f"Launching Minecraft using {launch_script} with command port {port}...")
+    os.chdir(target_dir + "/Minecraft")
+
+    spawn = True
+    while spawn:
+        rc = _exec(launch_script, str(port))
+        spawn = keep_alive
+
+    print(f"Exit code: {rc}")
+
+if __name__ == '__main__':
+    args = _parse_args()
+
+    if len(args.port) == 1:
+        # If only a single port is specified, launch Minecraft directly
+        _launch_minecraft_direct(args.script, args.port[0], args.keepalive)
+    else:
+        # If multiple ports are specified, launch each Minecraft instance in a new child process
+        instances = launch_minecraft(args.port, launch_script=args.script, keep_alive=args.keepalive)
+        print("Waiting for all instances to exit...")
+        for instance in instances:
+            instance.communicate()
\ No newline at end of file
diff --git a/Minecraft/launchClient_quiet.sh b/Minecraft/launchClient_quiet.sh
new file mode 100755
index 000000000..2930d00c7
--- /dev/null
+++ b/Minecraft/launchClient_quiet.sh
@@ -0,0 +1,2 @@
+#! /bin/bash
+./launchClient.sh -port $1 -env > ../out.txt 2>&1

From e8d3c84acc90964dd8ae83ae5093347b3dd7a452 Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Tue, 8 Sep 2020 14:16:02 +0100
Subject: [PATCH 02/12] Added initial RLlib traiing script

---
 MalmoEnv/malmoenv/core.py              |   1 +
 MalmoEnv/missions/rllib_multiagent.xml |  82 +++++++++++++++++
 MalmoEnv/rllib_train.py                | 119 +++++++++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 MalmoEnv/missions/rllib_multiagent.xml
 create mode 100644 MalmoEnv/rllib_train.py

diff --git a/MalmoEnv/malmoenv/core.py b/MalmoEnv/malmoenv/core.py
index cc1899ff5..89836d975 100644
--- a/MalmoEnv/malmoenv/core.py
+++ b/MalmoEnv/malmoenv/core.py
@@ -85,6 +85,7 @@ def __init__(self, reshape=False):
         self.action_space = None
         self.observation_space = None
         self.metadata = {'render.modes': ['rgb_array']}
+        self.reward_range = (-float('inf'), float('inf'))
         self.xml = None
         self.integratedServerPort = 0
         self.role = 0
diff --git a/MalmoEnv/missions/rllib_multiagent.xml b/MalmoEnv/missions/rllib_multiagent.xml
new file mode 100644
index 000000000..e6d1f757a
--- /dev/null
+++ b/MalmoEnv/missions/rllib_multiagent.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <About>
+        <Summary>Multi-agent Test Goal</Summary>
+  </About>
+
+  <ModSettings>
+        <MsPerTick>1</MsPerTick>
+  </ModSettings>
+
+  <ServerSection>
+    <ServerInitialConditions>
+      <Time>
+        <StartTime>6000</StartTime>
+        <AllowPassageOfTime>false</AllowPassageOfTime>
+      </Time>
+      <Weather>clear</Weather>
+      <AllowSpawning>false</AllowSpawning>
+    </ServerInitialConditions>
+    <ServerHandlers>
+      <FlatWorldGenerator generatorString="3;minecraft:bedrock,2*minecraft:dirt,minecraft:grass;1;village"/>
+
+      <DrawingDecorator>
+        <DrawCuboid x1="-2" y1="3" z1="-3" x2="8" y2="3" z2="7" type="lava" />
+
+        <DrawCuboid x1="-1" y1="3" z1="1" x2="-1" y2="3" z2="3" type="cobblestone" />
+        <DrawCuboid x1="0" y1="3" z1="2" x2="6" y2="3" z2="2" type="cobblestone" />
+        <DrawCuboid x1="7" y1="3" z1="1" x2="7" y2="3" z2="3" type="cobblestone" />
+
+        <DrawBlock  x="7"  y="3"  z="0"  type="lapis_block" />
+        <DrawBlock  x="7"  y="3"  z="4"  type="lapis_block" />
+
+        <DrawBlock  x="-1"  y="3"  z="0"  type="diamond_block" />
+        <DrawBlock  x="-1"  y="3"  z="4"  type="diamond_block" />
+      </DrawingDecorator>
+
+      <!--ServerQuitFromTimeUp timeLimitMs="300000" description="out_of_time"/>
+      <ServerQuitWhenAnyAgentFinishes/-->
+    </ServerHandlers>
+  </ServerSection>
+
+  <AgentSection mode="Survival">
+    <Name>Agent1</Name>
+
+    <AgentStart>
+      <Placement pitch="60" x="-0.5" y="4" yaw="270" z="0.5"/>
+    </AgentStart>
+
+    <AgentHandlers>
+
+      <VideoProducer want_depth="false">
+        <Width>84</Width>
+        <Height>84</Height>
+      </VideoProducer>
+
+      <DiscreteMovementCommands>
+          <ModifierList type="deny-list">
+            <command>attack</command>
+          </ModifierList>
+      </DiscreteMovementCommands>
+
+      <RewardForMissionEnd>
+        <Reward description="out_of_time" reward="-1" />
+      </RewardForMissionEnd>
+
+      <RewardForTouchingBlockType>
+        <Block reward="-1.0" type="lava" behaviour="onceOnly"/>
+        <Block reward="1.0" type="lapis_block" behaviour="onceOnly"/>
+      </RewardForTouchingBlockType>
+
+      <RewardForSendingCommand reward="-0.01"/>
+
+      <AgentQuitFromTouchingBlockType>
+        <Block type="lava" />
+        <Block type="lapis_block" />
+      </AgentQuitFromTouchingBlockType>
+      <AgentQuitFromReachingCommandQuota total="100"/>
+    </AgentHandlers>
+  </AgentSection>
+
+</Mission>
\ No newline at end of file
diff --git a/MalmoEnv/rllib_train.py b/MalmoEnv/rllib_train.py
new file mode 100644
index 000000000..d2628dd7b
--- /dev/null
+++ b/MalmoEnv/rllib_train.py
@@ -0,0 +1,119 @@
+import gym
+import ray
+from ray.tune import register_env, run_experiments
+from pathlib import Path
+import malmoenv
+
+ENV_NAME = "malmo"
+MISSION_XML = "missions/rllib_multiagent.xml"
+COMMAND_PORT = 8999
+NUM_MINECRAFT_INSTANCES = 4
+
+xml = Path(MISSION_XML).read_text()
+
+class TrackingEnv(gym.Wrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self._actions = [
+            self._forward,
+            self._back,
+            self._turn_right,
+            self._turn_left,
+            self._idle
+        ]
+
+    def _reset_state(self):
+        self._facing = (1, 0)
+        self._position = (0, 0)
+        self._visited = {}
+        self._update_visited()
+
+    def _forward(self):
+        self._position = (
+            self._position[0] + self._facing[0],
+            self._position[1] + self._facing[1]
+        )
+
+    def _back(self):
+        self._position = (
+            self._position[0] - self._facing[0],
+            self._position[1] - self._facing[1]
+        )
+
+    def _turn_left(self):
+        self._facing = (self._facing[1], -self._facing[0])
+
+    def _turn_right(self):
+        self._facing = (-self._facing[1], self._facing[0])
+
+    def _idle(self):
+        pass
+
+    def _encode_state(self):
+        return self._position
+
+    def _update_visited(self):
+        state = self._encode_state()
+        value = self._visited.get(state, 0)
+        self._visited[state] = value + 1
+        return value
+
+    def reset(self):
+        self._reset_state()
+        return super().reset()
+
+    def step(self, action):
+        o, r, d, i = super().step(action)
+        self._actions[action]()
+        revisit_count = self._update_visited()
+        if revisit_count == 0:
+            r += 0.02
+        if action == 4:
+            r += -0.5
+
+        return o, r, d, i
+
+
+class MalmoSyncEnv(gym.Wrapper):
+    def __init__(self, env, idle_action=4):
+        super().__init__(env)
+        self._idle_action = idle_action
+
+    def reset(self):
+        return super().reset()
+
+    def step(self, action):
+        o, r, d, i = super().step(action)
+        if d:
+            return o, r, d, i
+        return super().step(self._idle_action)
+
+
+def create_env(config):
+    env = malmoenv.make()
+    env.init(xml, COMMAND_PORT + config.worker_index, reshape=True)
+    env = MalmoSyncEnv(env)
+    env = TrackingEnv(env)
+    return env
+
+register_env(ENV_NAME, create_env)
+
+run_experiments({
+    "malmo": {
+        "run": "IMPALA",
+        "env": ENV_NAME,
+        "config": {
+            "model": {
+                "dim": 42
+            },
+            "num_workers": NUM_MINECRAFT_INSTANCES,
+            "rollout_fragment_length": 50,
+            "train_batch_size": 1024,
+            "replay_buffer_num_slots": 4000,
+            "replay_proportion": 10,
+            "learner_queue_timeout": 900,
+            "num_sgd_iter": 2,
+            "num_data_loader_buffers": 2,
+        }
+    }
+})
\ No newline at end of file

From a7719a1cb24b90c2ce4266801636eb83ce7d8d93 Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Tue, 8 Sep 2020 14:24:12 +0100
Subject: [PATCH 03/12] Fixed info to be returned as a dict

---
 MalmoEnv/malmoenv/core.py |  2 ++
 MalmoEnv/rllib_train.py   | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/MalmoEnv/malmoenv/core.py b/MalmoEnv/malmoenv/core.py
index 89836d975..ae32ffc86 100644
--- a/MalmoEnv/malmoenv/core.py
+++ b/MalmoEnv/malmoenv/core.py
@@ -349,6 +349,8 @@ def step(self, action):
                 obs = obs.reshape((self.height, self.width, self.depth)).astype(np.uint8)
         self.last_obs = obs
 
+        # RLlib requires info be returned as a dict rather than a string
+        info = { "raw_info": info }
         return obs, reward, self.done, info
 
     def close(self):
diff --git a/MalmoEnv/rllib_train.py b/MalmoEnv/rllib_train.py
index d2628dd7b..a4d4abed3 100644
--- a/MalmoEnv/rllib_train.py
+++ b/MalmoEnv/rllib_train.py
@@ -1,3 +1,21 @@
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) 2020 Microsoft Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ------------------------------------------------------------------------------------------------
 import gym
 import ray
 from ray.tune import register_env, run_experiments

From bb8643d36a6a5cdf723d8b06c70f2a0f26291f66 Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Tue, 8 Sep 2020 14:29:40 +0100
Subject: [PATCH 04/12] Added RLlib compatible multi-agent env

---
 MalmoEnv/malmoenv/multiagentenv.py | 287 +++++++++++++++++++++++++++++
 1 file changed, 287 insertions(+)
 create mode 100644 MalmoEnv/malmoenv/multiagentenv.py

diff --git a/MalmoEnv/malmoenv/multiagentenv.py b/MalmoEnv/malmoenv/multiagentenv.py
new file mode 100644
index 000000000..510a6a7b5
--- /dev/null
+++ b/MalmoEnv/malmoenv/multiagentenv.py
@@ -0,0 +1,287 @@
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) 2020 Microsoft Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ------------------------------------------------------------------------------------------------
+import time
+from threading import Thread
+from lxml import etree
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+import malmoenv
+from malmoenv.core import EnvException
+
+def _validate_config(xml, agent_configs):
+    """
+    Verify that the supplied agent config is compatible with the mission XML.
+    """
+    assert len(agent_configs) >= 2
+    xml = etree.fromstring(xml)
+    xml_agent_count = len(xml.findall("{http://ProjectMalmo.microsoft.com}AgentSection"))
+    assert len(agent_configs) == xml_agent_count
+
+
+def _parse_address(address):
+    """
+    Take addresses of various forms and convert them to a tuple of the form (HOST, PORT).
+    """
+
+    if isinstance(address, int):
+        # Only a port number provided
+        return ("127.0.0.1", address)
+
+    if isinstance(address, str):
+        parts = address.split(":")
+        if len(parts) == 1:
+            # Port number as a string
+            return ("127.0.0.1", int(parts[0]))
+        if len(parts) == 2:
+            # String in the form "HOST:PORT"
+            return (parts[0], int(parts[1]))
+
+    if len(address) == 2 and isinstance(address[0], str) and isinstance(address[1], int):
+        # An already parsed address
+        return address
+
+    raise EnvException(f"{address} is not a valid address")
+
+def _await_results(results):
+    """
+    Receives a dictionary of result tasks and repopulates it with the final results after the tasks
+    complete.
+    """
+    for agent_id, task in results.items():
+        results[agent_id] = task.wait()
+
+def _default_env_factory(agent_id, xml, role, host_address, host_port, command_address, command_port):
+    """
+    Default environment factory that fills out just enough settings to connect multiple game
+    instances into a single game session.
+    agent_id - The agent we're constructing the environment connection for.
+    xml - The mission XML.
+    role - The agent's role number. 0 == host agent.
+    host_address, host_port - Connection details for the game session host.
+    command_address, command_port - Connection details for the game instance the agent is controlling.
+    """
+    env = malmoenv.make()
+    env.init(xml, host_port,
+        server=host_address,
+        server2=command_address,
+        port2=command_port,
+        role=role,
+        exp_uid="default_experiment_id"
+    )
+    return env
+
+def _default_all_done_checker(env, obs, rewards, dones, infos):
+    """
+    Returns True if any agent is reported as done.
+    """
+    for done in dones.values():
+        if done:
+            return True
+    return False
+
+# Wraps a MalmoEnv instance and provides async reset and step operations
+# Reset operations need to be executed async as none of the connected environments will complete
+# their reset operations until all environments have at least issued a reset request.
+class _ConnectionContext:
+    def __init__(self, id, address, env):
+        """
+        Wrapper around a connection to a game instance.
+        id - The agent id that is in control of the game instance.
+        address - (server, port) tuple for the command connection.
+        env - The MalmoEnv instance that is connected to the game instance.
+        """
+        self.id = id
+        self.address = address
+        self.env = env
+
+        # Async task status tracking
+        self._task_thread = None
+        self._task_result = None
+
+    def wait(self):
+        """
+        Wait for the current async task to complete and return the result.
+        """
+        assert self._task_thread is not None
+        self._task_thread.join()
+        self._task_thread = None
+
+        # We want to re-trow the exception if the task raised an error
+        if isinstance(self._task_result, Exception):
+            raise self._task_result
+
+        return self._task_result
+
+    def reset(self):
+        """
+        Issue a reset request and return the async task immediately.
+        """
+        assert self._task_thread is None
+        self._task_thread = Thread(target=self._reset_task, name=f"Agent '{self.id}' reset")
+        self._task_thread.start()
+        return self
+
+    def step(self, action):
+        """
+        Issue a step request and return the async task immediately.
+        """
+        assert self._task_thread is None
+        self._task_thread = Thread(target=self._step_task, args=(action,), name=f"Agent '{self.id}' step")
+        self._task_thread.start()
+        return self
+
+    def close(self):
+        """
+        Shut down the Minecraft instance.
+        """
+        assert self._task_thread is None
+        self.env.close()
+        self.env.exit()
+
+    def _reset_task(self):
+        try:
+            self._task_result = self.env.reset()
+        except Exception as e:
+            self._task_result = e
+
+    def _step_task(self, action):
+        try:
+            self._task_result = self.env.step(action)
+        except Exception as e:
+            self._task_result = e
+
+# Config for a single agent that will be present within the environment
+class AgentConfig:
+    def __init__(self, id, address):
+        """
+        Configuration details for an agent acting within the environment.
+        id - The agent's id as used by RLlib.
+        address - The address for the game instance for the agent to connect to.
+        """
+        self.id = id
+        self.address = _parse_address(address)
+
+# RLlib compatible multi-agent environment.
+# This wraps multiple instances of MalmoEnv environments that are connected to their own Minecraft
+# instances.
+# The first agent defined in the agent_configs is treated as the primary Minecraft instance that
+# will act as the game server.
+class RllibMultiAgentEnv(MultiAgentEnv):
+    def __init__(self, xml, agent_configs, env_factory=None, all_done_checker=None):
+        """
+        An RLlib compatible multi-agent environment.
+        NOTE: Will not work with turn based actions as all agent act together.
+        xml - The mission XML
+        agent_configs - A list of AgentConfigs to decribe the agents within the environment.
+        env_factory - Function to allow custom construction of the MalmoEnv instances.
+                      This can be used to override the default inti parameter for the environment.
+        all_done_checker - Function to check if the "__all__" key should be set in the step done
+                           dictionary. The default check returns True if any agent reports that
+                           they're done.
+        """
+        _validate_config(xml, agent_configs)
+
+        self._all_done_checker = all_done_checker or _default_all_done_checker
+        env_factory = env_factory or _default_env_factory
+
+        # The first agent is treated as the game session host
+        host_address = agent_configs[0].address
+        self._connections = {}
+        self._reset_request_time = 0
+
+        role = 0
+        for agent_config in agent_configs:
+            env = env_factory(
+                agent_id=agent_config.id,
+                xml=xml,
+                role=role,
+                host_address=host_address[0],
+                host_port=host_address[1],
+                command_address=agent_config.address[0],
+                command_port=agent_config.address[1]
+            )
+            context = _ConnectionContext(
+                agent_config.id,
+                agent_config.address,
+                env
+            )
+            self._connections[agent_config.id] = context
+            role += 1
+
+
+    def get_observation_space(self, agent_id):
+        return self._connections[agent_id].env.observation_space
+
+    def get_action_space(self, agent_id):
+        return self._connections[agent_id].env.action_space
+
+    def reset(self):
+        obs = {}
+        request_time = time.perf_counter()
+        for agent_id, connection in self._connections.items():
+            obs[agent_id] = connection.reset()
+
+        # All reset operations must be issued asynchronously as none of the Minecraft instances
+        # will complete their reset requests until all agents have issued a reset request
+        _await_results(obs)
+        self._reset_request_time = time.perf_counter() - request_time
+
+        return obs
+
+    def step(self, actions):
+        results = {}
+        request_time = time.perf_counter()
+        for agent_id, action in actions.items():
+            results[agent_id] = self._connections[agent_id].step(action)
+
+        _await_results(results)
+        request_time = time.perf_counter() - request_time
+
+        # We need to repack the individual step results into dictionaries per data type to conform
+        # with RLlib's requirements
+        obs = {
+            agent_id: result[0]
+            for agent_id, result in results.items()
+        }
+        rewards = {
+            agent_id: result[1]
+            for agent_id, result in results.items()
+        }
+        dones = {
+            agent_id: result[2]
+            for agent_id, result in results.items()
+        }
+        infos = {
+            agent_id: result[3]
+            for agent_id, result in results.items()
+        }
+
+        # Pass the results to the done checker to set the required __all__ value
+        dones["__all__"] = self._all_done_checker(self, obs, rewards, dones, infos)
+        infos["step_request_time"] = request_time
+        infos["reset_request_time"] = self._reset_request_time
+
+        return obs, rewards, dones, infos
+
+    def close(self):
+        for connection in self._connections.values():
+            try:
+                connection.close()
+            except Exception as e:
+                message = getattr(e, "message", e)
+                print(f"Error closing environment: {message}")

From 8fdb57720b47e4a45ccf32ea4d9ce2db4b156f49 Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Tue, 8 Sep 2020 17:03:22 +0100
Subject: [PATCH 05/12] Moved the SyncEnv into core.py

---
 MalmoEnv/malmoenv/__init__.py          |  4 ++--
 MalmoEnv/malmoenv/core.py              | 18 ++++++++++++++++++
 MalmoEnv/missions/rllib_multiagent.xml |  2 +-
 MalmoEnv/rllib_train.py                | 18 +-----------------
 4 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/MalmoEnv/malmoenv/__init__.py b/MalmoEnv/malmoenv/__init__.py
index d24708985..4ce582ad2 100644
--- a/MalmoEnv/malmoenv/__init__.py
+++ b/MalmoEnv/malmoenv/__init__.py
@@ -17,6 +17,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 # ------------------------------------------------------------------------------------------------
 
-from malmoenv.core import ActionSpace, StringActionSpace, VisualObservationSpace, Env, make
+from malmoenv.core import ActionSpace, StringActionSpace, VisualObservationSpace, Env, SyncEnv, make
 
-__all__ = ['ActionSpace', 'StringActionSpace', 'VisualObservationSpace', 'Env', 'make']
+__all__ = ['ActionSpace', 'StringActionSpace', 'VisualObservationSpace', 'Env', 'SyncEnv', 'make']
diff --git a/MalmoEnv/malmoenv/core.py b/MalmoEnv/malmoenv/core.py
index ae32ffc86..d209e1060 100644
--- a/MalmoEnv/malmoenv/core.py
+++ b/MalmoEnv/malmoenv/core.py
@@ -495,5 +495,23 @@ def _get_token(self):
         return self.exp_uid + ":" + str(self.role) + ":" + str(self.resets)
 
 
+class SyncEnv(gym.Wrapper):
+    def __init__(self, env, idle_action=4, idle_delay=0):
+        super().__init__(env)
+        self._idle_action = idle_action
+        self._idle_delay = idle_delay
+
+    def reset(self):
+        return super().reset()
+
+    def step(self, action):
+        o, r, d, i = super().step(action)
+        if d:
+            return o, r, d, i
+
+        time.sleep(self._idle_delay)
+
+        return super().step(self._idle_action)
+
 def make():
     return Env()
diff --git a/MalmoEnv/missions/rllib_multiagent.xml b/MalmoEnv/missions/rllib_multiagent.xml
index e6d1f757a..0c2b5e910 100644
--- a/MalmoEnv/missions/rllib_multiagent.xml
+++ b/MalmoEnv/missions/rllib_multiagent.xml
@@ -79,4 +79,4 @@
     </AgentHandlers>
   </AgentSection>
 
-</Mission>
\ No newline at end of file
+</Mission>
diff --git a/MalmoEnv/rllib_train.py b/MalmoEnv/rllib_train.py
index a4d4abed3..608702f92 100644
--- a/MalmoEnv/rllib_train.py
+++ b/MalmoEnv/rllib_train.py
@@ -91,26 +91,10 @@ def step(self, action):
 
         return o, r, d, i
 
-
-class MalmoSyncEnv(gym.Wrapper):
-    def __init__(self, env, idle_action=4):
-        super().__init__(env)
-        self._idle_action = idle_action
-
-    def reset(self):
-        return super().reset()
-
-    def step(self, action):
-        o, r, d, i = super().step(action)
-        if d:
-            return o, r, d, i
-        return super().step(self._idle_action)
-
-
 def create_env(config):
     env = malmoenv.make()
     env.init(xml, COMMAND_PORT + config.worker_index, reshape=True)
-    env = MalmoSyncEnv(env)
+    env = malmoenv.SyncEnv(env, idle_action=4, idle_delay=0.01)
     env = TrackingEnv(env)
     return env
 

From c09a3865f01730276d7e58e5e01ee9f5a95e9d1d Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Wed, 9 Sep 2020 16:41:25 +0100
Subject: [PATCH 06/12] Added multi-agent training

---
 MalmoEnv/malmoenv/multiagentenv.py     | 12 +++--
 MalmoEnv/missions/rllib_multiagent.xml | 44 +++++++++++++++--
 MalmoEnv/rllib_train.py                | 65 ++++++++++++++++++++++----
 3 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/MalmoEnv/malmoenv/multiagentenv.py b/MalmoEnv/malmoenv/multiagentenv.py
index 510a6a7b5..2e10629d2 100644
--- a/MalmoEnv/malmoenv/multiagentenv.py
+++ b/MalmoEnv/malmoenv/multiagentenv.py
@@ -151,7 +151,7 @@ def close(self):
         """
         assert self._task_thread is None
         self.env.close()
-        self.env.exit()
+        #self.env.exit()
 
     def _reset_task(self):
         try:
@@ -201,6 +201,7 @@ def __init__(self, xml, agent_configs, env_factory=None, all_done_checker=None):
 
         # The first agent is treated as the game session host
         host_address = agent_configs[0].address
+        self._id = host_address
         self._connections = {}
         self._reset_request_time = 0
 
@@ -231,6 +232,7 @@ def get_action_space(self, agent_id):
         return self._connections[agent_id].env.action_space
 
     def reset(self):
+        print(f"Resetting {self._id}...")
         obs = {}
         request_time = time.perf_counter()
         for agent_id, connection in self._connections.items():
@@ -240,10 +242,12 @@ def reset(self):
         # will complete their reset requests until all agents have issued a reset request
         _await_results(obs)
         self._reset_request_time = time.perf_counter() - request_time
+        print(f"Reset {self._id} complete")
 
         return obs
 
     def step(self, actions):
+        print(f"Stepping {self._id} - Actions: {actions}...")
         results = {}
         request_time = time.perf_counter()
         for agent_id, action in actions.items():
@@ -273,8 +277,10 @@ def step(self, actions):
 
         # Pass the results to the done checker to set the required __all__ value
         dones["__all__"] = self._all_done_checker(self, obs, rewards, dones, infos)
-        infos["step_request_time"] = request_time
-        infos["reset_request_time"] = self._reset_request_time
+#        infos["step_request_time"] = request_time
+#        infos["reset_request_time"] = self._reset_request_time
+
+        print(f"Step of {self._id} complete - {dones}")
 
         return obs, rewards, dones, infos
 
diff --git a/MalmoEnv/missions/rllib_multiagent.xml b/MalmoEnv/missions/rllib_multiagent.xml
index 0c2b5e910..797a30e8c 100644
--- a/MalmoEnv/missions/rllib_multiagent.xml
+++ b/MalmoEnv/missions/rllib_multiagent.xml
@@ -1,4 +1,3 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 <Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 
   <About>
@@ -6,7 +5,7 @@
   </About>
 
   <ModSettings>
-        <MsPerTick>1</MsPerTick>
+        <MsPerTick>50</MsPerTick>
   </ModSettings>
 
   <ServerSection>
@@ -41,7 +40,7 @@
   </ServerSection>
 
   <AgentSection mode="Survival">
-    <Name>Agent1</Name>
+    <Name>Agent0</Name>
 
     <AgentStart>
       <Placement pitch="60" x="-0.5" y="4" yaw="270" z="0.5"/>
@@ -79,4 +78,43 @@
     </AgentHandlers>
   </AgentSection>
 
+  <AgentSection mode="Survival">
+    <Name>Agent1</Name>
+
+    <AgentStart>
+      <Placement pitch="60" x="-0.5" y="4" yaw="90" z="4.5"/>
+    </AgentStart>
+
+    <AgentHandlers>
+
+      <VideoProducer want_depth="false">
+        <Width>84</Width>
+        <Height>84</Height>
+      </VideoProducer>
+
+      <DiscreteMovementCommands>
+          <ModifierList type="deny-list">
+            <command>attack</command>
+          </ModifierList>
+      </DiscreteMovementCommands>
+
+      <RewardForMissionEnd>
+        <Reward description="out_of_time" reward="-1" />
+      </RewardForMissionEnd>
+
+      <RewardForTouchingBlockType>
+        <Block reward="-1.0" type="lava" behaviour="onceOnly"/>
+        <Block reward="1.0" type="lapis_block" behaviour="onceOnly"/>
+      </RewardForTouchingBlockType>
+
+      <RewardForSendingCommand reward="-0.01"/>
+
+      <AgentQuitFromTouchingBlockType>
+        <Block type="lava" />
+        <Block type="lapis_block" />
+      </AgentQuitFromTouchingBlockType>
+      <AgentQuitFromReachingCommandQuota total="100"/>
+    </AgentHandlers>
+  </AgentSection>
+
 </Mission>
diff --git a/MalmoEnv/rllib_train.py b/MalmoEnv/rllib_train.py
index 608702f92..e031f57d4 100644
--- a/MalmoEnv/rllib_train.py
+++ b/MalmoEnv/rllib_train.py
@@ -21,11 +21,13 @@
 from ray.tune import register_env, run_experiments
 from pathlib import Path
 import malmoenv
+from malmoenv.multiagentenv import RllibMultiAgentEnv, AgentConfig
 
-ENV_NAME = "malmo"
+SINGLE_AGENT_ENV = "malmo_single_agent"
+MULTI_AGENT_ENV = "malmo_multi_agent"
 MISSION_XML = "missions/rllib_multiagent.xml"
 COMMAND_PORT = 8999
-NUM_MINECRAFT_INSTANCES = 4
+NUM_ENVIRONMENT_INSTANCES = 2
 
 xml = Path(MISSION_XML).read_text()
 
@@ -91,31 +93,76 @@ def step(self, action):
 
         return o, r, d, i
 
-def create_env(config):
+
+def env_factory(agent_id, xml, role, host_address, host_port, command_address, command_port):
     env = malmoenv.make()
-    env.init(xml, COMMAND_PORT + config.worker_index, reshape=True)
-    env = malmoenv.SyncEnv(env, idle_action=4, idle_delay=0.01)
+    env.init(xml, host_port,
+        server=host_address,
+        server2=command_address,
+        port2=command_port,
+        role=role,
+        exp_uid="multiagent",
+        reshape=True
+    )
+    env = malmoenv.SyncEnv(env, idle_action=4, idle_delay=0.02)
     env = TrackingEnv(env)
     return env
 
-register_env(ENV_NAME, create_env)
+def all_done_checker(env, obs, rewards, dones, infos):
+    """
+    Returns True when all agents are reported as done.
+    """
+    for done in dones.values():
+        if not done:
+            return False
+    return True
+
+def create_single_agent_env(config):
+    port = COMMAND_PORT + config.worker_index
+    return env_factory("agent0", xml, 0, "127.0.0.1", port, "127.0.0.1", port)
+
+def create_multi_agent_env(config):
+    port = COMMAND_PORT + (config.worker_index * 2)
+    agent_config = [
+        AgentConfig(id=f"agent1", address=port-1),
+        AgentConfig(id=f"agent2", address=port),
+    ]
+    env = RllibMultiAgentEnv(xml, agent_config,
+        env_factory=env_factory,
+#        all_done_checker=all_done_checker
+    )
+    return env
+
+
+register_env(SINGLE_AGENT_ENV, create_single_agent_env)
+register_env(MULTI_AGENT_ENV, create_multi_agent_env)
 
 run_experiments({
     "malmo": {
         "run": "IMPALA",
-        "env": ENV_NAME,
+        "env": MULTI_AGENT_ENV,
         "config": {
             "model": {
                 "dim": 42
             },
-            "num_workers": NUM_MINECRAFT_INSTANCES,
+            "num_workers": NUM_ENVIRONMENT_INSTANCES,
             "rollout_fragment_length": 50,
             "train_batch_size": 1024,
             "replay_buffer_num_slots": 4000,
             "replay_proportion": 10,
             "learner_queue_timeout": 900,
             "num_sgd_iter": 2,
-            "num_data_loader_buffers": 2,
+            "num_data_loader_buffers": 1,
+
+            "multiagent": {
+                "policies": { "shared_policy": (
+                    None,
+                    gym.spaces.Box(0, 255, shape=(84, 84, 3)),
+                    gym.spaces.Discrete(5),
+                    {}
+                )},
+                "policy_mapping_fn": (lambda agent_id: "shared_policy")
+            }
         }
     }
 })
\ No newline at end of file

From aab83971858c92a11c093472bbf484d6c1048030 Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Thu, 10 Sep 2020 11:37:46 +0100
Subject: [PATCH 07/12] Stable RLlib training

---
 MalmoEnv/missions/rllib_multiagent.xml | 2 +-
 MalmoEnv/rllib_train.py                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/MalmoEnv/missions/rllib_multiagent.xml b/MalmoEnv/missions/rllib_multiagent.xml
index 797a30e8c..d89c4be8f 100644
--- a/MalmoEnv/missions/rllib_multiagent.xml
+++ b/MalmoEnv/missions/rllib_multiagent.xml
@@ -82,7 +82,7 @@
     <Name>Agent1</Name>
 
     <AgentStart>
-      <Placement pitch="60" x="-0.5" y="4" yaw="90" z="4.5"/>
+      <Placement pitch="60" x="-0.5" y="4" yaw="270" z="4.5"/>
     </AgentStart>
 
     <AgentHandlers>
diff --git a/MalmoEnv/rllib_train.py b/MalmoEnv/rllib_train.py
index e031f57d4..f9ae6ef5c 100644
--- a/MalmoEnv/rllib_train.py
+++ b/MalmoEnv/rllib_train.py
@@ -147,7 +147,7 @@ def create_multi_agent_env(config):
             },
             "num_workers": NUM_ENVIRONMENT_INSTANCES,
             "rollout_fragment_length": 50,
-            "train_batch_size": 1024,
+            "train_batch_size": 512,
             "replay_buffer_num_slots": 4000,
             "replay_proportion": 10,
             "learner_queue_timeout": 900,

From f3d5174a4f31cee6426933045eab08efc14186ea Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Tue, 15 Sep 2020 16:30:49 +0100
Subject: [PATCH 08/12] Switching back to turn based actions

---
 MalmoEnv/malmoenv/core.py                   |  21 +-
 MalmoEnv/malmoenv/multiagentenv.py          |   8 +-
 MalmoEnv/malmoenv/turnbasedmultiagentenv.py | 297 ++++++++++++++++++++
 MalmoEnv/missions/rllib_multiagent.xml      |  36 ++-
 MalmoEnv/rllib_train.py                     |  24 +-
 5 files changed, 363 insertions(+), 23 deletions(-)
 create mode 100644 MalmoEnv/malmoenv/turnbasedmultiagentenv.py

diff --git a/MalmoEnv/malmoenv/core.py b/MalmoEnv/malmoenv/core.py
index d209e1060..c95cc3bc7 100644
--- a/MalmoEnv/malmoenv/core.py
+++ b/MalmoEnv/malmoenv/core.py
@@ -311,9 +311,19 @@ def step(self, action):
         withturnkey = self.step_options < 2
         withinfo = self.step_options == 0 or self.step_options == 2
 
+        warning_displayed = False
+        start_time = time.time()
+
         while not self.done and \
                 ((obs is None or len(obs) == 0) or
                  (withinfo and info is None) or turn):
+
+            if (time.time() - start_time > 10) and not warning_displayed:
+                warning_displayed = True
+                print(f"WARNING! Long step time for {self.server2}:{self.port2}")
+                o = len(obs) if obs is not None else "None"
+                print(f"Done={self.done}, len(obs)={o}, turn={turn}")
+
             step_message = "<Step" + str(self.step_options) + ">" + \
                            self.action_space[action] + \
                            "</Step" + str(self.step_options) + " >"
@@ -502,16 +512,17 @@ def __init__(self, env, idle_action=4, idle_delay=0):
         self._idle_delay = idle_delay
 
     def reset(self):
+        time.sleep(self._idle_delay)
         return super().reset()
 
     def step(self, action):
-        o, r, d, i = super().step(action)
-        if d:
-            return o, r, d, i
+#        time.sleep(self._idle_delay)
+#        o, r, d, i = super().step(action)
+#        if d:
+#            return o, r, d, i
 
         time.sleep(self._idle_delay)
-
-        return super().step(self._idle_action)
+        return super().step(action)
 
 def make():
     return Env()
diff --git a/MalmoEnv/malmoenv/multiagentenv.py b/MalmoEnv/malmoenv/multiagentenv.py
index 2e10629d2..d73fa5a37 100644
--- a/MalmoEnv/malmoenv/multiagentenv.py
+++ b/MalmoEnv/malmoenv/multiagentenv.py
@@ -161,8 +161,11 @@ def _reset_task(self):
 
     def _step_task(self, action):
         try:
+            print(f"Stepping agent {self.id}, {self.address}...")
             self._task_result = self.env.step(action)
+            print(f"Step agent {self.id}, {self.address} complete, done={self._task_result[2]}")
         except Exception as e:
+            print(f"Exception with agent {self.id}, {self.address}")
             self._task_result = e
 
 # Config for a single agent that will be present within the environment
@@ -204,6 +207,7 @@ def __init__(self, xml, agent_configs, env_factory=None, all_done_checker=None):
         self._id = host_address
         self._connections = {}
         self._reset_request_time = 0
+        self._step = 0
 
         role = 0
         for agent_config in agent_configs:
@@ -233,6 +237,7 @@ def get_action_space(self, agent_id):
 
     def reset(self):
         print(f"Resetting {self._id}...")
+        self._step = 0
         obs = {}
         request_time = time.perf_counter()
         for agent_id, connection in self._connections.items():
@@ -247,7 +252,8 @@ def reset(self):
         return obs
 
     def step(self, actions):
-        print(f"Stepping {self._id} - Actions: {actions}...")
+        print(f"Step {self._step} for agent {self._id} - Actions: {actions}...")
+        self._step += 1
         results = {}
         request_time = time.perf_counter()
         for agent_id, action in actions.items():
diff --git a/MalmoEnv/malmoenv/turnbasedmultiagentenv.py b/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
new file mode 100644
index 000000000..b3c0dbd41
--- /dev/null
+++ b/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
@@ -0,0 +1,297 @@
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) 2020 Microsoft Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ------------------------------------------------------------------------------------------------
+import time
+from threading import Thread
+from lxml import etree
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+import malmoenv
+from malmoenv.core import EnvException
+
+def _validate_config(xml, agent_configs):
+    """
+    Verify that the supplied agent config is compatible with the mission XML.
+    """
+    assert len(agent_configs) >= 2
+    xml = etree.fromstring(xml)
+    xml_agent_count = len(xml.findall("{http://ProjectMalmo.microsoft.com}AgentSection"))
+    assert len(agent_configs) == xml_agent_count
+
+
+def _parse_address(address):
+    """
+    Take addresses of various forms and convert them to a tuple of the form (HOST, PORT).
+    """
+
+    if isinstance(address, int):
+        # Only a port number provided
+        return ("127.0.0.1", address)
+
+    if isinstance(address, str):
+        parts = address.split(":")
+        if len(parts) == 1:
+            # Port number as a string
+            return ("127.0.0.1", int(parts[0]))
+        if len(parts) == 2:
+            # String in the form "HOST:PORT"
+            return (parts[0], int(parts[1]))
+
+    if len(address) == 2 and isinstance(address[0], str) and isinstance(address[1], int):
+        # An already parsed address
+        return address
+
+    raise EnvException(f"{address} is not a valid address")
+
+def _await_results(results):
+    """
+    Receives a dictionary of result tasks and repopulates it with the final results after the tasks
+    complete.
+    """
+    for agent_id, task in results.items():
+        results[agent_id] = task.wait()
+
+def _default_env_factory(agent_id, xml, role, host_address, host_port, command_address, command_port):
+    """
+    Default environment factory that fills out just enough settings to connect multiple game
+    instances into a single game session.
+    agent_id - The agent we're constructing the environment connection for.
+    xml - The mission XML.
+    role - The agent's role number. 0 == host agent.
+    host_address, host_port - Connection details for the game session host.
+    command_address, command_port - Connection details for the game instance the agent is controlling.
+    """
+    env = malmoenv.make()
+    env.init(xml, host_port,
+        server=host_address,
+        server2=command_address,
+        port2=command_port,
+        role=role,
+        exp_uid="default_experiment_id"
+    )
+    return env
+
+def _default_all_done_checker(env, obs, rewards, dones, infos):
+    """
+    Returns True if any agent is reported as done.
+    """
+    for done in dones.values():
+        if done:
+            return True
+    return False
+
+# Wraps a MalmoEnv instance and provides async reset and step operations
+# Reset operations need to be executed async as none of the connected environments will complete
+# their reset operations until all environments have at least issued a reset request.
+class _ConnectionContext:
+    def __init__(self, id, address, env):
+        """
+        Wrapper around a connection to a game instance.
+        id - The agent id that is in control of the game instance.
+        address - (server, port) tuple for the command connection.
+        env - The MalmoEnv instance that is connected to the game instance.
+        """
+        self.id = id
+        self.address = address
+        self.env = env
+        self.last_observation = None
+
+        # Async task status tracking
+        self._task_thread = None
+        self._task_result = None
+
+    def wait(self):
+        """
+        Wait for the current async task to complete and return the result.
+        """
+        assert self._task_thread is not None
+        self._task_thread.join()
+        self._task_thread = None
+
+        # We want to re-trow the exception if the task raised an error
+        if isinstance(self._task_result, Exception):
+            raise self._task_result
+
+        return self._task_result
+
+    def reset(self):
+        """
+        Issue a reset request and return the async task immediately.
+        """
+        assert self._task_thread is None
+        self._task_thread = Thread(target=self._reset_task, name=f"Agent '{self.id}' reset")
+        self._task_thread.start()
+        return self
+
+    def _reset_task(self):
+        try:
+            self._task_result = self.last_observation = self.env.reset()
+        except Exception as e:
+            self._task_result = e
+
+    def step(self, action):
+        """
+        Issue a step request and return the async task immediately.
+        """
+        self.last_observation, r, d, i = self.env.step(action)
+        return self.last_observation, r, d, i
+
+    def close(self):
+        """
+        Shut down the Minecraft instance.
+        """
+        self.env.close()
+
+# Config for a single agent that will be present within the environment
+class AgentConfig:
+    def __init__(self, id, address):
+        """
+        Configuration details for an agent acting within the environment.
+        id - The agent's id as used by RLlib.
+        address - The address for the game instance for the agent to connect to.
+        """
+        self.id = id
+        self.address = _parse_address(address)
+
+# RLlib compatible multi-agent environment.
+# This wraps multiple instances of MalmoEnv environments that are connected to their own Minecraft
+# instances.
+# The first agent defined in the agent_configs is treated as the primary Minecraft instance that
+# will act as the game server.
+class TurnBasedRllibMultiAgentEnv(MultiAgentEnv):
+    def __init__(self, xml, agent_configs, env_factory=None, all_done_checker=None):
+        """
+        An RLlib compatible multi-agent environment.
+        NOTE: Will not work with turn based actions as all agent act together.
+        xml - The mission XML
+        agent_configs - A list of AgentConfigs to decribe the agents within the environment.
+        env_factory - Function to allow custom construction of the MalmoEnv instances.
+                      This can be used to override the default inti parameter for the environment.
+        all_done_checker - Function to check if the "__all__" key should be set in the step done
+                           dictionary. The default check returns True if any agent reports that
+                           they're done.
+        """
+        _validate_config(xml, agent_configs)
+
+        self._all_done_checker = all_done_checker or _default_all_done_checker
+        env_factory = env_factory or _default_env_factory
+
+        # The first agent is treated as the game session host
+        host_address = agent_configs[0].address
+        self._id = host_address
+        self._connections = {}
+        self._reset_request_time = 0
+        self._step = 0
+
+        role = 0
+        for agent_config in agent_configs:
+            env = env_factory(
+                agent_id=agent_config.id,
+                xml=xml,
+                role=role,
+                host_address=host_address[0],
+                host_port=host_address[1],
+                command_address=agent_config.address[0],
+                command_port=agent_config.address[1]
+            )
+            context = _ConnectionContext(
+                agent_config.id,
+                agent_config.address,
+                env
+            )
+            self._connections[agent_config.id] = context
+            role += 1
+
+
+    def get_observation_space(self, agent_id):
+        return self._connections[agent_id].env.observation_space
+
+    def get_action_space(self, agent_id):
+        return self._connections[agent_id].env.action_space
+
+    def reset(self):
+        print(f"Resetting {self._id}...")
+        self._step = 0
+        obs = {}
+        request_time = time.perf_counter()
+        for agent_id, connection in self._connections.items():
+            obs[agent_id] = connection.reset()
+
+        # All reset operations must be issued asynchronously as none of the Minecraft instances
+        # will complete their reset requests until all agents have issued a reset request
+        _await_results(obs)
+        self._reset_request_time = time.perf_counter() - request_time
+        print(f"Reset {self._id} complete")
+
+        return obs
+
+    def step(self, actions):
+        print(f"Step {self._step} for agent {self._id} - Actions: {actions}...")
+        self._step += 1
+        results = {}
+        request_time = time.perf_counter()
+        done = False
+
+        for agent_id, action in actions.items():
+            if not done:
+                time.sleep(0.5)
+                o, r, done, i = self._connections[agent_id].step(action)
+            else:
+                o = self._connections[agent_id].last_observation
+                r = 0.0
+                i = {}
+
+            assert self._connections[agent_id].env.observation_space.contains(o), f"Shape={o.shape}"
+            results[agent_id] = (o, r, done, i)
+
+        request_time = time.perf_counter() - request_time
+
+        # We need to repack the individual step results into dictionaries per data type to conform
+        # with RLlib's requirements
+        obs = {
+            agent_id: result[0]
+            for agent_id, result in results.items()
+        }
+        rewards = {
+            agent_id: result[1]
+            for agent_id, result in results.items()
+        }
+        dones = {
+            agent_id: result[2]
+            for agent_id, result in results.items()
+        }
+        infos = {
+            agent_id: result[3]
+            for agent_id, result in results.items()
+        }
+
+        # Pass the results to the done checker to set the required __all__ value
+        dones["__all__"] = self._all_done_checker(self, obs, rewards, dones, infos)
+#        infos["step_request_time"] = request_time
+#        infos["reset_request_time"] = self._reset_request_time
+
+        print(f"Step of {self._id} complete - {dones}")
+
+        return obs, rewards, dones, infos
+
+    def close(self):
+        for connection in self._connections.values():
+            try:
+                connection.close()
+            except Exception as e:
+                message = getattr(e, "message", e)
+                print(f"Error closing environment: {message}")
diff --git a/MalmoEnv/missions/rllib_multiagent.xml b/MalmoEnv/missions/rllib_multiagent.xml
index d89c4be8f..0d17a5faf 100644
--- a/MalmoEnv/missions/rllib_multiagent.xml
+++ b/MalmoEnv/missions/rllib_multiagent.xml
@@ -21,7 +21,7 @@
       <FlatWorldGenerator generatorString="3;minecraft:bedrock,2*minecraft:dirt,minecraft:grass;1;village"/>
 
       <DrawingDecorator>
-        <DrawCuboid x1="-2" y1="3" z1="-3" x2="8" y2="3" z2="7" type="lava" />
+        <DrawCuboid x1="-62" y1="3" z1="-63" x2="68" y2="3" z2="67" type="redstone_block" />
 
         <DrawCuboid x1="-1" y1="3" z1="1" x2="-1" y2="3" z2="3" type="cobblestone" />
         <DrawCuboid x1="0" y1="3" z1="2" x2="6" y2="3" z2="2" type="cobblestone" />
@@ -40,7 +40,7 @@
   </ServerSection>
 
   <AgentSection mode="Survival">
-    <Name>Agent0</Name>
+    <Name>Agent1</Name>
 
     <AgentStart>
       <Placement pitch="60" x="-0.5" y="4" yaw="270" z="0.5"/>
@@ -53,11 +53,13 @@
         <Height>84</Height>
       </VideoProducer>
 
-      <DiscreteMovementCommands>
-          <ModifierList type="deny-list">
-            <command>attack</command>
-          </ModifierList>
-      </DiscreteMovementCommands>
+      <TurnBasedCommands requestedPosition="1">
+        <DiscreteMovementCommands>
+            <ModifierList type="deny-list">
+              <command>attack</command>
+            </ModifierList>
+        </DiscreteMovementCommands>
+      </TurnBasedCommands>
 
       <RewardForMissionEnd>
         <Reward description="out_of_time" reward="-1" />
@@ -65,6 +67,7 @@
 
       <RewardForTouchingBlockType>
         <Block reward="-1.0" type="lava" behaviour="onceOnly"/>
+        <Block reward="-1.0" type="redstone_block" behaviour="onceOnly"/>
         <Block reward="1.0" type="lapis_block" behaviour="onceOnly"/>
       </RewardForTouchingBlockType>
 
@@ -74,12 +77,12 @@
         <Block type="lava" />
         <Block type="lapis_block" />
       </AgentQuitFromTouchingBlockType>
-      <AgentQuitFromReachingCommandQuota total="100"/>
+      <AgentQuitFromReachingCommandQuota total="300"/>
     </AgentHandlers>
   </AgentSection>
 
   <AgentSection mode="Survival">
-    <Name>Agent1</Name>
+    <Name>Agent2</Name>
 
     <AgentStart>
       <Placement pitch="60" x="-0.5" y="4" yaw="270" z="4.5"/>
@@ -92,11 +95,13 @@
         <Height>84</Height>
       </VideoProducer>
 
-      <DiscreteMovementCommands>
-          <ModifierList type="deny-list">
-            <command>attack</command>
-          </ModifierList>
-      </DiscreteMovementCommands>
+      <TurnBasedCommands requestedPosition="2">
+        <DiscreteMovementCommands>
+            <ModifierList type="deny-list">
+              <command>attack</command>
+            </ModifierList>
+        </DiscreteMovementCommands>
+      </TurnBasedCommands>
 
       <RewardForMissionEnd>
         <Reward description="out_of_time" reward="-1" />
@@ -104,6 +109,7 @@
 
       <RewardForTouchingBlockType>
         <Block reward="-1.0" type="lava" behaviour="onceOnly"/>
+        <Block reward="-1.0" type="redstone_block" behaviour="onceOnly"/>
         <Block reward="1.0" type="lapis_block" behaviour="onceOnly"/>
       </RewardForTouchingBlockType>
 
@@ -113,7 +119,7 @@
         <Block type="lava" />
         <Block type="lapis_block" />
       </AgentQuitFromTouchingBlockType>
-      <AgentQuitFromReachingCommandQuota total="100"/>
+      <AgentQuitFromReachingCommandQuota total="300"/>
     </AgentHandlers>
   </AgentSection>
 
diff --git a/MalmoEnv/rllib_train.py b/MalmoEnv/rllib_train.py
index f9ae6ef5c..ad2c4899a 100644
--- a/MalmoEnv/rllib_train.py
+++ b/MalmoEnv/rllib_train.py
@@ -18,16 +18,19 @@
 # ------------------------------------------------------------------------------------------------
 import gym
 import ray
+from ray.rllib.env.atari_wrappers import FrameStack
 from ray.tune import register_env, run_experiments
 from pathlib import Path
 import malmoenv
 from malmoenv.multiagentenv import RllibMultiAgentEnv, AgentConfig
+from malmoenv.turnbasedmultiagentenv import TurnBasedRllibMultiAgentEnv
 
 SINGLE_AGENT_ENV = "malmo_single_agent"
 MULTI_AGENT_ENV = "malmo_multi_agent"
 MISSION_XML = "missions/rllib_multiagent.xml"
 COMMAND_PORT = 8999
 NUM_ENVIRONMENT_INSTANCES = 2
+FRAME_STACK = 2
 
 xml = Path(MISSION_XML).read_text()
 
@@ -41,12 +44,14 @@ def __init__(self, env):
             self._turn_left,
             self._idle
         ]
+        self._step_count = 0
 
     def _reset_state(self):
         self._facing = (1, 0)
         self._position = (0, 0)
         self._visited = {}
         self._update_visited()
+        self._step_count = 0
 
     def _forward(self):
         self._position = (
@@ -91,6 +96,12 @@ def step(self, action):
         if action == 4:
             r += -0.5
 
+        self._step_count += 1
+        if self._step_count == 50:
+            d = True
+        elif r < -0.9:
+            d = True
+
         return o, r, d, i
 
 
@@ -104,6 +115,7 @@ def env_factory(agent_id, xml, role, host_address, host_port, command_address, c
         exp_uid="multiagent",
         reshape=True
     )
+    env = FrameStack(env, FRAME_STACK)
     env = malmoenv.SyncEnv(env, idle_action=4, idle_delay=0.02)
     env = TrackingEnv(env)
     return env
@@ -127,7 +139,7 @@ def create_multi_agent_env(config):
         AgentConfig(id=f"agent1", address=port-1),
         AgentConfig(id=f"agent2", address=port),
     ]
-    env = RllibMultiAgentEnv(xml, agent_config,
+    env = TurnBasedRllibMultiAgentEnv(xml, agent_config,
         env_factory=env_factory,
 #        all_done_checker=all_done_checker
     )
@@ -146,6 +158,7 @@ def create_multi_agent_env(config):
                 "dim": 42
             },
             "num_workers": NUM_ENVIRONMENT_INSTANCES,
+            "num_gpus": 0,
             "rollout_fragment_length": 50,
             "train_batch_size": 512,
             "replay_buffer_num_slots": 4000,
@@ -154,10 +167,17 @@ def create_multi_agent_env(config):
             "num_sgd_iter": 2,
             "num_data_loader_buffers": 1,
 
+            "exploration_config": {
+                "type": "EpsilonGreedy",
+                "initial_epsilon": 1.0,
+                "final_epsilon": 0.02,
+                "epsilon_timesteps": 10000
+            },
+
             "multiagent": {
                 "policies": { "shared_policy": (
                     None,
-                    gym.spaces.Box(0, 255, shape=(84, 84, 3)),
+                    gym.spaces.Box(0, 255, shape=(84, 84, 3 * FRAME_STACK)),
                     gym.spaces.Discrete(5),
                     {}
                 )},

From 8871515fbb6dcb37351c611025b423311a811363 Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Wed, 16 Sep 2020 14:01:11 +0100
Subject: [PATCH 09/12] Synchronised observations for turn based experiments

---
 MalmoEnv/malmoenv/turnbasedmultiagentenv.py | 31 +++++++++++++++++++--
 MalmoEnv/missions/rllib_multiagent.xml      |  2 +-
 MalmoEnv/rllib_train.py                     | 23 +++++----------
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/MalmoEnv/malmoenv/turnbasedmultiagentenv.py b/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
index b3c0dbd41..d24337a62 100644
--- a/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
+++ b/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
@@ -23,6 +23,8 @@
 import malmoenv
 from malmoenv.core import EnvException
 
+STEP_DELAY_TIME = 0.15
+
 def _validate_config(xml, agent_configs):
     """
     Verify that the supplied agent config is compatible with the mission XML.
@@ -240,7 +242,7 @@ def reset(self):
         return obs
 
     def step(self, actions):
-        print(f"Step {self._step} for agent {self._id} - Actions: {actions}...")
+#        print(f"Step {self._step} for agent {self._id} - Actions: {actions}...")
         self._step += 1
         results = {}
         request_time = time.perf_counter()
@@ -248,7 +250,7 @@ def step(self, actions):
 
         for agent_id, action in actions.items():
             if not done:
-                time.sleep(0.5)
+                time.sleep(STEP_DELAY_TIME)
                 o, r, done, i = self._connections[agent_id].step(action)
             else:
                 o = self._connections[agent_id].last_observation
@@ -284,7 +286,7 @@ def step(self, actions):
 #        infos["step_request_time"] = request_time
 #        infos["reset_request_time"] = self._reset_request_time
 
-        print(f"Step of {self._id} complete - {dones}")
+#        print(f"Step of {self._id} complete - {dones}")
 
         return obs, rewards, dones, infos
 
@@ -295,3 +297,26 @@ def close(self):
             except Exception as e:
                 message = getattr(e, "message", e)
                 print(f"Error closing environment: {message}")
+
+
+class SyncRllibMultiAgentEnv(MultiAgentEnv):
+    def __init__(self, env, idle_action):
+        self.env = env
+        self.idle_action = idle_action
+
+    def reset(self):
+        return self.env.reset()
+
+    def step(self, actions):
+        o, r, d, i = self.env.step(actions)
+        for done in d.values():
+            if done:
+                return o, r, d, i
+
+        return self.env.step({
+            key: self.idle_action
+            for key in actions
+        })
+
+    def close(self):
+        return self.env.close()
diff --git a/MalmoEnv/missions/rllib_multiagent.xml b/MalmoEnv/missions/rllib_multiagent.xml
index 0d17a5faf..5f8f68865 100644
--- a/MalmoEnv/missions/rllib_multiagent.xml
+++ b/MalmoEnv/missions/rllib_multiagent.xml
@@ -5,7 +5,7 @@
   </About>
 
   <ModSettings>
-        <MsPerTick>50</MsPerTick>
+        <MsPerTick>1</MsPerTick>
   </ModSettings>
 
   <ServerSection>
diff --git a/MalmoEnv/rllib_train.py b/MalmoEnv/rllib_train.py
index ad2c4899a..7d43dae7e 100644
--- a/MalmoEnv/rllib_train.py
+++ b/MalmoEnv/rllib_train.py
@@ -23,14 +23,14 @@
 from pathlib import Path
 import malmoenv
 from malmoenv.multiagentenv import RllibMultiAgentEnv, AgentConfig
-from malmoenv.turnbasedmultiagentenv import TurnBasedRllibMultiAgentEnv
+from malmoenv.turnbasedmultiagentenv import TurnBasedRllibMultiAgentEnv, SyncRllibMultiAgentEnv
 
 SINGLE_AGENT_ENV = "malmo_single_agent"
 MULTI_AGENT_ENV = "malmo_multi_agent"
 MISSION_XML = "missions/rllib_multiagent.xml"
 COMMAND_PORT = 8999
 NUM_ENVIRONMENT_INSTANCES = 2
-FRAME_STACK = 2
+FRAME_STACK = 1
 
 xml = Path(MISSION_XML).read_text()
 
@@ -115,20 +115,11 @@ def env_factory(agent_id, xml, role, host_address, host_port, command_address, c
         exp_uid="multiagent",
         reshape=True
     )
-    env = FrameStack(env, FRAME_STACK)
-    env = malmoenv.SyncEnv(env, idle_action=4, idle_delay=0.02)
+    if FRAME_STACK > 1:
+        env = FrameStack(env, FRAME_STACK)
     env = TrackingEnv(env)
     return env
 
-def all_done_checker(env, obs, rewards, dones, infos):
-    """
-    Returns True when all agents are reported as done.
-    """
-    for done in dones.values():
-        if not done:
-            return False
-    return True
-
 def create_single_agent_env(config):
     port = COMMAND_PORT + config.worker_index
     return env_factory("agent0", xml, 0, "127.0.0.1", port, "127.0.0.1", port)
@@ -141,8 +132,8 @@ def create_multi_agent_env(config):
     ]
     env = TurnBasedRllibMultiAgentEnv(xml, agent_config,
         env_factory=env_factory,
-#        all_done_checker=all_done_checker
     )
+    env = SyncRllibMultiAgentEnv(env, idle_action=4)
     return env
 
 
@@ -160,7 +151,7 @@ def create_multi_agent_env(config):
             "num_workers": NUM_ENVIRONMENT_INSTANCES,
             "num_gpus": 0,
             "rollout_fragment_length": 50,
-            "train_batch_size": 512,
+            "train_batch_size": 1024,
             "replay_buffer_num_slots": 4000,
             "replay_proportion": 10,
             "learner_queue_timeout": 900,
@@ -171,7 +162,7 @@ def create_multi_agent_env(config):
                 "type": "EpsilonGreedy",
                 "initial_epsilon": 1.0,
                 "final_epsilon": 0.02,
-                "epsilon_timesteps": 10000
+                "epsilon_timesteps": 7000
             },
 
             "multiagent": {

From 590af3d4e3e4afb672e7a4fd6c522f1b91234060 Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Wed, 16 Sep 2020 14:22:49 +0100
Subject: [PATCH 10/12] Removed debug info

---
 MalmoEnv/malmoenv/core.py                   |  28 --
 MalmoEnv/malmoenv/multiagentenv.py          | 299 --------------------
 MalmoEnv/malmoenv/turnbasedmultiagentenv.py |  21 +-
 MalmoEnv/rllib_train.py                     |   4 +-
 4 files changed, 16 insertions(+), 336 deletions(-)
 delete mode 100644 MalmoEnv/malmoenv/multiagentenv.py

diff --git a/MalmoEnv/malmoenv/core.py b/MalmoEnv/malmoenv/core.py
index c95cc3bc7..041e1836e 100644
--- a/MalmoEnv/malmoenv/core.py
+++ b/MalmoEnv/malmoenv/core.py
@@ -311,19 +311,10 @@ def step(self, action):
         withturnkey = self.step_options < 2
         withinfo = self.step_options == 0 or self.step_options == 2
 
-        warning_displayed = False
-        start_time = time.time()
-
         while not self.done and \
                 ((obs is None or len(obs) == 0) or
                  (withinfo and info is None) or turn):
 
-            if (time.time() - start_time > 10) and not warning_displayed:
-                warning_displayed = True
-                print(f"WARNING! Long step time for {self.server2}:{self.port2}")
-                o = len(obs) if obs is not None else "None"
-                print(f"Done={self.done}, len(obs)={o}, turn={turn}")
-
             step_message = "<Step" + str(self.step_options) + ">" + \
                            self.action_space[action] + \
                            "</Step" + str(self.step_options) + " >"
@@ -505,24 +496,5 @@ def _get_token(self):
         return self.exp_uid + ":" + str(self.role) + ":" + str(self.resets)
 
 
-class SyncEnv(gym.Wrapper):
-    def __init__(self, env, idle_action=4, idle_delay=0):
-        super().__init__(env)
-        self._idle_action = idle_action
-        self._idle_delay = idle_delay
-
-    def reset(self):
-        time.sleep(self._idle_delay)
-        return super().reset()
-
-    def step(self, action):
-#        time.sleep(self._idle_delay)
-#        o, r, d, i = super().step(action)
-#        if d:
-#            return o, r, d, i
-
-        time.sleep(self._idle_delay)
-        return super().step(action)
-
 def make():
     return Env()
diff --git a/MalmoEnv/malmoenv/multiagentenv.py b/MalmoEnv/malmoenv/multiagentenv.py
deleted file mode 100644
index d73fa5a37..000000000
--- a/MalmoEnv/malmoenv/multiagentenv.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Copyright (c) 2020 Microsoft Corporation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-# associated documentation files (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge, publish, distribute,
-# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all copies or
-# substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
-# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-# ------------------------------------------------------------------------------------------------
-import time
-from threading import Thread
-from lxml import etree
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
-import malmoenv
-from malmoenv.core import EnvException
-
-def _validate_config(xml, agent_configs):
-    """
-    Verify that the supplied agent config is compatible with the mission XML.
-    """
-    assert len(agent_configs) >= 2
-    xml = etree.fromstring(xml)
-    xml_agent_count = len(xml.findall("{http://ProjectMalmo.microsoft.com}AgentSection"))
-    assert len(agent_configs) == xml_agent_count
-
-
-def _parse_address(address):
-    """
-    Take addresses of various forms and convert them to a tuple of the form (HOST, PORT).
-    """
-
-    if isinstance(address, int):
-        # Only a port number provided
-        return ("127.0.0.1", address)
-
-    if isinstance(address, str):
-        parts = address.split(":")
-        if len(parts) == 1:
-            # Port number as a string
-            return ("127.0.0.1", int(parts[0]))
-        if len(parts) == 2:
-            # String in the form "HOST:PORT"
-            return (parts[0], int(parts[1]))
-
-    if len(address) == 2 and isinstance(address[0], str) and isinstance(address[1], int):
-        # An already parsed address
-        return address
-
-    raise EnvException(f"{address} is not a valid address")
-
-def _await_results(results):
-    """
-    Receives a dictionary of result tasks and repopulates it with the final results after the tasks
-    complete.
-    """
-    for agent_id, task in results.items():
-        results[agent_id] = task.wait()
-
-def _default_env_factory(agent_id, xml, role, host_address, host_port, command_address, command_port):
-    """
-    Default environment factory that fills out just enough settings to connect multiple game
-    instances into a single game session.
-    agent_id - The agent we're constructing the environment connection for.
-    xml - The mission XML.
-    role - The agent's role number. 0 == host agent.
-    host_address, host_port - Connection details for the game session host.
-    command_address, command_port - Connection details for the game instance the agent is controlling.
-    """
-    env = malmoenv.make()
-    env.init(xml, host_port,
-        server=host_address,
-        server2=command_address,
-        port2=command_port,
-        role=role,
-        exp_uid="default_experiment_id"
-    )
-    return env
-
-def _default_all_done_checker(env, obs, rewards, dones, infos):
-    """
-    Returns True if any agent is reported as done.
-    """
-    for done in dones.values():
-        if done:
-            return True
-    return False
-
-# Wraps a MalmoEnv instance and provides async reset and step operations
-# Reset operations need to be executed async as none of the connected environments will complete
-# their reset operations until all environments have at least issued a reset request.
-class _ConnectionContext:
-    def __init__(self, id, address, env):
-        """
-        Wrapper around a connection to a game instance.
-        id - The agent id that is in control of the game instance.
-        address - (server, port) tuple for the command connection.
-        env - The MalmoEnv instance that is connected to the game instance.
-        """
-        self.id = id
-        self.address = address
-        self.env = env
-
-        # Async task status tracking
-        self._task_thread = None
-        self._task_result = None
-
-    def wait(self):
-        """
-        Wait for the current async task to complete and return the result.
-        """
-        assert self._task_thread is not None
-        self._task_thread.join()
-        self._task_thread = None
-
-        # We want to re-trow the exception if the task raised an error
-        if isinstance(self._task_result, Exception):
-            raise self._task_result
-
-        return self._task_result
-
-    def reset(self):
-        """
-        Issue a reset request and return the async task immediately.
-        """
-        assert self._task_thread is None
-        self._task_thread = Thread(target=self._reset_task, name=f"Agent '{self.id}' reset")
-        self._task_thread.start()
-        return self
-
-    def step(self, action):
-        """
-        Issue a step request and return the async task immediately.
-        """
-        assert self._task_thread is None
-        self._task_thread = Thread(target=self._step_task, args=(action,), name=f"Agent '{self.id}' step")
-        self._task_thread.start()
-        return self
-
-    def close(self):
-        """
-        Shut down the Minecraft instance.
-        """
-        assert self._task_thread is None
-        self.env.close()
-        #self.env.exit()
-
-    def _reset_task(self):
-        try:
-            self._task_result = self.env.reset()
-        except Exception as e:
-            self._task_result = e
-
-    def _step_task(self, action):
-        try:
-            print(f"Stepping agent {self.id}, {self.address}...")
-            self._task_result = self.env.step(action)
-            print(f"Step agent {self.id}, {self.address} complete, done={self._task_result[2]}")
-        except Exception as e:
-            print(f"Exception with agent {self.id}, {self.address}")
-            self._task_result = e
-
-# Config for a single agent that will be present within the environment
-class AgentConfig:
-    def __init__(self, id, address):
-        """
-        Configuration details for an agent acting within the environment.
-        id - The agent's id as used by RLlib.
-        address - The address for the game instance for the agent to connect to.
-        """
-        self.id = id
-        self.address = _parse_address(address)
-
-# RLlib compatible multi-agent environment.
-# This wraps multiple instances of MalmoEnv environments that are connected to their own Minecraft
-# instances.
-# The first agent defined in the agent_configs is treated as the primary Minecraft instance that
-# will act as the game server.
-class RllibMultiAgentEnv(MultiAgentEnv):
-    def __init__(self, xml, agent_configs, env_factory=None, all_done_checker=None):
-        """
-        An RLlib compatible multi-agent environment.
-        NOTE: Will not work with turn based actions as all agent act together.
-        xml - The mission XML
-        agent_configs - A list of AgentConfigs to decribe the agents within the environment.
-        env_factory - Function to allow custom construction of the MalmoEnv instances.
-                      This can be used to override the default inti parameter for the environment.
-        all_done_checker - Function to check if the "__all__" key should be set in the step done
-                           dictionary. The default check returns True if any agent reports that
-                           they're done.
-        """
-        _validate_config(xml, agent_configs)
-
-        self._all_done_checker = all_done_checker or _default_all_done_checker
-        env_factory = env_factory or _default_env_factory
-
-        # The first agent is treated as the game session host
-        host_address = agent_configs[0].address
-        self._id = host_address
-        self._connections = {}
-        self._reset_request_time = 0
-        self._step = 0
-
-        role = 0
-        for agent_config in agent_configs:
-            env = env_factory(
-                agent_id=agent_config.id,
-                xml=xml,
-                role=role,
-                host_address=host_address[0],
-                host_port=host_address[1],
-                command_address=agent_config.address[0],
-                command_port=agent_config.address[1]
-            )
-            context = _ConnectionContext(
-                agent_config.id,
-                agent_config.address,
-                env
-            )
-            self._connections[agent_config.id] = context
-            role += 1
-
-
-    def get_observation_space(self, agent_id):
-        return self._connections[agent_id].env.observation_space
-
-    def get_action_space(self, agent_id):
-        return self._connections[agent_id].env.action_space
-
-    def reset(self):
-        print(f"Resetting {self._id}...")
-        self._step = 0
-        obs = {}
-        request_time = time.perf_counter()
-        for agent_id, connection in self._connections.items():
-            obs[agent_id] = connection.reset()
-
-        # All reset operations must be issued asynchronously as none of the Minecraft instances
-        # will complete their reset requests until all agents have issued a reset request
-        _await_results(obs)
-        self._reset_request_time = time.perf_counter() - request_time
-        print(f"Reset {self._id} complete")
-
-        return obs
-
-    def step(self, actions):
-        print(f"Step {self._step} for agent {self._id} - Actions: {actions}...")
-        self._step += 1
-        results = {}
-        request_time = time.perf_counter()
-        for agent_id, action in actions.items():
-            results[agent_id] = self._connections[agent_id].step(action)
-
-        _await_results(results)
-        request_time = time.perf_counter() - request_time
-
-        # We need to repack the individual step results into dictionaries per data type to conform
-        # with RLlib's requirements
-        obs = {
-            agent_id: result[0]
-            for agent_id, result in results.items()
-        }
-        rewards = {
-            agent_id: result[1]
-            for agent_id, result in results.items()
-        }
-        dones = {
-            agent_id: result[2]
-            for agent_id, result in results.items()
-        }
-        infos = {
-            agent_id: result[3]
-            for agent_id, result in results.items()
-        }
-
-        # Pass the results to the done checker to set the required __all__ value
-        dones["__all__"] = self._all_done_checker(self, obs, rewards, dones, infos)
-#        infos["step_request_time"] = request_time
-#        infos["reset_request_time"] = self._reset_request_time
-
-        print(f"Step of {self._id} complete - {dones}")
-
-        return obs, rewards, dones, infos
-
-    def close(self):
-        for connection in self._connections.values():
-            try:
-                connection.close()
-            except Exception as e:
-                message = getattr(e, "message", e)
-                print(f"Error closing environment: {message}")
diff --git a/MalmoEnv/malmoenv/turnbasedmultiagentenv.py b/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
index d24337a62..defbfeb43 100644
--- a/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
+++ b/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
@@ -34,7 +34,6 @@ def _validate_config(xml, agent_configs):
     xml_agent_count = len(xml.findall("{http://ProjectMalmo.microsoft.com}AgentSection"))
     assert len(agent_configs) == xml_agent_count
 
-
 def _parse_address(address):
     """
     Take addresses of various forms and convert them to a tuple of the form (HOST, PORT).
@@ -96,7 +95,7 @@ def _default_all_done_checker(env, obs, rewards, dones, infos):
             return True
     return False
 
-# Wraps a MalmoEnv instance and provides async reset and step operations
+# Wraps a MalmoEnv instance and provides async reset and sync step operations
 # Reset operations need to be executed async as none of the connected environments will complete
 # their reset operations until all environments have at least issued a reset request.
 class _ConnectionContext:
@@ -242,7 +241,6 @@ def reset(self):
         return obs
 
     def step(self, actions):
-#        print(f"Step {self._step} for agent {self._id} - Actions: {actions}...")
         self._step += 1
         results = {}
         request_time = time.perf_counter()
@@ -250,9 +248,14 @@ def step(self, actions):
 
         for agent_id, action in actions.items():
             if not done:
+                # We need to wait a small amount of time between each agent's step request to give
+                # the Minecraft instances time to sync up and agree whose turn to act it is
                 time.sleep(STEP_DELAY_TIME)
                 o, r, done, i = self._connections[agent_id].step(action)
             else:
+                # If any of the agents report themselves as "done", then we should stop taking turns
+                # so generate a dummy step result based on the last observation so that training
+                # receives valid looking data
                 o = self._connections[agent_id].last_observation
                 r = 0.0
                 i = {}
@@ -283,10 +286,6 @@ def step(self, actions):
 
         # Pass the results to the done checker to set the required __all__ value
         dones["__all__"] = self._all_done_checker(self, obs, rewards, dones, infos)
-#        infos["step_request_time"] = request_time
-#        infos["reset_request_time"] = self._reset_request_time
-
-#        print(f"Step of {self._id} complete - {dones}")
 
         return obs, rewards, dones, infos
 
@@ -299,6 +298,9 @@ def close(self):
                 print(f"Error closing environment: {message}")
 
 
+# As Malmo returns stale observations for actions, this wrapper can be used to sync observations
+# and actions by issuing an idle action after the policy generated action to query the resultant
+# state of the environment
 class SyncRllibMultiAgentEnv(MultiAgentEnv):
     def __init__(self, env, idle_action):
         self.env = env
@@ -308,11 +310,16 @@ def reset(self):
         return self.env.reset()
 
     def step(self, actions):
+        # The first step request to the environment returns stale data, so we want to ignore it
+        # unless Malmo reports one of the instances as "done"
         o, r, d, i = self.env.step(actions)
         for done in d.values():
             if done:
                 return o, r, d, i
 
+        # The second step request is really just a query for the environment state. When used with
+        # the turn based environment, there is a delay injected before the requests which allows
+        # the environment to settle into the new state
         return self.env.step({
             key: self.idle_action
             for key in actions
diff --git a/MalmoEnv/rllib_train.py b/MalmoEnv/rllib_train.py
index 7d43dae7e..a8f20e342 100644
--- a/MalmoEnv/rllib_train.py
+++ b/MalmoEnv/rllib_train.py
@@ -22,8 +22,7 @@
 from ray.tune import register_env, run_experiments
 from pathlib import Path
 import malmoenv
-from malmoenv.multiagentenv import RllibMultiAgentEnv, AgentConfig
-from malmoenv.turnbasedmultiagentenv import TurnBasedRllibMultiAgentEnv, SyncRllibMultiAgentEnv
+from malmoenv.turnbasedmultiagentenv import AgentConfig, TurnBasedRllibMultiAgentEnv, SyncRllibMultiAgentEnv
 
 SINGLE_AGENT_ENV = "malmo_single_agent"
 MULTI_AGENT_ENV = "malmo_multi_agent"
@@ -34,6 +33,7 @@
 
 xml = Path(MISSION_XML).read_text()
 
+# An environment wrapper to shape rewards and determine episode terminality independently of Malmo
 class TrackingEnv(gym.Wrapper):
     def __init__(self, env):
         super().__init__(env)

From d8ccfdbc04436bf652647b32fc48a853dea7b0ed Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Wed, 16 Sep 2020 14:24:43 +0100
Subject: [PATCH 11/12] Removed some redundant changes

---
 MalmoEnv/malmoenv/__init__.py | 4 ++--
 MalmoEnv/malmoenv/core.py     | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/MalmoEnv/malmoenv/__init__.py b/MalmoEnv/malmoenv/__init__.py
index 4ce582ad2..d24708985 100644
--- a/MalmoEnv/malmoenv/__init__.py
+++ b/MalmoEnv/malmoenv/__init__.py
@@ -17,6 +17,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 # ------------------------------------------------------------------------------------------------
 
-from malmoenv.core import ActionSpace, StringActionSpace, VisualObservationSpace, Env, SyncEnv, make
+from malmoenv.core import ActionSpace, StringActionSpace, VisualObservationSpace, Env, make
 
-__all__ = ['ActionSpace', 'StringActionSpace', 'VisualObservationSpace', 'Env', 'SyncEnv', 'make']
+__all__ = ['ActionSpace', 'StringActionSpace', 'VisualObservationSpace', 'Env', 'make']
diff --git a/MalmoEnv/malmoenv/core.py b/MalmoEnv/malmoenv/core.py
index 041e1836e..ae32ffc86 100644
--- a/MalmoEnv/malmoenv/core.py
+++ b/MalmoEnv/malmoenv/core.py
@@ -314,7 +314,6 @@ def step(self, action):
         while not self.done and \
                 ((obs is None or len(obs) == 0) or
                  (withinfo and info is None) or turn):
-
             step_message = "<Step" + str(self.step_options) + ">" + \
                            self.action_space[action] + \
                            "</Step" + str(self.step_options) + " >"

From 25b7cdc12da03e0a65513e26b4b4f576cc87f69c Mon Sep 17 00:00:00 2001
From: Adrian O'Grady <adriano@microsoft.com>
Date: Fri, 18 Sep 2020 11:50:23 +0100
Subject: [PATCH 12/12] Added automatic generation of version.properties

---
 MalmoEnv/malmoenv/turnbasedmultiagentenv.py | 2 --
 Minecraft/build.gradle                      | 6 +++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/MalmoEnv/malmoenv/turnbasedmultiagentenv.py b/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
index defbfeb43..208076084 100644
--- a/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
+++ b/MalmoEnv/malmoenv/turnbasedmultiagentenv.py
@@ -225,7 +225,6 @@ def get_action_space(self, agent_id):
         return self._connections[agent_id].env.action_space
 
     def reset(self):
-        print(f"Resetting {self._id}...")
         self._step = 0
         obs = {}
         request_time = time.perf_counter()
@@ -236,7 +235,6 @@ def reset(self):
         # will complete their reset requests until all agents have issued a reset request
         _await_results(obs)
         self._reset_request_time = time.perf_counter() - request_time
-        print(f"Reset {self._id} complete")
 
         return obs
 
diff --git a/Minecraft/build.gradle b/Minecraft/build.gradle
index 131f046da..d1006835f 100755
--- a/Minecraft/build.gradle
+++ b/Minecraft/build.gradle
@@ -15,7 +15,11 @@ apply plugin: 'net.minecraftforge.gradle.forge'
 
 // Read the version number from the Mod's version properties file.
 if (!file('src/main/resources/version.properties').exists()) {
-    ant.fail("version.properties file is missing - this is created automatically by CMake. If you are building from source, make sure you have built the full source tree, not just the Minecraft folder.")
+    // if version.properties doesn't exists make it from VERSION
+    File vers_file = new File("../VERSION")
+    version = vers_file.getText()
+    File out_file = new File("src/main/resources/version.properties")
+    out_file.write "malmomod.version=" + version
 }
 def propFile = file('src/main/resources/version.properties')
 def versionProp = new Properties()