diff --git a/alpacka/envs/__init__.py b/alpacka/envs/__init__.py index 55de8d6ee994377ffe0c19f696150966507eacd5..e4cbe274a5dcf5447bd46c348c815fe6ce74261d 100644 --- a/alpacka/envs/__init__.py +++ b/alpacka/envs/__init__.py @@ -5,6 +5,7 @@ import gin from alpacka.envs import cartpole from alpacka.envs import gfootball from alpacka.envs import sokoban +from alpacka.envs import rubik from alpacka.envs.base import * from alpacka.envs.wrappers import * @@ -21,3 +22,4 @@ ActionNoiseSokoban = configure_env(sokoban.ActionNoiseSokoban) # pylint: disable CartPole = configure_env(cartpole.CartPole) # pylint: disable=invalid-name GoogleFootball = configure_env(gfootball.GoogleFootball) # pylint: disable=invalid-name Sokoban = configure_env(sokoban.Sokoban) # pylint: disable=invalid-name +Rubik = configure_env(rubik.Rubik) # pylint: disable=invalid-name diff --git a/alpacka/envs/rubik.py b/alpacka/envs/rubik.py new file mode 100644 index 0000000000000000000000000000000000000000..4d8899e2ef43c30822ceb82e81bda2af2a855342 --- /dev/null +++ b/alpacka/envs/rubik.py @@ -0,0 +1,750 @@ +"""Rubik's Cube env.""" + +import copy +from enum import Enum +import numpy as np +import matplotlib.pyplot as plt +from gym import spaces +from matplotlib.patches import Rectangle +from matplotlib.patches import Polygon + +from alpacka.envs import base + + +class Actions(Enum): + """Actions' paremeters""" + U = {'name': 'U', 'f': 'U', 'd': 1, 'opposite': 'U_1'} + U_1 = {'name': 'U\'', 'f': 'U', 'd': -1, 'opposite': 'U'} + D = {'name': 'D', 'f': 'D', 'd': 1, 'opposite': 'D_1'} + D_1 = {'name': 'D\'', 'f': 'D', 'd': -1, 'opposite': 'D'} + F = {'name': 'F', 'f': 'F', 'd': 1, 'opposite': 'F_1'} + F_1 = {'name': 'F\'', 'f': 'F', 'd': -1, 'opposite': 'F'} + B = {'name': 'B', 'f': 'B', 'd': 1, 'opposite': 'B_1'} + B_1 = {'name': 'B\'', 'f': 'B', 'd': -1, 'opposite': 'B'} + R = {'name': 'R', 'f': 'R', 'd': 1, 'opposite': 'R_1'} + R_1 = {'name': 'R\'', 'f': 'R', 'd': -1, 'opposite': 'R'} + L = {'name': 'L', 'f': 'L', 'd': 1, 'opposite': 'L_1'} + L_1 = {'name': 'L\'', 'f': 'L', 'd': -1, 'opposite': 'L'} + + +class Cube(): + """ + Cube + ---- + Initialize with arguments: + - `n`, the side length (the cube is `n`x`n`x`n`) + - optional `whiteplastic=True` if you like white cubes + """ + + action_names = [a.name for a in Actions] + + facedict = {'U': 0, 'D': 1, 'F': 2, 'B': 3, 'R': 4, 'L': 5} + dictface = {v: k for k, v in facedict.items()} + normals = [np.array([0., 1., 0.]), np.array([0., -1., 0.]), + np.array([0., 0., 1.]), np.array([0., 0., -1.]), + np.array([1., 0., 0.]), np.array([-1., 0., 0.])] + # this xdirs has to be synchronized with the self.move() function + xdirs = [np.array([1., 0., 0.]), np.array([1., 0., 0.]), + np.array([1., 0., 0.]), np.array([-1., 0., 0.]), + np.array([0., 0., -1.]), np.array([0, 0., 1.])] + colordict = {'w': 0, 'y': 1, 'b': 2, 'g': 3, 'o': 4, 'r': 5} + pltpos = [(0., 1.05), (0., -1.05), (0., 0.), (2.10, 0.), (1.05, 0.), + (-1.05, 0.)] + labelcolor = '#7f00ff' + + def __init__(self, n, whiteplastic=False): + """ + (see above) + """ + self.n = n + self.stickers = np.array( + [np.tile(i, (self.n, self.n)) for i in range(6)]) + self.stickercolors = ['w', '#ffcf00', '#00008f', '#009f0f', '#ff6f00', + '#cf0000'] + # sticker thickness in units of total cube size + self.stickerthickness = 0.001 + # sticker size relative to cubie size (must be < 1) + self.stickerwidth = 0.9 + if whiteplastic: + self.plasticcolor = '#dfdfdf' + else: + self.plasticcolor = '#1f1f1f' + self.fontsize = 12. * (self.n / 5.) + self.solved_score = self.score() + + def turn(self, f, d): + """ + Turn whole cube (without making a layer move) around face `f` + `d` 90-degree turns in the clockwise direction. Use `d=3` or + `d=-1` for counter-clockwise. + """ + for l in range(self.n): + self.move(f, l, d) + + def move(self, f, l, d): + """ + Make a layer move of layer `l` parallel to face `f` through + `d` 90-degree turns in the clockwise direction. Layer `0` is + the face itself, and higher `l` values are for layers deeper + into the cube. Use `d=3` or `d=-1` for counter-clockwise + moves, and `d=2` for a 180-degree move.. + """ + i = self.facedict[f] + l2 = self.n - 1 - l + assert l < self.n + ds = range((d + 4) % 4) + if f == 'U': + f2 = 'D' + i2 = self.facedict[f2] + for _ in ds: + self._rotate([(self.facedict['F'], range(self.n), l2), + (self.facedict['R'], range(self.n), l2), + (self.facedict['B'], range(self.n), l2), + (self.facedict['L'], range(self.n), l2)]) + if f == 'D': + return self.move('U', l2, -d) + if f == 'F': + f2 = 'B' + i2 = self.facedict[f2] + for _ in ds: + self._rotate([(self.facedict['U'], range(self.n), l), + (self.facedict['L'], l2, range(self.n)), + (self.facedict['D'], range(self.n)[::-1], l2), + (self.facedict['R'], l, range(self.n)[::-1])]) + if f == 'B': + return self.move('F', l2, -d) + if f == 'R': + f2 = 'L' + i2 = self.facedict[f2] + for _ in ds: + self._rotate([(self.facedict['U'], l2, range(self.n)), + (self.facedict['F'], l2, range(self.n)), + (self.facedict['D'], l2, range(self.n)), + (self.facedict['B'], l, range(self.n)[::-1])]) + if f == 'L': + return self.move('R', l2, -d) + for _ in ds: + if l == 0: + self.stickers[i] = np.rot90(self.stickers[i], 3) + if l == self.n - 1: + self.stickers[i2] = np.rot90(self.stickers[i2], 1) + # print('moved', f, l, len(ds)) + return None + + def _rotate(self, args): + """ + Internal function for the `move()` function. + """ + a0 = args[0] + foo = self.stickers[a0] + a = a0 + for b in args[1:]: + self.stickers[a] = self.stickers[b] + a = b + self.stickers[a] = foo + + def randomize(self, number): + """ + Make `number` randomly chosen moves to scramble the cube. + """ + for _ in range(number): + f = self.dictface[np.random.randint(6)] + l = np.random.randint(self.n) + d = 1 + np.random.randint(3) + self.move(f, l, d) + + def _render_points(self, points, viewpoint): + """ + Internal function for the `render()` function. Clunky + projection from 3-d to 2-d, but also return a zorder variable. + """ + v2 = np.dot(viewpoint, viewpoint) + zdir = viewpoint / np.sqrt(v2) + xdir = np.cross(np.array([0., 1., 0.]), zdir) + xdir /= np.sqrt(np.dot(xdir, xdir)) + ydir = np.cross(zdir, xdir) + result = [] + for p in points: + dpoint = p - viewpoint + dproj = 0.5 * dpoint * v2 / np.dot(dpoint, -1. * viewpoint) + result += [np.array([np.dot(xdir, dproj), + np.dot(ydir, dproj), + np.dot(zdir, dpoint / np.sqrt(v2))])] + return result + + def render_views(self, ax): + """ + Make three projected 3-dimensional views of the cube for the + `render()` function. Because of zorder / occulting issues, + this code is very brittle; it will not work for all viewpoints + (the `np.dot(zdir, viewpoint)` test is not general; the corect + test involves the 'handedness' of the projected polygon). + """ + csz = 2. / self.n + x2 = 8. + x1 = 0.5 * x2 + for viewpoint, shift in [ + (np.array([-x1, -x1, x2]), np.array([-1.5, 3.])), + (np.array([x1, x1, x2]), np.array([0.5, 3.])), + (np.array([x2, x1, -x1]), np.array([2.5, 3.]))]: + for f, i in self.facedict.items(): + zdir = self.normals[i] + if np.dot(zdir, viewpoint) < 0: + continue + xdir = self.xdirs[i] + ydir = np.cross(zdir, xdir) # insanity: left-handed! + psc = 1. - 2. * self.stickerthickness + corners = [psc * zdir - psc * xdir - psc * ydir, + psc * zdir + psc * xdir - psc * ydir, + psc * zdir + psc * xdir + psc * ydir, + psc * zdir - psc * xdir + psc * ydir] + projects = self._render_points(corners, viewpoint) + xys = [p[0:2] + shift for p in projects] + zorder = np.mean([p[2] for p in projects]) + ax.add_artist(Polygon(xys, ec='none', fc=self.plasticcolor)) + for j in range(self.n): + for k in range(self.n): + corners = self._stickerpolygon(xdir, ydir, zdir, csz, j, + k) + projects = self._render_points(corners, viewpoint) + xys = [p[0:2] + shift for p in projects] + ax.add_artist(Polygon(xys, ec='none', + fc=self.stickercolors[ + self.stickers[i, j, k]])) + x0, y0, zorder = \ + self._render_points([1.5 * self.normals[i], ], viewpoint)[0] + ax.text(x0 + shift[0], y0 + shift[1], f, color=self.labelcolor, + ha='center', va='center', rotation=20, + fontsize=self.fontsize / (-zorder)) + + def _stickerpolygon(self, xdir, ydir, zdir, csz, j, k): + small = 0.5 * (1. - self.stickerwidth) + large = 1. - small + return [zdir - xdir + (j + small) * csz * xdir - ydir + ( + k + small + small) * csz * ydir, + zdir - xdir + (j + small + small) * csz * xdir - ydir + ( + k + small) * csz * ydir, + zdir - xdir + (j + large - small) * csz * xdir - ydir + ( + k + small) * csz * ydir, + zdir - xdir + (j + large) * csz * xdir - ydir + ( + k + small + small) * csz * ydir, + zdir - xdir + (j + large) * csz * xdir - ydir + ( + k + large - small) * csz * ydir, + zdir - xdir + (j + large - small) * csz * xdir - ydir + ( + k + large) * csz * ydir, + zdir - xdir + (j + small + small) * csz * xdir - ydir + ( + k + large) * csz * ydir, + zdir - xdir + (j + small) * csz * xdir - ydir + ( + k + large - small) * csz * ydir] + + def render_flat(self, ax): + """ + Make an unwrapped, flat view of the cube for the `render()` + function. This is a map, not a view really. It does not + properly render the plastic and stickers. + """ + for f, i in self.facedict.items(): + x0, y0 = self.pltpos[i] + cs = 1. / self.n + for j in range(self.n): + for k in range(self.n): + ax.add_artist(Rectangle((x0 + j * cs, y0 + k * cs), cs, cs, + ec=self.plasticcolor, + fc=self.stickercolors[ + self.stickers[i, j, k]])) + ax.text(x0 + 0.5, y0 + 0.5, f, color=self.labelcolor, + ha='center', va='center', rotation=20, + fontsize=self.fontsize) + + def render(self, fig, flat=True, views=True): + """ + Visualize the cube in a standard layout, including a flat, + unwrapped view and three perspective views. + """ + assert flat or views + xlim = (-2.4, 3.4) + ylim = (-1.2, 4.) + if not flat: + ylim = (2., 4.) + if not views: + xlim = (-1.2, 3.2) + ylim = (-1.2, 2.2) + if not fig: + fig = plt.figure(figsize=((xlim[1] - xlim[0]) * self.n / 5., + (ylim[1] - ylim[0]) * self.n / 5.)) + ax = fig.add_axes((0, 0, 1, 1), frameon=False, xticks=[], yticks=[]) + if views: + self.render_views(ax) + if flat: + self.render_flat(ax) + ax.set_xlim(xlim) + ax.set_ylim(ylim) + return fig + + def score(self): + """ + Calculate cube distance from solution + """ + temp_score = 1 + for i in range(6): + side = self.stickers[i] + side_color = side[1][1] + side_score = 0 + for x in range(3): + for y in range(3): + if side[x][y] == side_color: + side_score += 1 + temp_score *= side_score + return temp_score + + def move_by_action(self, action): + # action = self.actions.get(action_name) + f = action.value.get('f') + d = action.value.get('d') + self.move(f, 0, d) + + def solved(self, score): + return score == self.solved_score + + def get_state(self): + return self.stickers + + def opposite_actions(self, previous_action_name, action): + return previous_action_name == action.value.get('opposite') + + +def checkerboard(cube): + """ + Dumbness. + """ + ls = range(cube.n)[::2] + for f in ['U', 'F', 'R']: + for l in ls: + cube.move(f, l, 2) + if cube.n % 2 == 0: + for l in ls: + cube.move('F', l, 2) + + +class CubeletSet: + """ + Helper structure for Rubik's observations type converter. + See CubeConverter() for more info. + """ + + def __init__(self, colours_list, assign_table, is_even=None): + self.count = len(colours_list) + self.colours = colours_list + self.dim = len(colours_list[0]) + self.assign_table = assign_table + self.is_even = [False] * self.count if is_even is None else is_even + + self.ids = None + self.position_table = None + + self.make_ids() + self.make_position_table() + + def make_ids(self): + self.ids = dict() + + for i in range(self.count): + self.ids[self.colours[i]] = i + + def make_position_table(self): + self.position_table = [sorted([tuple(place) for place in np.transpose( + np.where(self.assign_table == i))]) for i in range(self.count)] + + def encode(self, observation): + """ + Encodes positions of cubelets in the set. + Returns as one-hot over possible positions + """ + res = np.zeros((self.count, 24), dtype=np.float) + + for i in range(self.count): + position = self.position_table[i] + colours = [observation[place] for place in position] + colours_sorted = tuple(sorted(colours)) + cubelet_id = self.ids[colours_sorted] + res[cubelet_id, self.dim * i + np.argmin(colours)] = 1. + + return res + + def decode(self, observation, result): + """ + Transforms positions of cubelets to sticker colours. + Places proper stickers in the result array. + """ + for i in range(self.count): + idx = np.where(observation[i] == 1)[0][0] + place = idx // self.dim + rotation = idx % self.dim + + colours = self.colours[i] + colours_rotated = [0] * self.dim + step_direction = -1 if self.is_even[i] ^ self.is_even[place] else 1 + + for j in range(self.dim): + colours_rotated[(rotation + j * step_direction) % self.dim] = \ + colours[j] + + for k, pos in enumerate(self.position_table[place]): + result[pos] = colours_rotated[k] + + +class CubeConverter: + """ + Allows transformation between sticker-based and cubelet-based observations + for Rubik's Cube environment. + + Sticker-based observation encodes colours of stickers on all the 6*3*3 + positions. + + Cubelet-based observation is taken from https://arxiv.org/pdf/1805.07470.pdf + and encodes positions of 8 corner cubelets and 12 edge cubelets. + """ + + def __init__(self, debug=False): + self.debug = debug + + x = -1 + self.corners = CubeletSet( + colours_list=[(0, 2, 5), (0, 3, 5), (0, 2, 4), (0, 3, 4), (1, 2, 5), + (1, 3, 5), (1, 2, 4), (1, 3, 4)], + is_even=[False, True, True, False, True, False, False, True], + assign_table=np.array( + [[[0, x, 1], + [x, x, x], + [2, x, 3]], + + [[5, x, 4], + [x, x, x], + [7, x, 6]], + + [[4, x, 0], + [x, x, x], + [6, x, 2]], + + [[7, x, 3], + [x, x, x], + [5, x, 1]], + + [[6, x, 2], + [x, x, x], + [7, x, 3]], + + [[5, x, 1], + [x, x, x], + [4, x, 0]]]), + ) + + self.edges = CubeletSet( + colours_list=[(0, 5), (0, 2), (0, 3), (0, 4), (2, 5), (3, 5), + (2, 4), (3, 4), (1, 5), (1, 2), (1, 3), (1, 4)], + assign_table=np.array( + [[[x, 0, x], + [1, x, 2], + [x, 3, x]], + + [[x, 8, x], + [10, x, 9], + [x, 11, x]], + + [[x, 4, x], + [9, x, 1], + [x, 6, x]], + + [[x, 7, x], + [10, x, 2], + [x, 5, x]], + + [[x, 6, x], + [11, x, 3], + [x, 7, x]], + + [[x, 5, x], + [8, x, 0], + [x, 4, x]]]), + ) + + def convert_sticker_to_cubelet(self, basic_observation, + force_no_debug=False): + """ + Converts sticker-based observation to cubelet-based observation. + """ + result = np.concatenate([self.corners.encode(basic_observation), + self.edges.encode(basic_observation)], axis=0) + + if self.debug and not force_no_debug: + assert (np.array_equal(basic_observation, + self.convert_cubelet_to_sticker(result, + force_no_debug=True))) + + return result + + def convert_cubelet_to_sticker(self, reduced_observation, + force_no_debug=False): + """ + Converts cubelet-based observation to sticker-based observation. + """ + result = np.zeros((6, 3, 3), dtype=np.float32) + + self.corners.decode(reduced_observation[:self.corners.count, :], result) + self.edges.decode(reduced_observation[self.corners.count:, :], result) + + for i in range(6): + result[i, 1, 1] = i + + if self.debug and not force_no_debug: + assert (np.array_equal(reduced_observation, + self.convert_sticker_to_cubelet(result, + force_no_debug=True))) + + return result + + +class DebugLevel(Enum): + WARNING = 0 + INFO = 1 + VERBOSE = 2 + + +class Rubik(base.ModelEnv): + """ + Rubik's Cube as RL environment + + step_limit: + Number of actions until episode termination + shuffles: + Number of moves taken to initially shuffle the cube + obs_type: ['stickers', 'cubelets'] + State encoding, see CubeConverter for more info + """ + metadata = {'render.modes': ['human']} + + def __init__(self, step_limit=100, shuffles=50, obs_type='stickers'): + self.cube = Cube(3, whiteplastic=False) + self.action_space = spaces.Discrete(len(ACTION_LOOKUP)) + self.fig = None + self.solved_state = self.cube.get_state() + + self.observation_space = None + self.obs_type = obs_type + self.converter = None + self.create_observation_space() + + self.scramble = [] + + self.debug_level = DebugLevel.WARNING + self.render_views = True + self.render_flat = True + self.render_cube = False + self.scramble_size = shuffles + + self.num_steps = 0 + self.step_limit = step_limit + + self.config() + + def config(self, debug_level=DebugLevel.WARNING, render_cube=False, + scramble_size=None, render_views=True, + render_flat=True, step_limit=None): + """ + Configures the cube with given parameters. + """ + self.debug_level = debug_level + self.render_cube = render_cube + if scramble_size is not None: + self.scramble_size = scramble_size + if step_limit is not None: + self.step_limit = step_limit + + self.render_views = render_views + self.render_flat = render_flat + + if self.render_cube: + plt.ion() + plt.show() + + def create_observation_space(self): + if self.obs_type == 'sticker': + self.observation_space = spaces.Box(low=0, high=1, + shape=(6 * 3 * 3 * 6,), + dtype=np.float32) + else: # self.obs_type == 'cubelet' + self.observation_space = spaces.Box(low=0, high=1, shape=(20 * 24,), + dtype=np.float32) + self.converter = CubeConverter() + + def step(self, action): + self._take_action(action) + reward = -1 + self.num_steps += 1 + + observation = self._get_state() + solved = np.array_equal(self.cube.get_state(), self.solved_state) + + if solved: + reward = 0 + + episode_over = solved or (self.num_steps == self.step_limit) + + return observation.flatten(), reward, episode_over, {'solved': solved} + + def reset(self): + self.cube = Cube(3, whiteplastic=False) + self.scramble = [] + if self.scramble_size > 0: + if self.debug_level == DebugLevel.INFO: + print('scramble ' + str(self.scramble_size) + ' moves') + self.randomize(self.scramble_size) + + self.num_steps = 0 + return self._get_state().flatten() + + def render(self, mode='human'): + if self.render_cube: + if self.fig: + plt.clf() + self.fig = self.cube.render(self.fig, views=self.render_views, + flat=self.render_flat) + plt.pause(0.001) + + def _take_action(self, action): + self.cube.move_by_action(ACTION_LOOKUP[action]) + + @staticmethod + def action_name(action): + return ACTION_LOOKUP[action].name + + def get_scramble(self): + return self.scramble + + def valid_scramble_action(self, action, previous_actions): + """ + Determines whether given action can be used during scrambling. + Action is considered invalid in case it undos the previous one or equals + previous two (which would give three equal actions in a row, so + essentially a single one). + """ + num_previous_actions = len(previous_actions) + if num_previous_actions > 2 \ + and previous_actions[num_previous_actions - 1] == \ + previous_actions[num_previous_actions - 2] \ + and action.name == previous_actions[num_previous_actions - 1]: + return False + if num_previous_actions > 1 \ + and self.cube.opposite_actions( + previous_actions[num_previous_actions - 1], action): + return False + return True + + def randomize(self, number): + t = 0 + while t < number: + action = ACTION_LOOKUP[np.random.randint(len(ACTION_LOOKUP.keys()))] + if self.valid_scramble_action(action, self.scramble): + self.scramble.append(action.name) + self.cube.move_by_action(action) + t += 1 + + def _get_state(self): + raw_state = self.cube.get_state() + if self.obs_type == 'sticker': + state = (np.arange(6) == raw_state[..., np.newaxis]).astype(int) + else: + state = self.converter.convert_sticker_to_cubelet(raw_state) + return state + + def clone_state(self): + return ( + copy.deepcopy(self.cube), + self.fig, + self.debug_level, + self.render_views, + self.render_flat, + self.render_cube, + self.scramble_size, + self.num_steps, + self.step_limit, + ) + + def restore_state(self, state): + ( + cube, + self.fig, + self.debug_level, + self.render_views, + self.render_flat, + self.render_cube, + self.scramble_size, + self.num_steps, + self.step_limit, + ) = state + self.cube = copy.deepcopy(cube) + + return self._get_state() + + +ACTION_LOOKUP = { + 0: Actions.U, + 1: Actions.U_1, + 2: Actions.D, + 3: Actions.D_1, + 4: Actions.F, + 5: Actions.F_1, + 6: Actions.B, + 7: Actions.B_1, + 8: Actions.R, + 9: Actions.R_1, + 10: Actions.L, + 11: Actions.L_1 +} + + +class GoalRubik(Rubik): + """ + Goal-oriented interface for Rubik environment. + """ + + def __init__(self, step_limit=100, shuffles=50, obs_type='sticker'): + super(GoalRubik, self).__init__(step_limit, shuffles, obs_type) + self.goal_obs = self._get_state() + + def create_observation_space(self): + if self.obs_type == 'sticker': + self.observation_space = spaces.Box(low=0, high=1, + shape=(6 * 3 * 3 * 12,), + dtype=np.float32) + else: + self.observation_space = spaces.Box(low=0, high=1, shape=(20 * 48,), + dtype=np.float32) + self.converter = CubeConverter() + + def step(self, action): + obs, reward, done, info = super(GoalRubik, self).step(action) + + obs = self._get_goal_observation(obs) + reward = self._calculate_reward(obs['observation'], + obs['achieved_goal'], + obs['desired_goal']) + + return obs, reward, done, info + + def reset(self): + obs = super(GoalRubik, self).reset() + return self._get_goal_observation(obs) + + def _get_goal_observation(self, obs): + return self._convert_observation(obs, obs, self.goal_obs) + + def _convert_observation(self, obs, state, goal): + return {'observation': obs, 'achieved_goal': state, + 'desired_goal': goal} + + def _calculate_reward(self, _, state, goal): + return 0 if np.array_equal(state, goal) else -1 + + def set_goal(self, goal_obs): + self.goal_obs = goal_obs diff --git a/configs/deterministic_mcts_value_rubik.gin b/configs/deterministic_mcts_value_rubik.gin new file mode 100644 index 0000000000000000000000000000000000000000..4a138f641fd5447562e75e92d05f10d70691e575 --- /dev/null +++ b/configs/deterministic_mcts_value_rubik.gin @@ -0,0 +1,63 @@ +# Parameters for DeterministicMCTSAgent: +# ============================================================================== +DeterministicMCTSAgent.avoid_loops = True +DeterministicMCTSAgent.gamma = 0.99 +DeterministicMCTSAgent.n_passes = 10 +DeterministicMCTSAgent.value_traits_class = @alpacka.agents.deterministic_mcts.ScalarValueTraits +DeterministicMCTSAgent.value_accumulator_class = @alpacka.agents.deterministic_mcts.ScalarValueAccumulator + +# Parameters for KerasNetwork: +# ============================================================================== +KerasNetwork.loss = ('mean_squared_error') +KerasNetwork.metrics = ['mse'] +KerasNetwork.model_fn = @alpacka.networks.keras.mlp +KerasNetwork.optimizer = 'adam' +KerasNetwork.weight_decay = 0.0 +KerasNetwork.train_callbacks = None + +# Parameters for mlp: +# ============================================================================== +mlp.activation = 'relu' +mlp.hidden_sizes = (256,) + +# Parameters for LocalBatchStepper: +# ============================================================================== +# None. + +# Parameters for Runner: +# ============================================================================== +Runner.agent_class = @alpacka.agents.DeterministicMCTSAgent +Runner.batch_stepper_class = @alpacka.batch_steppers.LocalBatchStepper +Runner.env_class = @alpacka.envs.Rubik +Runner.episode_time_limit = 100 +Runner.n_envs = 10 +Runner.n_epochs = 1500 +Runner.n_precollect_epochs = 15 +Runner.network_class = @alpacka.networks.KerasNetwork +Runner.trainer_class = @alpacka.trainers.SupervisedTrainer + +# Parameters for ScalarValueAccumulator: +# ============================================================================== +# None. + +# Parameters for ScalarValueTraits: +# ============================================================================== +# None. + +# Parameters for Rubik: +# ============================================================================== +Rubik.step_limit = 12 +Rubik.shuffles = 4 +Rubik.obs_type = 'sticker' + +# Parameters for SupervisedTrainer: +# ============================================================================== +SupervisedTrainer.batch_size = 32 +SupervisedTrainer.n_steps_per_epoch = 13 +SupervisedTrainer.replay_buffer_capacity = 500000 +SupervisedTrainer.replay_buffer_sampling_hierarchy = ['solved'] +SupervisedTrainer.target = @alpacka.trainers.supervised.target_value + +# Parameters for target_value: +# ============================================================================== +# None.