Using Custom Environments – pt.3

Published by onesixx on 23-07-2423-07-24

https://pythonprogramming.net/custom-environment-reinforcement-learning-stable-baselines-3-tutorial/

(OpenAI의 gym Environment 대신에) 자신의 custom Environment에서 RL을 수행.

어떤 Enviroment를 gym Environment에 맞게 변환해 주면 된다. (말이 쉽지 구현이 까다롭다….)

RL에서 가장 까다로운 2가지
– Environment의 observation을 가져오는 일 (data engineering)
– Agent에 적절한 rewards부여하기

Game = 그 자체가 가장 좋은 Enviroments = 뱀게임 (source)

# https://theailearner.com/2019/03/10/creating-a-snake-game-using-opencv-python/
# https://github.com/TheAILearner/Snake-Game-using-OpenCV-Python/blob/master/snake_game_using_opencv.ipynb

import numpy as np
import cv2  # imshow(), waitKey(), rectangle(), putText()
import random
import time

### Game Rules:
# 1. die 
def collision_with_boundaries(snake_head):
    if snake_head[0]>=500 or snake_head[0]<0 or snake_head[1]>=500 or snake_head[1]<0 :
        return 1  # true
    else:
        return 0 
# 2. die 
def collision_with_self(snake_position):
    snake_head = snake_position[0]
    if snake_head in snake_position[1:]:
        return 1
    else:
        return 0
    
# 3-1. Score increases and apple is moved to new position
def collision_with_apple(apple_position, score):
    score += 1
    apple_position = [random.randrange(1,50)*10,
                      random.randrange(1,50)*10]
    return apple_position, score


### Game Window :: Display game objects
img = np.zeros((500,500, 3),dtype='uint8')

### Apple and Snake - Initial positions :: Display game objects 
apple_position = [random.randrange(1,50)*10, random.randrange(1,50)*10]
snake_position = [[250,250],[240,250],[230,250]]

score = 0
prev_button_direction = 1
button_direction = 1
snake_head = [250,250]

while True:
    cv2.imshow('a',img)
    cv2.waitKey(1)
    img = np.zeros((500,500,3), dtype='uint8')

    ### Apple and Snake :: Display game objects
    cv2.rectangle(img,(apple_position[0], apple_position[1]), (apple_position[0]+10,apple_position[1]+10), (0,0,255), 3)
    for position in snake_position:
        cv2.rectangle(img,(position[0],position[1]), (position[0]+10,position[1]+10),(0,255,0),3)

    # Takes step after fixed time
    t_end = time.time() + 0.06
    k = -1
    while time.time() < t_end:
        if k == -1:
            k = cv2.waitKey(6)
        else:
            continue

    # 0-Left, 1-Right, 3-Up, 2-Down, q-Break
    # a-Left, d-Right, w-Up, s-Down
    if   k == ord('a') and prev_button_direction != 1:
        button_direction = 0
    elif k == ord('d') and prev_button_direction != 0:
        button_direction = 1
    elif k == ord('w') and prev_button_direction != 2:
        button_direction = 3
    elif k == ord('s') and prev_button_direction != 3:
        button_direction = 2
    elif k == ord('q'):
        break
    else:
        button_direction = button_direction
    prev_button_direction = button_direction

    # Change the head position based on the button direction
    if button_direction == 1:
        snake_head[0] += 10
    elif button_direction == 0:
        snake_head[0] -= 10
    elif button_direction == 2:
        snake_head[1] += 10
    elif button_direction == 3:
        snake_head[1] -= 10

    # 3-2. Increase Snake length on eating apple
    if snake_head == apple_position:
        apple_position, score = collision_with_apple(apple_position, score)
        snake_position.insert(0,list(snake_head))
    else:
        snake_position.insert(0,list(snake_head))
        snake_position.pop()

    ### Displaying the final Score  
    ### On collision kill the snake and print the score
    if collision_with_boundaries(snake_head) == 1 or collision_with_self(snake_position) == 1:
        font = cv2.FONT_HERSHEY_SIMPLEX
        img = np.zeros((500,500,3), dtype='uint8')
        cv2.putText(img,'Your Score is {}'.format(score),(140,250), font, 1,(255,255,255),2,cv2.LINE_AA)
        cv2.imshow('a',img)
        cv2.waitKey(0)
        #cv2.imwrite('D:/downloads/ii.jpg',img)
        break

cv2.destroyAllWindows()

위 게임을 gym environment로 변환하기 위해 필요한 구조

import gymnasium
from gymnasium import spaces

class CustomEnv(gymnasium.Env):
\tdef __init__(self, arg1, arg2, ...):
\t\tsuper(CustomEnv, self).__init__()
\t\t...
\tdef step(self, action):
\t\t...
\t
\tdef reset(self):
\t\t...

\tdef render(self, mode='human'):
\t\t...

\tdef close (self):
\t\t...

env.close()

"""Custom Environment that follows gym interface"""

import gymnasium
from gymnasium import spaces

env = gymnasium.Env
# env.reset()

class CustomEnv(env):
\tdef __init__(self, arg1, arg2):   
\t\tsuper(CustomEnv, self).__init__()
\t\t# Define Action and Observation space :: gym.spaces objects 중 하나 

\t\t### Action : Example when using discrete actions
\t\t# self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
\t\t# 0-Left, 1-Right, 3-Up, 2-Down
\t\tself.action_space = spaces.Discrete(4)

\t\t### Obs : Example for using image as input (channel-first; channel-last also works):
\t\t# self.observation_space = spaces.Box(
\t\t# \tlow=0, high=255,
\t\t# \tshape=(N_CHANNELS, HEIGHT, WIDTH), dtype=np.uint8
\t\t# )
\t\tobservation = [
\t\t\thead_x, head_y, 
\t\t\tapple_delta_x, apple_delta_y, 
\t\t\tsnake_length
\t\t] + list(self.prev_actions)

\t\t### Reward : 
        # self.total_reward = len(self.snake_position) - 3
\tdef step(self, action):
\t\t...
\t\treturn observation, reward, done, info
\t
\tdef reset(self):
\t\t...
\t\treturn observation  # reward, done, info can't be included

    # --------------------------------------------------------------------------
\tdef render(self, mode='human'):
\t\t...

\tdef close (self):
\t\t...
# ------------------------------------------------------------------------------
env.close()

defining our action space and observation space.

action space

4 clear possible actions

self.action_space = spaces.Discrete(4)

observation space

게임이미지 보다는.. 어설프더라도 좀더 구체적인 수치.

뱀머리 위치, 사과 위치, 뱀몸통 위치,

observation = [head_x, head_y, 
               apple_delta_x, apple_delta_y, 
               snake_length] + list(self.prev_actions)

reward

뱀길이

self.total_reward = len(self.snake_position) - 3  # start length is 3

reset()

episode시작시 마다 (steps이 시작하기전에) 호출

Using Custom Environments – pt.3

action space

observation space

reward

reset()

onesixx

Model-free Prediction & Control

Bellman (벨만 기대) 방정식

MDP(Markov Decision Process)

Using Custom Environments – pt.3

action space

observation space

reward

reset()

onesixx

Related Posts

Model-free Prediction & Control

Bellman (벨만 기대) 방정식

MDP(Markov Decision Process)