RL/1002.py at master · dgtgrade/RL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Follows a random search idea from:
# http://kvfrans.com/simple-algoritms-for-solving-cartpole/

# import gym
import gym.wrappers
import numpy as np
import time

#
float_formatter = lambda x: "%.6f" % x
np.set_printoptions(formatter={'float_kind': float_formatter})

#
# https://github.com/openai/gym/wiki/CartPole-v0
#
env = gym.make('CartPole-v0')
env = gym.wrappers.Monitor(env, 'tmp/cartpole-experiment-1', force=True)

#
EP_MAX = 1000
T_MAX = 200
#
# Solved Requirements:
# Considered solved when the average reward is
# greater than or equal to 195.0 over 100 consecutive trials.
CONSECUTIVE_EPS_TO_SOLVE = 100
T_TO_SOLVE = 195

#
t_best = 0
params_best = np.zeros(4)
consecutive_success = 0

for ep in range(EP_MAX):

    # noinspection PyRedeclaration
    observation = env.reset()

    if consecutive_success == 0:
        params = np.random.random(4) * 2 - 1

    print("ep: {}".format(ep))
    print("params: {}".format(params))

    t = 0

    while True:

        t += 1

        env.render()

        # print("observation: {}".format(observation))
        action = 1 if np.dot(observation, params) > 0 else 0

        #
        observation, reward, done, info = env.step(action)

        if done:
            print("Episode finished after {} timesteps".format(t))
            time.sleep(1)
            break

    if t > t_best:
        print("Good! Episode ran more than previous best ({}) timesteps".format(
            t_best))
        t_best = t

    if t > T_TO_SOLVE:
        consecutive_success += 1
    else:
        consecutive_success = 0

    print("Consecutive Successes: {}".format(consecutive_success))

    if consecutive_success == CONSECUTIVE_EPS_TO_SOLVE:
        print("WOW! {} consecutive success! problem solved".format(
            consecutive_success))
        break