-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeep_Q_learning.py
85 lines (80 loc) · 3.2 KB
/
deep_Q_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import random
import time
import gym
import numpy as np
import deep_neural_network as DNN
class DQNModel(DNN.DNNModel):
def __init__(self, gamma:float, batch_size, epsilon, S_l:list, learning_rate:float, lamda:int=0, theta = None):
DNN.DNNModel.__init__(self, S_l, learning_rate, lamda, theta)
self.D=[]
self.greedy=1.0
self.batch_size=batch_size
self.epsilon=epsilon
self.gamma = gamma
def trainDQN(self, myset):
# cur_states=[]
# next_states=[]
# for id in range(len(myset)):
# _cur_state, _action, _r, _next_state, _done=myset[id]
# next_states.append(_next_state)
# cur_states.append(_cur_state)
cur_states, _a,_b,next_states, _c=zip(*myset)
cur_states=np.array(cur_states)
next_states=np.array(next_states)
next_states_q=self.hypothesis(next_states)[-1]
cur_states_q=self.hypothesis(cur_states)[-1]
for id in range(len(myset)):
_cur_state, _action, _r, _next_state, _done=myset[id]
if not _done:
cur_states_q[id][_action]=float(_r+self.gamma*np.max(next_states_q[id]))
else:
cur_states_q[id][_action]=float(_r)
self.train(cur_states, cur_states_q)
def get_best_action(self, _state, prt=False):
if random.random() < self.greedy:
return random.randint(0,self.K-1)
else:
actions=self.hypothesis(list(_state))[-1]
if prt:
print(actions, np.argmax(actions))
return int(np.argmax(actions))
if __name__ == "__main__":
env=gym.make("MountainCar-v1")#v0的最大步数改为10000
test_model=DQNModel(0.9, 100, 0.01, [(3,""),(24,"ReLU"),(24,"ReLU"),(4,"Liner")], 0.001)
counts=[]
for times in range(5000):
count=0
state=env.reset()
if times >= 10 and test_model.greedy>test_model.epsilon:
test_model.greedy *= 0.995
while True:
count+=1
action=test_model.get_best_action(state)
next_state, reward, done, _=env.step([action-2])
if not done or next_state[0]>=0.5:
test_model.D.append((list(state), action, reward, list(next_state), done))
if times >= 10:
sets = random.sample(test_model.D, test_model.batch_size)
test_model.trainDQN(sets)
state=next_state
if done:
counts.append(count)
print(count)
if len(counts)>10:
counts=counts[1:]
if np.mean(counts) <= 160:
state = env.reset()
done=False
while not done:
env.render()
state,reward,done,_=env.step(test_model.get_best_action(state, True))
time.sleep(0.2)
#env.close()
break
state=env.reset()
done=False
while not done:
env.render()
state,reward,done,_=env.step(test_model.get_best_action(state, True))
time.sleep(0.02)
env.close()