-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrobot_in_a_room.py
67 lines (49 loc) · 1.75 KB
/
robot_in_a_room.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from locale import currency
import numpy as np
R = np.matrix([[-0.04,-0.04,-0.04,1],
[-0.04,0,-0.04,-1],
[-0.04,-0.04,-0.04,-0.04]])
Q=np.matrix(np.zeros([4,3]))
alpha = 0.8
initial_state=1
def available_actions(state):
curr_state_row = R[state,]
av_act = np.where(curr_state_row>=0)[1]
return av_act
available_act = available_actions(initial_state)
def sample_next_action(available_actions_range):
next_action = int(np.random.choice(available_act,1))
return next_action
action = sample_next_action(available_act)
def update (current_state, action, alpha):
max_index = np.where(Q[action,]==np.max(Q[action,]))[1]
if (max_index.shape[0] > 1):
max_index = int (np.random.choice(max_index, size=1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
#Q learning formula
Q[current_state,action] = R[current_state, action] + alpha*max_value
update(initial_state, action, alpha)
# TRAINING
for i in range (10000):
current_state = np.random.randint(0,int(Q.shape[0]))
available_act = available_actions(current_state)
action = sample_next_action(available_act)
update(current_state, action, alpha)
print("Trained Q matrix")
print(Q/np.max(Q)* 100)
# TESTING
# Goal state = 5
current_state = 1
steps = [current_state]
while current_state != 4:
next_step_index = np.where(Q[current_state,]==np.max(Q[current_state,]))[1]
if next_step_index.shape[0] > 1:
next_step_index=int(np.random.choice(next_step_index, size=1))
else:
next_step_index = int(next_step_index)
steps.append(next_step_index)
current_state = next_step_index
print("Selected path")
print(steps)