cajcodes commited on
Commit
b1581fb
·
verified ·
1 Parent(s): 32f0f10

Upload 5 files

Browse files
checkpoint_11.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb03f42910dd16bc7ecdad3e571b9c6332eb73e8b655569559cefc1cb5e04ba
3
+ size 162214
checkpoint_21.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3d6a41666abf63661f2d9cdc59b30fedb4395beca895137377ab44fa93910c4
3
+ size 163558
checkpoint_31.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9eff40ebbd5628cf30322832ec7c795db8b29700f6201461df566d6ba83b304
3
+ size 165542
checkpoint_41.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe2ee641727e969c091d68d7fa0fd711c95bd8a1dd79a2f73f9b96068b1b20fe
3
+ size 166822
train.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.optim as optim
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import numpy as np
6
+ from collections import deque
7
+ import random
8
+ import matplotlib.pyplot as plt
9
+ import matplotlib.animation as animation
10
+ import heapq # For the A* algorithm
11
+ from huggingface_hub import HfApi, HfFolder # Hugging Face API
12
+
13
+ # Function to generate a floorplan
14
+ def generate_floorplan(size=10, obstacle_density=0.2):
15
+ floorplan = [[0 for _ in range(size)] for _ in range(size)]
16
+ target_x, target_y = size - 1, size - 1
17
+ floorplan[target_x][target_y] = 2 # Mark target position
18
+ num_obstacles = int(size * size * obstacle_density)
19
+ for _ in range(num_obstacles):
20
+ x = random.randint(0, size - 1)
21
+ y = random.randint(0, size - 1)
22
+ if floorplan[x][y] == 0 and (x, y) != (0, 0):
23
+ floorplan[x][y] = 1 # Mark obstacle
24
+ return floorplan, target_x, target_y
25
+
26
+ def a_star(floorplan, start, goal):
27
+ size = len(floorplan)
28
+ open_set = []
29
+ heapq.heappush(open_set, (0, start))
30
+ came_from = {}
31
+ g_score = {start: 0}
32
+ f_score = {start: heuristic(start, goal)}
33
+
34
+ while open_set:
35
+ _, current = heapq.heappop(open_set)
36
+
37
+ if current == goal:
38
+ return reconstruct_path(came_from, current)
39
+
40
+ neighbors = get_neighbors(current, size)
41
+ for neighbor in neighbors:
42
+ if floorplan[neighbor[0]][neighbor[1]] == 1:
43
+ continue # Ignore obstacles
44
+
45
+ tentative_g_score = g_score[current] + 1
46
+
47
+ if neighbor not in g_score or tentative_g_score < g_score[neighbor]:
48
+ came_from[neighbor] = current
49
+ g_score[neighbor] = tentative_g_score
50
+ f_score[neighbor] = g_score[neighbor] + heuristic(neighbor, goal)
51
+ heapq.heappush(open_set, (f_score[neighbor], neighbor))
52
+
53
+ return []
54
+
55
+ def heuristic(a, b):
56
+ return abs(a[0] - b[0]) + abs(a[1] - b[1])
57
+
58
+ def get_neighbors(pos, size):
59
+ neighbors = []
60
+ x, y = pos
61
+ if x > 0:
62
+ neighbors.append((x - 1, y))
63
+ if x < size - 1:
64
+ neighbors.append((x + 1, y))
65
+ if y > 0:
66
+ neighbors.append((x, y - 1))
67
+ if y < size - 1:
68
+ neighbors.append((x, y + 1))
69
+ return neighbors
70
+
71
+ def reconstruct_path(came_from, current):
72
+ path = [current]
73
+ while current in came_from:
74
+ current = came_from[current]
75
+ path.append(current)
76
+ return path[::-1]
77
+
78
+ class Environment:
79
+ def __init__(self, size=10, obstacle_density=0.2):
80
+ self.size = size
81
+ self.floorplan, self.target_x, self.target_y = generate_floorplan(size, obstacle_density)
82
+ self.robot_x = 0
83
+ self.robot_y = 0
84
+
85
+ def reset(self):
86
+ while True:
87
+ self.robot_x = random.randint(0, self.size - 1)
88
+ self.robot_y = random.randint(0, self.size - 1)
89
+ if self.floorplan[self.robot_x][self.robot_y] == 0:
90
+ break
91
+ return self.get_cnn_state()
92
+
93
+ def step(self, action):
94
+ new_x, new_y = self.robot_x, self.robot_y
95
+
96
+ if action == 0: # Up
97
+ new_x = max(self.robot_x - 1, 0)
98
+ elif action == 1: # Down
99
+ new_x = min(self.robot_x + 1, self.size - 1)
100
+ elif action == 2: # Left
101
+ new_y = max(self.robot_y - 1, 0)
102
+ elif action == 3: # Right
103
+ new_y = min(self.robot_y + 1, self.size - 1)
104
+
105
+ # Check if the new position is an obstacle
106
+ if self.floorplan[new_x][new_y] != 1:
107
+ self.robot_x, self.robot_y = new_x, new_y
108
+
109
+ done = (self.robot_x == self.target_x and self.robot_y == self.target_y)
110
+ reward = self.get_reward(self.robot_x, self.robot_y)
111
+ next_state = self.get_cnn_state()
112
+ info = {}
113
+ return next_state, reward, done, info
114
+
115
+ def get_reward(self, robot_x, robot_y):
116
+ if self.floorplan[robot_x][robot_y] == 1:
117
+ return -5 # Penalty for hitting an obstacle
118
+ elif robot_x == self.target_x and robot_y == self.target_y:
119
+ return 10 # Reward for reaching the target
120
+ else:
121
+ return -0.1 # Penalty for each step
122
+
123
+ def get_cnn_state(self):
124
+ grid = [row[:] for row in self.floorplan]
125
+ grid[self.robot_x][self.robot_y] = 3 # Mark the robot's current position
126
+ return np.array(grid).flatten()
127
+
128
+ def render(self, path=None):
129
+ grid = np.array(self.floorplan)
130
+ fig, ax = plt.subplots()
131
+ ax.set_xticks(np.arange(-0.5, self.size, 1))
132
+ ax.set_yticks(np.arange(-0.5, self.size, 1))
133
+ ax.grid(which='major', color='k', linestyle='-', linewidth=1)
134
+ ax.tick_params(which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
135
+
136
+ def update(i):
137
+ ax.clear()
138
+ ax.imshow(grid, cmap='Greys', interpolation='nearest')
139
+ if path:
140
+ x, y = path[i]
141
+ ax.plot(y, x, 'bo') # Draw robot's path
142
+ plt.draw()
143
+
144
+ ani = animation.FuncAnimation(fig, update, frames=len(path), repeat=False)
145
+ plt.show()
146
+
147
+ class DQN(nn.Module):
148
+ def __init__(self, input_size, hidden_sizes, output_size):
149
+ super(DQN, self).__init__()
150
+ self.input_size = input_size
151
+ self.hidden_sizes = hidden_sizes
152
+ self.output_size = output_size
153
+
154
+ self.fc_layers = nn.ModuleList()
155
+ prev_size = input_size
156
+ for size in hidden_sizes:
157
+ self.fc_layers.append(nn.Linear(prev_size, size))
158
+ prev_size = size
159
+ self.output_layer = nn.Linear(prev_size, output_size)
160
+
161
+ def forward(self, x):
162
+ if len(x.shape) > 2:
163
+ x = x.view(x.size(0), -1)
164
+ for layer in self.fc_layers:
165
+ x = F.relu(layer(x))
166
+ x = self.output_layer(x)
167
+ return x
168
+
169
+ def choose_action(self, state):
170
+ with torch.no_grad():
171
+ state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
172
+ q_values = self(state_tensor)
173
+ action = q_values.argmax().item()
174
+ return action
175
+
176
+ class ReplayBuffer:
177
+ def __init__(self, capacity):
178
+ self.buffer = deque(maxlen=capacity)
179
+
180
+ def push(self, state, action, reward, next_state, done):
181
+ self.buffer.append((state, action, reward, next_state, done))
182
+
183
+ def sample(self, batch_size):
184
+ batch = random.sample(self.buffer, batch_size)
185
+ states, actions, rewards, next_states, dones = zip(*batch)
186
+ return states, actions, rewards, next_states, dones
187
+
188
+ def __len__(self):
189
+ return len(self.buffer)
190
+
191
+ # Function to save the model checkpoint
192
+ def save_checkpoint(state, filename="checkpoint.pth.tar"):
193
+ torch.save(state, filename)
194
+
195
+ # Function to load the model checkpoint
196
+ def load_checkpoint(filename):
197
+ checkpoint = torch.load(filename)
198
+ return checkpoint
199
+
200
+ # Training the DQN
201
+ env = Environment()
202
+ input_size = env.size * env.size # Flattened grid size
203
+ hidden_sizes = [64, 64] # Hidden layer sizes
204
+ output_size = 4 # Number of actions (up, down, left, right)
205
+
206
+ dqn = DQN(input_size, hidden_sizes, output_size)
207
+ dqn_target = DQN(input_size, hidden_sizes, output_size)
208
+ dqn_target.load_state_dict(dqn.state_dict())
209
+
210
+ optimizer = optim.Adam(dqn.parameters(), lr=0.001)
211
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
212
+ replay_buffer = ReplayBuffer(10000)
213
+ num_episodes = 50
214
+ batch_size = 64
215
+ gamma = 0.99
216
+ target_update_freq = 100
217
+ checkpoint_freq = 10 # Save checkpoint every 10 episodes
218
+
219
+ losses = []
220
+ for episode in range(num_episodes):
221
+ state = env.reset()
222
+ total_reward = 0
223
+ done = False
224
+
225
+ # Integrate A* guidance for initial exploration
226
+ initial_path = a_star(env.floorplan, (env.robot_x, env.robot_y), (env.target_x, env.target_y))
227
+ path_index = 0
228
+
229
+ while not done:
230
+ epsilon = max(0.01, 0.2 - 0.01 * (episode / 2))
231
+ if np.random.rand() < epsilon:
232
+ if initial_path and path_index < len(initial_path):
233
+ next_pos = initial_path[path_index]
234
+ if next_pos[0] < env.robot_x:
235
+ action = 0 # Up
236
+ elif next_pos[0] > env.robot_x:
237
+ action = 1 # Down
238
+ elif next_pos[1] < env.robot_y:
239
+ action = 2 # Left
240
+ else:
241
+ action = 3 # Right
242
+ path_index += 1
243
+ else:
244
+ action = np.random.randint(output_size)
245
+ else:
246
+ state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
247
+ with torch.no_grad():
248
+ q_values = dqn(state_tensor)
249
+ action = q_values.argmax().item()
250
+
251
+ next_state, reward, done, _ = env.step(action)
252
+ replay_buffer.push(state, action, reward, next_state, done)
253
+
254
+ if len(replay_buffer) > batch_size:
255
+ states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
256
+ states = torch.tensor(states, dtype=torch.float32)
257
+ actions = torch.tensor(actions, dtype=torch.int64)
258
+ rewards = torch.tensor(rewards, dtype=torch.float32)
259
+ next_states = torch.tensor(next_states, dtype=torch.float32)
260
+ dones = torch.tensor(dones, dtype=torch.float32)
261
+
262
+ q_values = dqn(states)
263
+ q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
264
+
265
+ with torch.no_grad():
266
+ next_q_values = dqn(next_states)
267
+ next_q_values = next_q_values.max(1)[0]
268
+ target_q_values = rewards + (1 - dones) * gamma * next_q_values
269
+
270
+ loss = F.smooth_l1_loss(q_values, target_q_values)
271
+ optimizer.zero_grad()
272
+ loss.backward()
273
+ optimizer.step()
274
+
275
+ losses.append(loss.item())
276
+
277
+ total_reward += reward
278
+ state = next_state
279
+
280
+ if episode % target_update_freq == 0:
281
+ dqn_target.load_state_dict(dqn.state_dict())
282
+ scheduler.step()
283
+
284
+ # Save checkpoints
285
+ if episode % checkpoint_freq == 0 or episode == num_episodes - 1:
286
+ checkpoint = {
287
+ 'episode': episode + 1,
288
+ 'state_dict': dqn.state_dict(),
289
+ 'optimizer': optimizer.state_dict(),
290
+ 'losses': losses
291
+ }
292
+ save_checkpoint(checkpoint, f'checkpoint_{episode + 1}.pth.tar')
293
+
294
+ print(f"Episode {episode + 1}: Total Reward = {total_reward}, Loss = {np.mean(losses[-batch_size:]) if losses else None}")
295
+
296
+ # Save the final model
297
+ torch.save(dqn.state_dict(), 'dqn_model.pth')
298
+
299
+ # Load the trained model
300
+ dqn = DQN(input_size, hidden_sizes, output_size)
301
+ dqn.load_state_dict(torch.load('dqn_model.pth'))
302
+ dqn.eval()
303
+
304
+ # Simulate the bot's path using the trained DQN agent
305
+ state = env.reset()
306
+ done = False
307
+ path = [(env.robot_x, env.robot_y)]
308
+
309
+ while not done:
310
+ state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
311
+ with torch.no_grad():
312
+ q_values = dqn(state_tensor)
313
+ action = q_values.argmax().item() # Choose action from the trained DQN
314
+ next_state, reward, done, _ = env.step(action)
315
+ path.append((env.robot_x, env.robot_y))
316
+ state = next_state
317
+
318
+ # Render the environment and the bot's path
319
+ env.render(path)
320
+
321
+ # Evaluate trained DQN
322
+ def evaluate_agent(env, agent, num_episodes=5):
323
+ total_rewards = 0
324
+ successful_episodes = 0
325
+
326
+ for episode in range(num_episodes):
327
+ state = env.reset()
328
+ episode_reward = 0
329
+ done = False
330
+
331
+ while not done:
332
+ action = agent.choose_action(state)
333
+ next_state, reward, done, _ = env.step(action)
334
+ episode_reward += reward
335
+ state = next_state
336
+
337
+ total_rewards += episode_reward
338
+ if episode_reward > 0:
339
+ successful_episodes += 1
340
+
341
+ avg_reward = total_rewards / num_episodes
342
+ success_rate = successful_episodes / num_episodes
343
+
344
+ print("Evaluation Results:")
345
+ print(f"Average Reward: {avg_reward}")
346
+ print(f"Success Rate: {success_rate}")
347
+
348
+ return avg_reward, success_rate
349
+
350
+ # Call the evaluation function after rendering
351
+ avg_reward, success_rate = evaluate_agent(env, dqn, num_episodes=5)
352
+
353
+ # Upload the model to Hugging Face
354
+ # Authenticate with Hugging Face API
355
+ api = HfApi()
356
+ api_token = HfFolder.get_token() # Ensure you have logged in with `huggingface-cli login`
357
+
358
+ # Create a model repository if it doesn't exist
359
+ model_repo = 'cajcodes/dqn-floorplan-finder'
360
+ api.create_repo(repo_id=model_repo, exist_ok=True)
361
+
362
+ # Upload the model
363
+ api.upload_file(
364
+ path_or_fileobj='dqn_model.pth',
365
+ path_in_repo='dqn_model.pth',
366
+ repo_id=model_repo
367
+ )