ProtossAgent-RL/ProtossAgent.py at master · DevonCurrent/ProtossAgent-RL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import random
import math
import os.path

import numpy as np
import pandas as pd

from pysc2.agents import base_agent
from pysc2.env import sc2_env
from pysc2.lib import actions, features, units
from absl import app
import random

DATA_FILE = 'sparse_agent_data'

ACTION_DO_NOTHING = 'donothing'
ACTION_BUILD_PYLON = 'buildpylon'
ACTION_BUILD_GATEWAY = 'buildgateway'
ACTION_BUILD_ZEALOT = 'buildzealot'
ACTION_ATTACK = 'attack'

smart_actions = [
	ACTION_DO_NOTHING,
	ACTION_BUILD_PYLON,
	ACTION_BUILD_GATEWAY,
	ACTION_BUILD_ZEALOT,
]

for mm_x in range(0, 64):
	for mm_y in range(0, 64):
		if (mm_x + 1) % 32 == 0 and (mm_y + 1) % 32 == 0:
			smart_actions.append(ACTION_ATTACK + '_' + str(mm_x - 16) + '_' + str(mm_y - 16))


# Stolen from https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
class QLearningTable:
	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
		self.actions = actions  # a list
		self.lr = learning_rate
		self.gamma = reward_decay
		self.epsilon = e_greedy
		self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
		self.disallowed_actions = {}

	def choose_action(self, observation, excluded_actions=[]):
		self.check_state_exist(observation)

		self.disallowed_actions[observation] = excluded_actions

		state_action = self.q_table.ix[observation, :]

		for excluded_action in excluded_actions:
			del state_action[excluded_action]

		if np.random.uniform() < self.epsilon:
			# some actions have the same value
			state_action = state_action.reindex(np.random.permutation(state_action.index))

			action = state_action.idxmax()
		else:
			action = np.random.choice(state_action.index)

		return action

	def learn(self, s, a, r, s_):
		if s == s_:
			return

		self.check_state_exist(s_)
		self.check_state_exist(s)

		q_predict = self.q_table.ix[s, a]

		s_rewards = self.q_table.ix[s_, :]

		if s_ in self.disallowed_actions:
			for excluded_action in self.disallowed_actions[s_]:
				del s_rewards[excluded_action]

		if s_ != 'terminal':
			q_target = r + self.gamma * s_rewards.max()
		else:
			q_target = r  # next state is terminal

		# update
		self.q_table.ix[s, a] += self.lr * (q_target - q_predict)

	def check_state_exist(self, state):
		if state not in self.q_table.index:
			# append new state to q table
			self.q_table = self.q_table.append(pd.Series([0] * len(self.actions), index=self.q_table.columns, name=state))


class ProtossAgent(base_agent.BaseAgent):
	def __init__(self):
		super(ProtossAgent, self).__init__()

		self.qlearn = QLearningTable(actions=list(range(len(smart_actions))))

		self.previous_action = None
		self.previous_state = None

		self.move_number = 0

		if os.path.isfile(DATA_FILE + '.gz'):
			self.qlearn.q_table = pd.read_pickle(DATA_FILE + '.gz', compression='gzip')

		self.attack_coordinates = None

	def unit_type_is_selected(self, obs, unit_type):
		if (len(obs.observation.single_select) > 0 and
		obs.observation.single_select[0].unit_type == unit_type):
				return True

		if (len(obs.observation.multi_select) > 0 and
		obs.observation.multi_select[0].unit_type == unit_type):
			return True

		return False

	def get_units_by_type(self, obs, unit_type):
		return [unit for unit in obs.observation.feature_units if unit.unit_type == unit_type]

	def can_do(self, obs, action):
		return action in obs.observation.available_actions

	def select_available_probe(self, obs):
		probes = self.get_units_by_type(obs, units.Protoss.Probe)
		if len(probes) > 0:
			probe = random.choice(probes)
			return actions.FUNCTIONS.select_point("select_all_type", (probe.x, probe.y))

	def build_pylon(self, obs):
		if self.unit_type_is_selected(obs, units.Protoss.Probe):
			if self.can_do(obs, actions.FUNCTIONS.Build_Pylon_screen.id):
				x = random.randint(0, 83)
				y = random.randint(0, 83)
				return actions.FUNCTIONS.Build_Pylon_screen("now", (x, y))
		return self.select_available_probe(obs)

	def build_gateway(self, obs):
		if self.unit_type_is_selected(obs, units.Protoss.Probe):
			if self.can_do(obs, actions.FUNCTIONS.Build_Gateway_screen.id):
					x = random.randint(0, 83)
					y = random.randint(0, 83)
					return actions.FUNCTIONS.Build_Gateway_screen("now", (x, y))
			return self.select_available_probe(obs)

	def select_army(self, obs):
		if self.can_do(obs, actions.FUNCTIONS.select_army.id):
			return actions.FUNCTIONS.select_army("select")

	def attack_enemy(self, obs):
		if self.unit_type_is_selected(obs, units.Protoss.Zealot):
			if self.can_do(obs, actions.FUNCTIONS.Attack_minimap.id):
				return actions.FUNCTIONS.Attack_minimap("now",
				self.attack_coordinates)
		return self.select_army(obs)

	def step(self, obs):
		super(ProtossAgent, self).step(obs)

		if obs.last():
			reward = obs.reward
			self.qlearn.learn(str(self.previous_state), self.previous_action, reward, 'terminal')
			self.qlearn.q_table.to_pickle(DATA_FILE + '.gz', 'gzip')
			self.previous_action = None
			self.previous_state = None
			self.move_number = 0

			return actions.FunctionCall(_NO_OP, [])

		if obs.first():
			agent_y, agent_x = (obs.observation.feature_minimap.player_relative ==
			features.PlayerRelative.SELF).nonzero()

			agent_xmean = agent_x.mean()
			agent_ymean = agent_y.mean()

			if agent_xmean <= 31 and agent_ymean <= 31:
				self.attack_coordinates = (49, 49)
			else:
				self.attack_coordinates = (12, 16)

		gateways = self.get_units_by_type(obs, units.Protoss.Gateway)
		zealots = self.get_units_by_type(obs, units.Protoss.Zealot)
		pylon = self.get_units_by_type(obs, units.Protoss.Pylon)
		free_supply = (obs.observation.player.food_cap - obs.observation.player.food_used)

		if len(zealots) >= 12:
			return self.attack_enemy(obs)

		if free_supply < 15:
			return self.build_pylon(obs)

		if len(gateways) <= 2:
			return self.build_gateway(obs)

		if self.can_do(obs, actions.FUNCTIONS.Train_Zealot_quick.id):
			return actions.FUNCTIONS.Train_Zealot_quick("now")

		if len(gateways) > 0 and not self.unit_type_is_selected(obs, units.Protoss.Gateway):
			gateway = random.choice(gateways)
			return actions.FUNCTIONS.select_point("select_all_type", (gateway.x, gateway.y))

		return actions.FUNCTIONS.no_op()


def main(unused_argv):
	agent = ProtossAgent()
	try:
		while True:
			with sc2_env.SC2Env(map_name="Catalyst", players=[sc2_env.Agent(sc2_env.Race.protoss),
			sc2_env.Bot(sc2_env.Race.random, sc2_env.Difficulty.easy)],
			agent_interface_format=features.AgentInterfaceFormat(
			feature_dimensions=features.Dimensions(screen=84, minimap=64),
			use_feature_units=True), step_mul=16, game_steps_per_episode=0, visualize=True, save_replay_episodes=1, replay_dir='E:\Program Files (x86)\StarCraft II\Replays') as env:

				agent.setup(env.observation_spec(), env.action_spec())
				timesteps = env.reset()
				agent.reset()

				while True:
					step_actions = [agent.step(timesteps[0])]
					if timesteps[0].last():
						break
					timesteps = env.step(step_actions)

	except KeyboardInterrupt:
		pass


if __name__ == "__main__":
	app.run(main)