2．实验思路

（2）实验思路

As for the Policy Iteration:

(1) Initialization the Matrix1

(2) Update the value according to the function:

(delta)

(3) Until the strategy unchanged, it’s restrained.

As for the Value Iteration:

(1) Initialize the Matrix1

(2) Update the value according to the function:

(delta)

(3) Output the value function, and obtain the best strategy.

3.实验结果

Policy Iteration:

Value Iteration;

class State(object):

def __init__(self,i,j):

self.up = 0.25

self.down = 0.25

self.left = 0.25

self.right = 0.25

if i > 0:

self.upState = (i - 1, j)

else:

self.upState = (i, j)

if i < 3:

self.downState = (i + 1, j)

else:

self.downState = (i, j)

if j > 0:
self.leftState = (i, j - 1)
else:
self.leftState = (i,j)
if j < 3:
self.rightState = (i, j + 1)
else:
self.rightState = (i, j)
# Add barrier in position(1, 2) and (2, 1)

if i == 2 and j == 2:
self.upState = (i, j)
self.leftState = (i, j)
if i == 3 and j == 1:
self.upState = (i, j)
if i == 2 and j == 0:
self.rightState = (i, j)
if i == 1 and j == 1:
self.rightState = (i, j)
self.downState = (i, j)
if i == 0 and j == 2:
self.downState = (i, j)
if i == 1 and j == 3:
self.leftState = (i, j)

def updatePolicy(self, rewardMatrix):

oldUp = self.up
oldDown = self.down
oldLeft = self.left
oldRight = self.right

upValue = -1 + rewardMatirx[self.upState[0]][self.upState[1]]
downValue = -1 + rewardMatirx[self.downState[0]][self.downState[1]]
leftValue = -1 + rewardMatirx[self.leftState[0]][self.leftState[1]]
rightValue = -1 + rewardMatirx[self.rightState[0]][self.rightState[1]]
if(upValue >= downValue)and(upValue >= leftValue)and(upValue >= rightValue):
self.up = 1.0
self.down = 0
self.left = 0
self.right = 0
elif(downValue >= upValue)and(downValue >= leftValue)and(downValue >= rightValue):
self.up = 0
self.down = 1.0
self.left = 0
self.right = 0
elif(leftValue >= upValue)and(leftValue >= downValue)and(leftValue >= rightValue):
self.up = 0
self.down = 0
self.left = 1.0
self.right = 0
else:
self.up = 0
self.down = 0
self.left = 0
self.right = 1.0
if(oldUp == self.up)and(oldDown == self.down)and(oldLeft == self.left)and(oldRight == self.right):
return True
else:
return False

################################################################################################################
# Update the reward matrix
def updateReward(i, j):
tempMatrix[i][j] = -1 + stateMatirx[i][j].up * rewardMatirx[stateMatirx[i][j].upState[0]][stateMatirx[i][j].upState[1]]                           + stateMatirx[i][j].down * rewardMatirx[stateMatirx[i][j].downState[0]][stateMatirx[i][j].downState[1]]                           + stateMatirx[i][j].left * rewardMatirx[stateMatirx[i][j].leftState[0]][stateMatirx[i][j].leftState[1]]                           + stateMatirx[i][j].right * rewardMatirx[stateMatirx[i][j].rightState[0]][stateMatirx[i][j].rightState[1]]

#################################################################################################################
# Initialize the state matrix
stateMatirx = [[] for i in range(4)]
for i in range(4):
for j in range(4):
stateMatirx[i].append(State(i, j))

################################################################################################################
# Initialize the reward matrix
rewardMatirx = [
[0,0,0,0],
[0,0,0,0],
[0,0,0,0],
[0,0,0,0]
]
# The matrix used to backup reward matrix
tempMatrix = [
[0,0,0,0],
[0,0,0,0],
[0,0,0,0],
[0,0,0,0]
]
#################################################################################################################
thresh = 0   # set a threshold value here
#################################################################################################################
# Policy iteration
stableFlag = True
while stableFlag:
# policy evaluation
delta = 0
for i in range(0, 4):
for j in range(0, 4):
if ((i == 0) and (j == 0)):
continue
if i == 1 and j == 2:
continue
if i == 2 and j == 1:
continue
else:
v = tempMatrix[i][j]
updateReward(i, j)
delta = max(delta, abs(tempMatrix[i][j] - v))
rewardMatirx = tempMatrix
while delta > thresh:
delta = 0
for i in range(0,4):
for j in range(0,4):
if((i == 0)and(j == 0)):
continue
if i == 1 and j == 2:
continue
if i == 2 and j == 1:
continue
else:
v = tempMatrix[i][j]
updateReward(i, j)
delta = max(delta, abs(tempMatrix[i][j] - v))
rewardMatirx = tempMatrix
# improve the policy
for i in range(0,4):
for j in range(0,4):
if (i == 0) and (j == 0):
continue
if i == 1 and j == 2:
continue
if i == 2 and j == 1:
continue
else:
stableFlag = (stableFlag and stateMatirx[i][j].updatePolicy(rewardMatirx))

stableFlag = not stableFlag
################################################################################################################
for i in range(0, 4):
for j in range(0, 4):
print(rewardMatirx[i][j])
print(" ")
print("\n")
#设置名为Operation的类存放信息
class Operation(object):

def __init__(self,i,j):

self.up = 0.25#概率为0.25

self.down = 0.25#概率为0.25

self.left = 0.25#概率为0.25

self.right = 0.25#概率为0.25

if i > 0:

self.upState = (i - 1, j)

else:

self.upState = (i, j)

if i < 3:

self.downState = (i + 1, j)

else:

self.downState = (i, j)

if j > 0:

self.leftState = (i, j - 1)

else:

self.leftState = (i,j)

if j < 3:

self.rightState = (i, j + 1)

else:

self.rightState = (i, j)

# 初始化收益矩阵

rewardMatrix = [

[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]

]

tempMatrix = [

[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]

]

# 初始化状态矩阵

stateMatirx = [[] for i in range(4)]

for i in range(4):

for j in range(4):

stateMatirx[i].append(Operation(i, j))

def updateReward(i,j):

tempMatrix[i][j] = max((-1 + rewardMatrix[stateMatirx[i][j].upState[0]][stateMatirx[i][j].upState[1]] ),

(-1 + rewardMatrix[stateMatirx[i][j].downState[0]][stateMatirx[i][j].downState[1]]) ,

(-1 + rewardMatrix[stateMatirx[i][j].leftState[0]][stateMatirx[i][j].leftState[1]]),

(-1 + rewardMatrix[stateMatirx[i][j].rightState[0]][stateMatirx[i][j].rightState[1]]))

#Value iteration

thresh = 0 # 设置阈值

delta = 0

for i in range(4):

for j in range(4):

if ((i == 0) and (j == 0)):

continue

if i == 1 and j == 2:

continue

if i == 2 and j == 1:

continue

else:

v = rewardMatrix[i][j]

updateReward(i,j)

delta = max(delta, abs(v - tempMatrix[i][j]))

rewardMatrix = tempMatrix

while delta > thresh:

delta = 0

for i in range(4):

for j in range(4):

if i == 0 and j == 0:

continue

if i == 1 and j == 2:

continue

if i == 2 and j == 1:

continue

else:

v = rewardMatrix[i][j]

updateReward(i, j)

delta = max(delta, abs(v - tempMatrix[i][j]))

rewardMatrix = tempMatrix

#输出结果矩阵

for i in range(0, 4):

for j in range(0, 4):

print(rewardMatrix[i][j])

print(" ")

print("\n")


人工智能之计算最佳策略

1. 实验要求 题目:计算最佳策略 在下面例子基础上,自行设计一个问题(例如:求解某两点之间的最短路径, 或是在图中加一些障碍物,计算最短路径), 给出该问题对应的 MDP 模型描述, 然后分别使用 value iteration 和 policy iteration 算法计算出最佳策略. 2．实验思路 (1)设计问题(MDP描述) 设计4*4的方格,即初始化的矩阵,每个空格都是一个状态,存在收益情况,在每到达一个点时便可选择上下左右四个方向移动,遇到边缘时状态不变,当移动一步则收益-1 (2)

Unity开发实战探讨-资源的加载释放最佳策略简要心得

Unity开发实战探讨-资源的加载释放最佳策略简要心得 看过我另外一篇关于Unity资源释放随笔<Unity开发实战探讨-资源的加载释放最佳策略>如果觉得略微复杂,那么下面是一些比较简要的心得体会: 概括 常用资源加载的方法有三种:静态,Resources内部资源,AssetBundle外部资源 资源释放的方式 有二种:立刻释放和统一释放. 静态 静态就是资源直接放场景,静态资源无法立刻释放,但场景关闭由引擎统一释放,开发者无法干预,所以最为无脑. 但静态过于死板,除了整个场景生命周期中必须使

计算练习—策略模式

Calculate计算类: public abstract class Calculate //抽象类 Calculate { private double a;//第一个数 public double A//封装字段 { get { return a; } set { a = value; } } private double b;//第二个数字段封装 public double B { get { return b; } set { b = value; } } public double

人工智能-实验一策略迭代和值迭代

1.实验问题 在4x4矩阵中添加终点和障碍点,分别有一个或多个,并且满足以下属性: 终点:value值不变,始终为0,邻接点可到达用大写字母E表示 障碍点:表示该点在矩阵中"不存在",邻接点不可到达该点,且该点没有value值跟状态,使用符号'#'表示 以任意除以上两种结点之外的所有其它结点为起点,求解起点到终点的最短距离,存在多终点时,以相隔最近的终结点为准. 2.实验思路 1)  使用值Policy Iteration和Value Iteration算法分别计算问题产生的最佳策略