forked from myazi/myLearn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMDP.cpp
224 lines (215 loc) · 6.16 KB
/
MDP.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
/***
马尔科夫决策过程值迭代,关键在于第一次迭代要例外,
因为目标状态是一个终止状态,放到迭代循环里面会出现
临近的状态回报函数无限的,发散。
迭代过程采用的是异步迭代,即每一次内层循环找到更优的
回报就立即更新最大回报,以便与之相邻的状态能立即更新到最优
*/
/****
值迭代
同步更新
12*12*7
*/
#include <iostream>
using namespace std;
#define size 12
int main()
{
int matrix[size][size]=
{
{0,1,0,0,1,0,0,0,0,0,0,0},
{1,0,1,0,0,0,0,0,0,0,0,0},
{0,1,0,1,0,0,1,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{1,0,0,0,0,0,0,0,1,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,1,0,0,0,0,1,0,0,1,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,1,0,0,0,0,1,0,0},
{0,0,0,0,0,0,0,0,1,0,1,0},
{0,0,0,0,0,0,1,0,0,1,0,1},
{0,0,0,0,0,0,0,1,0,0,1,0}
};
double reward[size]=
{
-0.02,-0.02,-0.02,1,
-0.02,0,-0.02,-1,
-0.02,-0.02,-0.02,-0.02
};
double maxreward[size]= {0,0,0,0,0,0,0,0,0,0,0,0};
int action[size]= {4,0,1,-1,8,-1,10,-1,9,8,9,10};//直接表示可到节点的下标
int i=0,j=0,count=0;
bool flag=0;
for(i=0;i<size;i++)
maxreward[i]=reward[i];
while(!flag)
{
flag=1;
for(i=0; i<size; i++)
{
if(action[i]>0||action[i]==0)
maxreward[i]=reward[i]+maxreward[action[i]];
else
maxreward[i]=reward[i];
}//放到这意味着同步更新,count=1008是12*12的7倍,即扫了7遍
for(i=0; i<size; i++)//对每一个状态求最大的V(s)
{
for(j=0; j<size; j++)//策略迭代的话这里其实可以换做扫一遍策略集,这也就是和值迭代不同的地方
{
//cout<<"i="<<i<<" "<<maxreward[i]<<" "<<endl;
if(matrix[i][j]==1&&maxreward[j]>maxreward[i]-reward[i]+0.0001)//更新累积回报
{
action[i]=j;
//if(action[i]>0||action[i]==0)
//maxreward[i]=reward[i]+maxreward[action[i]];//放到这是异步更新,
//else
// maxreward[i]=reward[i];
flag=0;//当累积回报不再更新,即不进入该if,那么就结束迭代
}
count++;
}
}
}
for(i=0; i<size; i++)
{
cout<<action[i]+1<<" ";
}
cout<<endl;
for(i=0; i<size; i++)
{
cout<<maxreward[i]<<" ";
}
cout<<endl<<"count="<<count<<endl;
return 0;
}
/*
值迭代 异步更新 12*12*4
*/
/*
#include <iostream>
using namespace std;
#define size 12
int main()
{
int matrix[size][size]=
{
{0,1,0,0,1,0,0,0,0,0,0,0},
{1,0,1,0,0,0,0,0,0,0,0,0},
{0,1,0,1,0,0,1,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{1,0,0,0,0,0,0,0,1,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,1,0,0,0,0,1,0,0,1,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,1,0,0,0,0,1,0,0},
{0,0,0,0,0,0,0,0,1,0,1,0},
{0,0,0,0,0,0,1,0,0,1,0,1},
{0,0,0,0,0,0,0,1,0,0,1,0}
};
double reward[size]=
{
-0.02,-0.02,-0.02,1,
-0.02,0,-0.02,-1,
-0.02,-0.02,-0.02,-0.02
};
double maxreward[size]= {0,0,0,0,0,0,0,0,0,0,0,0};
//double sumreward[size]={0,0,0,0,0,0,0,0,0,0,0,0};
int i=0,j=0,count=0;
bool flag=0;
for(i=0; i<size; i++)
maxreward[i]=reward[i];
while(!flag)
{
flag=1;
for(i=0; i<size; i++)//对每一个状态求最大的V(s)
{
for(j=0; j<size; j++) //由于不是策略迭代,只能遍历所有的状态,找出能到的,且更优的
{
if(matrix[i][j]==1&&maxreward[j]>maxreward[i]-reward[i]+0.0001)//double类型比较大小的偏差,加上一个小数作为精度
{
maxreward[i]=reward[i]+maxreward[j];//异步更新
flag=0;
}
count++;
}
}
}
for(i=0; i<size; i++)
cout<<maxreward[i]<<" ";
cout<<endl<<"count="<<count<<endl;
return 0;
}
*/
/***
策略迭代+异步更新
12*4*4
*/
/*
#include <iostream>
using namespace std;
#define size 12
#define ACTION 4
int main()
{
int matrix[size][size]=
{
{0,1,0,0,1,0,0,0,0,0,0,0},
{1,0,1,0,0,0,0,0,0,0,0,0},
{0,1,0,1,0,0,1,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{1,0,0,0,0,0,0,0,1,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,1,0,0,0,0,1,0,0,1,0},
{0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,1,0,0,0,0,1,0,0},
{0,0,0,0,0,0,0,0,1,0,1,0},
{0,0,0,0,0,0,1,0,0,1,0,1},
{0,0,0,0,0,0,0,1,0,0,1,0}
};
double reward[size]=
{
-0.02,-0.02,-0.02,1,
-0.02,0,-0.02,-1,
-0.02,-0.02,-0.02,-0.02
};
double maxreward[size]= {0,0,0,0,0,0,0,0,0,0,0,0};
int action[size]= {4,0,1,-1,8,-1,10,-1,9,8,9,10}; //上右下左{1,2,3,4}
int ac[ACTION]={-4,1,4,-1};
int i=0,j=0,count=0;
bool flag=0;
for(i=0;i<size;i++)
maxreward[i]=reward[i];
while(!flag)
{
flag=1;
for(i=0; i<size; i++)//对每一个状态求最大的V(s)
{
for(j=0; j<ACTION; j++)//策略迭代的话这里其实可以换做扫一遍策略集,这也就是和值迭代不同的地方
{
//cout<<"i="<<i<<" "<<maxreward[i]<<" "<<endl;
if(matrix[i][ac[j]+i]==1&&maxreward[ac[j]+i]>maxreward[i]-reward[i]+0.0001)
{
action[i]=j;
//if(reward[i]!=1&&reward[i]!=-1)
maxreward[i]=reward[i]+maxreward[ac[j]+i];
//else
// maxreward[i]=reward[i];
flag=0;
}
count++;
}
}
}
for(i=0; i<size; i++)
{
cout<<action[i]+1<<" ";
}
cout<<endl;
for(i=0; i<size; i++)
{
cout<<maxreward[i]<<" ";
}
cout<<endl<<"count="<<count<<endl;
return 0;
}
*/