diff --git a/MDP.cpp b/MDP.cpp index c087463..3e08f72 100644 --- a/MDP.cpp +++ b/MDP.cpp @@ -1,14 +1,14 @@ /*** -马尔科夫决策过程值迭代,关键在于第一次迭代要例外, -因为目标状态是一个终止状态,放到迭代循环里面会出现 -临近的状态回报函数无限的,发散。 -迭代过程采用的是异步迭代,即每一次内层循环找到更优的 -回报就立即更新最大回报,以便与之相邻的状态能立即更新到最优 +椹皵绉戝か鍐崇瓥杩囩▼鍊艰凯浠o紝鍏抽敭鍦ㄤ簬绗竴娆¤凯浠h渚嬪锛 +鍥犱负鐩爣鐘舵佹槸涓涓粓姝㈢姸鎬侊紝鏀惧埌杩唬寰幆閲岄潰浼氬嚭鐜 +涓磋繎鐨勭姸鎬佸洖鎶ュ嚱鏁版棤闄愮殑锛屽彂鏁c +杩唬杩囩▼閲囩敤鐨勬槸寮傛杩唬锛屽嵆姣忎竴娆″唴灞傚惊鐜壘鍒版洿浼樼殑 +鍥炴姤灏辩珛鍗虫洿鏂版渶澶у洖鎶ワ紝浠ヤ究涓庝箣鐩搁偦鐨勭姸鎬佽兘绔嬪嵆鏇存柊鍒版渶浼 */ /**** -值迭代 -同步更新 +鍊艰凯浠 +鍚屾鏇存柊 12*12*7 */ @@ -41,7 +41,7 @@ int main() -0.02,-0.02,-0.02,-0.02 }; double maxreward[size]= {0,0,0,0,0,0,0,0,0,0,0,0}; - int action[size]= {4,0,1,-1,8,-1,10,-1,9,8,9,10}; //上右下左{1,2,3,4} + int action[size]= {4,0,1,-1,8,-1,10,-1,9,8,9,10};//鐩存帴琛ㄧず鍙埌鑺傜偣鐨勪笅鏍 int i=0,j=0,count=0; bool flag=0; for(i=0;imaxreward[i]-reward[i]+0.0001)//更新累积回报 + if(matrix[i][j]==1&&maxreward[j]>maxreward[i]-reward[i]+0.0001)//鏇存柊绱Н鍥炴姤 { action[i]=j; //if(action[i]>0||action[i]==0) - //maxreward[i]=reward[i]+maxreward[action[i]];//放到这是异步更新, + //maxreward[i]=reward[i]+maxreward[action[i]];//鏀惧埌杩欐槸寮傛鏇存柊锛 //else // maxreward[i]=reward[i]; - flag=0;//当累积回报不再更新,即不进入该if,那么就结束迭代 + flag=0;//褰撶疮绉洖鎶ヤ笉鍐嶆洿鏂帮紝鍗充笉杩涘叆璇f锛岄偅涔堝氨缁撴潫杩唬 } count++; } @@ -89,7 +89,7 @@ int main() /* -值迭代 异步更新 12*12*4 +鍊艰凯浠 寮傛鏇存柊 12*12*4 */ /* #include @@ -127,13 +127,13 @@ int main() while(!flag) { flag=1; - for(i=0; imaxreward[i]-reward[i]+0.0001)//double类型比较大小的偏差,加上一个小数作为精度 + if(matrix[i][j]==1&&maxreward[j]>maxreward[i]-reward[i]+0.0001)//double绫诲瀷姣旇緝澶у皬鐨勫亸宸紝鍔犱笂涓涓皬鏁颁綔涓虹簿搴 { - maxreward[i]=reward[i]+maxreward[j];//异步更新 + maxreward[i]=reward[i]+maxreward[j];//寮傛鏇存柊 flag=0; } count++; @@ -149,7 +149,7 @@ int main() /*** -策略迭代+异步更新 +绛栫暐杩唬+寮傛鏇存柊 12*4*4 */ @@ -182,7 +182,7 @@ int main() -0.02,-0.02,-0.02,-0.02 }; double maxreward[size]= {0,0,0,0,0,0,0,0,0,0,0,0}; - int action[size]= {4,0,1,-1,8,-1,10,-1,9,8,9,10}; //上右下左{1,2,3,4} + int action[size]= {4,0,1,-1,8,-1,10,-1,9,8,9,10}; //涓婂彸涓嬪乏{1,2,3,4} int ac[ACTION]={-4,1,4,-1}; int i=0,j=0,count=0; bool flag=0; @@ -191,9 +191,9 @@ int main() while(!flag) { flag=1; - for(i=0; imaxreward[i]-reward[i]+0.0001)