- 에이전트가 환경 내에서 행동을 취하고, 환경은 해당 행동에 따라 변화한 상태와 발생한 보상을 다시 에이전트에게 전달한다.
- 에이전트는 이 정보들을 통해 학습을 진행하고 다시 새로운 상태에 대한 행동을 결정한다.
- 환경은 다시 해당 행동에 대해 에이전트에게 상태와 보상을 전달한다.

pip install mlagents
public override void Initialize(){
m_BallRb = ball.GetComponent<Rigidbody>();
m_ResetParams = Academy.Instance.EnvironmentParameters;
SetResetParameters();
}
public override void CollectObservations(VectorSensor sensor){
if (useVecObs){
sensor.AddObservation(gameObject.transform.rotation.z);\
sensor.AddObservation(gameObject.transform.rotation.x);
sensor.AddObservation(ball.transform.position - gameObject.transform.position);
sensor.AddObservation(m_BallRb.velocity);
}
}
public override void OnActionReceived(ActionBuffers actionBuffers){
var actionZ = 2f * Mathf.Clamp(actionBuffers.ContinuousActions[0], -1f, 1f);
var actionX = 2f * Mathf.Clamp(actionBuffers.ContinuousActions[1], -1f, 1f);
if ((gameObject.transform.rotation.z < 0.25f && actionZ > 0f) ||
(gameObject.transform.rotation.z > -0.25f && actionZ < 0f)){
gameObject.transform.Rotate(new Vector3(0, 0, 1), actionZ);
}
if ((gameObject.transform.rotation.x < 0.25f && actionX > 0f) ||
(gameObject.transform.rotation.x > -0.25f && actionX < 0f)){
gameObject.transform.Rotate(new Vector3(1, 0, 0), actionX);
}
if ((ball.transform.position.y - gameObject.transform.position.y) < -2f ||
Mathf.Abs(ball.transform.position.x - gameObject.transform.position.x) > 3f ||
Mathf.Abs(ball.transform.position.z - gameObject.transform.position.z) > 3f) {
SetReward(-1f);
EndEpisode();
}else{
SetReward(0.1f);
}
}
public override void OnEpisodeBegin(){
gameObject.transform.rotation = new Quaternion(0f, 0f, 0f, 0f);
gameObject.transform.Rotate(new Vector3(1, 0, 0), Random.Range(-10f, 10f));
gameObject.transform.Rotate(new Vector3(0, 0, 1), Random.Range(-10f, 10f));
m_BallRb.velocity = new Vector3(0f, 0f, 0f);
ball.transform.position = new Vector3(Random.Range(-1.5f, 1.5f), 4f, Random.Range(-1.5f, 1.5f))
+ gameObject.transform.position;
//Reset the parameters when the Agent is reset.
SetResetParameters();
}
public override void Heuristic(in ActionBuffers actionsOut){
var continuousActionsOut = actionsOut.ContinuousActions;
continuousActionsOut[0] = -Input.GetAxis("Horizontal");
continuousActionsOut[1] = Input.GetAxis("Vertical");
}
