From 3f922d24925e11f8a2ed1b4c9b64f5d1438707cb Mon Sep 17 00:00:00 2001 From: JulioJerez Date: Tue, 17 Oct 2023 07:15:09 -0700 Subject: [PATCH] start with continue policy grad (wip) --- .../ndSandbox/demos/ndCartpoleDiscrete.cpp | 6 +- .../dBrain/ndBrainAgentContinueVPG_Trainer.h | 472 ++++++++++++++++++ .../dBrain/ndBrainAgentDiscreteVPG_Trainer.h | 4 +- ...ePolicyGrad.h => ndBrainAgentPolicyGrad.h} | 36 +- newton-4.00/sdk/dBrain/ndBrainInc.h | 3 +- 5 files changed, 497 insertions(+), 24 deletions(-) create mode 100644 newton-4.00/sdk/dBrain/ndBrainAgentContinueVPG_Trainer.h rename newton-4.00/sdk/dBrain/{ndBrainAgentDiscretePolicyGrad.h => ndBrainAgentPolicyGrad.h} (67%) diff --git a/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp b/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp index 0a5d61c728..cd79b9a406 100644 --- a/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp +++ b/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp @@ -22,7 +22,7 @@ namespace ndCarpole_0 { - //#define D_TRAIN_AGENT + #define D_TRAIN_AGENT #define D_USE_VANILLA_POLICY_GRAD //#define D_USE_PROXIMA_POLICY_GRAD @@ -50,7 +50,7 @@ namespace ndCarpole_0 public: #ifdef D_USE_VANILLA_POLICY_GRAD - class ndCartpoleAgent : public ndBrainAgentDiscretePolicyGrad + class ndCartpoleAgent : public ndBrainAgentPolicyGrad #else class ndCartpoleAgent : public ndBrainAgentDQN #endif @@ -58,7 +58,7 @@ namespace ndCarpole_0 public: #ifdef D_USE_VANILLA_POLICY_GRAD ndCartpoleAgent(ndSharedPtr& actor) - :ndBrainAgentDiscretePolicyGrad(actor) + :ndBrainAgentPolicyGrad(actor) ,m_model(nullptr) { } diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentContinueVPG_Trainer.h b/newton-4.00/sdk/dBrain/ndBrainAgentContinueVPG_Trainer.h new file mode 100644 index 0000000000..30eda9a544 --- /dev/null +++ b/newton-4.00/sdk/dBrain/ndBrainAgentContinueVPG_Trainer.h @@ -0,0 +1,472 @@ +/* Copyright (c) <2003-2022> +* +* This software is provided 'as-is', without any express or implied +* warranty. In no event will the authors be held liable for any damages +* arising from the use of this software. +* +* Permission is granted to anyone to use this software for any purpose, +* including commercial applications, and to alter it and redistribute it +* freely, subject to the following restrictions: +* +* 1. The origin of this software must not be misrepresented; you must not +* claim that you wrote the original software. If you use this software +* in a product, an acknowledgment in the product documentation would be +* appreciated but is not required. +* +* 2. Altered source versions must be plainly marked as such, and must not be +* misrepresented as being the original software. +* +* 3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef _ND_DQN_BRAIN_AGENT_CONTINUE_VPG_TRAINER_H__ +#define _ND_DQN_BRAIN_AGENT_CONTINUE_VPG_TRAINER_H__ + +#include "ndBrainStdafx.h" +#include "ndBrain.h" +#include "ndBrainAgent.h" +#include "ndBrainTrainer.h" +#include "ndBrainReplayBuffer.h" +#include "ndBrainLossLeastSquaredError.h" + +// this is an implementation of the vanilla policy Gradient as described in: +// https://spinningup.openai.com/en/latest/algorithms/vpg.html + +template +class ndBrainAgentContinueVPG_Trainer : public ndBrainAgent, public ndBrainThreadPool +{ + public: + class HyperParameters + { + public: + HyperParameters() + { + m_bashBufferSize = 32; + m_numberOfHiddenLayers = 3; + m_maxTrajectorySteps = 1024 * 2; + m_extraTrajectorySteps = 1024 * 2; + + m_hiddenLayersNumberOfNeurons = 64; + + m_learnRate = ndBrainFloat(0.0005f); + m_regularizer = ndBrainFloat(1.0e-6f); + m_discountFactor = ndBrainFloat(0.99f); + m_threadsCount = ndMin(ndBrainThreadPool::GetMaxThreads(), ndMin(m_bashBufferSize, 16)); + //m_threadsCount = 1; + } + + ndBrainFloat m_learnRate; + ndBrainFloat m_regularizer; + ndBrainFloat m_discountFactor; + + ndInt32 m_threadsCount; + ndInt32 m_bashBufferSize; + ndInt32 m_maxTrajectorySteps; + ndInt32 m_extraTrajectorySteps; + ndInt32 m_numberOfHiddenLayers; + ndInt32 m_hiddenLayersNumberOfNeurons; + }; + + class ndTrajectoryStep + { + public: + ndTrajectoryStep() + :m_observation() + ,m_reward(ndBrainFloat(0.0f)) + ,m_action(0) + { + } + + ndTrajectoryStep(const ndTrajectoryStep& src) + :m_observation(src.m_observation) + ,m_reward(src.m_reward) + ,m_action(src.m_action) + { + } + + ndTrajectoryStep& operator=(const ndTrajectoryStep& src) + { + new (this) ndTrajectoryStep(src); + return*this; + } + + ndBrainFixSizeVector m_observation; + ndBrainFloat m_reward; + ndBrainFloat m_action; + }; + + ndBrainAgentContinueVPG_Trainer(const HyperParameters& hyperParameters); + virtual ~ndBrainAgentContinueVPG_Trainer(); + + ndInt32 GetFramesCount() const; + ndInt32 GetEposideCount() const; + ndInt32 GetEpisodeFrames() const; + + bool IsTrainer() const; + + protected: + void Step(); + void OptimizeStep(); + + void Save(ndBrainSave* const loadSave) const; + + void InitWeights(); + void InitWeights(ndBrainFloat weighVariance, ndBrainFloat biasVariance); + + bool IsSampling() const; + bool IsTerminal() const; + ndBrainFloat GetReward() const; + void AddExploration(ndBrainFloat* const actions); + + private: + void Optimize(); + void BackPropagate(); + + void CalcucateRewards(); + ndBrainFloat SelectAction(const ndBrainVector& probabilities) const; + + protected: + ndBrain m_actor; + ndBrainOptimizerAdam* m_optimizer; + ndArray m_trainers; + ndArray m_weightedTrainer; + ndArray m_auxiliaryTrainers; + + ndArray m_trajectory; + ndBrainVector m_rewards; + + ndBrainFloat m_gamma; + ndBrainFloat m_learnRate; + ndInt32 m_frameCount; + ndInt32 m_framesAlive; + ndInt32 m_eposideCount; + ndInt32 m_bashBufferSize; + ndInt32 m_maxTrajectorySteps; + ndInt32 m_extraTrajectorySteps; + ndMovingAverage<256> m_averageQvalue; + ndMovingAverage<256> m_averageFramesPerEpisodes; +}; + +template +ndBrainAgentContinueVPG_Trainer::ndBrainAgentContinueVPG_Trainer(const HyperParameters& hyperParameters) + :ndBrainAgent() + ,ndBrainThreadPool() + ,m_actor() + ,m_trajectory() + ,m_rewards() + ,m_gamma(hyperParameters.m_discountFactor) + ,m_learnRate(hyperParameters.m_learnRate) + ,m_frameCount(0) + ,m_framesAlive(0) + ,m_eposideCount(0) + ,m_bashBufferSize(hyperParameters.m_bashBufferSize) + ,m_maxTrajectorySteps(hyperParameters.m_maxTrajectorySteps) + ,m_extraTrajectorySteps(hyperParameters.m_extraTrajectorySteps) + ,m_averageQvalue() + ,m_averageFramesPerEpisodes() +{ + // build neural net + ndFixSizeArray layers; + layers.PushBack(new ndBrainLayerLinear(m_stateSize, hyperParameters.m_hiddenLayersNumberOfNeurons)); + layers.PushBack(new ndBrainLayerTanhActivation(layers[layers.GetCount() - 1]->GetOutputSize())); + for (ndInt32 i = 1; i < hyperParameters.m_numberOfHiddenLayers; ++i) + { + ndAssert(layers[layers.GetCount() - 1]->GetOutputSize() == hyperParameters.m_hiddenLayersNumberOfNeurons); + layers.PushBack(new ndBrainLayerLinear(hyperParameters.m_hiddenLayersNumberOfNeurons, hyperParameters.m_hiddenLayersNumberOfNeurons)); + layers.PushBack(new ndBrainLayerTanhActivation(hyperParameters.m_hiddenLayersNumberOfNeurons)); + } + layers.PushBack(new ndBrainLayerLinear(hyperParameters.m_hiddenLayersNumberOfNeurons, m_actionsSize)); + layers.PushBack(new ndBrainLayerSoftmaxActivation(m_actionsSize)); + for (ndInt32 i = 0; i < layers.GetCount(); ++i) + { + m_actor.AddLayer(layers[i]); + //m_target.AddLayer(layers[i]->Clone()); + } + InitWeights(); + + m_trainers.SetCount(0); + m_auxiliaryTrainers.SetCount(0); + SetThreadCount(hyperParameters.m_threadsCount); + for (ndInt32 i = 0; i < m_bashBufferSize; ++i) + { + ndBrainTrainer* const trainer = new ndBrainTrainer(&m_actor); + m_trainers.PushBack(trainer); + + ndBrainTrainer* const auxiliaryTrainer = new ndBrainTrainer(&m_actor); + m_auxiliaryTrainers.PushBack(auxiliaryTrainer); + } + + m_weightedTrainer.PushBack(m_trainers[0]); + m_optimizer = new ndBrainOptimizerAdam(); + m_optimizer->SetRegularizer(hyperParameters.m_regularizer); + + m_rewards.SetCount(m_maxTrajectorySteps + m_extraTrajectorySteps); + m_trajectory.SetCount(m_maxTrajectorySteps + m_extraTrajectorySteps); + m_trajectory.SetCount(0); +} + +template +ndBrainAgentContinueVPG_Trainer::~ndBrainAgentContinueVPG_Trainer() +{ + for (ndInt32 i = 0; i < m_trainers.GetCount(); ++i) + { + delete m_trainers[i]; + delete m_auxiliaryTrainers[i]; + } + delete m_optimizer; +} + +template +bool ndBrainAgentContinueVPG_Trainer::IsTrainer() const +{ + return true; +} + +template +void ndBrainAgentContinueVPG_Trainer::InitWeights() +{ + m_actor.InitWeightsXavierMethod(); +} + +template +void ndBrainAgentContinueVPG_Trainer::InitWeights(ndBrainFloat weighVariance, ndBrainFloat biasVariance) +{ + ndAssert(0); + m_actor.InitWeights(weighVariance, biasVariance); +} + +template +ndInt32 ndBrainAgentContinueVPG_Trainer::GetFramesCount() const +{ + return m_frameCount; +} + +template +bool ndBrainAgentContinueVPG_Trainer::IsSampling() const +{ + return false; +} + +template +ndInt32 ndBrainAgentContinueVPG_Trainer::GetEposideCount() const +{ + return m_eposideCount; +} + +template +ndInt32 ndBrainAgentContinueVPG_Trainer::GetEpisodeFrames() const +{ + return m_framesAlive; +} + +template +void ndBrainAgentContinueVPG_Trainer::BackPropagate() +{ + auto ClearGradients = ndMakeObject::ndFunction([this](ndInt32 threadIndex, ndInt32 threadCount) + { + const ndStartEnd startEnd(m_trainers.GetCount(), threadIndex, threadCount); + for (ndInt32 i = startEnd.m_start; i < startEnd.m_end; ++i) + { + ndBrainTrainer* const trainer = m_trainers[i]; + trainer->ClearGradients(); + } + }); + ndBrainThreadPool::ParallelExecute(ClearGradients); + + const ndInt32 steps = ndMin(m_maxTrajectorySteps, m_trajectory.GetCount()); + + for (ndInt32 base = 0; base < steps; base += m_bashBufferSize) + { + auto CalculateGradients = ndMakeObject::ndFunction([this, base](ndInt32 threadIndex, ndInt32 threadCount) + { + class Loss : public ndBrainLossLeastSquaredError + { + public: + Loss(ndBrainTrainer& trainer, ndBrainAgentContinueVPG_Trainer* const agent, ndInt32 index) + :ndBrainLossLeastSquaredError(trainer.GetBrain()->GetOutputSize()) + ,m_trainer(trainer) + ,m_agent(agent) + ,m_index(index) + { + } + + void GetLoss(const ndBrainVector& output, ndBrainVector& loss) + { + const ndBrainVector& rewards = m_agent->m_rewards; + ndInt32 actionIndex = ndInt32 (m_agent->m_trajectory[m_index].m_action); + loss.Set(ndBrainFloat(0.0f)); + ndBrainFloat negLogProb = -ndLog(output[actionIndex]); + loss[actionIndex] = negLogProb * rewards[m_index]; + } + + ndBrainTrainer& m_trainer; + ndBrainAgentContinueVPG_Trainer* m_agent; + ndInt32 m_index; + }; + + ndBrainFixSizeVector observations; + const ndStartEnd startEnd(m_bashBufferSize, threadIndex, threadCount); + for (ndInt32 i = startEnd.m_start; i < startEnd.m_end; ++i) + { + ndBrainTrainer& trainer = *m_auxiliaryTrainers[i]; + Loss loss(trainer, this, base + i); + if ((base + i) < m_trajectory.GetCount()) + { + trainer.BackPropagate(m_trajectory[base + i].m_observation, loss); + } + else + { + trainer.ClearGradients(); + } + } + }); + + ndBrainThreadPool::ParallelExecute(CalculateGradients); + + auto AddGradients = ndMakeObject::ndFunction([this](ndInt32 threadIndex, ndInt32 threadCount) + { + const ndStartEnd startEnd(m_trainers.GetCount(), threadIndex, threadCount); + for (ndInt32 i = startEnd.m_start; i < startEnd.m_end; ++i) + { + ndBrainTrainer* const trainer = m_trainers[i]; + const ndBrainTrainer* const auxiliaryTrainer = m_auxiliaryTrainers[i]; + trainer->AddGradients(auxiliaryTrainer); + } + }); + ndBrainThreadPool::ParallelExecute(AddGradients); + } + m_optimizer->AccumulateGradients(this, m_trainers); + m_weightedTrainer[0]->ScaleWeights(ndBrainFloat(1.0f) / ndBrainFloat(m_trajectory.GetCount())); + m_optimizer->Update(this, m_weightedTrainer, -m_learnRate); +} + +template +void ndBrainAgentContinueVPG_Trainer::Save(ndBrainSave* const loadSave) const +{ + loadSave->Save(&m_actor); +} + +template +bool ndBrainAgentContinueVPG_Trainer::IsTerminal() const +{ + ndAssert(0); + return false; +} + +template +ndBrainFloat ndBrainAgentContinueVPG_Trainer::GetReward() const +{ + ndAssert(0); + return ndBrainFloat(0.0f); +} + +template +void ndBrainAgentContinueVPG_Trainer::CalcucateRewards() +{ + const ndInt32 steps = m_trajectory.GetCount(); + #if 0 + for (ndInt32 i = 0; i < steps; ++i) + { + ndBrainFloat sum = ndBrainFloat(0.0f); + ndBrainFloat discount = ndBrainFloat(1.0f); + for (ndInt32 j = i; j < steps; ++j) + { + sum += discount * m_trajectory[j].m_reward; + discount *= m_gamma; + } + m_rewads[i] = sum; + } + #endif + + // using the Bellman equation. + m_rewards.SetCount(steps); + m_rewards[steps - 1] = m_trajectory[steps - 1].m_reward; + for (ndInt32 i = steps - 2; i >= 0; --i) + { + m_rewards[i] = m_trajectory[i].m_reward + m_gamma * m_rewards[i + 1]; + } + + m_averageQvalue.Update(m_rewards[0]); + m_averageFramesPerEpisodes.Update(ndBrainFloat(steps)); + m_rewards.GaussianNormalize(); +} + +template +void ndBrainAgentContinueVPG_Trainer::Optimize() +{ + CalcucateRewards(); + BackPropagate(); +} + +template +void ndBrainAgentContinueVPG_Trainer::AddExploration(ndBrainFloat* const) +{ + ndAssert(0); +} + +template +ndBrainFloat ndBrainAgentContinueVPG_Trainer::SelectAction(const ndBrainVector& probabilities) const +{ + ndBrainFixSizeVector pdf; + + pdf.SetCount(0); + ndBrainFloat sum = ndBrainFloat(0.0f); + for (ndInt32 i = 0; i < actionDim; ++i) + { + pdf.PushBack (sum); + sum += probabilities[i]; + } + pdf.PushBack(sum); + + ndFloat32 r = ndRand(); + ndInt32 index = actionDim - 1; + for (ndInt32 i = actionDim - 1; i >= 0; --i) + { + index = i; + if (pdf[i] < r) + { + break; + } + } + return ndBrainFloat (index); +} + +template +void ndBrainAgentContinueVPG_Trainer::Step() +{ + ndTrajectoryStep trajectoryStep; + ndBrainFixSizeVector probability; + + GetObservation(&trajectoryStep.m_observation[0]); + m_actor.MakePrediction(trajectoryStep.m_observation, probability); + trajectoryStep.m_action = SelectAction(probability); + ApplyActions(&trajectoryStep.m_action); + trajectoryStep.m_reward = GetReward(); + + ndAssert(m_trajectory.GetCount() < m_trajectory.GetCapacity()); + m_trajectory.PushBack(trajectoryStep); +} + +template +void ndBrainAgentContinueVPG_Trainer::OptimizeStep() +{ + if (!m_frameCount) + { + ResetModel(); + } + + bool isTeminal = IsTerminal() || (m_trajectory.GetCount() == (m_extraTrajectorySteps + m_maxTrajectorySteps)); + if (isTeminal) + { + Optimize(); + ResetModel(); + m_trajectory.SetCount(0); + m_eposideCount++; + m_framesAlive = 0; + } + + m_frameCount++; + m_framesAlive++; +} + +#endif \ No newline at end of file diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentDiscreteVPG_Trainer.h b/newton-4.00/sdk/dBrain/ndBrainAgentDiscreteVPG_Trainer.h index e35dc92a44..121eb16d7c 100644 --- a/newton-4.00/sdk/dBrain/ndBrainAgentDiscreteVPG_Trainer.h +++ b/newton-4.00/sdk/dBrain/ndBrainAgentDiscreteVPG_Trainer.h @@ -19,8 +19,8 @@ * 3. This notice may not be removed or altered from any source distribution. */ -#ifndef _ND_DQN_BRAIN_AGENT_DESCRETE_POLICY_GRAD_TRAINER_H__ -#define _ND_DQN_BRAIN_AGENT_DESCRETE_POLICY_GRAD_TRAINER_H__ +#ifndef _ND_DQN_BRAIN_AGENT_DESCRETE_VPG_H__ +#define _ND_DQN_BRAIN_AGENT_DESCRETE_VPG_H__ #include "ndBrainStdafx.h" #include "ndBrain.h" diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentDiscretePolicyGrad.h b/newton-4.00/sdk/dBrain/ndBrainAgentPolicyGrad.h similarity index 67% rename from newton-4.00/sdk/dBrain/ndBrainAgentDiscretePolicyGrad.h rename to newton-4.00/sdk/dBrain/ndBrainAgentPolicyGrad.h index 3999162c52..01ff46108b 100644 --- a/newton-4.00/sdk/dBrain/ndBrainAgentDiscretePolicyGrad.h +++ b/newton-4.00/sdk/dBrain/ndBrainAgentPolicyGrad.h @@ -19,8 +19,8 @@ * 3. This notice may not be removed or altered from any source distribution. */ -#ifndef _ND_DQN_BRAIN_AGENT_DESCRETE_POLICY_GRAD_H__ -#define _ND_DQN_BRAIN_AGENT_DESCRETE_POLICY_GRAD_H__ +#ifndef _ND_DQN_BRAIN_AGENT_POLICY_GRAD_H__ +#define _ND_DQN_BRAIN_AGENT_POLICY_GRAD_H__ #include "ndBrainStdafx.h" #include "ndBrain.h" @@ -29,11 +29,11 @@ #include "ndBrainReplayBuffer.h" template -class ndBrainAgentDiscretePolicyGrad: public ndBrainAgent +class ndBrainAgentPolicyGrad: public ndBrainAgent { public: - ndBrainAgentDiscretePolicyGrad(const ndSharedPtr& actor); - ~ndBrainAgentDiscretePolicyGrad(); + ndBrainAgentPolicyGrad(const ndSharedPtr& actor); + ~ndBrainAgentPolicyGrad(); void Step(); @@ -55,7 +55,7 @@ class ndBrainAgentDiscretePolicyGrad: public ndBrainAgent }; template -ndBrainAgentDiscretePolicyGrad::ndBrainAgentDiscretePolicyGrad(const ndSharedPtr& actor) +ndBrainAgentPolicyGrad::ndBrainAgentPolicyGrad(const ndSharedPtr& actor) :ndBrainAgent() ,m_actor(actor) { @@ -63,75 +63,75 @@ ndBrainAgentDiscretePolicyGrad::ndBrainAgentDiscretePolicy } template -ndBrainAgentDiscretePolicyGrad::~ndBrainAgentDiscretePolicyGrad() +ndBrainAgentPolicyGrad::~ndBrainAgentPolicyGrad() { ndAssert(0); } template -bool ndBrainAgentDiscretePolicyGrad::IsTrainer() const +bool ndBrainAgentPolicyGrad::IsTrainer() const { return false; } template -void ndBrainAgentDiscretePolicyGrad::InitWeights() +void ndBrainAgentPolicyGrad::InitWeights() { ndAssert(0); } template -void ndBrainAgentDiscretePolicyGrad::InitWeights(ndBrainFloat, ndBrainFloat) +void ndBrainAgentPolicyGrad::InitWeights(ndBrainFloat, ndBrainFloat) { ndAssert(0); } template -bool ndBrainAgentDiscretePolicyGrad::IsTerminal() const +bool ndBrainAgentPolicyGrad::IsTerminal() const { ndAssert(0); return false; } template -void ndBrainAgentDiscretePolicyGrad::AddExploration(ndBrainFloat* const) +void ndBrainAgentPolicyGrad::AddExploration(ndBrainFloat* const) { ndAssert(0); } template -ndBrainFloat ndBrainAgentDiscretePolicyGrad::GetReward() const +ndBrainFloat ndBrainAgentPolicyGrad::GetReward() const { ndAssert(0); return ndBrainFloat(0.0f); } template -void ndBrainAgentDiscretePolicyGrad::ResetModel() const +void ndBrainAgentPolicyGrad::ResetModel() const { ndAssert(0); } template -ndInt32 ndBrainAgentDiscretePolicyGrad::GetEpisodeFrames() const +ndInt32 ndBrainAgentPolicyGrad::GetEpisodeFrames() const { ndAssert(0); return 0; } template -void ndBrainAgentDiscretePolicyGrad::Save(ndBrainSave* const) const +void ndBrainAgentPolicyGrad::Save(ndBrainSave* const) const { ndAssert(0); } template -void ndBrainAgentDiscretePolicyGrad::OptimizeStep() +void ndBrainAgentPolicyGrad::OptimizeStep() { } template -void ndBrainAgentDiscretePolicyGrad::Step() +void ndBrainAgentPolicyGrad::Step() { ndBrainFloat stateBuffer[statesDim * 2]; ndBrainFloat actionBuffer[actionDim * 2]; diff --git a/newton-4.00/sdk/dBrain/ndBrainInc.h b/newton-4.00/sdk/dBrain/ndBrainInc.h index 2e1b1b6328..30f0276456 100644 --- a/newton-4.00/sdk/dBrain/ndBrainInc.h +++ b/newton-4.00/sdk/dBrain/ndBrainInc.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -50,8 +51,8 @@ #include #include #include -#include #include +#include #include #include