From a40b56060c46185fdb91ee1e387d022c1c32f918 Mon Sep 17 00:00:00 2001 From: Benjamin Chastanier Date: Wed, 27 Oct 2021 22:58:56 +0200 Subject: [PATCH] feat: DO allow to pause cluster --- .../bootstrap/doks-master-cluster.j2.tf | 2 + .../digitalocean/kubernetes/mod.rs | 189 +++++++++++++++++- tests/digitalocean/do_kubernetes.rs | 13 ++ 3 files changed, 199 insertions(+), 5 deletions(-) diff --git a/lib/digitalocean/bootstrap/doks-master-cluster.j2.tf b/lib/digitalocean/bootstrap/doks-master-cluster.j2.tf index f44645c6..5b5925a6 100644 --- a/lib/digitalocean/bootstrap/doks-master-cluster.j2.tf +++ b/lib/digitalocean/bootstrap/doks-master-cluster.j2.tf @@ -10,6 +10,7 @@ resource "digitalocean_kubernetes_cluster" "kubernetes_cluster" { tags = local.tags_ks_list +{%- if doks_worker_nodes|length > 0 %} node_pool { tags = local.tags_ks_list name = var.kubernetes_cluster_id @@ -19,4 +20,5 @@ resource "digitalocean_kubernetes_cluster" "kubernetes_cluster" { min_nodes = "{{ doks_worker_nodes[0].min_size }}" max_nodes = "{{ doks_worker_nodes[0].max_size }}" } +{%- endif %} } \ No newline at end of file diff --git a/src/cloud_provider/digitalocean/kubernetes/mod.rs b/src/cloud_provider/digitalocean/kubernetes/mod.rs index b960ae71..412692c4 100644 --- a/src/cloud_provider/digitalocean/kubernetes/mod.rs +++ b/src/cloud_provider/digitalocean/kubernetes/mod.rs @@ -24,13 +24,17 @@ use crate::cloud_provider::models::WorkerNodeDataTemplate; use crate::cloud_provider::qovery::EngineLocation; use crate::cloud_provider::{kubernetes, CloudProvider}; use crate::cmd::helm::{helm_exec_upgrade_with_chart_info, helm_upgrade_diff_with_chart_info}; -use crate::cmd::kubectl::{do_kubectl_exec_get_loadbalancer_id, kubectl_exec_get_all_namespaces}; +use crate::cmd::kubectl::{ + do_kubectl_exec_get_loadbalancer_id, kubectl_exec_api_custom_metrics, kubectl_exec_get_all_namespaces, +}; use crate::cmd::structs::HelmChart; use crate::cmd::terraform::{terraform_exec, terraform_init_validate_plan_apply, terraform_init_validate_state_list}; use crate::deletion_utilities::{get_firsts_namespaces_to_delete, get_qovery_managed_namespaces}; use crate::dns_provider::DnsProvider; use crate::error::EngineErrorCause::Internal; -use crate::error::{cast_simple_error_to_engine_error, EngineError, EngineErrorCause, EngineErrorScope, SimpleError}; +use crate::error::{ + cast_simple_error_to_engine_error, EngineError, EngineErrorCause, EngineErrorScope, SimpleError, SimpleErrorKind, +}; use crate::fs::workspace_directory; use crate::models::{ Context, Features, Listen, Listener, Listeners, ListenersHelper, ProgressInfo, ProgressLevel, ProgressScope, @@ -39,7 +43,7 @@ use crate::object_storage::spaces::Spaces; use crate::object_storage::ObjectStorage; use crate::string::terraform_list_format; use crate::{cmd, dns_provider}; -use retry::delay::Fibonacci; +use retry::delay::{Fibonacci, Fixed}; use retry::Error::Operation; use retry::OperationResult; use std::path::PathBuf; @@ -828,11 +832,186 @@ impl<'a> Kubernetes for DOKS<'a> { } fn on_pause(&self) -> Result<(), EngineError> { - todo!() + info!("DOKS.on_pause() called for {}", self.name()); + + let listeners_helper = ListenersHelper::new(&self.listeners); + let send_to_customer = |message: &str| { + listeners_helper.pause_in_progress(ProgressInfo::new( + ProgressScope::Infrastructure { + execution_id: self.context.execution_id().to_string(), + }, + ProgressLevel::Info, + Some(message), + self.context.execution_id(), + )) + }; + send_to_customer(format!("Preparing DOKS {} cluster pause with id {}", self.name(), self.id()).as_str()); + + let temp_dir = workspace_directory( + self.context.workspace_root_dir(), + self.context.execution_id(), + format!("bootstrap/{}", self.id()), + ) + .map_err(|err| self.engine_error(EngineErrorCause::Internal, err.to_string()))?; + + // generate terraform files and copy them into temp dir + let mut context = self.tera_context()?; + + // pause: remove all worker nodes to reduce the bill but keep master to keep all the deployment config, certificates etc... + let worker_nodes: Vec = Vec::new(); + context.insert("doks_worker_nodes", &worker_nodes); + + let _ = cast_simple_error_to_engine_error( + self.engine_error_scope(), + self.context.execution_id(), + crate::template::generate_and_copy_all_files_into_dir( + self.template_directory.as_str(), + temp_dir.as_str(), + &context, + ), + )?; + + // copy lib/common/bootstrap/charts directory (and sub directory) into the lib/digitalocean/bootstrap/common/charts directory. + // this is due to the required dependencies of lib/digitalocean/bootstrap/*.tf files + let common_charts_temp_dir = format!("{}/common/charts", temp_dir.as_str()); + let _ = cast_simple_error_to_engine_error( + self.engine_error_scope(), + self.context.execution_id(), + crate::template::copy_non_template_files( + format!("{}/common/bootstrap/charts", self.context.lib_root_dir()), + common_charts_temp_dir.as_str(), + ), + )?; + + // pause: only select terraform workers elements to pause to avoid applying on the whole config + // this to avoid failures because of helm deployments on removing workers nodes + let tf_workers_resources = match terraform_init_validate_state_list(temp_dir.as_str()) { + Ok(x) => { + let mut tf_workers_resources_name = Vec::new(); + for name in x { + if name.starts_with("digitalocean_kubernetes_node_pool.") { + tf_workers_resources_name.push(name); + } + } + tf_workers_resources_name + } + Err(e) => { + return Err(EngineError { + cause: EngineErrorCause::Internal, + scope: EngineErrorScope::Kubernetes(self.id.clone(), self.name.clone()), + execution_id: self.context.execution_id().to_string(), + message: e.message, + }) + } + }; + if tf_workers_resources.is_empty() { + return Err(EngineError { + cause: EngineErrorCause::Internal, + scope: EngineErrorScope::Kubernetes(self.id.clone(), self.name.clone()), + execution_id: self.context.execution_id().to_string(), + message: Some("No worker nodes present, can't Pause the infrastructure. This can happen if there where a manual operations on the workers or the infrastructure is already pause.".to_string()), + }); + } + + let kubernetes_config_file_path = self.config_file_path()?; + + // pause: wait 1h for the engine to have 0 running jobs before pausing and avoid getting unreleased lock (from helm or terraform for example) + let metric_name = "taskmanager_nb_running_tasks"; + let wait_engine_job_finish = retry::retry(Fixed::from_millis(60000).take(60), || { + return match kubectl_exec_api_custom_metrics( + &kubernetes_config_file_path, + self.cloud_provider().credentials_environment_variables(), + "qovery", + None, + metric_name, + ) { + Ok(metrics) => { + let mut current_engine_jobs = 0; + + for metric in metrics.items { + match metric.value.parse::() { + Ok(job_count) if job_count > 0 => current_engine_jobs += 1, + Err(e) => { + error!("error while looking at the API metric value {}. {:?}", metric_name, e); + return OperationResult::Retry(SimpleError { + kind: SimpleErrorKind::Other, + message: Some(e.to_string()), + }); + } + _ => {} + } + } + + if current_engine_jobs == 0 { + OperationResult::Ok(()) + } else { + OperationResult::Retry(SimpleError { + kind: SimpleErrorKind::Other, + message: Some("can't pause the infrastructure now, Engine jobs are currently running, retrying later...".to_string()), + }) + } + } + Err(e) => { + error!("error while looking at the API metric value {}. {:?}", metric_name, e); + OperationResult::Retry(e) + } + }; + }); + + match wait_engine_job_finish { + Ok(_) => info!("no current running jobs on the Engine, infrastructure pause is allowed to start"), + Err(Operation { error, .. }) => { + return Err(EngineError { + cause: EngineErrorCause::Internal, + scope: EngineErrorScope::Engine, + execution_id: self.context.execution_id().to_string(), + message: error.message, + }) + } + Err(retry::Error::Internal(msg)) => { + return Err(EngineError::new( + EngineErrorCause::Internal, + EngineErrorScope::Engine, + self.context.execution_id(), + Some(msg), + )) + } + } + + let mut terraform_args_string = vec!["apply".to_string(), "-auto-approve".to_string()]; + for x in tf_workers_resources { + terraform_args_string.push(format!("-target={}", x)); + } + let terraform_args = terraform_args_string.iter().map(|x| &**x).collect(); + + let message = format!("Pausing DOKS {} cluster deployment with id {}", self.name(), self.id()); + info!("{}", &message); + send_to_customer(&message); + + match cast_simple_error_to_engine_error( + self.engine_error_scope(), + self.context.execution_id(), + terraform_exec(temp_dir.as_str(), terraform_args), + ) { + Ok(_) => { + let message = format!("Kubernetes cluster {} successfully paused", self.name()); + info!("{}", &message); + send_to_customer(&message); + Ok(()) + } + Err(e) => { + error!("Error while pausing cluster {} with id {}.", self.name(), self.id()); + Err(e) + } + } } fn on_pause_error(&self) -> Result<(), EngineError> { - todo!() + warn!("DOKS.on_pause_error() called for {}", self.name()); + Err(self.engine_error( + EngineErrorCause::Internal, + format!("{} Kubernetes cluster failed to pause", self.name()), + )) } fn on_delete(&self) -> Result<(), EngineError> { diff --git a/tests/digitalocean/do_kubernetes.rs b/tests/digitalocean/do_kubernetes.rs index cb672c1b..b94e0f35 100644 --- a/tests/digitalocean/do_kubernetes.rs +++ b/tests/digitalocean/do_kubernetes.rs @@ -173,6 +173,7 @@ fn create_and_destroy_doks_cluster_ams_3() { create_and_destroy_doks_cluster(region, secrets, false, function_name!()); } +#[cfg(feature = "test-do-infra")] #[test] #[ignore] #[allow(dead_code)] @@ -183,3 +184,15 @@ fn create_upgrade_and_destroy_doks_cluster_in_nyc_3() { let secrets = FuncTestsSecrets::new(); create_upgrade_and_destroy_doks_cluster(region, secrets, "1.19", "1.20", function_name!()); } + +#[cfg(feature = "test-do-infra")] +#[test] +#[ignore] +#[allow(dead_code)] +#[allow(unused_attributes)] +#[named] +fn create_pause_and_destroy_kapsule_cluster_ams_3() { + let zone = Region::Amsterdam3; + let secrets = FuncTestsSecrets::new(); + create_and_destroy_doks_cluster(zone, secrets, true, function_name!()); +}