feat: DO allow to pause cluster

This commit is contained in:
Benjamin Chastanier
2021-10-27 22:58:56 +02:00
parent 9fbc4a512d
commit a40b56060c
3 changed files with 199 additions and 5 deletions

View File

@@ -10,6 +10,7 @@ resource "digitalocean_kubernetes_cluster" "kubernetes_cluster" {
tags = local.tags_ks_list
{%- if doks_worker_nodes|length > 0 %}
node_pool {
tags = local.tags_ks_list
name = var.kubernetes_cluster_id
@@ -19,4 +20,5 @@ resource "digitalocean_kubernetes_cluster" "kubernetes_cluster" {
min_nodes = "{{ doks_worker_nodes[0].min_size }}"
max_nodes = "{{ doks_worker_nodes[0].max_size }}"
}
{%- endif %}
}

View File

@@ -24,13 +24,17 @@ use crate::cloud_provider::models::WorkerNodeDataTemplate;
use crate::cloud_provider::qovery::EngineLocation;
use crate::cloud_provider::{kubernetes, CloudProvider};
use crate::cmd::helm::{helm_exec_upgrade_with_chart_info, helm_upgrade_diff_with_chart_info};
use crate::cmd::kubectl::{do_kubectl_exec_get_loadbalancer_id, kubectl_exec_get_all_namespaces};
use crate::cmd::kubectl::{
do_kubectl_exec_get_loadbalancer_id, kubectl_exec_api_custom_metrics, kubectl_exec_get_all_namespaces,
};
use crate::cmd::structs::HelmChart;
use crate::cmd::terraform::{terraform_exec, terraform_init_validate_plan_apply, terraform_init_validate_state_list};
use crate::deletion_utilities::{get_firsts_namespaces_to_delete, get_qovery_managed_namespaces};
use crate::dns_provider::DnsProvider;
use crate::error::EngineErrorCause::Internal;
use crate::error::{cast_simple_error_to_engine_error, EngineError, EngineErrorCause, EngineErrorScope, SimpleError};
use crate::error::{
cast_simple_error_to_engine_error, EngineError, EngineErrorCause, EngineErrorScope, SimpleError, SimpleErrorKind,
};
use crate::fs::workspace_directory;
use crate::models::{
Context, Features, Listen, Listener, Listeners, ListenersHelper, ProgressInfo, ProgressLevel, ProgressScope,
@@ -39,7 +43,7 @@ use crate::object_storage::spaces::Spaces;
use crate::object_storage::ObjectStorage;
use crate::string::terraform_list_format;
use crate::{cmd, dns_provider};
use retry::delay::Fibonacci;
use retry::delay::{Fibonacci, Fixed};
use retry::Error::Operation;
use retry::OperationResult;
use std::path::PathBuf;
@@ -828,11 +832,186 @@ impl<'a> Kubernetes for DOKS<'a> {
}
fn on_pause(&self) -> Result<(), EngineError> {
todo!()
info!("DOKS.on_pause() called for {}", self.name());
let listeners_helper = ListenersHelper::new(&self.listeners);
let send_to_customer = |message: &str| {
listeners_helper.pause_in_progress(ProgressInfo::new(
ProgressScope::Infrastructure {
execution_id: self.context.execution_id().to_string(),
},
ProgressLevel::Info,
Some(message),
self.context.execution_id(),
))
};
send_to_customer(format!("Preparing DOKS {} cluster pause with id {}", self.name(), self.id()).as_str());
let temp_dir = workspace_directory(
self.context.workspace_root_dir(),
self.context.execution_id(),
format!("bootstrap/{}", self.id()),
)
.map_err(|err| self.engine_error(EngineErrorCause::Internal, err.to_string()))?;
// generate terraform files and copy them into temp dir
let mut context = self.tera_context()?;
// pause: remove all worker nodes to reduce the bill but keep master to keep all the deployment config, certificates etc...
let worker_nodes: Vec<WorkerNodeDataTemplate> = Vec::new();
context.insert("doks_worker_nodes", &worker_nodes);
let _ = cast_simple_error_to_engine_error(
self.engine_error_scope(),
self.context.execution_id(),
crate::template::generate_and_copy_all_files_into_dir(
self.template_directory.as_str(),
temp_dir.as_str(),
&context,
),
)?;
// copy lib/common/bootstrap/charts directory (and sub directory) into the lib/digitalocean/bootstrap/common/charts directory.
// this is due to the required dependencies of lib/digitalocean/bootstrap/*.tf files
let common_charts_temp_dir = format!("{}/common/charts", temp_dir.as_str());
let _ = cast_simple_error_to_engine_error(
self.engine_error_scope(),
self.context.execution_id(),
crate::template::copy_non_template_files(
format!("{}/common/bootstrap/charts", self.context.lib_root_dir()),
common_charts_temp_dir.as_str(),
),
)?;
// pause: only select terraform workers elements to pause to avoid applying on the whole config
// this to avoid failures because of helm deployments on removing workers nodes
let tf_workers_resources = match terraform_init_validate_state_list(temp_dir.as_str()) {
Ok(x) => {
let mut tf_workers_resources_name = Vec::new();
for name in x {
if name.starts_with("digitalocean_kubernetes_node_pool.") {
tf_workers_resources_name.push(name);
}
}
tf_workers_resources_name
}
Err(e) => {
return Err(EngineError {
cause: EngineErrorCause::Internal,
scope: EngineErrorScope::Kubernetes(self.id.clone(), self.name.clone()),
execution_id: self.context.execution_id().to_string(),
message: e.message,
})
}
};
if tf_workers_resources.is_empty() {
return Err(EngineError {
cause: EngineErrorCause::Internal,
scope: EngineErrorScope::Kubernetes(self.id.clone(), self.name.clone()),
execution_id: self.context.execution_id().to_string(),
message: Some("No worker nodes present, can't Pause the infrastructure. This can happen if there where a manual operations on the workers or the infrastructure is already pause.".to_string()),
});
}
let kubernetes_config_file_path = self.config_file_path()?;
// pause: wait 1h for the engine to have 0 running jobs before pausing and avoid getting unreleased lock (from helm or terraform for example)
let metric_name = "taskmanager_nb_running_tasks";
let wait_engine_job_finish = retry::retry(Fixed::from_millis(60000).take(60), || {
return match kubectl_exec_api_custom_metrics(
&kubernetes_config_file_path,
self.cloud_provider().credentials_environment_variables(),
"qovery",
None,
metric_name,
) {
Ok(metrics) => {
let mut current_engine_jobs = 0;
for metric in metrics.items {
match metric.value.parse::<i32>() {
Ok(job_count) if job_count > 0 => current_engine_jobs += 1,
Err(e) => {
error!("error while looking at the API metric value {}. {:?}", metric_name, e);
return OperationResult::Retry(SimpleError {
kind: SimpleErrorKind::Other,
message: Some(e.to_string()),
});
}
_ => {}
}
}
if current_engine_jobs == 0 {
OperationResult::Ok(())
} else {
OperationResult::Retry(SimpleError {
kind: SimpleErrorKind::Other,
message: Some("can't pause the infrastructure now, Engine jobs are currently running, retrying later...".to_string()),
})
}
}
Err(e) => {
error!("error while looking at the API metric value {}. {:?}", metric_name, e);
OperationResult::Retry(e)
}
};
});
match wait_engine_job_finish {
Ok(_) => info!("no current running jobs on the Engine, infrastructure pause is allowed to start"),
Err(Operation { error, .. }) => {
return Err(EngineError {
cause: EngineErrorCause::Internal,
scope: EngineErrorScope::Engine,
execution_id: self.context.execution_id().to_string(),
message: error.message,
})
}
Err(retry::Error::Internal(msg)) => {
return Err(EngineError::new(
EngineErrorCause::Internal,
EngineErrorScope::Engine,
self.context.execution_id(),
Some(msg),
))
}
}
let mut terraform_args_string = vec!["apply".to_string(), "-auto-approve".to_string()];
for x in tf_workers_resources {
terraform_args_string.push(format!("-target={}", x));
}
let terraform_args = terraform_args_string.iter().map(|x| &**x).collect();
let message = format!("Pausing DOKS {} cluster deployment with id {}", self.name(), self.id());
info!("{}", &message);
send_to_customer(&message);
match cast_simple_error_to_engine_error(
self.engine_error_scope(),
self.context.execution_id(),
terraform_exec(temp_dir.as_str(), terraform_args),
) {
Ok(_) => {
let message = format!("Kubernetes cluster {} successfully paused", self.name());
info!("{}", &message);
send_to_customer(&message);
Ok(())
}
Err(e) => {
error!("Error while pausing cluster {} with id {}.", self.name(), self.id());
Err(e)
}
}
}
fn on_pause_error(&self) -> Result<(), EngineError> {
todo!()
warn!("DOKS.on_pause_error() called for {}", self.name());
Err(self.engine_error(
EngineErrorCause::Internal,
format!("{} Kubernetes cluster failed to pause", self.name()),
))
}
fn on_delete(&self) -> Result<(), EngineError> {

View File

@@ -173,6 +173,7 @@ fn create_and_destroy_doks_cluster_ams_3() {
create_and_destroy_doks_cluster(region, secrets, false, function_name!());
}
#[cfg(feature = "test-do-infra")]
#[test]
#[ignore]
#[allow(dead_code)]
@@ -183,3 +184,15 @@ fn create_upgrade_and_destroy_doks_cluster_in_nyc_3() {
let secrets = FuncTestsSecrets::new();
create_upgrade_and_destroy_doks_cluster(region, secrets, "1.19", "1.20", function_name!());
}
#[cfg(feature = "test-do-infra")]
#[test]
#[ignore]
#[allow(dead_code)]
#[allow(unused_attributes)]
#[named]
fn create_pause_and_destroy_kapsule_cluster_ams_3() {
let zone = Region::Amsterdam3;
let secrets = FuncTestsSecrets::new();
create_and_destroy_doks_cluster(zone, secrets, true, function_name!());
}