mirror of
https://github.com/jlengrand/engine.git
synced 2026-03-10 08:11:21 +00:00
feat: DO allow to pause cluster
This commit is contained in:
@@ -10,6 +10,7 @@ resource "digitalocean_kubernetes_cluster" "kubernetes_cluster" {
|
||||
|
||||
tags = local.tags_ks_list
|
||||
|
||||
{%- if doks_worker_nodes|length > 0 %}
|
||||
node_pool {
|
||||
tags = local.tags_ks_list
|
||||
name = var.kubernetes_cluster_id
|
||||
@@ -19,4 +20,5 @@ resource "digitalocean_kubernetes_cluster" "kubernetes_cluster" {
|
||||
min_nodes = "{{ doks_worker_nodes[0].min_size }}"
|
||||
max_nodes = "{{ doks_worker_nodes[0].max_size }}"
|
||||
}
|
||||
{%- endif %}
|
||||
}
|
||||
@@ -24,13 +24,17 @@ use crate::cloud_provider::models::WorkerNodeDataTemplate;
|
||||
use crate::cloud_provider::qovery::EngineLocation;
|
||||
use crate::cloud_provider::{kubernetes, CloudProvider};
|
||||
use crate::cmd::helm::{helm_exec_upgrade_with_chart_info, helm_upgrade_diff_with_chart_info};
|
||||
use crate::cmd::kubectl::{do_kubectl_exec_get_loadbalancer_id, kubectl_exec_get_all_namespaces};
|
||||
use crate::cmd::kubectl::{
|
||||
do_kubectl_exec_get_loadbalancer_id, kubectl_exec_api_custom_metrics, kubectl_exec_get_all_namespaces,
|
||||
};
|
||||
use crate::cmd::structs::HelmChart;
|
||||
use crate::cmd::terraform::{terraform_exec, terraform_init_validate_plan_apply, terraform_init_validate_state_list};
|
||||
use crate::deletion_utilities::{get_firsts_namespaces_to_delete, get_qovery_managed_namespaces};
|
||||
use crate::dns_provider::DnsProvider;
|
||||
use crate::error::EngineErrorCause::Internal;
|
||||
use crate::error::{cast_simple_error_to_engine_error, EngineError, EngineErrorCause, EngineErrorScope, SimpleError};
|
||||
use crate::error::{
|
||||
cast_simple_error_to_engine_error, EngineError, EngineErrorCause, EngineErrorScope, SimpleError, SimpleErrorKind,
|
||||
};
|
||||
use crate::fs::workspace_directory;
|
||||
use crate::models::{
|
||||
Context, Features, Listen, Listener, Listeners, ListenersHelper, ProgressInfo, ProgressLevel, ProgressScope,
|
||||
@@ -39,7 +43,7 @@ use crate::object_storage::spaces::Spaces;
|
||||
use crate::object_storage::ObjectStorage;
|
||||
use crate::string::terraform_list_format;
|
||||
use crate::{cmd, dns_provider};
|
||||
use retry::delay::Fibonacci;
|
||||
use retry::delay::{Fibonacci, Fixed};
|
||||
use retry::Error::Operation;
|
||||
use retry::OperationResult;
|
||||
use std::path::PathBuf;
|
||||
@@ -828,11 +832,186 @@ impl<'a> Kubernetes for DOKS<'a> {
|
||||
}
|
||||
|
||||
fn on_pause(&self) -> Result<(), EngineError> {
|
||||
todo!()
|
||||
info!("DOKS.on_pause() called for {}", self.name());
|
||||
|
||||
let listeners_helper = ListenersHelper::new(&self.listeners);
|
||||
let send_to_customer = |message: &str| {
|
||||
listeners_helper.pause_in_progress(ProgressInfo::new(
|
||||
ProgressScope::Infrastructure {
|
||||
execution_id: self.context.execution_id().to_string(),
|
||||
},
|
||||
ProgressLevel::Info,
|
||||
Some(message),
|
||||
self.context.execution_id(),
|
||||
))
|
||||
};
|
||||
send_to_customer(format!("Preparing DOKS {} cluster pause with id {}", self.name(), self.id()).as_str());
|
||||
|
||||
let temp_dir = workspace_directory(
|
||||
self.context.workspace_root_dir(),
|
||||
self.context.execution_id(),
|
||||
format!("bootstrap/{}", self.id()),
|
||||
)
|
||||
.map_err(|err| self.engine_error(EngineErrorCause::Internal, err.to_string()))?;
|
||||
|
||||
// generate terraform files and copy them into temp dir
|
||||
let mut context = self.tera_context()?;
|
||||
|
||||
// pause: remove all worker nodes to reduce the bill but keep master to keep all the deployment config, certificates etc...
|
||||
let worker_nodes: Vec<WorkerNodeDataTemplate> = Vec::new();
|
||||
context.insert("doks_worker_nodes", &worker_nodes);
|
||||
|
||||
let _ = cast_simple_error_to_engine_error(
|
||||
self.engine_error_scope(),
|
||||
self.context.execution_id(),
|
||||
crate::template::generate_and_copy_all_files_into_dir(
|
||||
self.template_directory.as_str(),
|
||||
temp_dir.as_str(),
|
||||
&context,
|
||||
),
|
||||
)?;
|
||||
|
||||
// copy lib/common/bootstrap/charts directory (and sub directory) into the lib/digitalocean/bootstrap/common/charts directory.
|
||||
// this is due to the required dependencies of lib/digitalocean/bootstrap/*.tf files
|
||||
let common_charts_temp_dir = format!("{}/common/charts", temp_dir.as_str());
|
||||
let _ = cast_simple_error_to_engine_error(
|
||||
self.engine_error_scope(),
|
||||
self.context.execution_id(),
|
||||
crate::template::copy_non_template_files(
|
||||
format!("{}/common/bootstrap/charts", self.context.lib_root_dir()),
|
||||
common_charts_temp_dir.as_str(),
|
||||
),
|
||||
)?;
|
||||
|
||||
// pause: only select terraform workers elements to pause to avoid applying on the whole config
|
||||
// this to avoid failures because of helm deployments on removing workers nodes
|
||||
let tf_workers_resources = match terraform_init_validate_state_list(temp_dir.as_str()) {
|
||||
Ok(x) => {
|
||||
let mut tf_workers_resources_name = Vec::new();
|
||||
for name in x {
|
||||
if name.starts_with("digitalocean_kubernetes_node_pool.") {
|
||||
tf_workers_resources_name.push(name);
|
||||
}
|
||||
}
|
||||
tf_workers_resources_name
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(EngineError {
|
||||
cause: EngineErrorCause::Internal,
|
||||
scope: EngineErrorScope::Kubernetes(self.id.clone(), self.name.clone()),
|
||||
execution_id: self.context.execution_id().to_string(),
|
||||
message: e.message,
|
||||
})
|
||||
}
|
||||
};
|
||||
if tf_workers_resources.is_empty() {
|
||||
return Err(EngineError {
|
||||
cause: EngineErrorCause::Internal,
|
||||
scope: EngineErrorScope::Kubernetes(self.id.clone(), self.name.clone()),
|
||||
execution_id: self.context.execution_id().to_string(),
|
||||
message: Some("No worker nodes present, can't Pause the infrastructure. This can happen if there where a manual operations on the workers or the infrastructure is already pause.".to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
let kubernetes_config_file_path = self.config_file_path()?;
|
||||
|
||||
// pause: wait 1h for the engine to have 0 running jobs before pausing and avoid getting unreleased lock (from helm or terraform for example)
|
||||
let metric_name = "taskmanager_nb_running_tasks";
|
||||
let wait_engine_job_finish = retry::retry(Fixed::from_millis(60000).take(60), || {
|
||||
return match kubectl_exec_api_custom_metrics(
|
||||
&kubernetes_config_file_path,
|
||||
self.cloud_provider().credentials_environment_variables(),
|
||||
"qovery",
|
||||
None,
|
||||
metric_name,
|
||||
) {
|
||||
Ok(metrics) => {
|
||||
let mut current_engine_jobs = 0;
|
||||
|
||||
for metric in metrics.items {
|
||||
match metric.value.parse::<i32>() {
|
||||
Ok(job_count) if job_count > 0 => current_engine_jobs += 1,
|
||||
Err(e) => {
|
||||
error!("error while looking at the API metric value {}. {:?}", metric_name, e);
|
||||
return OperationResult::Retry(SimpleError {
|
||||
kind: SimpleErrorKind::Other,
|
||||
message: Some(e.to_string()),
|
||||
});
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
if current_engine_jobs == 0 {
|
||||
OperationResult::Ok(())
|
||||
} else {
|
||||
OperationResult::Retry(SimpleError {
|
||||
kind: SimpleErrorKind::Other,
|
||||
message: Some("can't pause the infrastructure now, Engine jobs are currently running, retrying later...".to_string()),
|
||||
})
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!("error while looking at the API metric value {}. {:?}", metric_name, e);
|
||||
OperationResult::Retry(e)
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
match wait_engine_job_finish {
|
||||
Ok(_) => info!("no current running jobs on the Engine, infrastructure pause is allowed to start"),
|
||||
Err(Operation { error, .. }) => {
|
||||
return Err(EngineError {
|
||||
cause: EngineErrorCause::Internal,
|
||||
scope: EngineErrorScope::Engine,
|
||||
execution_id: self.context.execution_id().to_string(),
|
||||
message: error.message,
|
||||
})
|
||||
}
|
||||
Err(retry::Error::Internal(msg)) => {
|
||||
return Err(EngineError::new(
|
||||
EngineErrorCause::Internal,
|
||||
EngineErrorScope::Engine,
|
||||
self.context.execution_id(),
|
||||
Some(msg),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
let mut terraform_args_string = vec!["apply".to_string(), "-auto-approve".to_string()];
|
||||
for x in tf_workers_resources {
|
||||
terraform_args_string.push(format!("-target={}", x));
|
||||
}
|
||||
let terraform_args = terraform_args_string.iter().map(|x| &**x).collect();
|
||||
|
||||
let message = format!("Pausing DOKS {} cluster deployment with id {}", self.name(), self.id());
|
||||
info!("{}", &message);
|
||||
send_to_customer(&message);
|
||||
|
||||
match cast_simple_error_to_engine_error(
|
||||
self.engine_error_scope(),
|
||||
self.context.execution_id(),
|
||||
terraform_exec(temp_dir.as_str(), terraform_args),
|
||||
) {
|
||||
Ok(_) => {
|
||||
let message = format!("Kubernetes cluster {} successfully paused", self.name());
|
||||
info!("{}", &message);
|
||||
send_to_customer(&message);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Error while pausing cluster {} with id {}.", self.name(), self.id());
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn on_pause_error(&self) -> Result<(), EngineError> {
|
||||
todo!()
|
||||
warn!("DOKS.on_pause_error() called for {}", self.name());
|
||||
Err(self.engine_error(
|
||||
EngineErrorCause::Internal,
|
||||
format!("{} Kubernetes cluster failed to pause", self.name()),
|
||||
))
|
||||
}
|
||||
|
||||
fn on_delete(&self) -> Result<(), EngineError> {
|
||||
|
||||
@@ -173,6 +173,7 @@ fn create_and_destroy_doks_cluster_ams_3() {
|
||||
create_and_destroy_doks_cluster(region, secrets, false, function_name!());
|
||||
}
|
||||
|
||||
#[cfg(feature = "test-do-infra")]
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[allow(dead_code)]
|
||||
@@ -183,3 +184,15 @@ fn create_upgrade_and_destroy_doks_cluster_in_nyc_3() {
|
||||
let secrets = FuncTestsSecrets::new();
|
||||
create_upgrade_and_destroy_doks_cluster(region, secrets, "1.19", "1.20", function_name!());
|
||||
}
|
||||
|
||||
#[cfg(feature = "test-do-infra")]
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[allow(dead_code)]
|
||||
#[allow(unused_attributes)]
|
||||
#[named]
|
||||
fn create_pause_and_destroy_kapsule_cluster_ams_3() {
|
||||
let zone = Region::Amsterdam3;
|
||||
let secrets = FuncTestsSecrets::new();
|
||||
create_and_destroy_doks_cluster(zone, secrets, true, function_name!());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user