From 48f2a81c7376d58d7025e5e504bf7f376ae2f955 Mon Sep 17 00:00:00 2001 From: Pierre Mavro Date: Wed, 16 Feb 2022 15:59:40 +0100 Subject: [PATCH] feat: support scw node change without disruption (#587) --- .gitignore | 1 + lib/scaleway/bootstrap/ks-workers-nodes.j2.tf | 18 +- lib/scaleway/bootstrap/tf-default-vars.j2.tf | 2 +- lib/scaleway/bootstrap/tf-providers.j2.tf | 4 +- src/cloud_provider/aws/kubernetes/mod.rs | 5 +- src/cloud_provider/aws/kubernetes/node.rs | 1 + .../digitalocean/kubernetes/node.rs | 1 + src/cloud_provider/kubernetes.rs | 110 +++-- src/cloud_provider/models.rs | 1 + src/cloud_provider/scaleway/kubernetes/mod.rs | 413 +++++++++++++++++- .../scaleway/kubernetes/node.rs | 42 ++ src/errors/io.rs | 6 + src/errors/mod.rs | 103 ++++- tests/aws/aws_environment.rs | 14 +- tests/digitalocean/do_environment.rs | 10 +- tests/scaleway/scw_environment.rs | 10 +- 16 files changed, 683 insertions(+), 58 deletions(-) diff --git a/.gitignore b/.gitignore index a1913e40..7ddac5f2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ **/target *.iml +*.orig .idea .qovery-workspace .terraform/ diff --git a/lib/scaleway/bootstrap/ks-workers-nodes.j2.tf b/lib/scaleway/bootstrap/ks-workers-nodes.j2.tf index b23d5c8a..bc4fdf47 100644 --- a/lib/scaleway/bootstrap/ks-workers-nodes.j2.tf +++ b/lib/scaleway/bootstrap/ks-workers-nodes.j2.tf @@ -1,23 +1,27 @@ {% for scw_ks_worker_node in scw_ks_worker_nodes %} resource "scaleway_k8s_pool" "kubernetes_cluster_workers_{{ loop.index }}" { cluster_id = scaleway_k8s_cluster.kubernetes_cluster.id - name = "${var.kubernetes_cluster_id}_{{ loop.index }}" + name = "${var.kubernetes_cluster_id}_{{ scw_ks_worker_node.instance_type }}_{{ loop.index }}" node_type = "{{ scw_ks_worker_node.instance_type }}" region = var.region zone = var.zone # use Scaleway built-in cluster autoscaler - autoscaling = {{ scw_ks_pool_autoscale }} - autohealing = true - size = "{{ scw_ks_worker_node.min_nodes }}" - min_size = "{{ scw_ks_worker_node.min_nodes }}" - max_size = "{{ scw_ks_worker_node.max_nodes }}" + autoscaling = {{ scw_ks_pool_autoscale }} + autohealing = true + size = "{{ scw_ks_worker_node.min_nodes }}" + min_size = "{{ scw_ks_worker_node.min_nodes }}" + max_size = "{{ scw_ks_worker_node.max_nodes }}" + wait_for_pool_ready = true depends_on = [ scaleway_k8s_cluster.kubernetes_cluster, ] - tags = concat(local.tags_ks_list, ["QoveryNodeGroupName:{{ scw_ks_worker_node.name }}", "QoveryNodeGroupId:${var.kubernetes_cluster_id}-{{ loop.index }}"]) + lifecycle { + create_before_destroy = true + } + tags = concat(local.tags_ks_list, ["QoveryNodeGroupName:{{ scw_ks_worker_node.name }}", "QoveryNodeGroupId:${var.kubernetes_cluster_id}_{{ scw_ks_worker_node.instance_type }}_{{ loop.index }}"]) } {% endfor %} \ No newline at end of file diff --git a/lib/scaleway/bootstrap/tf-default-vars.j2.tf b/lib/scaleway/bootstrap/tf-default-vars.j2.tf index 09467d3c..73900b51 100644 --- a/lib/scaleway/bootstrap/tf-default-vars.j2.tf +++ b/lib/scaleway/bootstrap/tf-default-vars.j2.tf @@ -77,7 +77,7 @@ variable "kubernetes_cluster_id" { variable "kubernetes_cluster_name" { description = "Kubernetes cluster name" - default = "qovery-{{ kubernetes_cluster_id }}" # TODO(benjaminch): handle name creation in code + default = "{{ kubernetes_cluster_name }}" type = string } diff --git a/lib/scaleway/bootstrap/tf-providers.j2.tf b/lib/scaleway/bootstrap/tf-providers.j2.tf index 7ec6853a..5157bc29 100644 --- a/lib/scaleway/bootstrap/tf-providers.j2.tf +++ b/lib/scaleway/bootstrap/tf-providers.j2.tf @@ -9,7 +9,7 @@ terraform { required_providers { scaleway = { source = "scaleway/scaleway" - version = "~> 2.1.0" + version = "~> 2.2.0" } aws = { source = "hashicorp/aws" @@ -28,7 +28,7 @@ terraform { version = "~> 2.24.1" } } - required_version = ">= 0.13" + required_version = ">= 0.14" } diff --git a/src/cloud_provider/aws/kubernetes/mod.rs b/src/cloud_provider/aws/kubernetes/mod.rs index 0c698c54..9816b696 100644 --- a/src/cloud_provider/aws/kubernetes/mod.rs +++ b/src/cloud_provider/aws/kubernetes/mod.rs @@ -947,7 +947,10 @@ impl<'a> EKS<'a> { }; if tf_workers_resources.is_empty() { - return Err(EngineError::new_cluster_has_no_worker_nodes(event_details.clone())); + return Err(EngineError::new_cluster_has_no_worker_nodes( + event_details.clone(), + None, + )); } let kubernetes_config_file_path = self.get_kubeconfig_file_path()?; diff --git a/src/cloud_provider/aws/kubernetes/node.rs b/src/cloud_provider/aws/kubernetes/node.rs index ea3e8d47..8f0bdd0f 100644 --- a/src/cloud_provider/aws/kubernetes/node.rs +++ b/src/cloud_provider/aws/kubernetes/node.rs @@ -87,6 +87,7 @@ mod tests { NodeGroups::new("".to_string(), 2, 2, "t2.large".to_string(), 20).unwrap(), NodeGroups { name: "".to_string(), + id: None, min_nodes: 2, max_nodes: 2, instance_type: "t2.large".to_string(), diff --git a/src/cloud_provider/digitalocean/kubernetes/node.rs b/src/cloud_provider/digitalocean/kubernetes/node.rs index b90a4d9b..3a5bb7a5 100644 --- a/src/cloud_provider/digitalocean/kubernetes/node.rs +++ b/src/cloud_provider/digitalocean/kubernetes/node.rs @@ -133,6 +133,7 @@ mod tests { NodeGroups::new("".to_string(), 2, 2, "s-2vcpu-4gb".to_string(), 20).unwrap(), NodeGroups { name: "".to_string(), + id: None, min_nodes: 2, max_nodes: 2, instance_type: "s-2vcpu-4gb".to_string(), diff --git a/src/cloud_provider/kubernetes.rs b/src/cloud_provider/kubernetes.rs index 9fa0ac4c..a234fd96 100644 --- a/src/cloud_provider/kubernetes.rs +++ b/src/cloud_provider/kubernetes.rs @@ -34,7 +34,7 @@ use crate::fs::workspace_directory; use crate::logger::{LogLevel, Logger}; use crate::models::ProgressLevel::Info; use crate::models::{ - Action, Context, Listen, ListenersHelper, ProgressInfo, ProgressLevel, ProgressScope, QoveryIdentifier, + Action, Context, Listen, ListenersHelper, ProgressInfo, ProgressLevel, ProgressScope, QoveryIdentifier, StringPath, }; use crate::object_storage::ObjectStorage; use crate::unit_conversion::{any_to_mi, cpu_string_to_float}; @@ -78,29 +78,65 @@ pub trait Kubernetes: Listen { ) } + fn get_kubeconfig_filename(&self) -> String { + format!("{}.yaml", self.id()) + } + fn get_kubeconfig_file(&self) -> Result<(String, File), EngineError> { + let event_details = self.get_event_details(Infrastructure(InfrastructureStep::LoadConfiguration)); let bucket_name = format!("qovery-kubeconfigs-{}", self.id()); - let object_key = format!("{}.yaml", self.id()); + let object_key = self.get_kubeconfig_filename(); let stage = Stage::General(GeneralStep::RetrieveClusterConfig); - let (string_path, file) = match self - .config_file_store() - .get(bucket_name.as_str(), object_key.as_str(), true) - { - Ok((path, file)) => (path, file), - Err(err) => { - let error = EngineError::new_cannot_retrieve_cluster_config_file( - self.get_event_details(stage), - CommandError::new_from_safe_message( - format!( - "Error getting file from store, error: {}", - err.message.unwrap_or_else(|| "no details.".to_string()) - ) - .to_string(), - ), - ); - self.logger().log(LogLevel::Error, EngineEvent::Error(error.clone())); - return Err(error); + // check if kubeconfig locally exists + let local_kubeconfig = match self.get_temp_dir(event_details) { + Ok(x) => { + let local_kubeconfig_folder_path = format!("{}/{}", &x, &bucket_name); + let local_kubeconfig_generated = format!("{}/{}", &local_kubeconfig_folder_path, &object_key); + if Path::new(&local_kubeconfig_generated).exists() { + match File::open(&local_kubeconfig_generated) { + Ok(_) => Some(local_kubeconfig_generated), + Err(_) => { + debug!("couldn't open {} file", &local_kubeconfig_generated); + None + } + } + } else { + None + } + } + Err(_) => None, + }; + + // otherwise, try to get it from object storage + let (string_path, file) = match local_kubeconfig { + Some(local_kubeconfig_generated) => { + let kubeconfig_file = + File::open(&local_kubeconfig_generated).expect("couldn't read kubeconfig file, but file exists"); + + (StringPath::from(&local_kubeconfig_generated), kubeconfig_file) + } + None => { + match self + .config_file_store() + .get(bucket_name.as_str(), object_key.as_str(), true) + { + Ok((path, file)) => (path, file), + Err(err) => { + let error = EngineError::new_cannot_retrieve_cluster_config_file( + self.get_event_details(stage), + CommandError::new_from_safe_message( + format!( + "Error getting file from store, error: {}", + err.message.unwrap_or_else(|| "no details.".to_string()) + ) + .to_string(), + ), + ); + self.logger().log(LogLevel::Error, EngineEvent::Error(error.clone())); + return Err(error); + } + } } }; @@ -233,11 +269,12 @@ pub trait Kubernetes: Listen { where Self: Sized, { + let kubeconfig = match self.get_kubeconfig_file() { + Ok((path, _)) => path, + Err(e) => return Err(CommandError::new(e.message(), None)), + }; send_progress_on_long_task(self, Action::Create, || { - check_workers_status( - self.get_kubeconfig_file_path().expect("Unable to get Kubeconfig"), - self.cloud_provider().credentials_environment_variables(), - ) + check_workers_status(&kubeconfig, self.cloud_provider().credentials_environment_variables()) }) } fn upgrade_with_status(&self, kubernetes_upgrade_status: KubernetesUpgradeStatus) -> Result<(), EngineError>; @@ -436,11 +473,29 @@ pub fn deploy_environment( "deployment", CheckAction::Deploy, )?; + } - // Quick fix: adding 100 ms delay to avoid race condition on service status update - thread::sleep(std::time::Duration::from_millis(100)); + // Quick fix: adding 100 ms delay to avoid race condition on service status update + thread::sleep(std::time::Duration::from_millis(100)); - // check all deployed services + // check all deployed services + for service in &environment.stateful_services { + let _ = service::check_kubernetes_service_error( + service.exec_check_action(), + kubernetes, + service, + event_details.clone(), + &stateless_deployment_target, + &listeners_helper, + "check deployment", + CheckAction::Deploy, + )?; + } + + // Quick fix: adding 100 ms delay to avoid race condition on service status update + thread::sleep(std::time::Duration::from_millis(100)); + + for service in &environment.stateless_services { let _ = service::check_kubernetes_service_error( service.exec_check_action(), kubernetes, @@ -1192,6 +1247,7 @@ impl NodeGroups { Ok(NodeGroups { name: group_name, + id: None, min_nodes, max_nodes, instance_type, diff --git a/src/cloud_provider/models.rs b/src/cloud_provider/models.rs index 97d8f1c3..a02585b6 100644 --- a/src/cloud_provider/models.rs +++ b/src/cloud_provider/models.rs @@ -65,6 +65,7 @@ pub struct CpuLimits { #[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] pub struct NodeGroups { pub name: String, + pub id: Option, pub min_nodes: i32, pub max_nodes: i32, pub instance_type: String, diff --git a/src/cloud_provider/scaleway/kubernetes/mod.rs b/src/cloud_provider/scaleway/kubernetes/mod.rs index cc2b1493..52239ce6 100644 --- a/src/cloud_provider/scaleway/kubernetes/mod.rs +++ b/src/cloud_provider/scaleway/kubernetes/mod.rs @@ -12,7 +12,7 @@ use crate::cloud_provider::models::{NodeGroups, NodeGroupsFormat}; use crate::cloud_provider::qovery::EngineLocation; use crate::cloud_provider::scaleway::application::ScwZone; use crate::cloud_provider::scaleway::kubernetes::helm_charts::{scw_helm_charts, ChartsConfigPrerequisites}; -use crate::cloud_provider::scaleway::kubernetes::node::ScwInstancesType; +use crate::cloud_provider::scaleway::kubernetes::node::{ScwInstancesType, ScwNodeGroup}; use crate::cloud_provider::utilities::print_action; use crate::cloud_provider::{kubernetes, CloudProvider}; use crate::cmd::kubectl::{kubectl_exec_api_custom_metrics, kubectl_exec_get_all_namespaces, kubectl_exec_get_events}; @@ -29,18 +29,32 @@ use crate::models::{ }; use crate::object_storage::scaleway_object_storage::{BucketDeleteStrategy, ScalewayOS}; use crate::object_storage::ObjectStorage; +use crate::runtime::block_on; use crate::string::terraform_list_format; use crate::{cmd, dns_provider}; use ::function_name::named; +use reqwest::StatusCode; use retry::delay::{Fibonacci, Fixed}; use retry::Error::Operation; use retry::OperationResult; +use scaleway_api_rs::apis::Error; +use scaleway_api_rs::models::ScalewayK8sV1Cluster; use serde::{Deserialize, Serialize}; use std::env; use std::path::Path; use std::str::FromStr; use tera::Context as TeraContext; +#[derive(PartialEq)] +pub enum ScwNodeGroupErrors { + CloudProviderApiError(CommandError), + ClusterDoesNotExists(CommandError), + MultipleClusterFound, + NoNodePoolFound(CommandError), + MissingNodePoolInfo, + NodeGroupValidationError(CommandError), +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct KapsuleOptions { // Qovery @@ -193,6 +207,210 @@ impl<'a> Kapsule<'a> { }) } + fn get_configuration(&self) -> scaleway_api_rs::apis::configuration::Configuration { + scaleway_api_rs::apis::configuration::Configuration { + api_key: Some(scaleway_api_rs::apis::configuration::ApiKey { + key: self.options.scaleway_secret_key.clone(), + prefix: None, + }), + ..scaleway_api_rs::apis::configuration::Configuration::default() + } + } + + fn get_scw_cluster_info(&self) -> Result, EngineError> { + let event_details = self.get_event_details(Infrastructure(InfrastructureStep::LoadConfiguration)); + + // get cluster info + let cluster_info = match block_on(scaleway_api_rs::apis::clusters_api::list_clusters( + &self.get_configuration(), + self.region().as_str(), + None, + Some(self.options.scaleway_project_id.as_str()), + None, + None, + None, + Some(self.cluster_name().as_str()), + None, + None, + )) { + Ok(x) => x, + Err(e) => { + let msg = format!("wasn't able to retrieve SCW cluster information from the API. {:?}", e); + return Err(EngineError::new_cannot_get_cluster_error( + event_details.clone(), + CommandError::new(msg.clone(), Some(msg)), + )); + } + }; + + // if no cluster exists + let cluster_info_content = cluster_info.clusters.unwrap(); + if &cluster_info_content.len() == &(0 as usize) { + return Ok(None); + } else if &cluster_info_content.len() != &(1 as usize) { + let msg = format!( + "too many clusters found with this name, where 1 was expected. {:?}", + &cluster_info_content.len() + ); + return Err(EngineError::new_multiple_cluster_found_expected_one_error( + event_details, + CommandError::new(msg.clone(), Some(msg)), + )); + } + + Ok(Some(cluster_info_content[0].clone())) + } + + fn get_existing_sanitized_node_groups( + &self, + cluster_info: ScalewayK8sV1Cluster, + ) -> Result, ScwNodeGroupErrors> { + let error_cluster_id = format!("expected cluster id for this Scaleway cluster"); + let cluster_id = match cluster_info.id { + None => { + return Err(ScwNodeGroupErrors::NodeGroupValidationError( + CommandError::new_from_safe_message(error_cluster_id), + )) + } + Some(x) => x, + }; + + let pools = match block_on(scaleway_api_rs::apis::pools_api::list_pools( + &self.get_configuration(), + self.region().as_str(), + cluster_id.as_str(), + None, + None, + None, + None, + None, + )) { + Ok(x) => x, + Err(e) => { + let msg = format!("error while trying to get SCW pool info from cluster {}", &cluster_id); + let msg_with_error = format!("{}. {:?}", msg.clone(), e); + return Err(ScwNodeGroupErrors::CloudProviderApiError(CommandError::new( + msg_with_error, + Some(msg), + ))); + } + }; + + // ensure pool are present + if pools.pools.is_none() { + let msg = format!( + "No SCW pool found from the SCW API for cluster {}/{}", + &cluster_id, + &cluster_info.name.unwrap_or("unknown cluster".to_string()) + ); + return Err(ScwNodeGroupErrors::NoNodePoolFound(CommandError::new( + msg.clone(), + Some(msg), + ))); + } + + // create sanitized nodegroup pools + let mut nodegroup_pool: Vec = Vec::with_capacity(pools.total_count.unwrap_or(0 as f32) as usize); + for ng in pools.pools.unwrap() { + if ng.id.is_none() { + let msg = format!( + "error while trying to validate SCW pool ID from cluster {}", + &cluster_id + ); + return Err(ScwNodeGroupErrors::NodeGroupValidationError(CommandError::new( + msg.clone(), + Some(msg), + ))); + } + let ng_sanitized = self.get_node_group_info(ng.id.unwrap().as_str())?; + nodegroup_pool.push(ng_sanitized) + } + + Ok(nodegroup_pool) + } + + fn get_node_group_info(&self, pool_id: &str) -> Result { + let pool = match block_on(scaleway_api_rs::apis::pools_api::get_pool( + &self.get_configuration(), + self.region().as_str(), + pool_id, + )) { + Ok(x) => x, + Err(e) => { + return Err(match e { + Error::ResponseError(x) => { + let msg_with_error = format!( + "Error code while getting node group: {}, API message: {} ", + x.status, x.content + ); + match x.status { + StatusCode::NOT_FOUND => ScwNodeGroupErrors::NoNodePoolFound(CommandError::new( + msg_with_error, + Some("No node pool found".to_string()), + )), + _ => ScwNodeGroupErrors::CloudProviderApiError(CommandError::new( + msg_with_error, + Some("Scaleway API error while trying to get node group".to_string()), + )), + } + } + _ => { + let msg = "This Scaleway API error is not supported in the engine, please add it to better support it".to_string(); + ScwNodeGroupErrors::NodeGroupValidationError(CommandError::new(msg.clone(), Some(msg))) + } + }) + } + }; + + // ensure there is no missing info + if let Err(e) = self.check_missing_nodegroup_info(&pool.name, "name") { + return Err(e); + }; + if let Err(e) = self.check_missing_nodegroup_info(&pool.min_size, "min_size") { + return Err(e); + }; + if let Err(e) = self.check_missing_nodegroup_info(&pool.max_size, "max_size") { + return Err(e); + }; + if let Err(e) = self.check_missing_nodegroup_info(&pool.status, "status") { + return Err(e); + }; + + match ScwNodeGroup::new( + pool.id, + pool.name.unwrap(), + pool.min_size.unwrap() as i32, + pool.max_size.unwrap() as i32, + pool.node_type, + pool.size as i32, + pool.status.unwrap(), + ) { + Ok(x) => Ok(x), + Err(e) => Err(ScwNodeGroupErrors::NodeGroupValidationError(e)), + } + } + + fn check_missing_nodegroup_info(&self, item: &Option, name: &str) -> Result<(), ScwNodeGroupErrors> { + let event_details = self.get_event_details(Infrastructure(InfrastructureStep::LoadConfiguration)); + + self.logger.log( + LogLevel::Error, + EngineEvent::Error(EngineError::new_missing_workers_group_info_error( + event_details, + CommandError::new_from_safe_message(format!( + "Missing node pool info {} for cluster {}", + name, + self.context.cluster_id() + )), + )), + ); + + if item.is_none() { + return Err(ScwNodeGroupErrors::MissingNodePoolInfo); + }; + Ok(()) + } + fn kubeconfig_bucket_name(&self) -> String { format!("qovery-kubeconfigs-{}", self.id()) } @@ -256,7 +474,7 @@ impl<'a> Kapsule<'a> { // Kubernetes context.insert("test_cluster", &self.context.is_test_cluster()); context.insert("kubernetes_cluster_id", self.id()); - context.insert("kubernetes_cluster_name", self.name()); + context.insert("kubernetes_cluster_name", self.cluster_name().as_str()); context.insert("kubernetes_cluster_version", self.version()); // Qovery @@ -579,6 +797,192 @@ impl<'a> Kapsule<'a> { return Err(error); } + let cluster_info = self.get_scw_cluster_info()?; + if cluster_info.is_none() { + let msg = "no cluster found from the Scaleway API".to_string(); + return Err(EngineError::new_no_cluster_found_error( + event_details.clone(), + CommandError::new(msg.clone(), Some(msg)), + )); + } + + let current_nodegroups = match self + .get_existing_sanitized_node_groups(cluster_info.expect("A cluster should be present at this create stage")) + { + Ok(x) => x, + Err(e) => { + match e { + ScwNodeGroupErrors::CloudProviderApiError(c) => { + return Err(EngineError::new_missing_api_info_from_cloud_provider_error( + event_details.clone(), + Some(c), + )) + } + ScwNodeGroupErrors::ClusterDoesNotExists(_) => self.logger().log( + LogLevel::Info, + EngineEvent::Deploying( + event_details.clone(), + EventMessage::new_from_safe( + "cluster do not exists, no node groups can be retrieved for upgrade check".to_string(), + ), + ), + ), + ScwNodeGroupErrors::MultipleClusterFound => { + let msg = "multiple clusters found, can't match the correct node groups".to_string(); + return Err(EngineError::new_multiple_cluster_found_expected_one_error( + event_details.clone(), + CommandError::new(msg.clone(), Some(msg)), + )); + } + ScwNodeGroupErrors::NoNodePoolFound(_) => self.logger().log( + LogLevel::Info, + EngineEvent::Deploying( + event_details.clone(), + EventMessage::new_from_safe( + "cluster exists, but no node groups found for upgrade check".to_string(), + ), + ), + ), + ScwNodeGroupErrors::MissingNodePoolInfo => { + let msg = format!("Error with Scaleway API while trying to retrieve node pool info"); + return Err(EngineError::new_missing_api_info_from_cloud_provider_error( + event_details.clone(), + Some(CommandError::new_from_safe_message(msg)), + )); + } + ScwNodeGroupErrors::NodeGroupValidationError(c) => { + return Err(EngineError::new_missing_api_info_from_cloud_provider_error( + event_details.clone(), + Some(c), + )); + } + }; + Vec::with_capacity(0) + } + }; + + // ensure all node groups are in ready state Scaleway side + self.logger.log( + LogLevel::Info, + EngineEvent::Deploying( + event_details.clone(), + EventMessage::new_from_safe( + "ensuring all groups nodes are in ready state from the Scaleway API".to_string(), + ), + ), + ); + + for ng in current_nodegroups { + let res = retry::retry( + // retry 10 min max per nodegroup until they are ready + Fixed::from_millis(15000).take(40), + || { + self.logger().log( + LogLevel::Info, + EngineEvent::Deploying( + event_details.clone(), + EventMessage::new_from_safe(format!( + "checking node group {}/{:?}, current status: {:?}", + &ng.name, + &ng.id.as_ref().unwrap_or(&"unknown".to_string()), + &ng.status + )), + ), + ); + let pool_id = match &ng.id { + None => { + let msg = + "node group id was expected to get info, but not found from Scaleway API".to_string(); + return OperationResult::Retry( + EngineError::new_missing_api_info_from_cloud_provider_error( + event_details.clone(), + Some(CommandError::new_from_safe_message(msg)), + ), + ); + } + Some(x) => x, + }; + let scw_ng = match self.get_node_group_info(pool_id.as_str()) { + Ok(x) => x, + Err(e) => { + return match e { + ScwNodeGroupErrors::CloudProviderApiError(c) => { + let current_error = EngineError::new_missing_api_info_from_cloud_provider_error( + event_details.clone(), + Some(c), + ); + self.logger + .log(LogLevel::Error, EngineEvent::Error(current_error.clone())); + OperationResult::Retry(current_error) + } + ScwNodeGroupErrors::ClusterDoesNotExists(c) => { + let current_error = + EngineError::new_no_cluster_found_error(event_details.clone(), c); + self.logger + .log(LogLevel::Error, EngineEvent::Error(current_error.clone())); + OperationResult::Retry(current_error) + } + ScwNodeGroupErrors::MultipleClusterFound => { + OperationResult::Retry(EngineError::new_multiple_cluster_found_expected_one_error( + event_details.clone(), + CommandError::new_from_safe_message( + "Multiple cluster found while one was expected".to_string(), + ), + )) + } + ScwNodeGroupErrors::NoNodePoolFound(_) => OperationResult::Ok(()), + ScwNodeGroupErrors::MissingNodePoolInfo => { + OperationResult::Retry(EngineError::new_missing_api_info_from_cloud_provider_error( + event_details.clone(), + None, + )) + } + ScwNodeGroupErrors::NodeGroupValidationError(c) => { + let current_error = EngineError::new_missing_api_info_from_cloud_provider_error( + event_details.clone(), + Some(c), + ); + self.logger + .log(LogLevel::Error, EngineEvent::Error(current_error.clone())); + OperationResult::Retry(current_error) + } + } + } + }; + match scw_ng.status == scaleway_api_rs::models::scaleway_k8s_v1_pool::Status::Ready { + true => OperationResult::Ok(()), + false => OperationResult::Retry(EngineError::new_k8s_node_not_ready( + event_details.clone(), + CommandError::new_from_safe_message(format!( + "waiting for node group {} to be ready, current status: {:?}", + &scw_ng.name, scw_ng.status + )), + )), + } + }, + ); + match res { + Ok(_) => {} + Err(Operation { error, .. }) => return Err(error), + Err(retry::Error::Internal(msg)) => { + return Err(EngineError::new_k8s_node_not_ready( + event_details.clone(), + CommandError::new(msg, Some("Waiting for too long worker nodes to be ready".to_string())), + )) + } + } + } + self.logger.log( + LogLevel::Info, + EngineEvent::Deploying( + event_details.clone(), + EventMessage::new_from_safe( + "all node groups for this cluster are ready from cloud provider API".to_string(), + ), + ), + ); + + // ensure all nodes are ready on Kubernetes match self.check_workers_on_create() { Ok(_) => { self.send_to_customer( @@ -793,7 +1197,10 @@ impl<'a> Kapsule<'a> { }; if tf_workers_resources.is_empty() { - return Err(EngineError::new_cluster_has_no_worker_nodes(event_details.clone())); + return Err(EngineError::new_cluster_has_no_worker_nodes( + event_details.clone(), + None, + )); } let kubernetes_config_file_path = self.get_kubeconfig_file_path()?; diff --git a/src/cloud_provider/scaleway/kubernetes/node.rs b/src/cloud_provider/scaleway/kubernetes/node.rs index e1da625d..e1d85bd5 100644 --- a/src/cloud_provider/scaleway/kubernetes/node.rs +++ b/src/cloud_provider/scaleway/kubernetes/node.rs @@ -88,6 +88,47 @@ impl FromStr for ScwInstancesType { } } +#[derive(Clone)] +pub struct ScwNodeGroup { + pub name: String, + pub id: Option, + pub min_nodes: i32, + pub max_nodes: i32, + pub instance_type: String, + pub disk_size_in_gib: i32, + pub status: scaleway_api_rs::models::scaleway_k8s_v1_pool::Status, +} + +impl ScwNodeGroup { + pub fn new( + id: Option, + group_name: String, + min_nodes: i32, + max_nodes: i32, + instance_type: String, + disk_size_in_gib: i32, + status: scaleway_api_rs::models::scaleway_k8s_v1_pool::Status, + ) -> Result { + if min_nodes > max_nodes { + let msg = format!( + "The number of minimum nodes ({}) for group name {} is higher than maximum nodes ({})", + &group_name, &min_nodes, &max_nodes + ); + return Err(CommandError::new_from_safe_message(msg)); + } + + Ok(ScwNodeGroup { + name: group_name, + id, + min_nodes, + max_nodes, + instance_type, + disk_size_in_gib, + status, + }) + } +} + #[cfg(test)] mod tests { #[cfg(test)] @@ -104,6 +145,7 @@ mod tests { NodeGroups::new("".to_string(), 2, 2, "dev1-l".to_string(), 20).unwrap(), NodeGroups { name: "".to_string(), + id: None, min_nodes: 2, max_nodes: 2, instance_type: "dev1-l".to_string(), diff --git a/src/errors/io.rs b/src/errors/io.rs index 139b5df7..cb77a248 100644 --- a/src/errors/io.rs +++ b/src/errors/io.rs @@ -76,6 +76,9 @@ pub enum Tag { CannotGetCluster, ObjectStorageCannotCreateBucket, ObjectStorageCannotPutFileIntoBucket, + NoClusterFound, + OnlyOneClusterExpected, + CloudProviderApiMissingInfo, } impl From for Tag { @@ -139,6 +142,9 @@ impl From for Tag { errors::Tag::UnsupportedZone => Tag::UnsupportedZone, errors::Tag::K8sNodeIsNotReadyWithTheRequestedVersion => Tag::K8sNodeIsNotReadyWithTheRequestedVersion, errors::Tag::K8sNodeIsNotReady => Tag::K8sNodeIsNotReady, + errors::Tag::NoClusterFound => Tag::NoClusterFound, + errors::Tag::OnlyOneClusterExpected => Tag::OnlyOneClusterExpected, + errors::Tag::CloudProviderApiMissingInfo => Tag::CloudProviderApiMissingInfo, } } } diff --git a/src/errors/mod.rs b/src/errors/mod.rs index d1648f7d..38da4362 100644 --- a/src/errors/mod.rs +++ b/src/errors/mod.rs @@ -88,6 +88,8 @@ pub enum Tag { Unknown, /// MissingRequiredEnvVariable: represents an error where a required env variable is not set. MissingRequiredEnvVariable, + /// NoClusterFound: represents an error where no cluster was found + NoClusterFound, /// ClusterHasNoWorkerNodes: represents an error where the current cluster doesn't have any worker nodes. ClusterHasNoWorkerNodes, /// CannotGetWorkspaceDirectory: represents an error while trying to get workspace directory. @@ -188,10 +190,14 @@ pub enum Tag { CannotGetSupportedVersions, /// CannotGetCluster: represents an error where we cannot get cluster. CannotGetCluster, + /// OnlyOneClusterExpected: represents an error where only one cluster was expected but several where found + OnlyOneClusterExpected, /// ObjectStorageCannotCreateBucket: represents an error while trying to create a new object storage bucket. ObjectStorageCannotCreateBucket, /// ObjectStorageCannotPutFileIntoBucket: represents an error while trying to put a file into an object storage bucket. ObjectStorageCannotPutFileIntoBucket, + /// CloudProviderApiMissingInfo: represents an error while expecting mandatory info + CloudProviderApiMissingInfo, } #[derive(Clone, Debug)] @@ -353,14 +359,18 @@ impl EngineError { /// Arguments: /// /// * `event_details`: Error linked event details. - pub fn new_cluster_has_no_worker_nodes(event_details: EventDetails) -> EngineError { + /// * `raw_error`: Raw error message. + pub fn new_cluster_has_no_worker_nodes( + event_details: EventDetails, + raw_error: Option, + ) -> EngineError { let message = "No worker nodes present, can't proceed with operation."; EngineError::new( event_details, Tag::ClusterHasNoWorkerNodes, message.to_string(), message.to_string(), - None, + raw_error, None, Some( "This can happen if there where a manual operations on the workers or the infrastructure is paused." @@ -369,6 +379,32 @@ impl EngineError { ) } + /// Missing API info from the Cloud provider itself + /// + /// + /// + /// Arguments: + /// + /// * `event_details`: Error linked event details. + /// * `raw_error`: Raw error message. + pub fn new_missing_api_info_from_cloud_provider_error( + event_details: EventDetails, + raw_error: Option, + ) -> EngineError { + let message = "Error, missing required information from the Cloud Provider API"; + EngineError::new( + event_details, + Tag::CloudProviderApiMissingInfo, + message.to_string(), + message.to_string(), + raw_error, + None, + Some( + "This can happen if the cloud provider is encountering issues. You should try again later".to_string(), + ), + ) + } + /// Creates new error for unsupported instance type. /// /// Cloud provider doesn't support the requested instance type. @@ -1678,4 +1714,67 @@ impl EngineError { Some("Maybe there is a lag and cluster is not yet reported, please retry later.".to_string()), ) } + + /// Creates new error while trying to get cluster. + /// + /// Arguments: + /// + /// * `event_details`: Error linked event details. + /// * `raw_error`: Raw error message. + pub fn new_missing_workers_group_info_error(event_details: EventDetails, raw_error: CommandError) -> EngineError { + let message = "Error, cannot get cluster."; + + EngineError::new( + event_details, + Tag::CannotGetCluster, + message.to_string(), + message.to_string(), + Some(raw_error), + None, + Some("Maybe there is a lag and cluster is not yet reported, please retry later.".to_string()), + ) + } + + /// No cluster found + /// + /// Arguments: + /// + /// * `event_details`: Error linked event details. + /// * `raw_error`: Raw error message. + pub fn new_no_cluster_found_error(event_details: EventDetails, raw_error: CommandError) -> EngineError { + let message = "Error, no cluster found."; + + EngineError::new( + event_details, + Tag::CannotGetCluster, + message.to_string(), + message.to_string(), + Some(raw_error), + None, + Some("Maybe there is a lag and cluster is not yet reported, please retry later.".to_string()), + ) + } + + /// Too many clusters found, while expected only one + /// + /// Arguments: + /// + /// * `event_details`: Error linked event details. + /// * `raw_error`: Raw error message. + pub fn new_multiple_cluster_found_expected_one_error( + event_details: EventDetails, + raw_error: CommandError, + ) -> EngineError { + let message = "Too many clusters found with this name, where 1 was expected"; + + EngineError::new( + event_details, + Tag::OnlyOneClusterExpected, + message.to_string(), + message.to_string(), + Some(raw_error), + None, + Some("Please contact Qovery support for investigation.".to_string()), + ) + } } diff --git a/tests/aws/aws_environment.rs b/tests/aws/aws_environment.rs index 3fe1da2b..cc5f133d 100644 --- a/tests/aws/aws_environment.rs +++ b/tests/aws/aws_environment.rs @@ -102,7 +102,7 @@ fn test_build_cache() { .as_str(), ); - let mut environment = test_utilities::common::working_minimal_environment( + let environment = test_utilities::common::working_minimal_environment( &context, secrets .DEFAULT_TEST_DOMAIN @@ -118,9 +118,9 @@ fn test_build_cache() { let app_build = app.to_build(); let _ = match local_docker.has_cache(&app_build) { Ok(CacheResult::Hit) => assert!(false), - Ok(CacheResult::Miss(parent_build)) => assert!(true), + Ok(CacheResult::Miss(_)) => assert!(true), Ok(CacheResult::MissWithoutParentBuild) => assert!(false), - Err(err) => assert!(false), + Err(_) => assert!(false), }; let _ = match ecr.pull(&image).unwrap() { @@ -147,9 +147,9 @@ fn test_build_cache() { let _ = match local_docker.has_cache(&build_result.build) { Ok(CacheResult::Hit) => assert!(true), - Ok(CacheResult::Miss(parent_build)) => assert!(false), + Ok(CacheResult::Miss(_)) => assert!(false), Ok(CacheResult::MissWithoutParentBuild) => assert!(false), - Err(err) => assert!(false), + Err(_) => assert!(false), }; let start_pull_time = SystemTime::now(); @@ -1111,6 +1111,10 @@ fn deploy_a_non_working_environment_with_a_working_failover_on_aws_eks() { fn deploy_2_non_working_environments_with_2_working_failovers_on_aws_eks() { init(); + let test_name = function_name!(); + let span = span!(Level::INFO, "test", name = test_name); + let _enter = span.enter(); + let logger = logger(); let secrets = FuncTestsSecrets::new(); diff --git a/tests/digitalocean/do_environment.rs b/tests/digitalocean/do_environment.rs index f87d8098..f18c9275 100644 --- a/tests/digitalocean/do_environment.rs +++ b/tests/digitalocean/do_environment.rs @@ -104,7 +104,7 @@ fn test_build_cache() { .expect("DIGITAL_OCEAN_TEST_CLUSTER_ID is not set"), ); - let mut environment = test_utilities::common::working_minimal_environment( + let environment = test_utilities::common::working_minimal_environment( &context, secrets .DEFAULT_TEST_DOMAIN @@ -120,9 +120,9 @@ fn test_build_cache() { let app_build = app.to_build(); let _ = match local_docker.has_cache(&app_build) { Ok(CacheResult::Hit) => assert!(false), - Ok(CacheResult::Miss(parent_build)) => assert!(true), + Ok(CacheResult::Miss(_)) => assert!(true), Ok(CacheResult::MissWithoutParentBuild) => assert!(false), - Err(err) => assert!(false), + Err(_) => assert!(false), }; let _ = match docr.pull(&image).unwrap() { @@ -149,9 +149,9 @@ fn test_build_cache() { let _ = match local_docker.has_cache(&build_result.build) { Ok(CacheResult::Hit) => assert!(true), - Ok(CacheResult::Miss(parent_build)) => assert!(false), + Ok(CacheResult::Miss(_)) => assert!(false), Ok(CacheResult::MissWithoutParentBuild) => assert!(false), - Err(err) => assert!(false), + Err(_) => assert!(false), }; let start_pull_time = SystemTime::now(); diff --git a/tests/scaleway/scw_environment.rs b/tests/scaleway/scw_environment.rs index 405763cc..41063c19 100644 --- a/tests/scaleway/scw_environment.rs +++ b/tests/scaleway/scw_environment.rs @@ -109,7 +109,7 @@ fn test_build_cache() { .as_str(), ); - let mut environment = test_utilities::common::working_minimal_environment( + let environment = test_utilities::common::working_minimal_environment( &context, secrets .DEFAULT_TEST_DOMAIN @@ -125,9 +125,9 @@ fn test_build_cache() { let app_build = app.to_build(); let _ = match local_docker.has_cache(&app_build) { Ok(CacheResult::Hit) => assert!(false), - Ok(CacheResult::Miss(parent_build)) => assert!(true), + Ok(CacheResult::Miss(_)) => assert!(true), Ok(CacheResult::MissWithoutParentBuild) => assert!(false), - Err(err) => assert!(false), + Err(_) => assert!(false), }; let _ = match scr.pull(&image).unwrap() { @@ -154,9 +154,9 @@ fn test_build_cache() { let _ = match local_docker.has_cache(&build_result.build) { Ok(CacheResult::Hit) => assert!(true), - Ok(CacheResult::Miss(parent_build)) => assert!(false), + Ok(CacheResult::Miss(_)) => assert!(false), Ok(CacheResult::MissWithoutParentBuild) => assert!(false), - Err(err) => assert!(false), + Err(_) => assert!(false), }; let start_pull_time = SystemTime::now();