From 110f6eec5e338a7322aa39a4711982fcda9c842c Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Thu, 14 Sep 2023 01:44:53 +0000 Subject: [PATCH 1/3] Moved the function for finding host's FQDN(s) into util/ Extended the contract and the implementation of the function to allow a choice to return a single result or all findings. Migrated the only client of that function that existed before the move. --- src/replica/Registry.cc | 53 ++-------------------------- src/util/CMakeLists.txt | 2 ++ src/util/common.cc | 76 +++++++++++++++++++++++++++++++++++++++++ src/util/common.h | 17 +++++++++ 4 files changed, 98 insertions(+), 50 deletions(-) create mode 100644 src/util/common.cc diff --git a/src/replica/Registry.cc b/src/replica/Registry.cc index e6e1cce8bc..82df3f1479 100644 --- a/src/replica/Registry.cc +++ b/src/replica/Registry.cc @@ -26,21 +26,16 @@ #include "replica/Configuration.h" #include "replica/ConfigWorker.h" #include "replica/HttpClient.h" +#include "util/common.h" // LSST headers #include "lsst/log/Log.h" // Standard headers -#include #include #include -#include -#include #include -// Third-party headers -#include "boost/asio.hpp" - using namespace std; using json = nlohmann::json; @@ -50,49 +45,6 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.Registry"); string context(string const& func) { return "REGISTRY " + func + " "; } -/** - * @brief Get the FQDN of the current host. - * @note If there will be more than one canonical name associated with the host - * then the result will contain them all separated by comma, such as in: - * @code - * ,,... - * @endcode - * @return The FQDN (or FQDNs) - * @throws std::runtime_error In case if the information couldn't be retreived. - */ -string get_current_host_fqdn() { - // Get the short name of the current host first. - boost::system::error_code ec; - string const hostname = boost::asio::ip::host_name(ec); - if (ec.value() != 0) { - throw runtime_error("Registry::" + string(__func__) + - " boost::asio::ip::host_name failed: " + ec.category().name() + string(":") + - to_string(ec.value()) + "[" + ec.message() + "]"); - } - - // Get the host's FQDN(s) - struct addrinfo hints; - memset(&hints, 0, sizeof hints); - hints.ai_family = AF_UNSPEC; /* either IPV4 or IPV6 */ - hints.ai_socktype = SOCK_STREAM; /* IP */ - hints.ai_flags = AI_CANONNAME; /* canonical name */ - struct addrinfo* info; - while (true) { - int const retCode = getaddrinfo(hostname.data(), "http", &hints, &info); - if (retCode == 0) break; - if (retCode == EAI_AGAIN) continue; - throw runtime_error("Registry::" + string(__func__) + - " getaddrinfo failed: " + gai_strerror(retCode)); - } - string fqdn; - for (struct addrinfo* p = info; p != NULL; p = p->ai_next) { - if (!fqdn.empty()) fqdn += ","; - fqdn += p->ai_canonname; - } - freeaddrinfo(info); - return fqdn; -} - } // namespace namespace lsst::qserv::replica { @@ -142,7 +94,8 @@ vector Registry::workers() const { } void Registry::add(string const& name) const { - string const hostName = ::get_current_host_fqdn(); + bool const all = true; + string const hostName = util::get_current_host_fqdn(all); auto const config = _serviceProvider->config(); json const request = json::object({{"instance_id", _serviceProvider->instanceId()}, diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 6ef8eebe15..6140d9302d 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -5,6 +5,7 @@ target_sources(util PRIVATE Bug.cc CmdLineParser.cc Command.cc + common.cc ConfigStore.cc DynamicWorkQueue.cc Error.cc @@ -31,6 +32,7 @@ target_sources(util PRIVATE ) target_link_libraries(util PUBLIC + boost_system log ) diff --git a/src/util/common.cc b/src/util/common.cc new file mode 100644 index 0000000000..f129c0c7f6 --- /dev/null +++ b/src/util/common.cc @@ -0,0 +1,76 @@ +// -*- LSST-C++ -*- +/* + * LSST Data Management System + * Copyright 2015 LSST Corporation. + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +// Class header +#include "util/common.h" + +// Standard headers +#include +#include +#include +#include +#include + +// Third-party headers +#include "boost/asio.hpp" + +using namespace std; + +namespace lsst::qserv::util { + +string get_current_host_fqdn(bool all) { + // Get the short name of the current host first. + boost::system::error_code ec; + string const hostname = boost::asio::ip::host_name(ec); + if (ec.value() != 0) { + throw runtime_error("Registry::" + string(__func__) + + " boost::asio::ip::host_name failed: " + ec.category().name() + string(":") + + to_string(ec.value()) + "[" + ec.message() + "]"); + } + + // Get the host's FQDN(s) + struct addrinfo hints; + memset(&hints, 0, sizeof hints); + hints.ai_family = AF_UNSPEC; /* either IPV4 or IPV6 */ + hints.ai_socktype = SOCK_STREAM; /* IP */ + hints.ai_flags = AI_CANONNAME; /* canonical name */ + struct addrinfo* info; + while (true) { + int const retCode = getaddrinfo(hostname.data(), "http", &hints, &info); + if (retCode == 0) break; + if (retCode == EAI_AGAIN) continue; + throw runtime_error("Registry::" + string(__func__) + + " getaddrinfo failed: " + gai_strerror(retCode)); + } + string fqdn; + for (struct addrinfo* p = info; p != NULL; p = p->ai_next) { + if (!fqdn.empty()) { + if (!all) break; + fqdn += ","; + } + fqdn += p->ai_canonname; + } + freeaddrinfo(info); + return fqdn; +} + +} // namespace lsst::qserv::util diff --git a/src/util/common.h b/src/util/common.h index 5e8071431e..faf7e837a7 100644 --- a/src/util/common.h +++ b/src/util/common.h @@ -38,6 +38,23 @@ namespace lsst::qserv::util { +/** + * Get the FQDN(s) of the current host. + * + * If there will be more than one canonical name associated with the host + * and if a value of the input parameter is set as "all=true" then the result + * will contain all names separated by comma, such as in: + * @code + * ,,... + * @endcode + * @note In most setups there will be just one name. + * @param all The optional parameter that allows returning all names instead + * of the first one that was found. + * @return The FQDN (or FQDNs) + * @throws std::runtime_error In case if the information couldn't be retreived. + */ +std::string get_current_host_fqdn(bool all = false); + template typename Map::mapped_type const& getFromMap(Map const& m, typename Map::key_type const& key, typename Map::mapped_type const& defValue) { From 1b6323ceb1bb6eab495e38f21542b20fd39d95a3 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Thu, 14 Sep 2023 01:51:14 +0000 Subject: [PATCH 2/3] Fixed a bug in the result file URL generator Using FQDN of the worker's host name for the file-based protocol options XROOT and HTTP. The change allow running the application in the Kubernetes environment. --- src/wbase/Task.cc | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index c33a78f573..8f83f97259 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -37,7 +37,6 @@ // Third-party headers #include -#include "boost/asio.hpp" #include "boost/filesystem.hpp" // LSST headers @@ -51,6 +50,7 @@ #include "proto/TaskMsgDigest.h" #include "proto/worker.pb.h" #include "util/Bug.h" +#include "util/common.h" #include "util/HoldTrack.h" #include "util/IterableFormatter.h" #include "util/TimeUtils.h" @@ -69,18 +69,6 @@ namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.wbase.Task"); -string get_hostname() { - // Get the short name of the current host. - boost::system::error_code ec; - string const hostname = boost::asio::ip::host_name(ec); - if (ec.value() != 0) { - throw runtime_error("Task::" + string(__func__) + - " boost::asio::ip::host_name failed: " + ec.category().name() + string(":") + - to_string(ec.value()) + "[" + ec.message() + "]"); - } - return hostname; -} - string buildResultFilePath(shared_ptr const& taskMsg, string const& resultsDirname) { if (resultsDirname.empty()) return resultsDirname; @@ -155,14 +143,14 @@ Task::Task(TaskMsgPtr const& t, int fragmentNumber, std::shared_ptrresultDeliveryProtocol(); if (resultDeliveryProtocol != wconfig::WorkerConfig::ResultDeliveryProtocol::SSI) { _resultFilePath = ::buildResultFilePath(t, workerConfig->resultsDirname()); + auto const fqdn = util::get_current_host_fqdn(); if (resultDeliveryProtocol == wconfig::WorkerConfig::ResultDeliveryProtocol::XROOT) { - // NOTE: one extra '/' after the [:] spec is required to make + // NOTE: one extra '/' after the [:] spec is required to make // a "valid" XROOTD url. - _resultFileXrootUrl = "xroot://" + ::get_hostname() + ":" + - to_string(workerConfig->resultsXrootdPort()) + "/" + _resultFilePath; + _resultFileXrootUrl = "xroot://" + fqdn + ":" + to_string(workerConfig->resultsXrootdPort()) + + "/" + _resultFilePath; } else if (resultDeliveryProtocol == wconfig::WorkerConfig::ResultDeliveryProtocol::HTTP) { - _resultFileHttpUrl = - "http://" + ::get_hostname() + ":" + to_string(resultsHttpPort) + _resultFilePath; + _resultFileHttpUrl = "http://" + fqdn + ":" + to_string(resultsHttpPort) + _resultFilePath; } else { throw std::runtime_error("wbase::Task::Task: unsupported results delivery protocol: " + wconfig::WorkerConfig::protocol2str(resultDeliveryProtocol)); From 0a51dad3f0c0a736807e215f009d5ebf9ce61be5 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Thu, 14 Sep 2023 02:20:20 +0000 Subject: [PATCH 3/3] Fixed a bug in the result folder cleanup at the worker start up time In the original implementation of the file-based result delivery protocol, workers were attempting to clean up unclaimed files left in workers' result folders regardless of the protocol option. This clearly was a mistake for the SSI protocol option where the folder wasn't required to exist. As a result of this, the application posts confusing warnings in the logging stream. This was fixed. Another problem that was fixed was related to the protocol options HTTP and XROOT where the application wouldn't abort right away if the required folder didn't exist during the folder cleanup attempt. Not having this folder available in this scenario means a configuration error or a problem with the infrastructure. --- src/wbase/FileChannelShared.cc | 12 +++++++++--- src/xrdsvc/SsiService.cc | 4 +++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/wbase/FileChannelShared.cc b/src/wbase/FileChannelShared.cc index 2254c705d7..df0d617131 100644 --- a/src/wbase/FileChannelShared.cc +++ b/src/wbase/FileChannelShared.cc @@ -54,10 +54,15 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.wbase.FileChannelShared"); /** * Iterate over the result files at the results folder and remove those * which satisfy the desired criteria. + * @note The folder must exist when this function gets called. Any other + * scenario means a configuration error or a problem with the infrastructure. + * Running into either of these problems should result in the abort of + * the application. * @param context The calling context (used for logging purposes). * @param fileCanBeRemoved The optional validator to be called for each candidate file. * Note that missing validator means "yes" the candidate file can be removed. * @return The total number of removed files. + * @throws std::runtime_error If the results folder doesn't exist. */ size_t cleanUpResultsImpl(string const& context, fs::path const& dirPath, function fileCanBeRemoved = nullptr) { @@ -66,9 +71,10 @@ size_t cleanUpResultsImpl(string const& context, fs::path const& dirPath, boost::system::error_code ec; auto itr = fs::directory_iterator(dirPath, ec); if (ec.value() != 0) { - LOGS(_log, LOG_LVL_WARN, - context << "failed to open the results folder " << dirPath << ", ec: " << ec << "."); - return numFilesRemoved; + string const err = context + "failed to open the results folder '" + dirPath.string() + + "', ec: " + to_string(ec.value()) + "."; + LOGS(_log, LOG_LVL_ERROR, err); + throw runtime_error(err); } for (auto&& entry : boost::make_iterator_range(itr, {})) { auto filePath = entry.path(); diff --git a/src/xrdsvc/SsiService.cc b/src/xrdsvc/SsiService.cc index 2d98a5df9b..9debd8a786 100644 --- a/src/xrdsvc/SsiService.cc +++ b/src/xrdsvc/SsiService.cc @@ -188,7 +188,9 @@ SsiService::SsiService(XrdSsiLogger* log) // ATTENTION: this is the blocking operation since it needs to be run before accepting // new queries to ensure that worker had sufficient resources to process those. if (workerConfig->resultsCleanUpOnStart()) { - wbase::FileChannelShared::cleanUpResultsOnWorkerRestart(); + if (workerConfig->resultDeliveryProtocol() != wconfig::WorkerConfig::ResultDeliveryProtocol::SSI) { + wbase::FileChannelShared::cleanUpResultsOnWorkerRestart(); + } } }