diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..f888799bd --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,179 @@ +# +# $Id$ +# +# Created 2006/10/20 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +set(CMAKE_LEGACY_CYGWIN_WIN32 0) # Remove when CMake >= 2.8.4 is required +cmake_minimum_required(VERSION 2.4.6) + +project (KFS) + +if (DEFINED KFS_DIR_PREFIX) + message ("Kfs source dir prefix: ${KFS_DIR_PREFIX}") + set(CMAKE_MODULE_PATH ${KFS_DIR_PREFIX}cmake) +else (DEFINED KFS_DIR_PREFIX) + set(KFS_DIR_PREFIX "") + set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) +endif (DEFINED KFS_DIR_PREFIX) + +# Locate Boost +# set(Boost_LIB_DIAGNOSTIC_DEFINITIONS "-DBOOST_LIB_DIAGNOSTIC") + +if (NOT CYGWIN) + set(Boost_USE_STATIC_LIBS ON) +endif (NOT CYGWIN) + +set(Boost_USE_MULTITHREADED ON) + +IF (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + find_package(Boost COMPONENTS regex system REQUIRED) +ELSE (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + find_package(Boost COMPONENTS regex REQUIRED) +ENDIF (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + +message(STATUS "Boost-includes = ${Boost_INCLUDE_DIRS}") +message(STATUS "Boost-libs = ${Boost_LIBRARIES}") + +# Locate the path to jni.h +find_package(JNI) + +ENABLE_TESTING() + +# Change this to where the install directory is located +if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + set(CMAKE_INSTALL_PREFIX "." CACHE PATH "installation directory prefix" FORCE) +endif (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + +# Build with statically linked libraries; the value for this variable has to be defined here +# overwriting whatever is in the cache. +# When set to ON, we build with statically linked libraries; when off we +# link with dynamically linked libs + +IF (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + IF (BUILD_CPU_MODE STREQUAL "32") + message (STATUS "Building 32-bit mode on Solaris") + # If we are asked to build 32 bit mode + add_definitions (-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGE_FILES) + ELSE (BUILD_CPU_MODE STREQUAL "32") + # On solaris, use 64-bit mode + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -m64") + ENDIF (BUILD_CPU_MODE STREQUAL "32") + # Statically linked binaries don't work on solaris + set (USE_STATIC_LIB_LINKAGE OFF CACHE BOOL "Build binaries with statically linked libraries" FORCE) + # Cmake does whacky relink on solaris and messes things up; avoid this + set (CMAKE_SKIP_RPATH ON) +ELSE (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + set (USE_STATIC_LIB_LINKAGE ON CACHE BOOL "Build binaries with statically linked libraries" FORCE) + IF (CMAKE_SIZEOF_VOID_P MATCHES "4" AND NOT CYGWIN) + message (STATUS "Enabling largefile source flags") + add_definitions (-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_LARGE_FILES) + ENDIF (CMAKE_SIZEOF_VOID_P MATCHES "4" AND NOT CYGWIN) +ENDIF (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + +IF (ENABLE_PROFILING) + message (STATUS "Enabling profiling with gprof") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pg") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pg") + set(CMAKE_SHAREDBoost_USE_MULTITHREADED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg") + set(CMAKE_EXE_FLAGS "${CMAKE_EXE_FLAGS} -pg") +ENDIF (ENABLE_PROFILING) + +# Darwin compilers need to be told about ports +IF (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I/opt/local/include") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L/opt/local/lib") +ENDIF (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + + +# Change the line to Release to build release binaries +# For servers, build with debugging info; for tools, build Release +# + +IF (NOT CMAKE_BUILD_TYPE) + message (STATUS "Setting build type to Debug") + set (CMAKE_BUILD_TYPE "Debug") +ENDIF (NOT CMAKE_BUILD_TYPE) + +IF (CMAKE_BUILD_TYPE STREQUAL "Release") + message(STATUS "Enabling -D NDEBUG flag") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D NDEBUG -g3") +ENDIF(CMAKE_BUILD_TYPE STREQUAL "Release") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DBOOST_SP_USE_QUICK_ALLOCATOR") +string(TOUPPER KFS_OS_NAME_${CMAKE_SYSTEM_NAME} KFS_OS_NAME) +add_definitions (-D${KFS_OS_NAME}) + +# +# Find the path to libfuse.so +# + +SET(Fuse_LIBRARY_DIR "") +IF (EXISTS "/lib64/libfuse.so") + SET(Fuse_LIBRARY_DIR "/lib64") +ELSEIF (EXISTS "/opt/local/lib/libfuse.dylib") + SET(Fuse_LIBRARY_DIR "/opt/local/lib") +ELSEIF (EXISTS "/usr/local/lib/libfuse.dylib" OR EXISTS "/usr/local/lib/libfuse_ino64.dylib") + SET(Fuse_LIBRARY_DIR "/usr/local/lib") + SET(Fuse_INCLUDE_DIR "/usr/local/include/osxfuse") +ELSEIF (EXISTS "/usr/lib/libfuse.a" OR EXISTS "/usr/lib/libfuse.so") + SET(Fuse_LIBRARY_DIR "/usr/local/lib") +ELSEIF (EXISTS "/lib/libfuse.a" OR EXISTS "/lib/libfuse.so") + SET(Fuse_LIBRARY_DIR "/lib") +ENDIF (EXISTS "/lib64/libfuse.so") + +if(COMMAND cmake_policy) + cmake_policy(SET CMP0003 NEW) +endif(COMMAND cmake_policy) + +# include dirs +include_directories( ${Boost_INCLUDE_DIRS} ${KFS_DIR_PREFIX}src/cc) + +# get the subdirs we want +add_subdirectory (${KFS_DIR_PREFIX}src/cc/common src/cc/common) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/meta src/cc/meta) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/chunk src/cc/chunk) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/libclient src/cc/libclient) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/kfsio src/cc/kfsio) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/tools src/cc/tools) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/devtools src/cc/devtools) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/tests src/cc/tests) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/emulator src/cc/emulator) +add_subdirectory (${KFS_DIR_PREFIX}src/test-scripts src/test-scripts) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/qcdio src/cc/qcdio) +add_subdirectory (${KFS_DIR_PREFIX}src/cc/qcrs src/cc/qcrs) + +add_subdirectory (${KFS_DIR_PREFIX}examples/cc examples/cc) + +IF (NOT ${JAVA_INCLUDE_PATH} STREQUAL "") + message(STATUS "Found JNI...building kfs_access") + include_directories ( ${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2} ) + add_subdirectory (${KFS_DIR_PREFIX}src/cc/access src/cc/access) +ENDIF (NOT ${JAVA_INCLUDE_PATH} STREQUAL "") + +IF (NOT ${Fuse_LIBRARY_DIR} STREQUAL "") + message(STATUS "Found fuse") + include_directories ( ${Fuse_INCLUDE_DIR} ) + add_subdirectory (${KFS_DIR_PREFIX}src/cc/fuse src/cc/fuse) +ENDIF (NOT ${Fuse_LIBRARY_DIR} STREQUAL "") diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..419c29d0f --- /dev/null +++ b/Makefile @@ -0,0 +1,55 @@ +# $Id$ +# +# Created 2012/07/27 +# Author: Mike Ovsiannikov +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# Do not assume gnumake -- keep it as simple as possible + +release: + cd build && \ + { test -d release || mkdir release; } && \ + cd release && \ + cmake -D CMAKE_BUILD_TYPE=RelWithDebInfo ../.. && \ + make install + if test -x "`which ant 2>/dev/null`"; then ant jar; fi + if test -x "`which python 2>/dev/null`"; then \ + cd build/release && python ../../src/cc/access/kfs_setup.py build; fi + # cd build/release && make test + cd build/release && ../../src/test-scripts/kfstest.sh + +tarball: release + cd build && \ + tar -cvf kfs.tar -C ./release ./bin ./lib ./include && \ + tar -rvf kfs.tar -C ../ ./scripts ./webui ./examples ./benchmarks && \ + gzip qfs.tar + +debug: + cd build && \ + { test -d debug || mkdir debug; } && \ + cd debug && \ + cmake ../.. && \ + make install + if test -x "`which ant 2>/dev/null`"; then ant jar; fi + if test -x "`which python 2>/dev/null`"; then \ + cd build/debug && python ../../src/cc/access/kfs_setup.py build; fi + # cd build/debug && make test + cd build/debug && ../../src/test-scripts/kfstest.sh + +clean: + rm -rf build/release build/debug build/classes build/kfs-*.jar build/*.tar.gz diff --git a/benchmarks/mstress/MStress_Client.java b/benchmarks/mstress/MStress_Client.java new file mode 100644 index 000000000..4cede8e87 --- /dev/null +++ b/benchmarks/mstress/MStress_Client.java @@ -0,0 +1,445 @@ +/** + * $Id$ + * + * Author: Thilee Subramaniam + * + * Copyright 2012 Quantcast Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy + * of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * This Java client performs filesystem meta opetarions on the Hadoop namenode + * using HDFS DFSClient. + */ + +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.InetSocketAddress; +import java.util.Arrays; +import java.util.Collections; +import java.util.Date; +import java.util.LinkedList; +import java.util.Queue; +import java.util.Random; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSClient; +import org.apache.hadoop.hdfs.protocol.DirectoryListing; +import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; + +public class MStress_Client +{ + static final String TEST_BASE_DIR = new String("/mstress"); + + static DFSClient dfsClient_ = null; + static StringBuilder path_ = new StringBuilder(4096); + static int pathLen_ = 0; + static int totalCreateCount = 0; + static final int COUNT_INCR = 500; + + //From commandline + static String dfsServer_ = ""; + static int dfsPort_ = 0; + static String testName_ = ""; + static String prefix_ = ""; + static int prefixLen_ = 0; + static String planfilePath_ = ""; + static String hostName_ = ""; + static String processName_ = ""; + + //From plan file + static String type_ = ""; + static int levels_ = 0; + static int inodesPerLevel_ = 0; + static int pathsToStat_ = 0; + + private static void pathPush(String leafStr) { + int leafLen = leafStr.length(); + if (leafLen == 0) { + return; + } + if (leafStr.charAt(0) != '/') { + path_.insert(pathLen_, "/"); + System.out.printf("Leaf = %s, path_ = [%s]\n", leafStr, path_.toString()); + pathLen_ ++; + } + path_.insert(pathLen_, leafStr); + System.out.printf("After push Leaf = %s, path_ = [%s]\n", leafStr, path_.toString()); + pathLen_ += leafLen; + } + + private static void pathPop(String leafStr) { + int leafLen = leafStr.length(); + if (leafLen > pathLen_ - 1) { + System.out.printf("Error in pop: %s from %s, leafLen = %d, pathLen_ = %d\n", leafStr, path_.toString(), leafLen, pathLen_); + return; + } + String lastPart = path_.substring(pathLen_ - leafLen, pathLen_); + System.out.printf("lastPart = [%s - %s] leafStr = [%s - %s]\n", lastPart, lastPart.getClass().getName(), leafStr, leafStr.getClass().getName()); + + if (!leafStr.equals(lastPart)) { + System.out.printf("Error in pop: %s from %s\n", leafStr, path_.toString()); + System.exit(1); + return; + } + pathLen_ -= leafLen + 1; + path_.insert(pathLen_, '\0'); + System.out.printf("After pop, path_ = [%s]\n", path_.toString()); + } + + private static void pathReset() { + path_.insert(0, '\0'); + pathLen_ = 0; + } + + + public static void main(String args[]) { + parseOptions(args); + + try { + Configuration conf = new Configuration(true); + String confSet = "hdfs://" + dfsServer_ + ":" + dfsPort_; + conf.set("fs.default.name", confSet); + conf.set("fs.trash.interval", "0"); + InetSocketAddress inet = new InetSocketAddress(dfsServer_, dfsPort_); + dfsClient_ = new DFSClient(inet, conf); + + if (parsePlanFile() < 0) { + return; + } + + if (testName_.equals("create")) { + createDFSPaths(); + } else if (testName_.equals("stat")) { + statDFSPaths(); + } else if (testName_.equals("readdir")) { + listDFSPaths(); + } else if (testName_.equals("delete")) { + removeDFSPaths(); + } else { + System.out.printf("Error: unrecognized test \'%s\'\n", testName_); + } + } catch( IOException e) { + e.printStackTrace(); + } + return; + } + + private static void parseOptions(String args[]) + { + if (!(args.length == 14 || args.length == 12 || args.length == 5)) { + usage(); + } + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-s") && i+1 < args.length) { + dfsServer_ = args[i+1]; + System.out.println(args[i+1]); + i++; + } else if (args[i].equals("-p") && i+1 < args.length) { + dfsPort_ = Integer.parseInt(args[i+1]); + System.out.println(args[i+1]); + i++; + } else if (args[i].equals("-t") && i+1 < args.length) { + testName_ = args[i+1]; + System.out.println(args[i+1]); + i++; + } else if (args[i].equals("-a") && i+1 < args.length) { + planfilePath_ = args[i+1]; + System.out.println(args[i+1]); + i++; + } else if (args[i].equals("-c") && i+1 < args.length) { + hostName_ = args[i+1]; + System.out.println(args[i+1]); + i++; + } else if (args[i].equals("-n") && i+1 < args.length) { + processName_ = args[i+1]; + System.out.println(args[i+1]); + i++; + } else if (args[i].equals("-P") && i+1 < args.length) { + prefix_ = args[i+1]; + System.out.println(args[i+1]); + i++; + } + } + + if (dfsServer_.length() == 0 || + testName_.length() == 0 || + planfilePath_.length() == 0 || + hostName_.length() == 0 || + processName_.length() == 0 || + dfsPort_ == 0) { + usage(); + } + if (prefix_ == null) { + prefix_ = new String("PATH_"); + } + prefixLen_ = prefix_.length(); + } + + private static void usage() + { + String className = MStress_Client.class.getName(); + System.out.printf("Usage: java %s -s dfs-server -p dfs-port [-t [create|stat|readdir|rmdir] -a planfile-path -c host -n process-name -P prefix]\n", + className); + System.out.printf(" -t: this option requires -a, -c, and -n options.\n"); + System.out.printf(" -P: default prefix is PATH_.\n"); + System.out.printf("eg:\n"); + System.out.printf(" java %s -s -p -t create -a -c localhost -n Proc_00\n", className); + System.exit(1); + } + + private static int parsePlanFile() + { + int ret = -1; + try { + FileInputStream fis = new FileInputStream(planfilePath_); + DataInputStream dis = new DataInputStream(fis); + BufferedReader br = new BufferedReader(new InputStreamReader(dis)); + + if (prefix_.isEmpty()) { + prefix_ = "PATH_"; + } + + String line; + while ((line = br.readLine()) != null) { + if (line.length() == 0 || line.startsWith("#")) { + continue; + } + if (line.startsWith("type=")) { + type_ = line.substring(5); + continue; + } + if (line.startsWith("levels=")) { + levels_ = Integer.parseInt(line.substring(7)); + continue; + } + if (line.startsWith("inodes=")) { + inodesPerLevel_ = Integer.parseInt(line.substring(7)); + continue; + } + if (line.startsWith("nstat=")) { + pathsToStat_ = Integer.parseInt(line.substring(6)); + continue; + } + } + dis.close(); + if (levels_ > 0 && !type_.isEmpty() && inodesPerLevel_ > 0 && pathsToStat_ > 0) { + ret = 0; + } + } catch (Exception e) { + System.out.println("Error: " + e.getMessage()); + } + return ret; + } + + private static long timeDiffMilliSec(Date alpha, Date zigma) + { + return zigma.getTime() - alpha.getTime(); + } + + private static void CreateDFSPaths(int level, String parentPath) { + Boolean isLeaf = false; + Boolean isDir = false; + if (level + 1 >= levels_) { + isLeaf = true; + } + if (isLeaf) { + if (type_.equals("dir")) { + isDir = true; + } else { + isDir = false; + } + } else { + isDir = true; + } + + for (int i = 0; i < inodesPerLevel_; i++) { + String path = parentPath + "/" + prefix_ + Integer.toString(i); + //System.out.printf("Creating (isdir=%b) [%s]\n", isDir, path.toString()); + + if (isDir) { + try { + dfsClient_.mkdirs(path); + totalCreateCount ++; + if (totalCreateCount % COUNT_INCR == 0) { + System.out.printf("Created paths so far: %d\n", totalCreateCount); + } + if (!isLeaf) { + CreateDFSPaths(level+1, path); + } + } catch( IOException e) { + e.printStackTrace(); + throw new RuntimeException(); + } + } else { + try { + dfsClient_.create(path, true); + totalCreateCount ++; + if (totalCreateCount % COUNT_INCR == 0) { + System.out.printf("Created paths so far: %d\n", totalCreateCount); + } + } catch( IOException e) { + e.printStackTrace(); + throw new RuntimeException(); + } + } + } + } + + private static int createDFSPaths() + { + String basePath = new String(TEST_BASE_DIR) + "/" + hostName_ + "_" + processName_; + try { + Boolean ret = dfsClient_.mkdirs(basePath); + if (!ret) { + System.out.printf("Error: failed to create test base dir [%s]\n", basePath); + return -1; + } + } catch( IOException e) { + e.printStackTrace(); + throw new RuntimeException(); + } + + Date alpha = new Date(); + + CreateDFSPaths(0, basePath); + + Date zigma = new Date(); + System.out.printf("Client: %d paths created in %d msec\n", totalCreateCount, timeDiffMilliSec(alpha, zigma)); + return 0; + } + + private static int statDFSPaths() + { + String basePath = new String(TEST_BASE_DIR) + "/" + hostName_ + "_" + processName_; + + Date alpha = new Date(); + Random random = new Random(alpha.getTime()); + + for (int count = 0; count < pathsToStat_; count++) { + String path = basePath; + for (int d = 0; d < levels_; d++) { + int randIdx = random.nextInt(inodesPerLevel_); + String name = new String(prefix_) + Integer.toString(randIdx); + path = path + "/" + name; + } + + //System.out.printf("Doing stat on [%s]\n", path); + HdfsFileStatus stat = null; + try { + stat = dfsClient_.getFileInfo(path); + } catch(IOException e) { + e.printStackTrace(); + throw new RuntimeException(); + } + if (count % COUNT_INCR == 0) { + System.out.printf("Stat paths so far: %d\n", count); + } + } + Date zigma = new Date(); + System.out.printf("Client: Stat done on %d paths in %d msec\n", pathsToStat_, timeDiffMilliSec(alpha, zigma)); + return 0; + } + + private static int listDFSPaths() + { + Date alpha = new Date(); + int inodeCount = 0; + + String basePath = new String(TEST_BASE_DIR) + "/" + hostName_ + "_" + processName_; + Queue pending = new LinkedList(); + pending.add(basePath); + + while (!pending.isEmpty()) { + String parent = pending.remove(); + DirectoryListing thisListing; + try { + thisListing = dfsClient_.listPaths(parent, HdfsFileStatus.EMPTY_NAME); + if (thisListing == null || thisListing.getPartialListing().length == 0) { + //System.out.println("Empty directory"); + continue; + } + do { + HdfsFileStatus[] children = thisListing.getPartialListing(); + for (int i = 0; i < children.length; i++) { + String localName = children[i].getLocalName(); + //System.out.printf("Readdir going through [%s/%s]\n", parent, localName); + if (localName.equals(".") || localName.equals("..")) { + continue; + } + inodeCount ++; + if (inodeCount % COUNT_INCR == 0) { + System.out.printf("Readdir paths so far: %d\n", inodeCount); + } + if (children[i].isDir()) { + pending.add(parent + "/" + localName); + } + } + if (!thisListing.hasMore()) { + break; + } else { + //System.out.println("Remaining entries " + Integer.toString(thisListing.getRemainingEntries())); + } + thisListing = dfsClient_.listPaths(parent, thisListing.getLastName()); + } while (thisListing != null); + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(); + } + } + + Date zigma = new Date(); + System.out.printf("Client: Directory walk done over %d inodes in %d msec\n", inodeCount, timeDiffMilliSec(alpha, zigma)); + return 0; + } + + private static int removeDFSPaths() + { + String rmPath = new String(TEST_BASE_DIR) + "/" + hostName_ + "_" + processName_; + + System.out.printf("Deleting %s ...\n", rmPath); + + int countLeaf = (int) Math.round(Math.pow(inodesPerLevel_, levels_)); + int[] leafIdxRangeForDel = new int[countLeaf]; + for(int i=0;i 0) { + path = prefix_ + delta + "/" + path; + } else { + path = prefix_ + delta; + } + } + dfsClient_.delete(rmPath + "/" + path,true); + } + dfsClient_.delete(rmPath, true); + } catch(IOException e) { + e.printStackTrace(); + throw new RuntimeException(); + } + Date zigma = new Date(); + System.out.printf("Client: Deleted %s. Delete took %d msec\n", rmPath, timeDiffMilliSec(alpha, zigma)); + return 0; + } +} diff --git a/benchmarks/mstress/Makefile b/benchmarks/mstress/Makefile new file mode 100644 index 000000000..a87e952e8 --- /dev/null +++ b/benchmarks/mstress/Makefile @@ -0,0 +1,57 @@ +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + + +UNAME := $(shell uname) +CC = g++ +KFS_BUILD_INCLUDE?=../../build/include +KFS_BUILD_STATLIB?=../../build/lib/static +INCLUDES = -I. -I$(KFS_BUILD_INCLUDE)/kfs +CCFLAGS = -g -O2 $(INCLUDES) +LIBS = -lkfs_tools -lkfs_client -lkfs_qcdio -lpthread -lkfs_io -lkfs_common -lkfs_qcdio -lpthread -lz -lcrypto -lkfs_qcrs -lboost_regex +ifneq ($(UNAME), Darwin) +LIBS += -lrt +endif +LDFLAGS = -L$(KFS_BUILD_STATLIB) -L$(BOOST_LIBRARY_DIR) $(LIBS) + +javaC = javac +javaR = java +javaCP = -cp + +default: ccclient javaclient + +ccclient: mstress_client.cc + $(CC) $(CCFLAGS) mstress_client.cc $(LDFLAGS) -o mstress_client + +javaclient: MStress_Client.java + $(javaC) $(javaCP) $(shell echo mstress_hdfs_client_jars/*.jar | sed 's/ /:/g') MStress_Client.java + +run_ccclient: ccclient + ./mstress_client -h + +run_javaclient: javaclient + $(javaR) $(javaCP) .:$(shell echo mstress_hdfs_client_jars/*.jar | sed 's/ /:/g') MStress_Client -h + +clean: + rm -f *.o mstress_client *.class + rm -rf *.dSYM + diff --git a/benchmarks/mstress/README b/benchmarks/mstress/README new file mode 100644 index 000000000..7b6e0fa6a --- /dev/null +++ b/benchmarks/mstress/README @@ -0,0 +1,312 @@ +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# + + +MSTRESS : A framework for metaserver/namenode benchmarking +========================================================== + +Contents: + [1] Framework description + [2] Files in this direcotry + [3] Running benchmark + [4] Setting up DFS metaserver/namenode + + +[1] Framework +============= + +The mstress master would invoke mstress.py in slave mode on the client hosts +through SSH. + +Each mstress slave would invoke the necessary number of load-generating clients, +which would stress the meta server. + + +-----------------------------+ + | +-------------------+ | + | | mstress.py +-----+----------------------+ + | | (--mode master) +-----+------------------+ | + | +-------------------+ | | | + | (master host) | | | + +-----------------------------+ | | + | | + +--------------------------------------+ | | + | | | | + +-----------+ | +--------------+ +--------------+ | | | + | |<---------+-|mstress_client|<--| mstress.py |<-+--+ | + | | | +--------------+ |(--mode slave)| | | + | DFS meta | | +--------------+ | | + | server | | (client host 1) | | + | | +--------------------------------------+ | + | | | + | | | + | | +--------------------------------------+ | + | | | +--------------+ +--------------+ | | + | |<-----------|mstress_client|<--| mstress.py |<-+------+ + +-----------+ | +--------------+ |(--mode slave)| | + | +--------------+ | + | (client host 2) | + +--------------------------------------+ + +The clients will do file or directory tree creation, stat, or directory walk as +specified by the benchmark plan. + + + +[2] Files +========= + + - mstress_initialize.sh + Helper script to be used before compiling the source and deploying the + mstress bundle. + Do ./mstress_initialize.sh --h to see options. + + - Makefile + Used to build the KFS stress client (C++) and HDFS stress client (Java). + Ensure that $JAVA_HOME is set correctly. + + - mstress_client.cc + Produces the mstress_client binary that actually drives the KFS metaserver. + Build using the provided Makefile ('make ccclient') + See 'Benchmarking Procedure' below for details. + + - MStress_Client.java + Produces the java MStress_Client for HDFS namenode. + Build using the provided Makefile ('make javaclient') + See 'Benchmarking Procedure' below for details. + + - mstress_prepare_master_clients.sh + Helper script used to copy the mstress directory to a list of hosts. To be + used after running make. + + - mstress_plan.py + Used to generate a plan file for benchmarking. + Args: client hosts list, number of clients per client host, file tree depth, + nodes per level etc. + The generated plan file is also copied to the /tmp firectory of the + participating client hosts. + Do ./mstress_plan.py --help to see all options. + + - mstress.py + Used to run the metaserver test with the help of the plan file. + Args: dfs server host & port, planfile etc. + This script invokes mstress.py on the remote host through SSH. For this + reason, the mstress path should be the same on the participating hosts. + Do ./mstress.py --help to see all options. + + - mstress_run.py + Essentially a wrapper around mstress_plan.py and mstress.py + Args: client hosts list and DFS server:port information. + Do mstress_run.py --help to see usage. + + - mstress_sample_run.sh + Used to run sample benchmarks on given KFS and HDFS servers by launching + clients on localhost. Essentially a wrapper around mstress_initialize.sh, + make, mstress_prepare_master_clients.sh, and mstress.run.py. + + - mstress_cleanup.py + Used to clean up the plan files and log files created on participating + hosts. + Do ./mstress_cleanup.py --help to see usage. + + + +[3] Benchmarking Procedure +========================== + +In reality, benchmark would use separate physical machines each for compiling, +running the DFS server, running mstress master, and load generating clients. +The procedure below assumes different machines, but one can also run all +on the same box, "localhost". + + +(1) Setup the KFS metaserver and HDFS namenode with the help of + section [4] "Setting up DFS metaserver/namenode" below. + + +(2) You should have SSH key authentication set up on the hosts involved so + that the scripts can do password/passphrase-less login. + + +(3) On the build machine, ensure that you have the Cloudera HDFS client jars. + This is typically at /usr/lib/hadoop/client/*.jars. + If you don't have them, install them by, + 1. Add the following to /etc/yum.repos.d/thirdparty.repo (sudo needed) + ----------------------------------- + [cloudera-cdh4] + name=Cloudera's Distribution for Hadoop, Version 4 + baseurl=http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/4/ + gpgkey = http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera + gpgcheck = 1 + ----------------------------------- + 2. sudo yum install hadoop-client + + +(4) On the build host, execute 'mstress_initialize.sh' to set up jar paths. + ./mstress_initialize.sh /usr/lib/hadoop/client/ + + +(5) On the build host, compile and install KFS using the steps described in the + DeveloperDoc in top-level 'doc' directory. Then change directory to + bench/mstress, and just issuing 'make' should build the Java/C++ clients. + + To manually build C++ client: + - Assuming the kfs code is in ~/code/kfs, compile and install KFS using + the steps described in the DeveloperDoc in top-level 'doc' directory. + - cd ~/code/kfs/bench/mstress + - KFS_BUILD_INCLUDE=~/code/kfs/build/include \ + KFS_BUILD_STATLIB=~/code/kfs/build/lib/static \ + BOOST_LIBRARY_DIR=/opt/local/lib \ + make + If you encounter any build problem, ensure that your KFS_BUILD_INCLUDE etc. + refer to valid paths. + + + To manually build MStress_Client.java + - Compile MStress_client.java with hadoop-client jars in the class path. + theCP=$(echo mstress_hdfs_client_jars/*.jar | sed 's/ /:/g') + javac -cp $theCP MStress_Client.java + + +(6) Determine the master and load generating client hosts that you want to use + to connect to the DFS server. This could just be "localhost" if you want to + run the benchmark locally. + + +(7) From the build host, use "mstress_prepare_master_clients.sh" to copy your + mstress directory to the participating hosts. + Note: Do './mstress_prepare_master_clients.sh localhost' localhost-only run. + The mstress directory paths should be the same on master and client hosts. + + +(8) On the master host change directory to ~/mstress + Create a plan file using mstress_plan.py. + Do ./mstress_plan.py --help to see example usage. + Eg: + ./mstress_plan.py -c localhost,127.0.0.1 -n 3 -t file -l 2 -i 10 -n 139 + + This will create a plan that creates 2 levels of 10 inodes each by 3 + processes on 2 hosts. Since each client creates 110 inodes (10 directories + with 10 files each) and since there are 6 clients (3 x 2), this plan is to + create 660 inodes on the DFS server. + + The planfile will pick N files to stat per client such that + (N x client-host-count x clients-per-host) is just enough to meet 139. + + The plan file gets copied to the /tmp directory where you run it. It will + also get copied to the participating client hosts in the '-c' option. + + +(9) Checklist: check the presence of, + - the plan file on master host and client hosts (step 8 does this for you) + - the mstress_client binaries (KFS and HDFS clients) on master and all + client hosts (step 7). + +(10) Run the benchmark from the master with mstress.py. + Do ./mstress.py --help to see options. + Eg: + ./mstress.py -f kfs -s -p -a + ./mstress.py -f hdfs -s -p -a + +(11) The benchmark name, progress, and time taken will be printed out. + + +[4] DFS Server Setup +==================== + +[4.1] KFS Metaserver Setup +------------------------- + +You can setup the KFS metaserver using the steps described in AdminDoc in the +top-level 'doc' directory. + +If you want to set up a simple metaserver for local testing, please use the +script ~/code/kfs/examples/sampleservers/sample_setup.py. + + +[4.2] HDFS Namenode Setup +------------------------- + +This will setup the HDFS namenode to listen on port 40000. +The webUI will run on default port 50070. +The installation used here is based on Cloudera's CDH4 release. + +(1) Ensure java is installed, and $JAVA_HOME is set. + +(2) Add the following to /etc/yum.repos.d/thirdparty.repo (sudo needed) + ----------------------------------- + [cloudera-cdh4] + name=Cloudera's Distribution for Hadoop, Version 4 + baseurl=http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/4/ + gpgkey = http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera + gpgcheck = 1 + ----------------------------------- + +(3) Install hadoop-hdfs-namenode and update the configs. + sudo yum install hadoop-hdfs-namenode + sudo mv /etc/hadoop/conf /etc/hadoop/conf.orig + sudo cp -r /etc/hadoop/conf.empty /etc/hadoop/conf + +(4) Update /etc/hadoop/conf/core-site.xml (enter your server name instead + of 10.20.30.255) + ---------------------------------- + + + fs.default.name + hdfs://10.20.30.255:40000 + + + ---------------------------------- + +(5) Edit /etc/hadoop/conf/hdfs-site.xml, fix or ensure that there is + a "file://" prefix to avoid warnings. + ---------------------------------- + + + dfs.name.dir + file:///var/lib/hadoop-hdfs/cache/hdfs/dfs/name + + + ---------------------------------- + +(6) Format the namenode: + sudo service hadoop-hdfs-namenode init + +(7) Start namenode. + sudo service hadoop-hdfs-namenode start + +(8) Now namenode should be running. Confirm this by running, + ps aux | grep java + sudo netstat -pan | grep 40000 + +(9) To administer the files and directories, + /usr/lib/hadoop/bin/hadoop fs -ls / + +(10) The user with write access on this namenode is "hdfs". Therefore, give + write permission to "/" folder (for mstress benchmark to use) by logging + in as "hdfs" user. + sudo bash + su hdfs + JAVA_HOME= /usr/lib/hadoop/bin/hadoop fs -chmod 777 / + exit + +(11) Now the namenode is ready for running benchmarks. + diff --git a/benchmarks/mstress/mstress.py b/benchmarks/mstress/mstress.py new file mode 100755 index 000000000..de714d08f --- /dev/null +++ b/benchmarks/mstress/mstress.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python + +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# This program invokes given number of client processes on the given set of +# remote clients (Java and C++) and makes use of the plan file to apply load +# on the DFS server. + +import optparse +import sys +import subprocess +import time +import os +import signal +import datetime +import commands +import resource +import re + +class Globals: + MASTER_PATH = '' + CLIENT_PATH = '' + MSTRESS_LOCK = '/tmp/mstress_master.lock' + SIGNALLED = False + SERVER_CMD = "" + SERVER_KEYWORD = "" + KFS_SERVER_CMD = "metaserver" + KFS_SERVER_KEYWORD = "metaserver" + HDFS_SERVER_CMD = "java" + HDFS_SERVER_KEYWORD = "NameNode" + + +def ParseCommandline(): + parser = optparse.OptionParser() + parser.add_option('-m', '--mode', + action='store', + default='master', + type='string', + help='Run as master or slave') + parser.add_option('-f', '--filesystem', + action='store', + default=None, + type='string', + help='Filesystem whose metaserver to test. kfs or hdfs.') + parser.add_option('-s', '--server', + action='store', + default=None, + type='string', + help='Metaserver or Namenode hostname.') + parser.add_option('-p', '--port', + action='store', + default=None, + type='int', + help='Metaserver or Namenode port') + parser.add_option('-c', '--client-hostname', + action='store', + default=None, + type='string', + help='mstress slave\'s hostname (slave only option).') + parser.add_option('-k', '--client-lookup-key', + action='store', + default=None, + type='string', + help='mstress slave\'s lookup key to be used (slave only option).') + parser.add_option('-t', '--client-testname', + action='store', + default=None, + type='string', + help='Test to run on mstress slave (slave only option).') + parser.add_option('-a', '--plan', + action='store', + default=None, + type='string', + help='Plan file containing client instructions.') + parser.add_option('-l', '--leave-files', action='store_true', + default=False, help='Leave files. Does not perform delete test.') + + opts, args = parser.parse_args() + if args: + sys.exit('Unexpected arguments: %s.' % str(args)) + + if not opts.filesystem or not opts.server or not opts.port or not opts.plan: + sys.exit('Missing mandatory arguments.') + if opts.mode not in ('master', 'slave'): + sys.exit('Invalid mode.') + if opts.mode == 'master': + # master should not have -c option + if opts.client_hostname is not None: + sys.exit('Master: does not support -c option.') + if opts.client_testname is not None: + sys.exit('Master: does not support -t option.') + else: + # for slave, this is the slave host name. + hosts = opts.client_hostname.split(',') + if len(hosts) != 1: + sys.exit('Slave: Error in client host name.') + if opts.client_testname is None or opts.client_lookup_key is None: + sys.exit('Slave: Error in client test name or lookup key.') + + return opts + + +def PrintMemoryUsage(opts): + proc = subprocess.Popen(['ssh', opts.server, + 'ps -C %s -o rss,pid,cmd |grep %s'%(Globals.SERVER_CMD,Globals.SERVER_KEYWORD)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + print "Memory usage %dKB" % int(re.search(r'^(\d+)\s+', proc.communicate()[0]).group(1)) + + +def RunMStressMaster(opts, hostsList): + """ Called when run in master mode. Calls master funcions for 'create', + 'stat', and 'readdir'. + + Args: + opts: options object, from parsed commandine options. + hostsList: list of hosts obtained from plan file. + + Returns: + None + """ + + # print 'Master: called with %r, %r' % (opts, hostsList) + + startTime = datetime.datetime.now() + if RunMStressMasterTest(opts, hostsList, 'create') == False: + return + deltaTime = datetime.datetime.now() - startTime + print '\nMaster: Create test took %d.%d sec' % (deltaTime.seconds, deltaTime.microseconds/1000000) + PrintMemoryUsage(opts) + print '==========================================' + + startTime = datetime.datetime.now() + if RunMStressMasterTest(opts, hostsList, 'stat') == False: + return + deltaTime = datetime.datetime.now() - startTime + print '\nMaster: Stat test took %d.%d sec' % (deltaTime.seconds, deltaTime.microseconds/1000000) + print '==========================================' + + startTime = datetime.datetime.now() + if RunMStressMasterTest(opts, hostsList, 'readdir') == False: + return + deltaTime = datetime.datetime.now() - startTime + print '\nMaster: Readdir test took %d.%d sec' % (deltaTime.seconds, deltaTime.microseconds/1000000) + print '==========================================' + + if opts.leave_files: + print "\nNot deleting files because of -l option" + return + + startTime = datetime.datetime.now() + if RunMStressMasterTest(opts, hostsList, 'delete') == False: + return + deltaTime = datetime.datetime.now() - startTime + print '\nMaster: Delete test took %d.%d sec' % (deltaTime.seconds, deltaTime.microseconds/1000000) + print '==========================================' + + +def RunMStressMasterTest(opts, hostsList, test): + """ Called when run in master mode. Invokes the slave version of the same + program on the provided hosts list with the given test name. + + Args: + opts: parsed commandline options. + hostsList: list of hosts obtained from plan file. + test: string: test name to call. + + Returns: + False on error, True on success + """ + if Globals.SIGNALLED: + return False + + # invoke remote master client. + ssh_cmd = '%s -m slave -f %s -s %s -p %d -t %s -a %s'%( + Globals.MASTER_PATH, + opts.filesystem, + opts.server, + opts.port, + test, + opts.plan) + clientHostMapping = MapHostnameForTest(hostsList, test) + running_procs = {} + + for client in hostsList: + slaveLogfile = opts.plan + '_' + client + '_' + test + '_' + opts.filesystem + '.slave.log' + p = subprocess.Popen(['/usr/bin/ssh', client, + '%s -c %s -k %s >& %s' % (ssh_cmd, client, clientHostMapping[client], slaveLogfile)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + running_procs[p] = client + + isLine1 = True + while running_procs: + tobedelkeys = [] + for proc in running_procs.iterkeys(): + client = running_procs[proc] + retcode = proc.poll() + if retcode is not None: + sout,serr = proc.communicate() + if sout: + print '\nMaster: output of slave (%s):%s' % (client, sout) + if serr: + print '\nMaster: err of slave (%s):%s' % (client, serr) + tobedelkeys.append(proc) + else: + if Globals.SIGNALLED: + proc.terminate() + + for k in tobedelkeys: + del running_procs[k] + + if running_procs: + if isLine1: + sys.stdout.write('Master: remote slave running \'%s\'' % test) + isLine1 = False + else: + sys.stdout.write('.') + sys.stdout.flush() + time.sleep(0.5) + return True + +def MapHostnameForTest(clients, test): + """ Determines the '-c' argument to use for slave invocation. This argument + is passed to the C++/Java client so that the client can use it as a key + to read the plan file. + + For 'create', this name is the same as the client name. But for doing + a 'stat' or a 'readdir' we want to run the tests on a client different + from the one that created the path. + Args: + clients: list of strings, clients. + test: string, the name of the test. + + Returns: + map of strings, client name to '-c' argument. + """ + mapping = {} + length = len(clients) + for i in range(0, length): + if test == 'stat' or test == 'readdir': + mapping[clients[i]] = clients[(i+1)%length] + else: + mapping[clients[i]] = clients[i] + + return mapping + +def RunMStressSlave(opts, clientsPerHost): + """ Called when the code is run in slave mode, on each slave. + Invokes number of client processes equal to 'clientsPerHost'. + + Args: + opts: parsed commandline options. + clientsPerHost: integer, number of processes to run on each host. + + Returns: + None + """ + + print 'Slave: called with %r, %d' % (opts, clientsPerHost) + os.putenv('KFS_CLIENT_DEFAULT_FATTR_REVALIDATE_TIME',"-1") + + running_procs = [] + for i in range(0, clientsPerHost): + clientLogfile = '%s_%s_proc_%02d_%s_%s.client.log' % (opts.plan, opts.client_hostname, i, opts.client_testname, opts.filesystem) + args = ["%s -s %s -p %s -a %s -c %s -t %s -n proc_%02d >& %s" % ( + Globals.CLIENT_PATH, + opts.server, + str(opts.port), + opts.plan, + opts.client_lookup_key, + opts.client_testname, + i, + clientLogfile)] + print 'Slave: args = %r' % args + p = subprocess.Popen(args, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + running_procs.append(p) + + isLine1 = True + while running_procs: + for proc in running_procs: + ret = proc.poll() + if ret is not None: + sout,serr = proc.communicate() + if sout: + print '\nSlave: output of (ClientHost %s, ClientNo %r):%s' % (opts.client_hostname, proc, sout) + if serr: + print '\nSlave: err of (ClientHost %s, ClientNo %r):%s' % (opts.client_hostname, proc, serr) + running_procs.remove(proc) + else: + if Globals.SIGNALLED: + proc.terminate() + + if running_procs: + if isLine1: + sys.stdout.write('Slave: load client \'%s\' running' % opts.client_testname) + isLine1 = False + else: + sys.stdout.write('.') + sys.stdout.flush() + time.sleep(0.5) + + +def ReadPlanFile(opts): + """ Reads the given plan file to extract the list of client-hosts and + process-count per client-host. + + Args: + opts: parsed commandline options. + + Returns: + hostslist: list of client host names + clientsPerHost: integer: client processes per client host. + """ + + hostsList = None + clientsPerHost = None + leafType = None + numLevels = None + numToStat = None + nodesPerLevel = None + + planfile = open(opts.plan, 'r') + for line in planfile: + if line.startswith('#'): + continue + if line.startswith('hostslist='): + hostsList = line[len('hostslist='):].strip().split(',') + elif line.startswith('clientsperhost='): + clientsPerHost = int(line[len('clientsperhost='):].strip()) + elif line.startswith('type='): + leafType = line[len('type='):].strip() + elif line.startswith('levels='): + numLevels = int(line[len('levels='):].strip()) + elif line.startswith('nstat='): + numToStat = int(line[len('nstat='):].strip()) + elif line.startswith('inodes='): + nodesPerLevel = int(line[len('inodes='):].strip()) + planfile.close() + if None in (hostsList, clientsPerHost, leafType, numLevels, numToStat, nodesPerLevel): + sys.exit('Failed to read plan file') + + nodesPerProcess = 0 + leafNodesPerProcess = 0 + for l in range(1,numLevels+1): + nodesPerProcess += pow(nodesPerLevel,l) + if l == numLevels: + leafNodesPerProcess = pow(nodesPerLevel,l) + inters = nodesPerProcess - leafNodesPerProcess + overallNodes = nodesPerProcess * len(hostsList) * clientsPerHost + overallLeafs = leafNodesPerProcess * len(hostsList) * clientsPerHost + intermediateNodes = inters * len(hostsList) * clientsPerHost + len(hostsList) * clientsPerHost + 1 + totalNumToStat = numToStat * len(hostsList) * clientsPerHost + + print ('Plan:\n' + + ' o %d client processes on each of %d hosts will generate load.\n' % (clientsPerHost, len(hostsList)) + + ' o %d levels of %d nodes (%d leaf nodes, %d total nodes) will be created by each client process.\n' % (numLevels, nodesPerLevel, leafNodesPerProcess, nodesPerProcess) + + ' o Overall, %d leaf %ss will be created, %d intermediate directories will be created.\n' % (overallLeafs, leafType, intermediateNodes) + + ' o Stat will be done on a random subset of %d leaf %ss by each client process, totalling %d stats.\n' % (numToStat, leafType, totalNumToStat) + + ' o Readdir (non-overlapping) will be done on the full file tree by all client processes.\n') + return hostsList, clientsPerHost + + +def SetGlobalPaths(opts): + mydir = os.path.dirname(os.path.realpath(__file__)) + Globals.MASTER_PATH = os.path.join(mydir, 'mstress.py') + + if opts.filesystem == 'kfs': + Globals.CLIENT_PATH = os.path.join(mydir, 'mstress_client') + Globals.SERVER_CMD = Globals.KFS_SERVER_CMD + Globals.SERVER_KEYWORD = Globals.KFS_SERVER_KEYWORD + elif opts.filesystem == 'hdfs': + hdfsjars = commands.getoutput("echo %s/mstress_hdfs_client_jars/*.jar | sed 's/ /:/g'" % mydir) + Globals.CLIENT_PATH = 'java -Xmx256m -cp %s:%s MStress_Client' % (mydir,hdfsjars) + Globals.SERVER_CMD = Globals.HDFS_SERVER_CMD + Globals.SERVER_KEYWORD = Globals.HDFS_SERVER_KEYWORD + else: + sys.exit('Invalid filesystem option') + +def CreateLock(opts): + if opts.mode != 'master': + return + if os.path.exists(Globals.MSTRESS_LOCK): + sys.exit('Program already running. Please wait till it finishes') + f = open(Globals.MSTRESS_LOCK, 'w') + f.write(str(os.getpid())) + f.close() + +def RemoveLock(opts): + if opts.mode != 'master': + return + if os.path.exists(Globals.MSTRESS_LOCK): + f = open(Globals.MSTRESS_LOCK, 'r') + pid = f.read() + f.close() + if int(pid) == os.getpid(): + os.unlink(Globals.MSTRESS_LOCK) + +def HandleSignal(signum, frame): + print "Received signal, %d" % signum + Globals.SIGNALLED = True + +def main(): + signal.signal(signal.SIGTERM, HandleSignal) + signal.signal(signal.SIGINT, HandleSignal) + signal.signal(signal.SIGHUP, HandleSignal) + + opts = ParseCommandline() + + SetGlobalPaths(opts) + + CreateLock(opts) + + try: + (hostsList,clientsPerHost) = ReadPlanFile(opts) + + if opts.mode == 'master': + RunMStressMaster(opts, hostsList) + else: + RunMStressSlave(opts, clientsPerHost) + finally: + RemoveLock(opts) + +if __name__ == '__main__': + main() + diff --git a/benchmarks/mstress/mstress_cleanup.py b/benchmarks/mstress/mstress_cleanup.py new file mode 100755 index 000000000..f197f4069 --- /dev/null +++ b/benchmarks/mstress/mstress_cleanup.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +# +# $Id$ +# +# Copyright 2012 Quantcast Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# This is a helper script to cleanup the planfile and the logs from all +# participating hosts. +# + +import optparse +import sys +import subprocess +import time +import os +import signal +import datetime +import commands + +if len(sys.argv) < 2 or sys.argv[1].startswith('-'): + print 'Usage: %s \nThis will cleanup the planfile and the logs from all participating hosts.' % sys.argv[0] + sys.exit(0) + +if not sys.argv[1].startswith('/tmp'): + print 'Planfile is typically in the /tmp directory. Are you sure?' + sys.exit(1) + +planFile = sys.argv[1] +hostsList = None +f = None + +try: + f = open(planFile, 'r') +except IOError, e: + print 'Planfile not found' + sys.exit(1) + +for line in f: + if line.startswith('#'): + continue + if line.startswith('hostslist='): + hostsList = line[len('hostslist='):].strip().split(',') + break +f.close() + +if len(hostsList) == 0: + print 'No hosts list found in plan file. Exiting.' + sys.exit(1) + +for host in hostsList: + cmd = 'ssh %s "rm -f %s*"' % (host, planFile) + print 'Executing "%s"' % cmd + print commands.getoutput(cmd) + +print 'Done' + diff --git a/benchmarks/mstress/mstress_client.cc b/benchmarks/mstress/mstress_client.cc new file mode 100644 index 000000000..61996f4b0 --- /dev/null +++ b/benchmarks/mstress/mstress_client.cc @@ -0,0 +1,582 @@ +/** + * $Id$ + * + * Author: Thilee Subramaniam + * + * Copyright 2012 Quantcast Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy + * of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * This C++ client performs filesystem meta opetarions on the KFS metaserver + * using kfsClient. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include "KfsClient.h" + +FILE* logFile = stdout; + +#define TEST_BASE_DIR "/mstress" +#define COUNT_INCR 500 + +/* + This program is invoked with the following arguments: + - kfs server/port + - test name ('create', 'stat', or 'readdir') + - a planfile + - keys to read the planfile (hostname and process name) + + eg: For the following plan file, + --------------------------------------------- + #File or directory + type=file + #Number of levels in created tree + levels=2 + #Number of inodes per level + inodes=3 + #Number of random leaf paths to stat, per client + nstat=17 + --------------------------------------------- + if 'create' testname, '127.0.0.1' hostname, and 'proc_00' processname + 'PPP' path_prefix are given, then the created tree would be: + /mstress/127.0.0.1_proc_00/PPP_0/PPP_0 + /mstress/127.0.0.1_proc_00/PPP_0/PPP_1 + /mstress/127.0.0.1_proc_00/PPP_0/PPP_2 + /mstress/127.0.0.1_proc_00/PPP_1/PPP_0 + /mstress/127.0.0.1_proc_00/PPP_1/PPP_1 + /mstress/127.0.0.1_proc_00/PPP_1/PPP_2 + /mstress/127.0.0.1_proc_00/PPP_2/PPP_0 + /mstress/127.0.0.1_proc_00/PPP_2/PPP_1 + /mstress/127.0.0.1_proc_00/PPP_2/PPP_2 +*/ + + +//Global datastructure to hold various options. +struct Client { + static const size_t INITIAL_SIZE; + struct Path + { + char* actualPath_; + size_t actualSize_; + size_t len_; + + Path() : actualSize_(INITIAL_SIZE), len_(0) { + actualPath_ = (char*)calloc(actualSize_, 1); + } + + void Push(const char* leafStr) { + size_t leafLen = strlen(leafStr); + if (leafLen == 0) { + return; + } + if (len_ + 1 + leafLen + 1 > actualSize_) { + actualSize_ *= 2; + actualPath_ = (char*)realloc(actualPath_, actualSize_); + fprintf(logFile, "Reallocating to %zu bytes\n", actualSize_); + } + if (leafStr[0] != '/') { + strcpy(actualPath_ + len_, "/"); + len_ ++; + } + strcpy(actualPath_ + len_, leafStr); + len_ += leafLen; + } + + void Pop(const char* leafStr) { + size_t leafLen = strlen(leafStr); + if (leafLen > len_ - 1 || + strncmp(actualPath_ + len_ - leafLen, leafStr, leafLen)) { + fprintf(logFile, "Error in pop %s from %s\n", leafStr, actualPath_); + exit(0); + } + len_ -= leafLen + 1; + *(actualPath_ + len_) = 0; + } + + void Reset() { + actualPath_[0] = 0; + len_ = 0; + } + }; + + Path path_; + + //from commadline + string dfsServer_; + int dfsPort_; + string testName_; + string planfilePath_; + string prefix_; + size_t prefixLen_; + string hostName_; + string processName_; + + //from planfile + string type_; + int levels_; + int inodesPerLevel_; + int pathsToStat_; +}; +const size_t Client::INITIAL_SIZE = 1 << 12; + +//A simple AutoDoUndo class to delete kfsClient automatically. +class AutoCleanupKfsClient +{ +public: + AutoCleanupKfsClient(Client* client) : initialized(false) + { + kfsClient = KFS::Connect(client->dfsServer_, client->dfsPort_); + if (kfsClient) { + initialized = true; + } + } + ~AutoCleanupKfsClient() { + delete kfsClient; + } + bool IsInitialized() { return initialized; } + KFS::KfsClient* GetClient() { return kfsClient; } + +private: + KFS::KfsClient* kfsClient; + bool initialized; +}; + + +void Usage(const char* argv0) +{ + fprintf(logFile, "Usage: %s -s dfs-server -p dfs-port [-t [create|stat|readdir|delete] -a planfile-path -c host -n process-name -P path-prefix]\n", argv0); + fprintf(logFile, " -t: this option requires -a, -c, and -n options.\n"); + fprintf(logFile, " -P: the default value is PATH_.\n"); + fprintf(logFile, "eg:\n%s -s -p -t create -a -c localhost -n Proc_00\n", argv0); + exit(0); +} + +void hexout(char* str, int len) { + for (int i = 0; i < len; i++) { + printf("%02X ", str[i]); + } + printf("\n"); + for (int i = 0; i < len; i++) { + printf("%2c ", str[i] < 30 ? '.' : str[i]); + } + printf("\n"); +} + +void myitoa(int n, char* buf) +{ + static char result[32]; + snprintf(result, 32, "%d", n); + strcpy(buf, result); +} + +//Return a random permutation of numbers in [0..range). +void unique_random(vector& result, size_t range) +{ + result.resize(range); + srand(time(NULL)); + + for(size_t i=0; itestName_ = test_name; + client->dfsServer_ = dfs_server; + client->dfsPort_ = atoi(dfs_port); + client->planfilePath_ = planfile; + client->hostName_ = host; + client->processName_ = process_name; + if (!prefix) { + client->prefix_ = "PATH_"; + } else { + client->prefix_ = prefix; + } + client->prefixLen_ = client->prefix_.size(); + + fprintf(logFile, "server=%s\n", dfs_server); + fprintf(logFile, "port=%s\n", dfs_port); + fprintf(logFile, "planfile=%s\n", planfile); + fprintf(logFile, "host=%s\n", host); + fprintf(logFile, "process_name=%s\n", process_name); + fprintf(logFile, "test name=%s\n", test_name); +} + +//Reads the plan file and add the level information to distribution_ vector. +//Also set type_, prefix_, levels_, pathsToStat_ class variables. +void ParsePlanFile(Client* client) +{ + string line; + ifstream ifs(client->planfilePath_.c_str(), ifstream::in); + + while (ifs.good()) { + getline(ifs, line); + if (line.empty() || line[0] == '#') { + continue; + } + if (line.substr(0, 5) == "type=") { + client->type_ = line.substr(5); + continue; + } + if (line.substr(0, 7) == "levels=") { + client->levels_ = atoi(line.substr(7).c_str()); + continue; + } + if (line.substr(0, 7) == "inodes=") { + client->inodesPerLevel_ = atoi(line.substr(7).c_str()); + continue; + } + if (line.substr(0, 6) == "nstat=") { + client->pathsToStat_ = atoi(line.substr(6).c_str()); + continue; + } + } + ifs.close(); + if (client->prefix_.empty() || client->levels_ <= 0 && client->inodesPerLevel_ <= 0) { + fprintf(logFile, "Error parsing plan file\n"); + exit(-1); + } +} + +long TimeDiffMilliSec(struct timeval* alpha, struct timeval* zigma) +{ + long diff = 0; + diff += (zigma->tv_sec - alpha->tv_sec) * 1000; + diff += (zigma->tv_usec - alpha->tv_usec) / 1000; + return diff < 0 ? 0 : diff; +} + + +int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs, int level, int* createdCount) +{ + + KFS::KfsClient* kfsClient = kfs->GetClient(); + int rc; + bool isLeaf = (level + 1 >= client->levels_); + bool isDir =isLeaf ? (client->type_ == "dir") : true; + char name[512]; + strncpy(name, client->prefix_.c_str(), 512); + for (int i = 0; i < client->inodesPerLevel_; i++) { + myitoa(i, name + client->prefixLen_); + client->path_.Push(name); + //hexout(client->path_.actualPath_, client->path_.len_ + 3); + + if (isDir) { + //fprintf(logFile, "Creating DIR [%s]\n", client->path_.actualPath_); + rc = kfsClient->Mkdir(client->path_.actualPath_); + (*createdCount)++; + if (*createdCount > 0 && (*createdCount) % COUNT_INCR == 0) { + fprintf(logFile, "Created paths so far: %d\n", *createdCount); + } + if (!isLeaf) { + CreateDFSPaths(client, kfs, level+1, createdCount); + } + } else { + //fprintf(logFile, "Creating file [%s]\n", client->path_.actualPath_); + rc = kfsClient->Create(client->path_.actualPath_); + (*createdCount)++; + if (*createdCount > 0 && (*createdCount) % COUNT_INCR == 0) { + fprintf(logFile, "Created paths so far: %d\n", *createdCount); + } + } + client->path_.Pop(name); + } +} + +int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs) +{ + KFS::KfsClient* kfsClient = kfs->GetClient(); + ostringstream os; + os << TEST_BASE_DIR << "/" << client->hostName_ + "_" << client->processName_; + int err = kfsClient->Mkdirs(os.str().c_str()); + //fprintf(logFile, "first mkdir err = %d\n", err); + if (err && err != EEXIST) { + fprintf(logFile, "Error: mkdir test base dir failed\n"); + exit(-1); + } + + int createdCount = 0; + struct timeval tvAlpha; + gettimeofday(&tvAlpha, NULL); + + client->path_.Reset(); + client->path_.Push(os.str().c_str()); + if (CreateDFSPaths(client, kfs, 0, &createdCount) < 0) { + fprintf(logFile, "Error: failed to create DFS paths\n"); + } + + struct timeval tvZigma; + gettimeofday(&tvZigma, NULL); + fprintf(logFile, "Client: %d paths created in %ld msec\n", createdCount, TimeDiffMilliSec(&tvAlpha, &tvZigma)); +} + +int StatDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { + KFS::KfsClient* kfsClient = kfs->GetClient(); + + ostringstream os; + os << TEST_BASE_DIR << "/" << client->hostName_ + "_" << client->processName_; + + srand(time(NULL)); + struct timeval tvAlpha; + gettimeofday(&tvAlpha, NULL); + + for (int count = 0; count < client->pathsToStat_; count++) { + client->path_.Reset(); + client->path_.Push(os.str().c_str()); + char name[4096]; + strncpy(name, client->prefix_.c_str(), client->prefixLen_); + + for (int d = 0; d < client->levels_; d++) { + int randIdx = rand() % client->inodesPerLevel_; + myitoa(randIdx, name + client->prefixLen_); + client->path_.Push(name); + //fprintf(logFile, "Stat: path now is %s\n", client->path_.actualPath_); + } + //fprintf(logFile, "Stat: doing stat on [%s]\n", client->path_.actualPath_); + + KFS::KfsFileAttr attr; + int err = kfsClient->Stat(os.str().c_str(), attr); + if (err) { + fprintf(logFile, "error doing stat on %s\n", os.str().c_str()); + } + + if (count > 0 && count % COUNT_INCR == 0) { + fprintf(logFile, "Stat paths so far: %d\n", count); + } + } + + struct timeval tvZigma; + gettimeofday(&tvZigma, NULL); + fprintf(logFile, "Client: Stat done on %d paths in %ld msec\n", client->pathsToStat_, TimeDiffMilliSec(&tvAlpha, &tvZigma)); + + return 0; +} + +int ListDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { + KFS::KfsClient* kfsClient = kfs->GetClient(); + + srand(time(NULL)); + struct timeval tvAlpha; + gettimeofday(&tvAlpha, NULL); + int inodeCount = 0; + + queue pending; + ostringstream os; + os << TEST_BASE_DIR << "/" << client->hostName_ + "_" << client->processName_; + pending.push(os.str()); + + while (!pending.empty()) { + string parent = pending.front(); + pending.pop(); + //fprintf(logFile, "readdir on parent [%s]\n", parent.c_str()); + vector children; + int err = kfsClient->ReaddirPlus(parent.c_str(), children); + if (err) { + fprintf(logFile, "Error [err=%d] reading directory %s\n", err, parent.c_str()); + continue; + } + while (!children.empty()) { + string child = children.back().filename; + bool isDir = children.back().isDirectory; + children.pop_back(); + //fprintf(logFile, " Child = %s inodeCount=%d\n", child.c_str(), inodeCount); + if (child == "." || + child == "..") { + continue; + } + inodeCount ++; + if (isDir) { + string nextParent = parent + "/" + child; + pending.push(nextParent); + //fprintf(logFile, " Adding next parent [%s]\n", nextParent.c_str()); + } + if (inodeCount > 0 && inodeCount % COUNT_INCR == 0) { + fprintf(logFile, "Readdir paths so far: %d\n", inodeCount); + } + } + } + + struct timeval tvZigma; + gettimeofday(&tvZigma, NULL); + fprintf(logFile, "Client: Directory walk done over %d inodes in %ld msec\n", inodeCount, TimeDiffMilliSec(&tvAlpha, &tvZigma)); + return 0; +} + +int RemoveDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { + KFS::KfsClient* kfsClient = kfs->GetClient(); + KFS::KfsFileAttr attr; + + ostringstream os; + os << TEST_BASE_DIR << "/" << client->hostName_ + "_" << client->processName_; + + // get a list of leaf indices to remove. Note that this is different from the nodename suffix. + // eg: if we have 3 levels of 4 inodes, then there will be pow(4,3) = 64 leaf nodes. We are + // interested in a subset of indices in (0..63). This gets filled in 'leafIdxRangeForDel'. + long double countLeaf = pow(client->inodesPerLevel_, client->levels_); + vector leafIdxRangeForDel; + unique_random(leafIdxRangeForDel, countLeaf); + fprintf(logFile, "To delete %d paths\n", leafIdxRangeForDel.size()); + + struct timeval tvAlpha; + gettimeofday(&tvAlpha, NULL); + bool isLeafDir = client->type_=="dir"; + + char sfx[32]; + string pathToDel; + string pathSoFar; + size_t idx = 0; + size_t pos = 0; + int delta = 0; + int lev = 0; + + while (!leafIdxRangeForDel.empty()) { + idx = leafIdxRangeForDel.back(); + leafIdxRangeForDel.pop_back(); + pathSoFar.clear(); + pos = 0; + delta = 0; + lev = 0; + while (lev < client->levels_) { + pos = idx / client->inodesPerLevel_; + delta = idx - (pos * client->inodesPerLevel_); + myitoa(delta, sfx); + if (pathSoFar.length()) { + pathSoFar = client->prefix_ + sfx + "/" + pathSoFar; + } else { + pathSoFar = client->prefix_ + sfx; + } + idx = pos; + lev ++; + } + + pathToDel = os.str() + "/" + pathSoFar; + fprintf(logFile, "Client: Deleting %s ...\n", pathToDel.c_str()); + if (isLeafDir) { + kfsClient->Rmdir(pathToDel.c_str()); + } else { + kfsClient->Remove(pathToDel.c_str()); + } + } + + kfsClient->RmdirsFast(os.str().c_str()); + + struct timeval tvZigma; + gettimeofday(&tvZigma, NULL); + fprintf(logFile, "Client: Deleted %s. Delete took %ld msec\n", os.str().c_str(), TimeDiffMilliSec(&tvAlpha, &tvZigma)); + + return 0; +} + + +int main(int argc, char* argv[]) +{ + Client client; + + parse_options(argc, argv, &client); + + AutoCleanupKfsClient kfs(&client); + if (!kfs.IsInitialized()) { + fprintf(logFile, "kfs client failed to initialize. exiting.\n"); + exit(-1); + } + + ParsePlanFile(&client); + + if (client.testName_ == "create") { + CreateDFSPaths(&client, &kfs); + } else if (client.testName_ == "stat") { + StatDFSPaths(&client, &kfs); + } else if (client.testName_ == "readdir") { + ListDFSPaths(&client, &kfs); + } else if (client.testName_ == "delete") { + RemoveDFSPaths(&client, &kfs); + } else { + fprintf(logFile, "Error: unrecognized test '%s'", client.testName_.c_str()); + return -1; + } + return 0; +} + diff --git a/benchmarks/mstress/mstress_initialize.sh b/benchmarks/mstress/mstress_initialize.sh new file mode 100755 index 000000000..3929dbcec --- /dev/null +++ b/benchmarks/mstress/mstress_initialize.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# +# The mstress runs stress tests on KFS metaserver and HDFS namenode. To be able +# to work with the namenode, the java HDFS client needs to be compiled and run +# with hadoop hdfs jars. +# +# To make this process uniform across mstress master and slaves, we copy the +# jars to a local directory and copy it around to all participating hosts at +# a fixed location. +# (Without this, each client would need to add entries to /etc/yum.repos.d/, +# and require root access to install hadoop client rpms). +# +# This script packages the jars locally on the build node, so that the +# 'mstress_prepare_clients.sh' script can copy it over to master and clients. +# +# Run this program with the path of hadoop client jars as argument (default +# /usr/lib/hadoop/client/), and it will create a "mstress_hdfs_client_jars" +# directory containing the jars. +# + +if [[ "$1" = -* ]] +then + echo "Usage: $0 [ path/to/hadoop/client/jars ]" + echo " This prepares the build environment on mstress build host" + echo " Default path is '/usr/lib/hadoop/client/'" + exit +fi + +JARS_SOURCE=${1:-"/usr/lib/hadoop/client/"} + +DIR="$( cd "$( dirname "$0" )" && pwd )" +JARS_TARGET=${DIR}/mstress_hdfs_client_jars + +if [ ! -d "$JARS_SOURCE" ] || [ -z "$(ls -A "$JARS_SOURCE"/*.jar)" ]; then + echo ""$JARS_SOURCE" is not a directory or does not have the jars." + exit 1 +fi + +if [ -d "$JARS_TARGET" ]; then + if [ "$(ls -A "$JARS_TARGET"/*.jar 2> /dev/null)" ]; then + echo ""$JARS_TARGET" already has the jars. Nothing to do." + exit 0 + fi +fi + +mkdir -p "$JARS_TARGET" +cp $JARS_SOURCE/*.jar "$JARS_TARGET" + +if [ $? -ne 0 ] +then + echo "Failed to copy jars." + exit 1 +fi + +echo "Hadoop client jars from $JARS_SOURCE copied to $JARS_TARGET." + +exit 0 + diff --git a/benchmarks/mstress/mstress_plan.py b/benchmarks/mstress/mstress_plan.py new file mode 100755 index 000000000..79043d514 --- /dev/null +++ b/benchmarks/mstress/mstress_plan.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python + +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# This code is used to generate a plan file for metaserver vs namenode +# benchmarking. +# +import optparse +import sys +import subprocess +import time +import os +import math +import getpass + +""" +This program is used to create the directory/file layout to be used +in metaserver/namenode stress test. + +You basically specify the depth of the directory tree and the number +of elements (files or directories) per level, along with the list of +client-hosts you want to use and the number of clients per client-host +that you want to use. + +This script will generate the plan file, and copy it to the /tmp on the +given list of client hosts. + +Thereafter, you can execute the mstress.py with this plan file. +""" + +class Globals: + PATH_PREFIX = 'Dir_' + PLAN_OUTPUT = './planfile.txt' + +def ParseCommandline(): + epi = ('Example: "%s -c h1,h2 -n 3 -l 4 -i 3 -s 100" would create 4 levels of 3 inodes ' % sys.argv[0] + + '(3+9+27+81=120) per client process. Since there are 3 ' + + 'processes on 2 hosts, we create 120x6=720 inodes. We will attempt ' + + 'to stat 100 random leaf paths using all client processes. We will do a readdir ' + + 'all through the directory tree.') + + parser = optparse.OptionParser(epilog=epi) + + parser.add_option('-c', '--client-hosts', + action='store', + default='localhost', + type='string', + help='Comma-separated list of client host names.') + parser.add_option('-n', '--clients-per-host', + action='store', + default=1, + type='int', + help='Number of clients per client host.') + parser.add_option('-l', '--levels', + action='store', + default=1, + type='int', + help='File-tree depth on each client.') + parser.add_option('-i', '--inodes-per-level', + action='store', + default=100, + type='int', + help='Inodes per each level on each client.') + parser.add_option('-t', '--path-type', + action='store', + default='dir', + type='string', + help='Whether to create "dir" or "file" inodes.') + parser.add_option('-s', '--num-to-stat', + action='store', + default=100, + type='int', + help='Number of inodes to stat (<=total leaf inodes).') + parser.add_option('-o', '--output-file', + action='store', + default=None, + type='string', + help='Output plan file.') + + opts, args = parser.parse_args() + if args: + sys.exit('Unexpected arguments: %s.' % str(args)) + + if opts.output_file is None: + opts.output_file = '/tmp/mstress_%s_%s.plan' % (getpass.getuser(), time.strftime("%F-%H-%M-%S", time.gmtime())) + + return opts + + +def main(): + opts = ParseCommandline() + hostlist = opts.client_hosts.split(',') + + numClientProcesses = float(len(hostlist) * opts.clients_per_host) + if numClientProcesses == 0.0: + sys.exit('Invalid client processes') + + #get the smallest number larger than 'opts.num_to_stat' that is a multiple of opts.num_to_stat + statPerClient = int(math.ceil(float(opts.num_to_stat) / numClientProcesses)) + + #print opts + outfile = open(opts.output_file, 'w') + outfile.write('# *** DO NOT EDIT THIS FILE BY HAND *** \n# USE mstress_plan.py TO MODIFY INSTEAD\n#\n') + outfile.write('#List of hosts taking part in the plan\nhostslist=%s\n' % opts.client_hosts) + outfile.write('#Number of mstress cliends per client host\nclientsperhost=%d\n' % opts.clients_per_host) + outfile.write('#File or directory\ntype=%s\n' % opts.path_type) + outfile.write('#Number of levels in created tree\nlevels=%d\n' % opts.levels) + outfile.write('#Number of inodes per level\ninodes=%d\n' % opts.inodes_per_level) + outfile.write('#Number of random paths to stat, per client\nnstat=%d\n' % statPerClient) + + """ old code + begin_tree_delta = 0 + for level in range(0,opts.levels): + begin_tree_delta = begin_tree_delta + pow(opts.inodes_per_level, level + 1) + #print "delta = ", begin_tree_delta + + outfile.write('#host\tclient\tlevel\tdistribution\n') + begin_tree_idx = 0 + for host_no in range(0,len(hostlist)): + host = hostlist[host_no] + for client_no in range(0,opts.clients_per_host): + # tree for this level + begin_idx = begin_tree_idx + for level in range(0,opts.levels): + prefix = '%s\tproc_%02d\t%d\t' % (host, client_no, level) + # print '-- h=%d, c=%d level=%d, begin idx = %d' % (host_no, client_no, level, begin_idx) + suffix = '' + for ranges in range(0, pow(opts.inodes_per_level, level)): + if len(suffix) != 0: + suffix = suffix + ',' + suffix = suffix + '%d-%d'%(begin_idx, begin_idx + opts.inodes_per_level - 1) + begin_idx = begin_idx + opts.inodes_per_level + outfile.write('%s\t%s\n' % (prefix, suffix)) + begin_tree_idx = begin_tree_idx + begin_tree_delta + #print "next begin tree idx = ", begin_tree_idx + """ + + outfile.close() + print '==> Created planfile: %s' % opts.output_file + print 'copying file %s to all client hosts' % opts.output_file + for client in hostlist: + p = subprocess.Popen(['/usr/bin/scp', os.path.abspath(opts.output_file), '%s:%s' % (client, opts.output_file)]) + while 1: + ret = p.poll() + if ret == None: + time.sleep(0.5) + else: + print 'transfered %s to %s' % (opts.output_file, client) + break + +if __name__ == '__main__': + main() + diff --git a/benchmarks/mstress/mstress_prepare_master_clients.sh b/benchmarks/mstress/mstress_prepare_master_clients.sh new file mode 100755 index 000000000..a561a5cc9 --- /dev/null +++ b/benchmarks/mstress/mstress_prepare_master_clients.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# +# To run mstress, the participating client hosts and the master host should all +# have the mstress files in the same path. +# +# This script, run with a comma-separated list of hostnames, will copy the +# tar + gz bundle of the mstress directory to the home directory of the hosts +# and untar + unzip them for usage. +# + +if [ -z $1 ] || [[ "$1" = -* ]] +then + echo "Usage: $0 " + echo " This copies the mstress bundle to master and client hosts." + exit +fi + +which tar &>/dev/null +if [ $? -ne 0 ] +then + echo "tar command not found." + exit 1 +fi + +script_dir=$(dirname "$0") + +cd $script_dir/.. && tar cvfz mstress.tgz mstress +if [ $? -ne 0 ] +then + echo "failed to create archive." + cd - + exit 1 +fi + +cd - +for v in `echo "$@"|sed 's/,/ /g'` +do + ssh $v "rm -rf ~/mstress*" + scp $script_dir/../mstress.tgz $v:~ + ssh $v "tar xvfz mstress.tgz" +done + diff --git a/benchmarks/mstress/mstress_run.py b/benchmarks/mstress/mstress_run.py new file mode 100755 index 000000000..dfdfc62ea --- /dev/null +++ b/benchmarks/mstress/mstress_run.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python + +# +# $Id$ +# +# Author: Ying Zheng +# +# Copyright 2012 Quantcast Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# This is essentially a wrapper around mstress_plan.py and mstress.py. +# + +import optparse +import sys +import subprocess +import time +import os +import datetime +import commands +import resource +import getpass +import re + +class Params: + TARGETS = [] + CLIENT_HOSTS = '' + CLIENTS_PER_HOST = 5 + PATH_TYPE = "dir" + PATH_LEVELS = 3 + INODES_PER_LEVEL = 16 + def NumFiles2Stat(): + return Params.INODES_PER_LEVEL**Params.PATH_LEVELS*Params.CLIENTS_PER_HOST*len(Params.CLIENT_HOSTS.split(","))/2 + NumFiles2Stat = staticmethod(NumFiles2Stat) + +def Usage(): + print 'Usage: %s [clients] [fs_type,fs_host,fs_port] [fs_type,fs_host,fs_port]..' % sys.argv[0] + print ' clients: comma separated list of client host names' + print ' fs_type: kfs or hdfs' + print ' fs_host: metaserver or namenode hostname' + print ' fs_port: metaserver or namenode port' + print 'Eg: %s 10.15.20.25,10.20.25.30 kfs,10.10.10.10,10000 hdfs,20.20.20.20,20000' + sys.exit(0) + + +def PrintMsg(m): + print '\033[34m' + m + '\033[0m' + + +def MakePlan(): + plan_file = '/tmp/mstress_%s_%s.plan' % (getpass.getuser(), time.strftime("%F-%H-%M-%S", time.gmtime())) + PrintMsg("Preparing benchmark plan [%s] ..." % plan_file) + subprocess.call(["./mstress_plan.py", + "--client-hosts", Params.CLIENT_HOSTS, + "--clients-per-host", str(Params.CLIENTS_PER_HOST), + "--path-type", Params.PATH_TYPE, + "--levels", str(Params.PATH_LEVELS), + "--inodes-per-level", str(Params.INODES_PER_LEVEL), + "--num-to-stat", str(Params.NumFiles2Stat()), + "-o", plan_file]) + return plan_file + + +def RunBenchmark(plan_file): + for t in Params.TARGETS: + type = t[0] + server = t[1] + port = t[2] + result = Execute(type, + ["./mstress.py", + "-f", type, + "-s", server, + "-p", port, + "-a", plan_file]) + PrintResult(type, result) + + +def Execute(type, args): + os.putenv("PYTHONUNBUFFERED","TRUE"); + PrintMsg("\n==========================================\nStarting benchmark for '%s'..." % type) + + result = "" + proc = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + while proc.poll() == None: + output = proc.stdout.read(1) + result += output + sys.stdout.write(output) + sys.stdout.flush() + + output = proc.stdout.read() + result += output + sys.stdout.write(output) + sys.stdout.flush() + proc.wait() + + return result + + +def PrintResult(type, result): + PrintMsg("\nBenchmark results for '%s':" % type) + for m in re.findall(r"(\w+) test took (\S+) sec",result): + PrintMsg("%-10s: %s sec"%(m[0],m[1])) + PrintMsg("\n%s\n==========================================" % + re.search(r"Memory usage .*$", result, re.MULTILINE).group(0)) + + +def ParseArgs(): + argc = len(sys.argv) + if argc <= 1 or sys.argv[1].startswith('-') or argc < 3: + Usage() + + Params.CLIENT_HOSTS = sys.argv[1].strip() + + triple = sys.argv[2].strip().split(',') + if len(triple) != 3 or triple[0] not in ('kfs', 'hdfs'): + Usage() + Params.TARGETS.append(triple) + + if argc > 3: + triple = sys.argv[3].strip().split(',') + if len(triple) != 3 or triple[0] not in ('kfs', 'hdfs'): + Usage() + Params.TARGETS.append(triple) + + +def main(): + ParseArgs() + plan_file = MakePlan() + RunBenchmark(plan_file) + +if __name__ == '__main__': + main() + diff --git a/benchmarks/mstress/mstress_sample_run.sh b/benchmarks/mstress/mstress_sample_run.sh new file mode 100755 index 000000000..573a32323 --- /dev/null +++ b/benchmarks/mstress/mstress_sample_run.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# This is a single-button command of the benchmarking process. You just give +# it the metaserver (perhaps run by example/sampleservers/sample_setup.py) and +# the namenode (see README for how to install and run namenode), and this will +# package the jars, compile the sources, setup the clients on localhost, and +# run the tests. +# + +SERVER_ARGS="" + +function usage() { + echo "Usage: $0 [-k host:port] [-h host:port] + -k: KFS metaserver host and port. + -h: HDFS namenode host and port." >&2 +} + +while getopts 'k:h:' OPTION + do + case $OPTION in + k) + KFS_HOST=$(echo "$OPTARG" | cut -d ":" -f 1) + KFS_PORT=$(echo "$OPTARG" | cut -d ":" -f 2) + SERVER_ARGS=$SERVER_ARGS" kfs,"$KFS_HOST","$KFS_PORT"" + ;; + h) + HDFS_HOST=$(echo "$OPTARG" | cut -d ":" -f 1) + HDFS_PORT=$(echo "$OPTARG" | cut -d ":" -f 2) + SERVER_ARGS=$SERVER_ARGS" hdfs,"$HDFS_HOST","$HDFS_PORT"" + ;; + ?) + usage + exit 1 + ;; + esac +done + +shift $(($OPTIND - 1)) +if [ $# -ne 0 ]; then + usage + exit 2 +fi + +[ -z "$SERVER_ARGS" ] && usage && exit 0 + +[ -z "$JAVA_HOME" ] && echo "Need JAVA_HOME to be set." && exit 1 + +./mstress_initialize.sh +[ $? -ne 0 ] && echo "Failed to prepare hdfs client jars. Please verify hadoop hdfs namenode is installed." && exit 1 + +make +[ $? -ne 0 ] && echo "Failed to compile mstress clients." && exit 1 + +./mstress_prepare_master_clients.sh localhost +[ $? -ne 0 ] && echo "Failed to prepare mstress on localhost." && exit 1 + +pushd ~/mstress + +echo "Running ./mstress_run.py localhost ${SERVER_ARGS} .." +./mstress_run.py localhost ${SERVER_ARGS} +ret=$? +popd + +[ $ret -ne 0 ] && echo "Failed to run benchmark on localhost." && exit 1 + +exit 0 diff --git a/build.xml b/build.xml new file mode 100644 index 000000000..3b0b7be9e --- /dev/null +++ b/build.xml @@ -0,0 +1,85 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + + + +
diff --git a/examples/README b/examples/README new file mode 100644 index 000000000..db820693d --- /dev/null +++ b/examples/README @@ -0,0 +1,64 @@ +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# + + + +This directory contains the following contents: + +1. cc/kfssample_main.cc + cc/CMakeLists.txt + The C++ client to access KFS. The binary gets compiled as part of kfs build + (described in 'doc' directory at top of source dir), and the sample gets + copied to the /bin/examples directory where is the selected + build directory. + +2. java/KfsSample.java + The Java client to access KFS. + To compile: + javac -classpath ../../build/kfs-*.jar KfsSample.java + To run, assuming that make install was performed in ../../build/release, + or top level make was invoked with no or default target: + libdir="`cd ../../build/release/lib && pwd`" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${libdir}" + export LD_LIBRARY_PATH + kfsjar=`echo ../../build/kfs-*.jar` + java -Djava.library.path="$libdir" -classpath ".:$kfsjar" KfsSample 127.0.0.1 20000 + +3. python/kfssample.py + python/kfssample.cfg + Python support is at experimental stage now. Nevertheless, one can run this + example to access KFS via python. The DeveloperDoc in the top-level 'doc' + directory explains how to build and install the python KFS extension module. + To run the kfssample.py, please set your PYTHONPATH and LD_LIBRARY_PATH + accordingly and point it to a KFS metaserver via the kfssample.cfg. + +4. sampleservers + Directory with a python script to bring up a simple KFS server locally. + + Once you build the C++ binaries (see 'doc' directory at top of source dir) + point the sampleservers/sample_setup.py script to a config file and the + build paths to start the servers. + + A sample config file (sampleservers/sample_setup.cfg file) is provided that + starts up a metaserver, two chunk servers, and a webserver to monitor the + servers. Please refer to the comments/Usage of sampleservers/sample_setup.py. + diff --git a/examples/cc/CMakeLists.txt b/examples/cc/CMakeLists.txt new file mode 100644 index 000000000..30aa979ea --- /dev/null +++ b/examples/cc/CMakeLists.txt @@ -0,0 +1,40 @@ +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# Build the C++ example binary +# + +add_executable (kfssample kfssample_main.cc) + +if (USE_STATIC_LIB_LINKAGE) + target_link_libraries (kfssample kfsClient qcdio pthread) + add_dependencies (kfssample kfsClient qcdio) +else (USE_STATIC_LIB_LINKAGE) + target_link_libraries (kfssample kfsClient-shared qcdio-shared pthread) + add_dependencies (kfssample kfsClient-shared qcdio-shared) +endif (USE_STATIC_LIB_LINKAGE) + +if (NOT APPLE) + target_link_libraries(kfssample rt) +endif (NOT APPLE) + +install (TARGETS kfssample + RUNTIME DESTINATION bin/examples) diff --git a/examples/cc/kfssample_main.cc b/examples/cc/kfssample_main.cc new file mode 100644 index 000000000..1c98624f6 --- /dev/null +++ b/examples/cc/kfssample_main.cc @@ -0,0 +1,254 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2007/09/05 +// Author: Sriram Rao (Kosmix Corp.) +// +// Copyright 2012 Quantcast Corp. +// Copyright 2007 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief A sample C++ program that demonstrates KFS APIs. To run +// this program, you need: +// - kfs client libraries to link with +// - a KFS deployment +// +// The cmake build system takes care of compiling this. Follow the DeveloperDoc +// in the top-level 'doc' directory for compile instructions. +// +// To see compile commandline (assuming KFS code is at ~/code/kfs, and it has +// already been compiled using instructions), +// cd ~/code/kfs/build/examples/cc && make VERBOSE=1 +// +//---------------------------------------------------------------------------- + +#include +#include +#include + +extern "C" { +#include +#include +#include +#include +#include +#include +} + +#include "libclient/KfsClient.h" +#include "libclient/KfsAttr.h" + +using std::cout; +using std::cerr; +using std::endl; +using std::ifstream; +using std::string; +using std::vector; + +KFS::KfsClient *gKfsClient; + +// generate sample data for testing +void generateData(char *buf, int numBytes); + +int +main(int argc, char **argv) +{ + string serverHost = ""; + int port = -1; + bool help = false; + char optchar; + + while ((optchar = getopt(argc, argv, "hp:s:")) != -1) { + switch (optchar) { + case 'p': + port = atoi(optarg); + break; + case 's': + serverHost = optarg; + break; + case 'h': + help = true; + break; + default: + cout << "Unrecognized flag " << optchar << endl; + help = true; + break; + } + } + + if (help || (serverHost == "") || (port < 0)) { + cout << "Usage: " << argv[0] << " -s -p " + << endl; + exit(0); + } + + // + // Get a handle to the KFS client object. This is our entry into + // the KFS namespace. + // + + gKfsClient = KFS::Connect(serverHost, port); + if (!gKfsClient) { + cerr << "kfs client failed to initialize...exiting" << "\n"; + exit(-1); + } + + // Make a directory /ctest + string baseDir = "ctest"; + int res = gKfsClient->Mkdirs(baseDir.c_str()); + if (res < 0 && res != -EEXIST) { + cout << "Mkdir failed: " << KFS::ErrorCodeToStr(res) << endl; + exit(-1); + } + + // What we just created better be a directory + if (!gKfsClient->IsDirectory(baseDir.c_str())) { + cout << "KFS doesn't think: " << baseDir << " is a dir!" << endl; + exit(-1); + } + + // Create a simple file with default replication (at most 3) + string tempFilename = baseDir + "/foo.1"; + int fd; + + // fd is our file-handle to the file we are creating; this + // file handle should be used in subsequent I/O calls on + // the file. + if ((fd = gKfsClient->Create(tempFilename.c_str())) < 0) { + cout << "Create failed: " << KFS::ErrorCodeToStr(fd) << endl; + exit(-1); + } + + // Get the directory listings + vector entries; + + if ((res = gKfsClient->Readdir(baseDir.c_str(), entries)) < 0) { + cout << "Readdir failed! " << KFS::ErrorCodeToStr(res) << endl; + exit(-1); + } + + cout << "Read dir returned: " << endl; + for (size_t i = 0; i < entries.size(); i++) { + cout << entries[i] << endl; + } + + // write something to the file + int numBytes = 2048; + char *dataBuf = new char[numBytes]; + + generateData(dataBuf, numBytes); + + // make a copy and write out using the copy; we keep the original + // so we can validate what we get back is what we wrote. + char *copyBuf = new char[numBytes]; + memcpy(copyBuf, dataBuf, numBytes); + + res = gKfsClient->Write(fd, copyBuf, numBytes); + if (res != numBytes) { + cout << "Was able to write only: " << res << " instead of " << numBytes << endl; + } + + // flush out the changes + gKfsClient->Sync(fd); + + // Close the file-handle + gKfsClient->Close(fd); + + // Determine the file-size + KFS::KfsFileAttr fileAttr; + gKfsClient->Stat(tempFilename.c_str(), fileAttr); + long size = fileAttr.fileSize; + + if (size != numBytes) { + cout << "KFS thinks the file's size is: " << size << " instead of " << numBytes << endl; + } + + // rename the file + string newFilename = baseDir + "/foo.2"; + gKfsClient->Rename(tempFilename.c_str(), newFilename.c_str()); + + if (gKfsClient->Exists(tempFilename.c_str())) { + cout << tempFilename << " still exists after rename!" << endl; + exit(-1); + } + + // Re-create the file and try a rename that should fail... + int fd1 = gKfsClient->Create(tempFilename.c_str()); + + if (!gKfsClient->Exists(tempFilename.c_str())) { + cout << " After rec-create..., " << tempFilename << " doesn't exist!" << endl; + exit(-1); + } + + gKfsClient->Close(fd1); + + // try to rename and don't allow overwrite + if (gKfsClient->Rename(newFilename.c_str(), tempFilename.c_str(), false) == 0) { + cout << "Rename with overwrite disabled succeeded...error!" << endl; + exit(-1); + } + + // Remove the file + gKfsClient->Remove(tempFilename.c_str()); + + // Re-open the file + if ((fd = gKfsClient->Open(newFilename.c_str(), O_RDWR)) < 0) { + cout << "Open on : " << newFilename << " failed!" << KFS::ErrorCodeToStr(fd) << endl; + exit(-1); + } + + // read some bytes + res = gKfsClient->Read(fd, copyBuf, 128); + + // Verify what we read matches what we wrote + for (int i = 0; i < 128; i++) { + if (dataBuf[i] != copyBuf[i]) { + cout << "Data mismatch at : " << i << endl; + } + } + + // seek to offset 40 + gKfsClient->Seek(fd, 40); + + // Seek and verify that we are we think we are + size = gKfsClient->Tell(fd); + if (size != 40) { + cout << "After seek, we are at: " << size << " should be at 40 " << endl; + } + + gKfsClient->Close(fd); + + // remove the file + gKfsClient->Remove(newFilename.c_str()); + + // remove the dir + if ((res = gKfsClient->Rmdir(baseDir.c_str()) < 0)) { + cout << "Unable to remove: " << baseDir << " : " << KFS::ErrorCodeToStr(res) << endl; + } else { + cout << "Tests passed!" << endl; + } +} + +void generateData(char *buf, int numBytes) +{ + int i; + + srand(100); + for (i = 0; i < numBytes; i++) { + buf[i] = (char) ('a' + (rand() % 26)); + } +} + diff --git a/examples/java/KfsSample.java b/examples/java/KfsSample.java new file mode 100644 index 000000000..aa5fb2257 --- /dev/null +++ b/examples/java/KfsSample.java @@ -0,0 +1,219 @@ +/** + * $Id$ + * + * Created 2007/08/25 + * Author: Sriram Rao + * + * Copyright 2012 Quantcast Corp. + * Copyright 2007 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * \brief A Sample Java program to access KFSAccess APIs. To run this program, + * you need: + * - kfs.jar in your CLASSPATH + * - libkfs_access.so in your LD_LIBRARY_PATH + * - a KFS deployment + * Eg: + * javac -cp ~/code/kfs/build/.jar KfsSample.java + * DYLD_LIBRARY_PATH=~/code/kfs/build/lib \ + * java -cp .:~/code/kfs/build/.jar KfsSample 172.16.59.127 20000 + */ + + +import java.io.*; +import java.net.*; +import java.util.Random; +import java.nio.ByteBuffer; +import com.quantcast.kfs.access.*; + +public class KfsSample +{ + public static void main(String args[]) { + if (args.length < 1) { + System.out.println("Usage: KfsTest "); + System.exit(1); + } + try { + // From the command line, get the location of the meta-server + int port = Integer.parseInt(args[1].trim()); + + // Initialize a KfsAccess object. The KfsAccess object is + // the glue code that gets us into KFS world. + // + KfsAccess kfsAccess = new KfsAccess(args[0], port); + + String basedir = new String("jtest"); + + // Make a test directory where we can do something + if (!kfsAccess.kfs_exists(basedir)) { + if (kfsAccess.kfs_mkdirs(basedir) != 0) { + System.out.println("Unable to mkdir"); + System.exit(1); + } + } + + // What we just created better be a directory + if (!kfsAccess.kfs_isDirectory(basedir)) { + System.out.println("KFS doesn't think " + basedir + " is a dir!"); + System.exit(1); + + } + + // Create a simple file with default replication (at most 3) + String path = new String(basedir + "/foo.1"); + KfsOutputChannel outputChannel; + + // outputChannel implements a WriteableChannel interface; + // it is our handle to subsequent I/O on the file. + if ((outputChannel = kfsAccess.kfs_create(path)) == null) { + System.out.println("Unable to call create"); + System.exit(1); + } + + // Get the directory listings + String [] entries; + if ((entries = kfsAccess.kfs_readdir(basedir)) == null) { + System.out.println("Readdir failed"); + System.exit(1); + } + + System.out.println("Readdir returned: "); + for (int i = 0; i < entries.length; i++) { + System.out.println(entries[i]); + } + + // write something to the file + int numBytes = 2048; + char [] dataBuf = new char[numBytes]; + + generateData(dataBuf, numBytes); + + String s = new String(dataBuf); + byte[] buf = s.getBytes(); + + int res = outputChannel.write(ByteBuffer.wrap(buf, 0, buf.length)); + if (res != buf.length) { + System.out.println("Was able to write only: " + res); + } + + // flush out the changes + outputChannel.sync(); + + // Close the file-handle + outputChannel.close(); + + // Determine the file-size + long sz = kfsAccess.kfs_filesize(path); + + if (sz != buf.length) { + System.out.println("System thinks the file's size is: " + sz); + } + + // rename the file + String npath = new String(basedir + "/foo.2"); + kfsAccess.kfs_rename(path, npath); + + if (kfsAccess.kfs_exists(path)) { + System.out.println(path + " still exists after rename!"); + System.exit(1); + } + + KfsOutputChannel outputChannel1 = kfsAccess.kfs_create(path); + if (outputChannel1 != null) { + outputChannel1.close(); + } + + if (!kfsAccess.kfs_exists(path)) { + System.out.println(path + " doesn't exist"); + System.exit(1); + } + + // try to rename and don't allow overwrite + if (kfsAccess.kfs_rename(npath, path, false) == 0) { + System.out.println("Rename with overwrite disabled succeeded!"); + System.exit(1); + } + + // Remove the file + kfsAccess.kfs_remove(path); + + // Verify that it is gone + if (!kfsAccess.kfs_isFile(npath)) { + System.out.println(npath + " is not a normal file!"); + System.exit(1); + } + + // Re-open the file to read something. For reads/writes, + // Kfs provides a readable/writeable byte channel interface. + KfsInputChannel inputChannel = kfsAccess.kfs_open(npath); + if (inputChannel == null) { + System.out.println("open on " + npath + "failed!"); + System.exit(1); + } + + // read some bytes + buf = new byte[128]; + res = inputChannel.read(ByteBuffer.wrap(buf, 0, 128)); + + // Verify what we read matches what we wrote + s = new String(buf); + for (int i = 0; i < 128; i++) { + if (dataBuf[i] != s.charAt(i)) { + System.out.println("Data mismatch at char: " + i); + } + } + + // seek to offset 40. The KfsInputChannel allows seeking; + // this is an extension to the basic readablebytechannel api. + inputChannel.seek(40); + + // Seek and verify that we are we think we are + sz = inputChannel.tell(); + if (sz != 40) { + System.out.println("After seek, we are at: " + sz); + } + + inputChannel.close(); + + // remove the file + kfsAccess.kfs_remove(npath); + + // remove the dir + if (kfsAccess.kfs_rmdir(basedir) < 0) { + System.out.println("unable to remove: " + basedir); + System.exit(1); + } + System.out.println("All done...Test passed!"); + + } catch (Exception e) { + e.printStackTrace(); + System.out.println("Unable to setup KfsAccess"); + System.exit(1); + } + } + + private static Random randGen = new Random(100); + + private static void generateData(char buf[], int numBytes) + { + int i; + + for (i = 0; i < numBytes; i++) { + buf[i] = (char) ('a' + (randGen.nextInt(26))); + } + } + +} diff --git a/examples/python/kfssample.cfg b/examples/python/kfssample.cfg new file mode 100644 index 000000000..ff9b43746 --- /dev/null +++ b/examples/python/kfssample.cfg @@ -0,0 +1,32 @@ +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# NOTE: The python support for KFS is EXPERIMENTAL at this stage. The +# python extension module has not been tested on large scale +# deploymentsi yet. Please excercise caution while using the +# python wrapper. + +# This config file can be used by the python client to connect to the servers +# setup locally. The metaserver host and port match the config file: +# examples/sampleservers/sample_setup.cfg + +metaServer.name = localhost +metaServer.port = 20000 diff --git a/examples/python/kfssample.py b/examples/python/kfssample.py new file mode 100644 index 000000000..5c5e74991 --- /dev/null +++ b/examples/python/kfssample.py @@ -0,0 +1,174 @@ +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + +# +# NOTE: The python support for KFS is EXPERIMENTAL at this stage. The +# python extension module has not been tested on large scale +# deploymentsi yet. Please excercise caution while using the +# python module. + +""" +This simple test tries to create some files and directories, and write some +data at specific offsets in the created files. Then it tries to ensure that +the created paths are valid, and that the file contents are as expected. + +To run this script, + - Prepare kfs.so as described in the file 'doc/ClientDeveloperDoc' + - Ensure that the KFS metaserver and chunkserver are running. + - Ensure that the metaserver host/port matches the contents of argv[1]. + - Ensure that the PYTHONPATH and LD_LIBRARY_PATH are set accordingly. + eg: PYTHONPATH=${PYTHONPATH}:~/code/kfs/build/lib/lib64/python \ + LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:~/code/kfs/build/lib \ + python ./kfssample.py kfssample.cfg +""" + +import os +import sys +import time +import errno + +import kfs + +def main(): + if len(sys.argv) < 2: + sys.exit('Usage: %s config_file' % sys.argv[0]) + + client = None + try: + client = kfs.client(sys.argv[1]) + except: + print "Unable to start the KFS client." + print "Make sure that the meta- and chunkservers are running." + sys.exit(1) + + testBaseDir = "kfssample_base" + testDirs = ("dir1", "dir2") + testFile1 = "dir1/file1" + testFile2 = "file2" + file1Content = "Cu populo nusquam alienum vim, graece latine prodesset ex qui, quo ea lucilius intellegat." + file2ContentA = { 0 : "are ", # at offset 0 + 40 : "you ", # at offset 40 + 1030 : "always ", + 1048580 : "wrong?" } + file2ContentB = { 500 : "really " } + + client.cd("/") + + try: # just in case we didn't cleanup last time + client.rmdirs(testBaseDir) + except IOError, err: + pass + + client.mkdir(testBaseDir) + client.cd(testBaseDir) + for td in testDirs: + client.mkdir(td) + time.sleep(1) + print "Created directories." + + client.cd("/" + testBaseDir) + f1 = client.create(testFile1, 2) + f2 = client.create(testFile2, 3) + + f1.write(file1Content) + for offset, content in file2ContentA.items(): + f2.seek(offset) + f2.write(content) + print "Created files." + + f1.sync() + f1.close() + f2.sync() + f2.close() + time.sleep(1) + print "Closed files (first time)." + + f1 = client.open(testFile1, 'r') + f2 = client.open(testFile2, 'w') + print "Opened files." + + for offset, content in file2ContentB.items(): + f2.seek(offset) + f2.write(content) + + f1.sync() + f1.close() + f2.sync() + f2.close() + time.sleep(1) + print "Closed files (second time)." + + # Verify if everything is fine. + client.cd("/") + expected = ("dir1", "dir2", "file2") + for node in client.readdir(testBaseDir): + print node + if node in (".", ".."): + continue + if node not in expected: + sys.exit("%s is not in expected list %r" % (node, expected)) + + expected = ("file1") + for node in client.readdir(testBaseDir + "/dir1"): + print node + if node in (".", ".."): + continue + if node not in expected: + sys.exit("%s is not in expected list %r" % (node, expected)) + print "Created paths are in order." + + filePath1 = testBaseDir + "/" + testFile1 + filePath2 = testBaseDir + "/" + testFile2 + + print "Stat for %s is %r" % (filePath1, client.stat(filePath1)) + print "Stat for %s is %r" % (filePath2, client.stat(filePath2)) + + f1 = client.open(filePath1, 'r') + out = f1.read(2) + if (out != "Cu"): + sys.exit("Error: Expected 'Cu', got '%s'.", out) + f1.seek(31) + out = f1.read(6) + if (out != "graece"): + sys.exit("Error: Expected 'graece', got '%s'.", out) + pos = f1.tell() + if pos != 37: + sys.exit("Error: Expected 'pos = 37', got 'pos = %d'.", pos) + f1.close() + print "File1 contents are in order" + + f2 = client.open(filePath2, 'r') + f2.seek(1032) + out = f2.read(3) + if (out != "way"): + sys.exit("Error: Expected 'way', got '%s'.", out) + f2.seek(1048578) + out = f2.read(7) + if out[2:] != "wrong": + sys.exit("Error: Expected '..wrong', got '%r'.", out) + f2.close() + print "File2 contents are in order" + + client.rmdirs(testBaseDir) + +if __name__ == '__main__': + main() + diff --git a/examples/sampleservers/sample_setup.cfg b/examples/sampleservers/sample_setup.cfg new file mode 100644 index 000000000..226fe0de3 --- /dev/null +++ b/examples/sampleservers/sample_setup.cfg @@ -0,0 +1,50 @@ +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# This simple local setup is for one metaserver, two chunkservers that talk to +# the metaserver, and a webserver to monitor the KFS servers. +# +# Note the unit port numbers and the unique chunkserver directories for each +# chunkserver. + +[metaserver] +hostname = localhost +rundir = ~/qfsbase/meta +clientport = 20000 +chunkport = 20100 +clusterkey = myTestCluster + +[chunkserver1] +hostname = localhost +rundir = ~/qfsbase/chunk1 +chunkport = 21001 +space = 700m + +[chunkserver2] +hostname = localhost +rundir = ~/qfsbase/chunk2 +chunkport = 21002 +space = 500m + +[webui] +hostname = localhost +rundir = ~/qfsbase/web +webport = 22000 diff --git a/examples/sampleservers/sample_setup.py b/examples/sampleservers/sample_setup.py new file mode 100755 index 000000000..e89ce90d7 --- /dev/null +++ b/examples/sampleservers/sample_setup.py @@ -0,0 +1,493 @@ +#!/usr/bin/env python + +# +# $Id$ +# +# Author: Thilee Subramaniam +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +""" + +This scrips helps to setup a simple, local deployment of the KFS servers. One +can define the servers' configuration through a setup file (eg:sample_setup.cfg +below) and run this setup to install, uninstall or upgrade the sample KFS +deployment. + +eg: +./sample_setup.py -c sample_setup.cfg -a install -b ~/code/kfs/build -s ~/code/kfs + -c: config file + -a: action (one of install, stop, uninstall) + -b: dist dir + -s: source dir + +Contents of sample_setup.cfg, that sets up a metaserver and two chunk servers. +--------------------------------- +[metaserver] +hostname = localhost +rundir = ~/kfsbase/meta +clientport = 20000 +chunkport = 20100 +clusterkey = myTestCluster + +[chunkserver1] +hostname = localhost +rundir = ~/kfsbase/chunk1 +chunkport = 21001 +space = 700m + +[chunkserver2] +hostname = localhost +rundir = ~/kfsbase/chunk2 +chunkport = 21002 +space = 500m + +[webui] +hostname = localhost +rundir = ~/kfsbase/web +webport = 22000 +--------------------------------- + +The script sets up the servers' config files as follows: + +meta-run-dir/checkpoints/ + /logs/ + /conf/MetaServer.prp + /metaserver.log + /metaserver.out + +chunk-run-dir/chunkserver1/chunks/ + /conf/ChunkServer.prp + /chunkserver.log + /chunkserver.out + /chunkserver2/chunks/ + /conf/ChunkServer.prp + /chunkserver.log + /chunkserver.out + +webui-run-dir/docroot/ + /conf/WebUI.cfg + /webui.log +""" + +import sys, os, os.path, shutil, errno, signal, posix, re +import ConfigParser +import subprocess + +from optparse import OptionParser, OptionGroup, IndentedHelpFormatter + +class Globals(): + METASERVER = None + CHUNKSERVER = None + WEBSERVER = None + +def get_size_in_bytes(str): + if not str: + return 0 + pos = 0 + while pos < len(str) and not str[pos].isalpha(): + pos = pos + 1 + if pos >= len(str): + return int(str) + val = int(str[0:pos]) + unit = str[pos] + mul = 1 + if unit in ('k', 'K'): + mul = 1000 + elif unit in ('m', 'M'): + mul = 1000000 + elif unit in ('g', 'G'): + mul = 1000000000 + return val * mul + +def check_binaries(releaseDir, sourceDir): + if not os.path.exists(releaseDir + '/bin/metaserver'): + sys.exit('Metaserver missing in build directory') + Globals.METASERVER = releaseDir + '/bin/metaserver' + + if not os.path.exists(releaseDir + '/bin/chunkserver'): + sys.exit('Chunkserver missing in build directory') + Globals.CHUNKSERVER = releaseDir + '/bin/chunkserver' + + if os.path.exists(releaseDir + '/webui/kfsstatus.py'): + Globals.WEBSERVER = releaseDir + '/webui/kfsstatus.py' + elif os.path.exists(sourceDir + '/webui/kfsstatus.py'): + Globals.WEBSERVER = sourceDir + '/webui/kfsstatus.py' + else: + sys.exit('Webserver missing in build and source directories') + print 'Binaries presence checking - OK.' + +def kill_running_program(binaryPath): + #print 'Trying to kill instances of [ %s ] ..' % binaryPath + + if binaryPath.find('kfsstatus') >= 0: + cmd = 'ps -ef | grep %s | grep -v grep | awk \'{print $2}\'' % binaryPath + res = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).communicate() + pid = res[0].strip() + if pid != '': + os.kill(int(pid), signal.SIGTERM) + return + + pids = subprocess.Popen(['pidof', binaryPath], stdout=subprocess.PIPE).communicate() + for pid in pids[0].strip().split(): + os.kill(int(pid), signal.SIGTERM) + +def run_command(cmd): + return subprocess.check_call(cmd, shell=True) + +# copy files & directories from src directory to dst directory. if dst does +# not exist, create it. if dst's children with same src children names exist +# then overwrite them. +def duplicate_tree(src, dst): + if os.path.exists(dst) and not os.path.isdir(dst): + sys.exit('Cannot duplicate directory to a non-directory') + + if not os.path.exists(dst): + os.makedirs(dst) + + for li in os.listdir(src): + srcPath = os.path.join(src, li) + dstPath = os.path.join(dst, li) + + if os.path.isdir(dstPath): + shutil.rmtree(dstPath) + else: + if os.path.exists(dstPath): + os.unlink(dstPath) + + if os.path.isdir(srcPath): + shutil.copytree(srcPath, dstPath) + else: + shutil.copyfile(srcPath, dstPath) + +def mkdir_p(dirname): + try: + os.makedirs(dirname) + except OSError, err: + if err.errno != errno.EEXIST: + sys.exit('Failed to create directory') + else: + if not os.path.isdir(dirname): + sys.exit('% exists, but is not a directory!' % dirname) + +def parse_command_line(): + action_keys = { 'install' : True, + 'start' : True, + 'stop' : True, + 'uninstall' : True } + + argv0Dir = os.path.dirname(sys.argv[0]) + + defaultConfig = os.path.join(argv0Dir, 'sample_setup.cfg') + defaultConfig = os.path.abspath(defaultConfig) + + defaultSrcDir = os.path.join(argv0Dir, '../..') + defaultSrcDir = os.path.abspath(defaultSrcDir) + + defaultRelDir = os.path.join(argv0Dir, '../../build/release') + defaultRelDir = os.path.abspath(defaultRelDir) + + if not os.path.exists(defaultRelDir): + defaultRelDir = os.path.join(argv0Dir, '../..') + + formatter = IndentedHelpFormatter(max_help_position=50, width=120) + usage = "usage: ./%prog [options] -a " + parser = OptionParser(usage, formatter=formatter, add_help_option=False) + + parser.add_option('-c', '--config-file', action='store', + default=defaultConfig, metavar='FILE', help='Setup config file.') + + parser.add_option('-a', '--action', action='store', default=None, + metavar='ACTION', help='One of install, uninstall, or stop.') + + parser.add_option('-b', '--release-dir', action='store', + default=defaultRelDir, metavar='DIR', help='QFS release directory.') + + parser.add_option('-s', '--source-dir', action='store', + default=defaultSrcDir, metavar='DIR', help='QFS source directory.') + + parser.add_option('-h', '--help', action='store_true', + help="Print this help message and exit.") + + actions = """ +Actions: + install - setup meta and chunk server directories, restarting/starting them + start - start meta and chunk servers + stop - stop meta and chunk servers + uninstall - remove meta and chunk server directories after stopping them""" + + # an install sets up all config files and (re)starts the servers. + # an uninstall stops the servers and removes the config files. + # a stop stops the servers. + opts, args = parser.parse_args() + + if opts.help: + parser.print_help() + print actions + print + posix._exit(0) + + e = [] + if not os.path.isfile(opts.config_file): + e.append("specified 'config-file' does not exist: %s" % opts.config_file) + + if not opts.action: + e.append("'action' must be specified") + elif not action_keys.has_key(opts.action): + e.append("invalid 'action' specified: %s" % opts.action) + + if not os.path.isdir(opts.release_dir): + e.append("specified 'release-dir' does not exist: %s" % opts.release_dir) + + if not os.path.isdir(opts.source_dir): + e.append("specified 'source-dir' does not exist: %s" % opts.source_dir) + + if len(e) > 0: + parser.print_help() + print actions + print + for error in e: + print "*** %s" % error + print + posix._exit(1) + + return opts + +def do_cleanup(config, doUninstall): + if config.has_section('metaserver'): + metaDir = config.get('metaserver', 'rundir') + if metaDir: + kill_running_program(Globals.METASERVER) + if doUninstall and os.path.isdir(metaDir): + shutil.rmtree(metaDir) + + for section in config.sections(): + if section.startswith('chunkserver'): + chunkDir = config.get(section, 'rundir') + if chunkDir: + kill_running_program(Globals.CHUNKSERVER) + if doUninstall and os.path.isdir(chunkDir): + shutil.rmtree(chunkDir) + + if config.has_section('webui'): + webDir = config.get('webui', 'rundir') + if webDir: + kill_running_program(Globals.WEBSERVER) + if doUninstall and os.path.isdir(webDir): + shutil.rmtree(webDir) + if doUninstall: + print 'Uninstall - OK.' + else: + print 'Stop servers - OK.' + +def setup_directories(config): + if config.has_section('metaserver'): + metaDir = config.get('metaserver', 'rundir') + if metaDir: + mkdir_p(metaDir); + mkdir_p(metaDir + '/conf') + mkdir_p(metaDir + '/checkpoints') + mkdir_p(metaDir + '/logs') + + for section in config.sections(): + if section.startswith('chunkserver'): + chunkDir = config.get(section, 'rundir') + if chunkDir: + mkdir_p(chunkDir); + mkdir_p(chunkDir + '/conf') + mkdir_p(chunkDir + '/chunkdir_1') + mkdir_p(chunkDir + '/chunkdir_2') + mkdir_p(chunkDir + '/logs') + + if config.has_section('webui'): + webDir = config.get('webui', 'rundir') + if webDir: + mkdir_p(webDir); + mkdir_p(webDir + '/conf') + mkdir_p(webDir + '/docroot') + print 'Setup directories - OK.' + +def setup_config_files(config): + if 'metaserver' not in config.sections(): + sys.exit('Required metaserver section not found in config') + metaDir = config.get('metaserver', 'rundir') + if not metaDir: + sys.exit('Required metaserver rundir not found in config') + + metaserverHostname = config.get('metaserver', 'hostname') + metaserverClientPort = config.getint('metaserver', 'clientport') + metaserverChunkPort = config.getint('metaserver', 'chunkport') + clusterKey = config.get('metaserver', 'clusterkey') + + #metaserver + metaFile = open(metaDir + '/conf/MetaServer.prp', 'w') + print >> metaFile, "metaServer.clientPort = %d" % metaserverClientPort + print >> metaFile, "metaServer.chunkServerPort = %d" % metaserverChunkPort + print >> metaFile, "metaServer.clusterKey = %s" % clusterKey + print >> metaFile, "metaServer.cpDir = %s/checkpoints" % metaDir + print >> metaFile, "metaServer.logDir = %s/logs" % metaDir + print >> metaFile, "metaServer.createEmptyFs = 1" + print >> metaFile, "metaServer.recoveryInterval = 1" + print >> metaFile, "metaServer.msgLogWriter.logLevel = DEBUG" + print >> metaFile, "metaServer.msgLogWriter.maxLogFileSize = 1e6" + print >> metaFile, "metaServer.msgLogWriter.maxLogFiles = 10" + print >> metaFile, "metaServer.minChunkservers = 1" + print >> metaFile, "metaServer.clientThreadCount = 4" + print >> metaFile, "metaServer.rootDirUser = %d" % os.getuid() + print >> metaFile, "metaServer.rootDirGroup = %d" % os.getgid() + print >> metaFile, "metaServer.rootDirMode = 0777" + metaFile.close() + + # chunkservers + for section in config.sections(): + if section.startswith('chunkserver'): + chunkClientPort = config.getint(section, 'chunkport') + spaceStr = config.get(section, 'space') + chunkDir = config.get(section, 'rundir') + if chunkDir: + chunkFile = open(chunkDir + '/conf/ChunkServer.prp', 'w') + print >> chunkFile, "chunkServer.metaServer.hostname = %s" % metaserverHostname + print >> chunkFile, "chunkServer.metaServer.port = %d" % metaserverChunkPort + print >> chunkFile, "chunkServer.clientPort = %d" % chunkClientPort + print >> chunkFile, "chunkServer.clusterKey = %s" % clusterKey + print >> chunkFile, "chunkServer.rackId = 0" + print >> chunkFile, "chunkServer.chunkDir = %s/chunkdir_1 %s/chunkdir_2" % (chunkDir, chunkDir) + print >> chunkFile, "chunkServer.diskIo.crashOnError = 1" + print >> chunkFile, "chunkServer.abortOnChecksumMismatchFlag = 1" + print >> chunkFile, "chunkServer.msgLogWriter.logLevel = DEBUG" + print >> chunkFile, "chunkServer.msgLogWriter.maxLogFileSize = 1e6" + print >> chunkFile, "chunkServer.msgLogWriter.maxLogFiles = 2" + chunkFile.close() + + # webserver + if 'webui' not in config.sections(): + return + webDir = config.get('webui', 'rundir') + if not webDir: + return + webFile = open(webDir + '/conf/WebUI.cfg', 'w') + print >> webFile, "[webserver]" + print >> webFile, "webServer.metaserverHost = %s" % metaserverHostname + print >> webFile, "webServer.metaserverPort = %d" % metaserverClientPort + print >> webFile, "webServer.port = %d" % config.getint('webui', 'webport') + print >> webFile, "webServer.docRoot = %s/docroot" % webDir + print >> webFile, "webServer.allmachinesfn = /dev/null" + print >> webFile, "webServer.displayPorts = True" + print >> webFile, "[chunk]" + print >> webFile, "refreshInterval = 5" + print >> webFile, "currentSize = 30" + print >> webFile, "currentSpan = 10" + print >> webFile, "hourlySize = 30" + print >> webFile, "hourlySpan =120" + print >> webFile, "daylySize = 24" + print >> webFile, "daylySpan = 3600" + print >> webFile, "monthlySize = 30" + print >> webFile, "monthlySpan = 86400" + print >> webFile, "displayPorts = True" + print >> webFile, "predefinedHeaders = Buffer-req-wait-usec&D-Timer-overrun-count&D-Timer-overrun-sec&XMeta-server-location&Client-active&D-Buffer-req-denied-bytes&D-CPU-sys&D-CPU-user&D-Disk-read-bytes&D-Disk-read-count&D-Disk-write-bytes&D-Disk-write-count&Write-appenders&D-Disk-read-errors&D-Disk-write-errors" + webFile.close() + print 'Setup config files - OK.' + +def copy_files(config, sourceDir): + # currently, only the web CSS stuff need be copied. + if 'webui' in config.sections(): + webDir = config.get('webui', 'rundir') + if webDir: + webDst = webDir + '/docroot' + webSrc = sourceDir + '/webui/files' + duplicate_tree(webSrc, webDst) + +def start_servers(config, whichServers = 'all'): + startMeta = whichServers in ('meta', 'all') + startChunk = whichServers in ('chunk', 'all') + startWeb = whichServers in ('web', 'all') + + errors = 0 + + if startMeta: + startWeb = True + kill_running_program(Globals.METASERVER) + metaDir = config.get('metaserver', 'rundir') + if metaDir: + metaConf = metaDir + '/conf/MetaServer.prp' + metaLog = metaDir + '/MetaServer.log' + metaOut = metaDir + '/MetaServer.out' + command = '%s -c %s %s > %s 2>&1 &' % ( + Globals.METASERVER, + metaConf, + metaLog, + metaOut) + if run_command(command) > 0: + print "*** metaserver failed to start" + error = 1 + + if startChunk: + kill_running_program(Globals.CHUNKSERVER) + for section in config.sections(): + if section.startswith('chunkserver'): + chunkDir = config.get(section, 'rundir') + if chunkDir: + chunkConf = chunkDir + '/conf/ChunkServer.prp' + chunkLog = chunkDir + '/ChunkServer.log' + chunkOut = chunkDir + '/ChunkServer.out' + command = '%s %s %s > %s 2>&1 &' % ( + Globals.CHUNKSERVER, + chunkConf, + chunkLog, + chunkOut) + if run_command(command) > 0: + print "*** chunkserver failed to start" + error = 1 + + if startWeb: + kill_running_program(Globals.WEBSERVER) + webDir = config.get('webui', 'rundir') + if webDir: + webConf = webDir + '/conf/WebUI.cfg' + webLog = webDir + '/webui.log' + command = '%s %s > %s 2>&1 &' % (Globals.WEBSERVER, webConf, webLog) + if run_command(command) > 0: + print "*** chunkserver failed to start" + error = 1 + + if errors > 0: + print 'Started servers - FAILED.' + else: + print 'Started servers - OK.' + +def parse_config(configFile): + config = ConfigParser.ConfigParser() + config.read(configFile); + for section in config.sections(): + dir = config.get(section, 'rundir') + config.set(section, 'rundir', os.path.expanduser(dir)) + return config + +if __name__ == '__main__': + opts = parse_command_line() + config = parse_config(opts.config_file) + + check_binaries(opts.release_dir, opts.source_dir) + + if opts.action in ('uninstall', 'stop'): + do_cleanup(config, opts.action == 'uninstall') + posix._exit(0) + + setup_directories(config) + setup_config_files(config) + copy_files(config, opts.source_dir) + start_servers(config) diff --git a/package/conf/ChunkServer.prp b/package/conf/ChunkServer.prp new file mode 100644 index 000000000..2aced28ad --- /dev/null +++ b/package/conf/ChunkServer.prp @@ -0,0 +1,30 @@ +# +# $Id$ +# +# Copyright 2008-2012 Quantcast Corp. +# +# Author: Sriram Rao (Quantcast Corp.) +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# Configuration for chunk server: +# meta server location +chunkServer.metaServer.hostname = localhost +chunkServer.metaServer.port = 30000 +# port to open for client connections +chunkServer.clientPort = 22000 +# Directory for storing the chunks +chunkServer.chunkDir = chunks + diff --git a/package/conf/KfsClient.prp b/package/conf/KfsClient.prp new file mode 100644 index 000000000..1d37209a5 --- /dev/null +++ b/package/conf/KfsClient.prp @@ -0,0 +1,24 @@ +# +# $Id$ +# +# Copyright 2008-2012 Quantcast Corp. +# +# Author: Sriram Rao (Quantcast Corp.) +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# Where is the metaserver +metaServer.name = localhost +metaServer.port = 20000 diff --git a/package/conf/MetaServer.prp b/package/conf/MetaServer.prp new file mode 100644 index 000000000..c9e67ae73 --- /dev/null +++ b/package/conf/MetaServer.prp @@ -0,0 +1,28 @@ +# +# $Id# +# +# Copyright 2008-2012 Quantcast Corp. +# +# Author: Sriram Rao (Quantcast Corp.) +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# Port at which clients can connect to us +metaServer.clientPort = 20000 +# Port at which chunkservers can connect to us +metaServer.chunkServerPort = 30000 +metaServer.logDir = meta/transaction_logs +metaServer.cpDir = meta/checkpoint +metaServer.createEmptyFs = 1 diff --git a/package/deb/.gitignore b/package/deb/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/package/rpm/README b/package/rpm/README new file mode 100644 index 000000000..f48e2780b --- /dev/null +++ b/package/rpm/README @@ -0,0 +1,35 @@ +# +# $Id$ +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# + +The specs directory contains the .spec files needed to build the RPMs for the different KFS components. + +Prerequisite: +Ensure that the $JAVA_HOME variable is set correctly, and the java headers are at $JAVA_HOME/include. +If the java include path is different, then modify the spec file by adding the following flag to cmake command. + "cmake -D JAVA_INCLUDE_PATH=/java/path/include -D CMAKE_BUILD_TYPE=RelWithDebInfo ." + +To build the metaserver RPM: +rpmbuild -v -ba specs/kfs-metaserver.spec + +To build the chunkserver RPM: +rpmbuild -v -ba specs/kfs-chunkserver.spec + +To build the clients RPM: +rpmbuild -v -ba specs/kfs-clients.spec diff --git a/package/rpm/specs/qfs-chunkserver.spec b/package/rpm/specs/qfs-chunkserver.spec new file mode 100644 index 000000000..f9a6e873c --- /dev/null +++ b/package/rpm/specs/qfs-chunkserver.spec @@ -0,0 +1,75 @@ +# +# $Id$ +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# + + +%define debug_package %{nil} +%define debug_packages %{nil} + +Summary: QFS Chunk Server Package +Name: qfs-chunkserver +Version: 1.0.0 +Release: 0 +License: Apache +Group: Applications/Distributed +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root + +BuildRequires: boost + +%define _install_prefix /opt/qc/qfs/chunkserver + +%description +This package contains the Quantcast Distributed Filesystem chunk server. + +%prep +cd %{_sourcedir} +if [ -d qfs ]; then + rm -rf qfs +fi +git clone https://github.com/quantcast/qfs.git + +%build +cd %{_sourcedir}/qfs +make release + +%install +rm -rf %{buildroot} +mkdir -p %{buildroot}%{_install_prefix}/bin +mkdir -p %{buildroot}%{_install_prefix}/sbin +mkdir -p %{buildroot}%{_install_prefix}/conf +install -m 755 %{_sourcedir}/qfs/build/release/bin/chunk* %{buildroot}%{_install_prefix}/bin +install -m 644 %{_sourcedir}/qfs/package/conf/ChunkServer.prp %{buildroot}%{_install_prefix}/conf + +%clean +rm -rf %{buildroot} +rm -rf %{_sourcedir}/qfs + +%pre + +%post + +%preun + +%files +%defattr(-,root,root,-) +%{_install_prefix}/* + +%postun + +%changelog diff --git a/package/rpm/specs/qfs-client.spec b/package/rpm/specs/qfs-client.spec new file mode 100644 index 000000000..8944244d7 --- /dev/null +++ b/package/rpm/specs/qfs-client.spec @@ -0,0 +1,78 @@ +# +# $Id$ +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# + + +%define debug_package %{nil} +%define debug_packages %{nil} + +Summary: QFS Client +Name: qfs-client +Version: 1.0.0 +Release: 0 +License: Apache +Group: Applications/Distributed +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root + +BuildRequires: boost + +%define _install_prefix /opt/qc/qfs/client + +%description +This package contains the Quantcast Distributed Filesystem client tools and libraries + +%prep +cd %{_sourcedir} +if [ -d qfs ]; then + rm -rf qfs +fi +git clone https://github.com/quantcast/qfs.git + +%build +cd %{_sourcedir}/qfs +make release + +%install +rm -rf %{buildroot} +mkdir -p %{buildroot}%{_install_prefix}/bin +mkdir -p %{buildroot}%{_install_prefix}/conf +mkdir -p %{buildroot}%{_install_prefix}/lib +mkdir -p %{buildroot}%{_install_prefix}/include +install -m 755 %{_sourcedir}/qfs/build/release/bin/tools/* %{buildroot}%{_install_prefix}/bin +install -m 644 %{_sourcedir}/qfs/package/conf/KfsClient.prp %{buildroot}%{_install_prefix}/conf +install -m 644 %{_sourcedir}/qfs/build/release/lib/lib* %{buildroot}%{_install_prefix}/lib +cp -a %{_sourcedir}/qfs/build/release/include %{buildroot}%{_install_prefix} + +%clean +rm -rf %{buildroot} +rm -rf %{_sourcedir}/qfs + +%pre + +%post + +%preun + +%files +%defattr(-,root,root,-) +%{_install_prefix}/* + +%postun + +%changelog diff --git a/package/rpm/specs/qfs-metaserver.spec b/package/rpm/specs/qfs-metaserver.spec new file mode 100644 index 000000000..f891e2f7f --- /dev/null +++ b/package/rpm/specs/qfs-metaserver.spec @@ -0,0 +1,78 @@ +# +# $Id$ +# +# Copyright 2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# + +%define debug_package %{nil} +%define debug_packages %{nil} + +Summary: QFS Meta Server Package +Name: qfs-metaserver +Version: 1.0.0 +Release: 0 +License: Apache +Group: Applications/Distributed +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root + +BuildRequires: boost + +%define _install_prefix /opt/qc/qfs/metaserver + +%description +This package contains the Quantcast Distributed Filesystem meta server. + +%prep +cd %{_sourcedir} +if [ -d qfs ]; then + rm -rf qfs +fi +git clone https://github.com/quantcast/qfs.git + +%build +cd %{_sourcedir}/qfs +make release + +%install +rm -rf %{buildroot} +mkdir -p %{buildroot}%{_install_prefix}/bin +mkdir -p %{buildroot}%{_install_prefix}/sbin +mkdir -p %{buildroot}%{_install_prefix}/conf +install -m 755 %{_sourcedir}/qfs/build/release/bin/metaserver %{buildroot}%{_install_prefix}/bin +install -m 755 %{_sourcedir}/qfs/build/release/bin/filelister %{buildroot}%{_install_prefix}/bin +install -m 755 %{_sourcedir}/qfs/build/release/bin/kfsfsck %{buildroot}%{_install_prefix}/bin +install -m 755 %{_sourcedir}/qfs/build/release/bin/logcompactor %{buildroot}%{_install_prefix}/bin +cp -a %{_sourcedir}/qfs/webui %{buildroot}%{_install_prefix}/ +install -m 644 %{_sourcedir}/qfs/package/conf/MetaServer.prp %{buildroot}%{_install_prefix}/conf + +%clean +rm -rf %{buildroot} +rm -rf %{_sourcedir}/qfs + +%pre + +%post + +%preun + +%files +%defattr(-,root,root,-) +%{_install_prefix}/* + +%postun + +%changelog diff --git a/scripts/kfsprune.py b/scripts/kfsprune.py new file mode 100755 index 000000000..b5d6401b3 --- /dev/null +++ b/scripts/kfsprune.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# +# $Id$ +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# \file kfsprune.py +# \brief KFS log and checkpoint housekeeping +# +# Housekeeping script to clean up the KFS checkpoint and log files, +# which would otherwise accumulate without limit. Looks for all +# files of the form . in a specified directory and removes +# all but a subset of them according to the rules +# +# - maintain at least the specified minimum number +# - keep more if necessary to cover the specified time span +# - but don't exceed the specified maximum +# +# Two instances of this script should be run periodically from the +# main cleanup script, one for the checkpoints and the other for the +# logs. +# +# Note that the decision to measure the relative age of the files +# by their sequence numbers rather than mtime is deliberate, since +# KFS controls the numbers whereas time is controlled externally +# and can go backwards. +# +# Old files can either be deleted or compressed via gzip. +# +import os +import sys +import glob +import stat +import time +import getopt +import gzip +import re + +def age(file): + """return age of file (last mtime) in seconds""" + + now = time.time() + return now - os.stat(file)[stat.ST_MTIME] + +def isNonCompressedFile (filename): + if filename.find(".tmp") >= 0: + return False + extn = os.path.splitext(filename)[1][1:] + if re.match("^\d+$", extn): + return True + return False + +def oldfiles(prefix, minsave, maxsave, mintime): + """return a list of the oldest files of a given type + + Look up all files of the form "prefix.*" in the current + directory and determine which ones are candidates for + deletion. The rules for deciding this are as follows: + + the part of the file after the dot is assumed to be a + monotonically increasing integer, which determines the + relative age of the files; + + at least "minsave" files are to be preserved; + + the youngest removed file should have an mtime at least + "mintime" seconds in the past; we will preserve up to + "maxsave" files to try to satisfy this condition. + + """ + # get all candidates and rearrange by sequence number + files = filter(isNonCompressedFile, glob.glob(prefix + ".*")) + tails = [int(os.path.splitext(f)[1][1:]) for f in files] + tails.sort() + files = ["%s.%d" % (prefix, t) for t in tails] + + # trim off the minimum number + files = files[:-minsave] + + # trim extras required by time constraint + saved = minsave + while len(files) != 0 and saved < maxsave and age(files[-1]) < mintime: + del files[-1] + saved += 1 + + return files + +def compressFiles(oldones): + """ Compress a list of files using gzip """ + for fn in oldones: + f = open(fn, 'rb') + cf = gzip.open(fn + '.gz', 'wb') + while 1: + data = f.read(4096) + if data == "": + break + cf.write(data) + f.close() + cf.close() + +def prunefiles(dir, prefix, minsave, maxsave, mintime, compress): + """remove/compress all sufficiently old files from directory "dir" + + Change directory to "dir", find old files, and delete/compress them; + see oldfiles above for an explanation of the parameters + + """ + os.chdir(dir); + oldones = oldfiles(prefix, minsave, maxsave, mintime) + if compress > 0: + compressFiles(oldones) + for f in oldones: + os.remove(f) + +if (__name__ == "__main__"): + minsave, maxsave, mintime = 10, 100, 3600 + + (opts, args) = getopt.getopt(sys.argv[1:], "m:M:t:z", + ["min=", "max=", "time=", "compress"]) + + compress = 0 + for (o, a) in opts: + if o in ("-m", "--min"): + minsave = int(a) + elif o in ("-M", "--max"): + maxsave = int(a) + elif o in ("-t", "--time"): + mintime = int(a) + elif o in ("-z", "--compress"): + compress = 1 + + if maxsave < 0 or minsave < 0 or mintime < 0 or maxsave < minsave: + raise getopt.GetoptError, "invalid options" + + if len(args) != 2: + raise getopt.GetoptError, "missing arguments" + + prunefiles(args[0], args[1], minsave, maxsave, mintime, compress) diff --git a/scripts/metalogprune.py b/scripts/metalogprune.py new file mode 100755 index 000000000..e2ebd58dc --- /dev/null +++ b/scripts/metalogprune.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# +# $Id: metalogprune.py 117 2008-08-05 20:51:44Z sriramsrao $ +# +# Copyright 2008 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# \file metalogprune.py +# \brief KFS log and checkpoint housekeeping +# +# We gzip and keep all the old log files around. This is a bit of an +# overkill---we can prune away files that are no longer referenced by +# any checkpoint. We find the oldest checkpoint file and find that +# the log it references; files older than that log file are deleted. +# +import os +import sys +import glob +import stat +import time +import getopt +import gzip + +def age(file): + """return age of file (last mtime) in seconds""" + + now = time.time() + return now - os.stat(file)[stat.ST_MTIME] + +def orderByAge(this, that): + if age(this) > age(that): + return this + return that + +def olderThanLog(logfile, lognum): + """Return True if logfile which is of the form log.# has a + sequence number less than lognum""" + (base, extn) = os.path.splitext(logfile) + extn = extn[1:] + if extn == 'gz': + val = int(os.path.splitext(base)[1][1:]) + else: + val = int(extn) + return val < lognum + +def prunefiles(cpdir, logdir): + """Find the log file that is referenced by the oldest CP file. + Log files that are older than that one can be deleted.""" + oldest = reduce(orderByAge, glob.glob(cpdir + '/chkpt.*')) + if oldest is None: + return + print "Oldest cp: %s" % oldest + # get the log file + for l in open(oldest).xreadlines(): + if l.startswith('log/'): + lognum = int(os.path.splitext(l[4:])[1][1:]) + print lognum + alllogfiles = glob.glob(logdir + '/log.*') + oldones = [f for f in alllogfiles if olderThanLog(f, lognum)] + for f in oldones: + os.remove(f) + break + +if (__name__ == "__main__"): + if len(sys.argv) != 3: + raise getopt.GetoptError, "missing arguments" + + # kfscpdir, kfslogdir + prunefiles(sys.argv[1], sys.argv[2]) diff --git a/src/cc/access/CMakeLists.txt b/src/cc/access/CMakeLists.txt new file mode 100644 index 000000000..a360f85ca --- /dev/null +++ b/src/cc/access/CMakeLists.txt @@ -0,0 +1,44 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +# Take all the .cc files and build a library out of them +add_library (kfs_access SHARED kfs_access_jni.cc) +add_dependencies (kfs_access kfsClient-shared) +target_link_libraries (kfs_access kfsClient-shared) +if (NOT APPLE) + target_link_libraries(kfs_access rt) +endif (NOT APPLE) + +IF (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + # On Solaris, cmake uses gcc to link rather than g++. This + # prevents linking with libstdc++ which prevents Java from loading + # kfs_access. To fix, set the compiler to be C++ + set (CMAKE_C_COMPILER "${CMAKE_CXX_COMPILER}") +ENDIF (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + +# +install (TARGETS kfs_access + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib-static) diff --git a/src/cc/access/kfs_access_jni.cc b/src/cc/access/kfs_access_jni.cc new file mode 100644 index 000000000..16a6b4c21 --- /dev/null +++ b/src/cc/access/kfs_access_jni.cc @@ -0,0 +1,1268 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2007/08/24 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2007-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief JNI code in C++ world for accesing KFS Client. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include + +using std::vector; +using std::string; +using std::cout; +using std::endl; +using std::ostringstream; + +#include +#include "libclient/KfsClient.h" +using namespace KFS; + +extern "C" { + jlong Java_com_quantcast_kfs_access_KfsAccess_initF( + JNIEnv *jenv, jclass jcls, jstring jpath); + + jlong Java_com_quantcast_kfs_access_KfsAccess_initS( + JNIEnv *jenv, jclass jcls, jstring jmetaServerHost, jint metaServerPort); + + void Java_com_quantcast_kfs_access_KfsAccess_destroy( + JNIEnv *jenv, jclass jcls, jlong jptr); + + jint Java_com_quantcast_kfs_access_KfsAccess_cd( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jint Java_com_quantcast_kfs_access_KfsAccess_mkdirs( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode); + + jint Java_com_quantcast_kfs_access_KfsAccess_rmdir( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jint Java_com_quantcast_kfs_access_KfsAccess_compareChunkReplicas( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject stringbuffermd5); + + jint Java_com_quantcast_kfs_access_KfsAccess_rmdirs( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jobjectArray Java_com_quantcast_kfs_access_KfsAccess_readdirplus( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jobjectArray Java_com_quantcast_kfs_access_KfsAccess_readdir( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jboolean jpreloadattr); + + jint Java_com_quantcast_kfs_access_KfsAccess_remove( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jint Java_com_quantcast_kfs_access_KfsAccess_rename( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring joldpath, jstring jnewpath, + jboolean joverwrite); + + jint Java_com_quantcast_kfs_access_KfsAccess_exists( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jint Java_com_quantcast_kfs_access_KfsAccess_isFile( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jint Java_com_quantcast_kfs_access_KfsAccess_isDirectory( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jlong Java_com_quantcast_kfs_access_KfsAccess_filesize( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jobjectArray Java_com_quantcast_kfs_access_KfsAccess_getDataLocation( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jstart, jlong jlen); + + jshort Java_com_quantcast_kfs_access_KfsAccess_getReplication( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jshort Java_com_quantcast_kfs_access_KfsAccess_setReplication( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint jnumReplicas); + + jlong Java_com_quantcast_kfs_access_KfsAccess_getModificationTime( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + jint Java_com_quantcast_kfs_access_KfsAccess_setModificationTime( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jmsec); + + jint Java_com_quantcast_kfs_access_KfsAccess_open( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jstring jmode, jint jnumReplicas, + jint jnumStripes, jint jnumRecoveryStripes, jint jstripeSize, jint jstripedType, jint jcreateMode); + + jint Java_com_quantcast_kfs_access_KfsAccess_create( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint jnumReplicas, jboolean jexclusive, + jint jnumStripes, jint jnumRecoveryStripes, jint jstripeSize, jint jstripedType, + jboolean foreceType, jint mode); + + jlong Java_com_quantcast_kfs_access_KfsAccess_setDefaultIoBufferSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jlong jsize); + + jlong Java_com_quantcast_kfs_access_KfsAccess_getDefaultIoBufferSize( + JNIEnv *jenv, jclass jcls, jlong jptr); + + jlong Java_com_quantcast_kfs_access_KfsAccess_setDefaultReadAheadSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jlong jsize); + + jlong Java_com_quantcast_kfs_access_KfsAccess_getDefaultReadAheadSize( + JNIEnv *jenv, jclass jcls, jlong jptr); + + jlong Java_com_quantcast_kfs_access_KfsAccess_setIoBufferSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong jsize); + + jlong Java_com_quantcast_kfs_access_KfsAccess_getIoBufferSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); + + jlong Java_com_quantcast_kfs_access_KfsAccess_setReadAheadSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong jsize); + + jlong Java_com_quantcast_kfs_access_KfsAccess_getReadAheadSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); + + jint Java_com_quantcast_kfs_access_KfsAccess_getStripedType( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); + + void Java_com_quantcast_kfs_access_KfsAccess_setFileAttributeRevalidateTime( + JNIEnv *jenv, jclass jcls, jlong jptr, jint secs); + + jint Java_com_quantcast_kfs_access_KfsAccess_chmod( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jint mode); + + jint Java_com_quantcast_kfs_access_KfsAccess_chmodr( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jint mode); + + jint Java_com_quantcast_kfs_access_KfsAccess_fchmod( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jint mode); + + jint Java_com_quantcast_kfs_access_KfsAccess_chowns( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jstring user, jstring group); + + jint Java_com_quantcast_kfs_access_KfsAccess_chownsr( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jstring user, jstring group); + + jint Java_com_quantcast_kfs_access_KfsAccess_chown( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jlong user, jlong group); + + jint Java_com_quantcast_kfs_access_KfsAccess_chownr( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jlong user, jlong group); + + jint Java_com_quantcast_kfs_access_KfsAccess_fchowns( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jstring user, jstring group); + + jint Java_com_quantcast_kfs_access_KfsAccess_fchown( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong user, jlong group); + + jint Java_com_quantcast_kfs_access_KfsAccess_setEUserAndEGroup( + JNIEnv *jenv, jclass jcls, jlong jptr, jlong user, jlong group, jlongArray); + + jint Java_com_quantcast_kfs_access_KfsAccess_stat( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject attr); + + jstring Java_com_quantcast_kfs_access_KfsAccess_strerror( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jerr); + + jboolean Java_com_quantcast_kfs_access_KfsAccess_isnotfound( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jerr); + + jint Java_com_quantcast_kfs_access_KfsAccess_close( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); + + jlong Java_com_quantcast_kfs_access_KfsAccess_seek( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong joffset); + + jint Java_com_quantcast_kfs_access_KfsAccess_getUMask( + JNIEnv *jenv, jclass jcls, jlong jptr); + + jint Java_com_quantcast_kfs_access_KfsAccess_setUMask( + JNIEnv *jenv, jclass jcls, jlong jptr, jint umask); + + /* Input channel methods */ + jint Java_com_quantcast_kfs_access_KfsInputChannel_read( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end); + + jint Java_com_quantcast_kfs_access_KfsInputChannel_seek( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong joffset); + + jint Java_com_quantcast_kfs_access_KfsInputChannel_tell( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); + + jint Java_com_quantcast_kfs_access_KfsInputChannel_close( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); + + /* Output channel methods */ + jint Java_com_quantcast_kfs_access_KfsOutputChannel_write( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end); + + jint Java_com_quantcast_kfs_access_KfsOutputChannel_atomicRecordAppend( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end); + + jint Java_com_quantcast_kfs_access_KfsOutputChannel_sync( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); + + jint Java_com_quantcast_kfs_access_KfsOutputChannel_seek( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong joffset); + + jint Java_com_quantcast_kfs_access_KfsOutputChannel_tell( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); + + jint Java_com_quantcast_kfs_access_KfsOutputChannel_close( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); +} + +namespace +{ + inline void setStr(string & dst, JNIEnv * jenv, jstring src) + { + char const * s = jenv->GetStringUTFChars(src, 0); + dst.assign(s); + jenv->ReleaseStringUTFChars(src, s); + } +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_initF( + JNIEnv *jenv, jclass jcls, jstring jpath) +{ + string path; + setStr(path, jenv, jpath); + KfsClient* const clnt = Connect(path.c_str()); + return (jlong) clnt; +} + +jint Java_com_quantcast_kfs_access_KfsAccess_compareChunkReplicas( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject stringbuffermd5) +{ + if (! jptr) { + return -EFAULT; + } + string path , md5Sum; + setStr(path, jenv, jpath); + + KfsClient* const clnt = (KfsClient *) jptr; + const int res = clnt->CompareChunkReplicas(path.c_str(), md5Sum); + if (res != 0) { + return res; + } + jcls = jenv->GetObjectClass(stringbuffermd5); + jmethodID mid = jenv->GetMethodID(jcls, "append", + "(Ljava/lang/String;)Ljava/lang/StringBuffer;"); + if(mid == 0) { + return -EFAULT; + } + jstring jstr = jenv->NewStringUTF(md5Sum.c_str()); + if (! jstr) { + return -EFAULT; + } + jenv->CallObjectMethod(stringbuffermd5, mid, jstr); + return res; +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_initS( + JNIEnv *jenv, jclass jcls, jstring jmetaServerHost, jint metaServerPort) +{ + string path; + setStr(path, jenv, jmetaServerHost); + KfsClient* const clnt = Connect(path, metaServerPort); + return (jlong) clnt; +} + +void Java_com_quantcast_kfs_access_KfsAccess_destroy( + JNIEnv *jenv, jclass jcls, jlong jptr) +{ + KfsClient* const clnt = (KfsClient*)jptr; + delete clnt; +} + +jint Java_com_quantcast_kfs_access_KfsAccess_cd( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return clnt->Cd(path.c_str()); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_mkdirs( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return clnt->Mkdirs(path.c_str(), (kfsMode_t)mode); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_rmdir( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return clnt->Rmdir(path.c_str()); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_rmdirs( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return clnt->Rmdirs(path.c_str()); +} + +jobjectArray Java_com_quantcast_kfs_access_KfsAccess_readdir( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jboolean jpreloadattr) +{ + if (! jptr) { + return 0; + } + string path; + setStr(path, jenv, jpath); + + KfsClient* const clnt = (KfsClient*)jptr; + vector fattr; + vector entries; + if ((jpreloadattr ? + clnt->ReaddirPlus(path.c_str(), fattr) : + clnt->Readdir(path.c_str(), entries)) != 0) { + return 0; + } + jclass jstrClass = jenv->FindClass("java/lang/String"); + if (! jstrClass) { + jclass excl = jenv->FindClass("java/lang/ClassNotFoundException"); + if (excl) { + jenv->ThrowNew(excl, 0); + } + return 0; + } + const jsize cnt = jpreloadattr ? fattr.size() : entries.size(); + jobjectArray jentries = jenv->NewObjectArray(cnt, jstrClass, 0); + if (! jentries) { + return 0; + } + for (jsize i = 0; i < cnt; i++) { + jstring s = jenv->NewStringUTF( + jpreloadattr ? fattr[i].filename.c_str() : entries[i].c_str()); + if (! s) { + return 0; + } + jenv->SetObjectArrayElement(jentries, i, s); + jenv->DeleteLocalRef(s); + } + return jentries; +} + +jint Java_com_quantcast_kfs_access_KfsAccess_open( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jstring jmode, + jint jnumReplicas, jint jnumStripes, jint jnumRecoveryStripes, + jint jstripeSize, jint jstripedType, jint jcreateMode) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path, mode; + int openMode = 0; + + setStr(path, jenv, jpath); + setStr(mode, jenv, jmode); + + if (mode == "opendir") + return clnt->OpenDirectory(path.c_str()); + + if (mode == "r") + openMode = O_RDONLY; + else if (mode == "rw") + openMode = O_RDWR | O_CREAT; + else if (mode == "w") + openMode = O_WRONLY | O_CREAT; + else if (mode == "a") + openMode = O_WRONLY | O_APPEND; + + return clnt->Open(path.c_str(), openMode, jnumReplicas, + jnumStripes, jnumRecoveryStripes, jstripeSize, jstripedType, jcreateMode); +} + +jint Java_com_quantcast_kfs_access_KfsInputChannel_close( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return clnt->Close(jfd); +} + +jint Java_com_quantcast_kfs_access_KfsOutputChannel_close( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return clnt->Close(jfd); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_create( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint jnumReplicas, jboolean jexclusive, + jint jnumStripes, jint jnumRecoveryStripes, jint jstripeSize, jint jstripedType, + jboolean foreceType, jint mode) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return clnt->Create(path.c_str(), jnumReplicas, jexclusive, + jnumStripes, jnumRecoveryStripes, jstripeSize, jstripedType, foreceType, (kfsMode_t)mode); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_remove( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return clnt->Remove(path.c_str()); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_rename( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring joldpath, + jstring jnewpath, jboolean joverwrite) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string opath, npath; + setStr(opath, jenv, joldpath); + setStr(npath, jenv, jnewpath); + + return clnt->Rename(opath.c_str(), npath.c_str(), joverwrite); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_setDefaultIoBufferSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jlong jsize) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return (jlong)clnt->SetDefaultIoBufferSize(jsize); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_getDefaultIoBufferSize( + JNIEnv *jenv, jclass jcls, jlong jptr) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return (jlong)clnt->GetDefaultIoBufferSize(); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_setDefaultReadAheadSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jlong jsize) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return (jlong)clnt->SetDefaultReadAheadSize(jsize); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_getDefaultReadAheadSize( + JNIEnv *jenv, jclass jcls, jlong jptr) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return (jlong)clnt->GetDefaultReadAheadSize(); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_setIoBufferSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong jsize) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return (jlong)clnt->SetIoBufferSize(jfd, jsize); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_getIoBufferSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return (jlong)clnt->GetIoBufferSize(jfd); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_setReadAheadSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong jsize) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return (jlong)clnt->SetReadAheadSize(jfd, jsize); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_getReadAheadSize( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + return (jlong)clnt->GetReadAheadSize(jfd); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_getStripedType( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + const bool computeFilesize = false; + KfsFileAttr attr; + return (jint)(clnt->Stat(path.c_str(), attr, computeFilesize) != 0 ? + KFS_STRIPED_FILE_TYPE_UNKNOWN : attr.striperType); +} + +void Java_com_quantcast_kfs_access_KfsAccess_setFileAttributeRevalidateTime( + JNIEnv *jenv, jclass jcls, jlong jptr, jint secs) +{ + if (! jptr) { + return; + } + KfsClient* const clnt = (KfsClient*)jptr; + clnt->SetFileAttributeRevalidateTime(secs); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_chmod( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode) +{ + if (! jptr) { + return -EFAULT; + } + string path; + setStr(path, jenv, jpath); + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->Chmod(path.c_str(), (kfsMode_t)mode); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_chmodr( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode) +{ + if (! jptr) { + return -EFAULT; + } + string path; + setStr(path, jenv, jpath); + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->ChmodR(path.c_str(), (kfsMode_t)mode); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_fchmod( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jint mode) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->Chmod(jfd, (kfsMode_t)mode); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_chowns( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jstring juser, jstring jgroup) +{ + if (! jptr) { + return -EFAULT; + } + string path; + setStr(path, jenv, jpath); + string user; + string group; + if (juser) { + setStr(user, jenv, juser); + } + if (jgroup) { + setStr(group, jenv, jgroup); + } + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->Chown(path.c_str(), group.c_str(), user.c_str()); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_chownsr( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jstring juser, jstring jgroup) +{ + if (! jptr) { + return -EFAULT; + } + string path; + setStr(path, jenv, jpath); + string user; + string group; + if (juser) { + setStr(user, jenv, juser); + } + if (jgroup) { + setStr(group, jenv, jgroup); + } + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->ChownR(path.c_str(), group.c_str(), user.c_str()); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_chown( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong user, jlong group) +{ + if (! jptr) { + return -EFAULT; + } + string path; + setStr(path, jenv, jpath); + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->Chown(path.c_str(), (kfsUid_t)user, (kfsGid_t)group); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_chownr( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong user, jlong group) +{ + if (! jptr) { + return -EFAULT; + } + string path; + setStr(path, jenv, jpath); + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->ChownR(path.c_str(), (kfsUid_t)user, (kfsGid_t)group); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_chownR( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong user, jlong group) +{ + if (! jptr) { + return -EFAULT; + } + string path; + setStr(path, jenv, jpath); + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->ChownR(path.c_str(), (kfsUid_t)user, (kfsGid_t)group); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_fchowns( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jstring juser, jstring jgroup) +{ + if (! jptr) { + return -EFAULT; + } + string user; + string group; + if (juser) { + setStr(user, jenv, juser); + } + if (jgroup) { + setStr(group, jenv, jgroup); + } + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->Chown(jfd, group.c_str(), user.c_str()); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_fchown( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong user, jlong group) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient *) jptr; + return clnt->Chown(jfd, (kfsUid_t)user, (kfsGid_t)group); +} + +jint Java_com_quantcast_kfs_access_KfsOutputChannel_sync( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + return clnt->Sync(jfd); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_setEUserAndEGroup( + JNIEnv *jenv, jclass jcls, jlong jptr, jlong user, jlong group, jlongArray jgroups) +{ + if (! jptr) { + return -EFAULT; + } + kfsGid_t* groups = 0; + jsize cnt = 0; + if (jgroups) { + cnt = jenv->GetArrayLength(jgroups); + jlong* const jg = jenv->GetLongArrayElements(jgroups, 0); + groups = new kfsGid_t[cnt]; + for (jsize i = 0; i < cnt; i++) { + groups[i] = (kfsGid_t)jg[i]; + } + jenv->ReleaseLongArrayElements(jgroups, jg, 0); + } + KfsClient* const clnt = (KfsClient*)jptr; + const int ret = clnt->SetEUserAndEGroup( + (kfsUid_t)user, (kfsGid_t)group, groups, (int)cnt); + delete [] groups; + return ret; +} + +jint Java_com_quantcast_kfs_access_KfsInputChannel_seek( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong joffset) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + return clnt->Seek(jfd, joffset); +} + +jint Java_com_quantcast_kfs_access_KfsInputChannel_tell( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + return clnt->Tell(jfd); +} + +jint Java_com_quantcast_kfs_access_KfsOutputChannel_seek( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong joffset) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + return clnt->Seek(jfd, joffset); +} + +jint Java_com_quantcast_kfs_access_KfsOutputChannel_tell( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + return clnt->Tell(jfd); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_exists( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return (clnt->Exists(path.c_str()) ? 1 : 0); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_isFile( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + + return (clnt->IsFile(path.c_str()) ? 1 : 0); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_isDirectory( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + + return (clnt->IsDirectory(path.c_str()) ? 1 : 0); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_filesize( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + KfsFileAttr attr; + string path; + setStr(path, jenv, jpath); + + if (clnt->Stat(path.c_str(), attr) != 0) { + return -1; + } + return attr.fileSize; +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_getModificationTime( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + KfsFileAttr attr; + string path; + setStr(path, jenv, jpath); + + if (clnt->Stat(path.c_str(), attr) != 0) + return -1; + + // The expected return value is in ms + return ((jlong) attr.mtime.tv_sec) * 1000 + (jlong) (attr.mtime.tv_usec / 1000); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_setModificationTime( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jmsec) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + + struct timeval mtime; + + // the input is in ms + mtime.tv_sec = jmsec / 1000; + mtime.tv_usec = jmsec % 1000; + if (clnt->SetMtime(path.c_str(), mtime) != 0) + return -1; + + return 0; +} + +jobjectArray Java_com_quantcast_kfs_access_KfsAccess_getDataLocation( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jstart, jlong jlen) +{ + if (! jptr) { + return NULL; + } + KfsClient* const clnt = (KfsClient*)jptr; + + // for each block, there could be multiple locations due to replication; return them all here + + string path; + setStr(path, jenv, jpath); + + vector< vector > entries; + const int res = clnt->GetDataLocation(path.c_str(), jstart, jlen, entries); + if (res < 0) { + return 0; + } + jclass jstrArrClass = jenv->FindClass("[Ljava/lang/String;"); + if (! jstrArrClass) { + jclass excl = jenv->FindClass("java/lang/ClassNotFoundException"); + if (excl) { + jenv->ThrowNew(excl, 0); + } + return 0; + } + jclass jstrClass = jenv->FindClass("java/lang/String"); + if (! jstrArrClass) { + jclass excl = jenv->FindClass("java/lang/ClassNotFoundException"); + if (excl) { + jenv->ThrowNew(excl, 0); + } + return 0; + } + // For each block, return its location(s) + const jsize sz = (jsize)entries.size(); + jobjectArray jentries = jenv->NewObjectArray(sz, jstrArrClass, 0); + if (! jentries) { + return 0; + } + for (jsize i = 0; i < sz; i++) { + const jsize lsz = (jsize)entries[i].size(); + jobjectArray jlocs = jenv->NewObjectArray(lsz, jstrClass, 0); + if (! jlocs) { + return 0; + } + for (jsize j = 0; j < lsz; j++) { + jstring s = jenv->NewStringUTF(entries[i][j].c_str()); + if (! s) { + return 0; + } + jenv->SetObjectArrayElement(jlocs, j, s); + jenv->DeleteLocalRef(s); + } + jenv->SetObjectArrayElement(jentries, i, jlocs); + jenv->DeleteLocalRef(jlocs); + } + + return jentries; +} + +jshort Java_com_quantcast_kfs_access_KfsAccess_getReplication( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return clnt->GetReplicationFactor(path.c_str()); +} + +jshort Java_com_quantcast_kfs_access_KfsAccess_setReplication( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint jnumReplicas) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + string path; + setStr(path, jenv, jpath); + return clnt->SetReplicationFactor(path.c_str(), jnumReplicas); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_stat( + JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject attr) +{ + if (! jptr) { + return -EFAULT; + } + if (! jpath || ! attr) { + return -EINVAL; + } + + jclass const acls = jenv->GetObjectClass(attr); + if (! acls) { + return -EINVAL; + } + + string path; + setStr(path, jenv, jpath); + KfsFileAttr kfsAttr; + KfsClient* const clnt = (KfsClient*)jptr; + int ret = clnt->Stat(path.c_str(), kfsAttr); + if (ret != 0) { + return (jint)ret; + } + string names[3]; + names[0] = kfsAttr.filename; + ret = clnt->GetUserAndGroupNames( + kfsAttr.user, kfsAttr.group, names[1], names[2]); + if (ret != 0) { + return (jint)ret; + } + + jfieldID fid = jenv->GetFieldID(acls, "isDirectory", "Z"); + if (! fid) { + return -EFAULT; + } + jenv->SetIntField(attr, fid, (jboolean)kfsAttr.isDirectory); + + fid = jenv->GetFieldID(acls, "filesize", "J"); + if (! fid) { + return -EFAULT; + } + jenv->SetIntField(attr, fid, (jlong)kfsAttr.fileSize); + + fid = jenv->GetFieldID(acls, "modificationTime", "J"); + if (! fid) { + return -EFAULT; + } + jenv->SetLongField(attr, fid, + (jlong)kfsAttr.mtime.tv_sec * 1000 + + (jlong)kfsAttr.mtime.tv_usec / 1000 + ); + + fid = jenv->GetFieldID(acls, "replication", "I"); + if (! fid) { + return -EFAULT; + } + jenv->SetIntField(attr, fid, kfsAttr.numReplicas); + + fid = jenv->GetFieldID(acls, "striperType", "I"); + if (! fid) { + return -EFAULT; + } + jenv->SetIntField(attr, fid, (jint)kfsAttr.striperType); + + fid = jenv->GetFieldID(acls, "numStripes", "I"); + if (! fid) { + return -EFAULT; + } + jenv->SetIntField(attr, fid, (jint)kfsAttr.numStripes); + + fid = jenv->GetFieldID(acls, "numRecoveryStripes", "I"); + if (! fid) { + return -EFAULT; + } + jenv->SetIntField(attr, fid, (jint)kfsAttr.numRecoveryStripes); + + fid = jenv->GetFieldID(acls, "stripeSize", "I"); + if (! fid) { + return -EFAULT; + } + jenv->SetIntField(attr, fid, (jint)kfsAttr.stripeSize); + + fid = jenv->GetFieldID(acls, "owner", "J"); + if (! fid) { + return -EFAULT; + } + jenv->SetLongField(attr, fid, (jlong)kfsAttr.user); + + fid = jenv->GetFieldID(acls, "group", "J"); + if (! fid) { + return -EFAULT; + } + jenv->SetLongField(attr, fid, (jlong)kfsAttr.group); + + fid = jenv->GetFieldID(acls, "mode", "I"); + if (! fid) { + return -EFAULT; + } + jenv->SetIntField(attr, fid, (jint)kfsAttr.mode); + + fid = jenv->GetFieldID(acls, "fileId", "J"); + if (! fid) { + return -EFAULT; + } + jenv->SetLongField(attr, fid, (jlong)kfsAttr.fileId); + + fid = jenv->GetFieldID(acls, "dirCount", "J"); + if (! fid) { + return -EFAULT; + } + jenv->SetLongField(attr, fid, (jlong)kfsAttr.dirCount()); + + fid = jenv->GetFieldID(acls, "fileCount", "J"); + if (! fid) { + return -EFAULT; + } + jenv->SetLongField(attr, fid, (jlong)kfsAttr.fileCount()); + + const char* const fieldNames[] = {"filename", "ownerName", "groupName"}; + for (int i = 0; i < 3; i++) { + jstring const nm = jenv->NewStringUTF(names[i].c_str()); + if (! nm) { + return -EFAULT; + } + fid = jenv->GetFieldID(acls, fieldNames[i], "Ljava/lang/String;"); + if (! fid) { + return -EFAULT; + } + jenv->SetObjectField(attr, fid, nm); + } + + return 0; +} + +jstring Java_com_quantcast_kfs_access_KfsAccess_strerror( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jerr) +{ + const string str = KFS::ErrorCodeToStr((int)jerr); + return jenv->NewStringUTF(str.c_str()); +} + +jboolean Java_com_quantcast_kfs_access_KfsAccess_isnotfound( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jerr) +{ + return (jboolean)(jerr == -ENOENT); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_close( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + return clnt->Close(jfd); +} + +jlong Java_com_quantcast_kfs_access_KfsAccess_seek( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong joffset) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + return (jlong)clnt->Seek(jfd, joffset); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_getUMask( + JNIEnv *jenv, jclass jcls, jlong jptr) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + return (jint)(clnt->GetUMask() & 0777); +} + +jint Java_com_quantcast_kfs_access_KfsAccess_setUMask( + JNIEnv *jenv, jclass jcls, jlong jptr, jint umask) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + clnt->SetUMask((kfsMode_t)umask); + return 0; +} + +jint Java_com_quantcast_kfs_access_KfsInputChannel_read( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + if (! buf) { + return 0; + } + void * addr = jenv->GetDirectBufferAddress(buf); + jlong cap = jenv->GetDirectBufferCapacity(buf); + + if (! addr || cap < 0) { + return 0; + } + if(begin < 0 || end > cap || begin > end) { + return 0; + } + addr = (void *)(uintptr_t(addr) + begin); + + ssize_t sz = clnt->Read((int) jfd, (char *) addr, (size_t) (end - begin)); + return (jint)sz; +} + +jint Java_com_quantcast_kfs_access_KfsOutputChannel_write( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + if(! buf) { + return 0; + } + void* addr = jenv->GetDirectBufferAddress(buf); + jlong cap = jenv->GetDirectBufferCapacity(buf); + + if (! addr || cap < 0) { + return 0; + } + if (begin < 0 || end > cap || begin > end) { + return 0; + } + addr = (void *)(uintptr_t(addr) + begin); + + ssize_t sz = clnt->Write((int) jfd, (const char *) addr, (size_t) (end - begin)); + return (jint)sz; +} + +jint Java_com_quantcast_kfs_access_KfsOutputChannel_atomicRecordAppend( + JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end) +{ + if (! jptr) { + return -EFAULT; + } + KfsClient* const clnt = (KfsClient*)jptr; + + if (! buf) { + return 0; + } + void * addr = jenv->GetDirectBufferAddress(buf); + jlong cap = jenv->GetDirectBufferCapacity(buf); + + if (! addr || cap < 0) { + return 0; + } + if (begin < 0 || end > cap || begin > end) { + return 0; + } + addr = (void *)(uintptr_t(addr) + begin); + + ssize_t sz = clnt->AtomicRecordAppend((int) jfd, (const char *) addr, (int) (end - begin)); + return (jint)sz; +} diff --git a/src/cc/access/kfs_module_py.cc b/src/cc/access/kfs_module_py.cc new file mode 100644 index 000000000..b6de0f366 --- /dev/null +++ b/src/cc/access/kfs_module_py.cc @@ -0,0 +1,1181 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/08/01 +// Author: Blake Lewis (Kosmix Corp.) +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Glue code for Python apps to access KFS. +// +// Note: The Python Extension Module is in experimental stage. Please use it +// with caution. +//---------------------------------------------------------------------------- + +#include "Python.h" +#include "structmember.h" +#include "libclient/KfsClient.h" +#include +#include +#include +#include + +#include +#include +#include + +using std::string; +using std::vector; + +using KFS::KfsClient; +using KFS::chunkOff_t; +using KFS::KfsFileAttr; +using KFS::ErrorCodeToStr; + +struct kfs_Client { + PyObject_HEAD + PyObject *propfile; // Properties file + PyObject *cwd; // Current directory + KfsClient * client; // The client itself +}; + +static PyObject *Client_new(PyTypeObject *type, PyObject *args, PyObject *kwds); +static int Client_init(PyObject *pself, PyObject *args, PyObject *kwds); +static int Client_print(PyObject *pself, FILE *fp, int flags); +static void Client_dealloc(PyObject *pself); + +static PyObject *Client_repr(PyObject *pself); +static PyObject *kfs_isdir(PyObject *pself, PyObject *args); +static PyObject *kfs_isfile(PyObject *pself, PyObject *args); +static PyObject *kfs_mkdir(PyObject *pself, PyObject *args); +static PyObject *kfs_mkdirs(PyObject *pself, PyObject *args); +static PyObject *kfs_rmdir(PyObject *pself, PyObject *args); +static PyObject *kfs_rmdirs(PyObject *pself, PyObject *args); +static PyObject *kfs_readdir(PyObject *pself, PyObject *args); +static PyObject *kfs_readdirplus(PyObject *pself, PyObject *args); +static PyObject *kfs_create(PyObject *pself, PyObject *args); +static PyObject *kfs_stat(PyObject *pself, PyObject *args); +static PyObject *kfs_getNumChunks(PyObject *pself, PyObject *args); +static PyObject *kfs_getChunkSize(PyObject *pself, PyObject *args); +static PyObject *kfs_remove(PyObject *pself, PyObject *args); +static PyObject *kfs_rename(PyObject *pself, PyObject *args); +static PyObject *kfs_coalesceblocks(PyObject *pself, PyObject *args); +static PyObject *kfs_open(PyObject *pself, PyObject *args); +static PyObject *kfs_cd(PyObject *pself, PyObject *args); +static PyObject *kfs_log_level(PyObject *pself, PyObject *args); + +inline static void SetPyIoError(int64_t err) +{ + const string s = ErrorCodeToStr((int)err); + PyErr_SetString(PyExc_IOError, s.c_str()); +} + +static PyMemberDef Client_members[] = { + { (char*)"properties", T_OBJECT, offsetof(kfs_Client, propfile), RO, (char*)"properties file" }, + { (char*)"cwd", T_OBJECT, offsetof(kfs_Client, cwd), RO, (char*)"current directory" }, + { NULL } +}; + +static PyMethodDef Client_methods[] = { + { "mkdir", kfs_mkdir, METH_VARARGS, "Create directory."}, + { "mkdirs", kfs_mkdirs, METH_VARARGS, "Create directory tree."}, + { "isdir", kfs_isdir, METH_VARARGS, "Check if a path is a directory."}, + { "rmdir", kfs_rmdir, METH_VARARGS, "Remove directory."}, + { "rmdirs", kfs_rmdirs, METH_VARARGS, "Remove directory tree."}, + { "readdir", kfs_readdir, METH_VARARGS, "Read directory." }, + { "readdirplus", kfs_readdirplus, METH_VARARGS, "Read directory with attributes." }, + { "stat", kfs_stat, METH_VARARGS, "Stat file." }, + { "getNumChunks", kfs_getNumChunks, METH_VARARGS, "Get # of chunks in a file." }, + { "getChunkSize", kfs_getChunkSize, METH_VARARGS, "Get default chunksize for a file." }, + { "create", kfs_create, METH_VARARGS, "Create file." }, + { "remove", kfs_remove, METH_VARARGS, "Remove file." }, + { "rename", kfs_rename, METH_VARARGS, "Rename file or directory." }, + { "coalesce_blocks", kfs_coalesceblocks, METH_VARARGS, "Coalesce blocks from src->dest." }, + { "open", kfs_open, METH_VARARGS, "Open file." }, + { "isfile", kfs_isfile, METH_VARARGS, "Check if a path is a file."}, + { "cd", kfs_cd, METH_VARARGS, "Change directory." }, + { "log_level", kfs_log_level, METH_VARARGS, "Set log4cpp log level." }, + { NULL } +}; + +PyDoc_STRVAR(Client_doc, +"A kfs.client object is an instance of the KFS client library\n" +"that sends RPC's to the KFS metadata and chunk servers in order\n" +"to perform file system operations. In addition, its 'open' method\n" +"creates kfs.file objects that represent files in KFS.\n\n" +"To create a client, you must supply a 'properties file' which\n" +"defines the hostname and port number for the metaserver, e.g.\n\n" +"\tmy_client = kfs.client('KfsTester.properties')\n\n" +"Methods:\n" +"\tmkdir(path) -- create a directory\n" +"\tmkdirs(path) -- create a directory tree\n" +"\trmdir(path) -- remove a directory (path should be an empty dir)\n" +"\trmdirs(path) -- remove a directory tree\n" +"\treaddir(path) -- return a tuple of directory contents\n" +"\treaddirplus(path) -- directory entries plus attributes\n" +"\tisdir(path) -- return TRUE if path is a directory\n" +"\tisfile(path) -- return TRUE if path is a file\n" +"\tstat(path) -- file attributes, compatible with os.stat\n" +"\tgetNumChunks(path) -- return the # of chunks in a file\n" +"\tgetChunkSize(path) -- return the default size of chunks in a file\n" +"\tcreate(path, numReplicas=3) -- create a file and return a kfs.file object for it\n" +"\tremove(path) -- remove a file\n" +"\tcoalesceblocks(src, dst) -- append blocks from src->dest\n" +"\topen(path[, mode]) -- open a file and return an object for it\n" +"\tcd(path) -- change current directory\n" +"\tlog_level(level) -- change the message log level\n" +"\n\nData:\n" +"\tproperties -- the name of the properties file\n" +"\tcwd -- the current directory (for relative paths)\n"); + +static PyTypeObject kfs_ClientType = { + PyObject_HEAD_INIT(NULL) + 0, // ob_size + "kfs.client", // tp_name + sizeof (kfs_Client), // tp_basicsize + 0, // tp_itemsize + Client_dealloc, // tp_dealloc + Client_print, // tp_print + 0, // tp_getattr + 0, // tp_setattr + 0, // tp_compare + Client_repr, // tp_repr + 0, // tp_as_number + 0, // tp_as_sequence + 0, // tp_as_mapping + 0, // tp_hash + 0, // tp_call + 0, // tp_str + 0, // tp_getattro + 0, // tp_setattro + 0, // tp_as_buffer + Py_TPFLAGS_DEFAULT, // tp_flags + Client_doc, // tp_doc + 0, // tp_traverse + 0, // tp_clear + 0, // tp_richcompare + 0, // tp_weaklistoffest + 0, // tp_iter + 0, // tp_iternext + Client_methods, // tp_methods + Client_members, // tp_members + 0, // tp_getset + 0, // tp_base + 0, // tp_dict + 0, // tp_descr_get + 0, // tp_descr_set + 0, // tp_dictoffset + Client_init, // tp_init + 0, // tp_alloc + Client_new // tp_new +}; + + +struct kfs_File { + PyObject_HEAD + PyObject *name; // File name + PyObject *mode; // Access mode + PyObject *pclient; // Python object for KFS client + int fd; // File descriptor +}; + +static PyObject * +File_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + static PyObject *noname = NULL; + if (noname == NULL) { + noname = PyString_FromString(""); + if (noname == NULL) + return NULL; + } + kfs_File *self = (kfs_File *)type->tp_alloc(type, 0); + if (self != NULL) { + Py_INCREF(noname); + self->name = noname; + Py_INCREF(noname); + self->mode = noname; + Py_INCREF(noname); + self->pclient = noname; + self->fd = -1; + } + return (PyObject *)self; +} + +static void +File_dealloc(PyObject *pself) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + if (self->fd != -1) + cl->client->Close(self->fd); + Py_DECREF(self->name); + Py_DECREF(self->mode); + Py_DECREF(self->pclient); + self->ob_type->tp_free((PyObject *)self); +} + +static int +modeflag(const char *modestr) +{ + // convert mode string to flag + int mode = -1; + + if (strcmp(modestr, "r") == 0) + mode = O_RDWR; + else if (strcmp(modestr, "w") == 0) + mode = O_WRONLY; + else if (strcmp(modestr, "r+") == 0 || strcmp(modestr, "w+") == 0) + mode = O_RDWR; + else if (strcmp(modestr, "a") == 0) + mode = O_WRONLY | O_APPEND; + + return mode; +} + +static int +set_file_members( + kfs_File *self, const char *path, + const char *modestr, kfs_Client *client, int fd) +{ + int mode; + + // convert mode string to flag + mode = modeflag(modestr); + if (mode == -1) + return -1; + + // open the file if necessary + if (fd < 0) + fd = client->client->Open(path, mode); + + if (fd < 0) { + SetPyIoError(fd); + return -1; + } + + // set all of the fields in the kfs_File structure + Py_DECREF(self->name); + self->name = PyString_FromString(path); + Py_DECREF(self->mode); + self->mode = PyString_FromString(modestr); + PyObject *pclient = (PyObject *)client; + Py_INCREF(pclient); + Py_DECREF(self->pclient); + self->pclient = pclient; + self->fd = fd; + + return 0; +} + +static int +File_init(PyObject *pself, PyObject *args, PyObject *kwds) +{ + kfs_File *self = (kfs_File *)pself; + const char *nm, *md = "r"; + PyObject *cl = NULL; + + static char *kwlist[] = { + (char*)"client", (char*)"name", (char*)"mode", NULL + }; + + int ok = !PyArg_ParseTupleAndKeywords( + args, kwds, "O!s|s", kwlist, &kfs_ClientType, + &cl, &nm, &md); + if (!ok) + return -1; + + return set_file_members(self, nm, md, (kfs_Client *)cl, -1); +} + +static PyObject * +File_repr(PyObject *pself) +{ + kfs_File *self = (kfs_File *)pself; + return PyString_FromFormat("kfs.file<%s, %s, %d>", + PyString_AsString(self->name), + PyString_AsString(self->mode), + self->fd); +} + +static int +File_print(PyObject *pself, FILE *fp, int flags) +{ + kfs_File *self = (kfs_File *)pself; + fprintf(fp, "kfs.file<%s, %s, %d>\n", + PyString_AsString(self->name), + PyString_AsString(self->mode), + self->fd); + return 0; +} + +static PyObject * +kfs_reopen(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + char *modestr = PyString_AsString(self->mode); + + if (!PyArg_ParseTuple(args, "|s", &modestr)) + return NULL; + + int mode = modeflag(modestr); + if (mode == -1) + return NULL; + + int fd = cl->client->Open(PyString_AsString(self->name), mode); + if (fd == -1) + return NULL; + + self->fd = fd; + self->mode = PyString_FromString(modestr); + Py_RETURN_NONE; +} + +static PyObject * +kfs_close(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + if (self->fd != -1) { + cl->client->Close(self->fd); + self->fd = -1; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_read(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + ssize_t rsize = -1l; + + if (!PyArg_ParseTuple(args, "l", &rsize)) + return NULL; + + if (self->fd == -1) { + SetPyIoError(-EBADF); + return NULL; + } + + PyObject *v = PyString_FromStringAndSize((char *)NULL, rsize); + if (v == NULL) + return NULL; + + char *buf = PyString_AsString(v); + ssize_t nr = cl->client->Read(self->fd, buf, rsize); + if (nr < 0) { + Py_DECREF(v); + SetPyIoError(nr); + return NULL; + } + if (nr != rsize) + _PyString_Resize(&v, nr); + return v; +} + +static PyObject * +kfs_write(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + int wsize = -1; + char *buf = NULL; + + if (!PyArg_ParseTuple(args, "s#", &buf, &wsize)) + return NULL; + + if (self->fd == -1) { + SetPyIoError(EBADF); + return NULL; + } + + ssize_t nw = cl->client->Write(self->fd, buf, (ssize_t)wsize); + if (nw < 0) { + SetPyIoError(nw); + return NULL; + } + if (nw != wsize) { + PyObject *msg = PyString_FromFormat( + "requested write of %d bytes but %ld were written", wsize, nw); + return msg; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_chunkLocations(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + int off, len; + + if (!PyArg_ParseTuple(args, "i|i", &off, &len)) + return NULL; + + if (self->fd == -1) { + SetPyIoError(-EBADF); + return NULL; + } + + vector > results; + + int s = cl->client->GetDataLocation(self->fd, off, len, results); + if (s < 0) { + SetPyIoError(s); + return NULL; + } + size_t n = results.size(); + PyObject *outer = PyTuple_New(n); + for (size_t i = 0; i < n; i++) { + size_t nlocs = results[i].size(); + vector locs = results[i]; + PyObject *inner = PyTuple_New(nlocs); + for (size_t j = 0; j < nlocs; j++) { + PyTuple_SetItem(inner, j, PyString_FromString(locs[j].c_str())); + } + PyTuple_SetItem(outer, i, inner); + } + return outer; +} + +static PyObject * +kfs_dataVerify(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + int wsize = -1; + char *buf = NULL; + + if (!PyArg_ParseTuple(args, "s#", &buf, &wsize)) + return NULL; + + if (self->fd == -1) { + SetPyIoError(-EBADF); + return NULL; + } + + bool res = cl->client->VerifyDataChecksums(self->fd); + return Py_BuildValue("b", res); +} + +static PyObject * +kfs_truncate(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + off_t off; + + if (!PyArg_ParseTuple(args, "L|i", &off)) + return NULL; + + if (self->fd == -1) { + SetPyIoError(-EBADF); + return NULL; + } + + int s = cl->client->Truncate(self->fd, off); + if (s < 0) { + SetPyIoError(s); + return NULL; + } + Py_RETURN_NONE; +} + + +static PyObject * +kfs_sync(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + int s = cl->client->Sync(self->fd); + if (s < 0) { + SetPyIoError(s); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_seek(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + off_t off; + int whence = SEEK_SET; + + if (!PyArg_ParseTuple(args, "L|i", &off, &whence)) + return NULL; + + if (self->fd == -1) { + SetPyIoError(-EBADF); + return NULL; + } + + off_t s = cl->client->Seek(self->fd, off, whence); + if (s < 0) { + SetPyIoError(s); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_tell(PyObject *pself, PyObject *args) +{ + kfs_File *self = (kfs_File *)pself; + kfs_Client *cl = (kfs_Client *)self->pclient; + + if (self->fd == -1) { + SetPyIoError(-EBADF); + return NULL; + } + + off_t pos = (off_t)cl->client->Tell(self->fd); + if (pos < 0) { + SetPyIoError(pos); + return NULL; + } + return Py_BuildValue("L", pos); +} + +static PyMethodDef File_methods[] = { + { "open", kfs_reopen, METH_VARARGS, "Open a closed file." }, + { "close", kfs_close, METH_NOARGS, "Close file." }, + { "read", kfs_read, METH_VARARGS, "Read from file." }, + { "write", kfs_write, METH_VARARGS, "Write to file." }, + { "truncate", kfs_truncate, METH_VARARGS, "Truncate a file." }, + { "chunk_locations", kfs_chunkLocations, METH_VARARGS, "Get location(s) of a chunk." }, + { "seek", kfs_seek, METH_VARARGS, "Seek to file offset." }, + { "tell", kfs_tell, METH_NOARGS, "Return current offset." }, + { "sync", kfs_sync, METH_NOARGS, "Flush file data." }, + { "data_verify", kfs_dataVerify, METH_VARARGS, "Verify data matches what is in KFS."}, + { NULL } +}; + +static PyMemberDef File_members[] = { + { (char*)"name", T_OBJECT, offsetof(kfs_File, name), RO, (char*)"file name" }, + { (char*)"mode", T_OBJECT, offsetof(kfs_File, mode), RO, (char*)"access mode" }, + { (char*)"fd", T_INT, offsetof(kfs_File, fd), RO, (char*)"file descriptor" }, + { NULL } +}; + +PyDoc_STRVAR(File_doc, +"These objects represent KFS files. They include a file descriptor (fd)\n" +"that identifies the file to the KFS client library for reads, writes,\n" +"and syncs. When a file is closed, its fd becomes -1 and no further\n" +"operations can be done on it unless it is reopened with the kfs.file\n" +"open method.\n\n" +"Methods:\n" +"\topen([mode]) -- reopen closed file\n" +"\tclose() -- close file\n" +"\tread(len) -- read len bytes, return as string\n" +"\twrite(str) -- write string to file\n" +"\ttruncate(off) -- truncate file at specified offset\n" +"\tseek(off) -- seek to specified offset\n" +"\ttell() -- return current offest\n" +"\tsync() -- flush file data to server\n" +"\tchunk_locations(path, offset) -- location(s) of the chunk corresponding to offset\n" +"\tdata_verify(str) -- verify that the data in KFS matches what is passed in\n" +"\nData:\n\n" +"\tname -- the name of the file\n" +"\tmode -- access mode ('r', 'w', 'r+', or 'w+')\n" +"\tfd -- file descriptor (-1 if closed)\n"); + +static PyTypeObject kfs_FileType = { + PyObject_HEAD_INIT(NULL) + 0, // ob_size + "kfs.file", // tp_name + sizeof (kfs_File), // tp_basicsize + 0, // tp_itemsize + File_dealloc, // tp_dealloc + File_print, // tp_print + 0, // tp_getattr + 0, // tp_setattr + 0, // tp_compare + File_repr, // tp_repr + 0, // tp_as_number + 0, // tp_as_sequence + 0, // tp_as_mapping + 0, // tp_hash + 0, // tp_call + 0, // tp_str + 0, // tp_getattro + 0, // tp_setattro + 0, // tp_as_buffer + Py_TPFLAGS_DEFAULT, // tp_flags + File_doc, // tp_doc + 0, // tp_traverse + 0, // tp_clear + 0, // tp_richcompare + 0, // tp_weaklistoffest + 0, // tp_iter + 0, // tp_iternext + File_methods, // tp_methods + File_members, // tp_members + 0, // tp_getset + 0, // tp_base + 0, // tp_dict + 0, // tp_descr_get + 0, // tp_descr_set + 0, // tp_dictoffset + File_init, // tp_init + 0, // tp_alloc + File_new // tp_new +}; + +static void +Client_dealloc(PyObject *pself) +{ + kfs_Client *self = (kfs_Client *)pself; + Py_XDECREF(self->propfile); + Py_XDECREF(self->cwd); + delete self->client; + self->ob_type->tp_free(pself); +} + +static PyObject * +Client_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + kfs_Client *self = (kfs_Client *)type->tp_alloc(type, 0); + + if (self == NULL) + return NULL; + + PyObject *p = PyString_FromString(""); + PyObject *c = PyString_FromString("/"); + if (p == NULL || c == NULL) { + Py_DECREF(self); + return NULL; + } + + self->propfile = p; + self->cwd = c; + + return (PyObject *)self; +} + +static int +Client_init(PyObject *pself, PyObject *args, PyObject *kwds) +{ + kfs_Client *self = (kfs_Client *)pself; + char *pf = NULL; + + if (!PyArg_ParseTuple(args, "s", &pf)) + return -1; + + KfsClient * client = KFS::Connect(pf); + if (!client) { + PyErr_SetString(PyExc_IOError, "Unable to start client."); + return -1; + } + self->client = client; + PyObject *tmp = self->propfile; + self->propfile = PyString_FromString(pf); + Py_XDECREF(tmp); + + return 0; +} + +static PyObject * +Client_repr(PyObject *pself) +{ + kfs_Client *self = (kfs_Client *)pself; + return PyString_FromFormat("kfs.client<%s, %s>", + PyString_AsString(self->propfile), + PyString_AsString(self->cwd)); +} + +static int +Client_print(PyObject *pself, FILE *fp, int flags) +{ + kfs_Client *self = (kfs_Client *)pself; + fprintf(fp, "kfs.client<%s, %s>\n", + PyString_AsString(self->propfile), + PyString_AsString(self->cwd)); + return 0; +} + +static string +strip_dots(string path) +{ + vector component; + string result; + string::size_type start = 0; + + while (start != string::npos) { + assert(path[start] == '/'); + string::size_type slash = path.find('/', start + 1); + string nextc = path.substr(start, slash - start); + start = slash; + if (nextc.compare("/..") == 0) { + if (!component.empty()) + component.pop_back(); + } else if (nextc.compare("/.") != 0) + component.push_back(nextc); + } + + if (component.empty()) + component.push_back(string("/")); + + for (vector ::iterator c = component.begin(); + c != component.end(); c++) { + result += *c; + } + return result; +} + +/* + * Take a path name that was supplied as an argument for a KFS operation. + * If it is not absolute, add the current directory to the front of it and + * in either case, call strip_dots to strip out any "." and ".." components. + */ +static string +build_path(PyObject *cwd, const char *input) +{ + string tail(input); + if (input[0] == '/') + return strip_dots(tail); + + const char *c = PyString_AsString(cwd); + bool is_root = (c[0] == '/' && c[1] == '\0'); + string head(c); + if (!is_root) + head.append("/"); + return strip_dots(head + tail); +} + +static PyObject * +kfs_cd(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + KfsFileAttr attr; + int status = self->client->Stat(path.c_str(), attr); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + if (! attr.isDirectory) { + SetPyIoError(-ENOTDIR); + return NULL; + } + PyObject *newcwd = PyString_FromString(path.c_str()); + if (newcwd != NULL) { + Py_DECREF(self->cwd); + self->cwd = newcwd; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_log_level(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *logLevel; + + if (!PyArg_ParseTuple(args, "s", &logLevel)) + return NULL; + + self->client->SetLogLevel(logLevel); + Py_RETURN_NONE; +} + +static PyObject * +kfs_isdir(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + bool res = self->client->IsDirectory(path.c_str()); + return Py_BuildValue("b", res); +} + +static PyObject * +kfs_isfile(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + bool res = self->client->IsFile(path.c_str()); + return Py_BuildValue("b", res); +} + +static PyObject * +kfs_mkdir(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + int status = self->client->Mkdir(path.c_str()); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_mkdirs(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + int status = self->client->Mkdirs(path.c_str()); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_rmdir(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + int status = self->client->Rmdir(path.c_str()); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_rmdirs(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + int status = self->client->Rmdirs(path.c_str()); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + Py_RETURN_NONE; +} + +/*! + * \brief read directory + * + * Return directory contents as a tuple of names + * XXX It should return a tuple of (name, fid) pairs, but + * the KFS client readdir code currently only gives names + */ +static PyObject * +kfs_readdir(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + vector result; + int status = self->client->Readdir(path.c_str(), result); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + size_t n = result.size(); + PyObject *tuple = PyTuple_New(n); + for (size_t i = 0; i != n; i++) { + PyTuple_SetItem(tuple, i, + PyString_FromString(result[i].c_str())); + } + return tuple; +} + +/*! + * \brief Package a KfsFileAttr into a tuple + */ +static PyObject * +package_fattr(KfsFileAttr &fa) +{ + PyObject *tuple = PyTuple_New(7); + PyTuple_SetItem(tuple, 0, PyString_FromString(fa.filename.c_str())); + PyTuple_SetItem(tuple, 1, PyLong_FromLongLong(fa.fileId)); + PyTuple_SetItem(tuple, 2, PyString_FromString(ctime(&fa.mtime.tv_sec))); + PyTuple_SetItem(tuple, 3, PyString_FromString(ctime(&fa.ctime.tv_sec))); + PyTuple_SetItem(tuple, 4, PyString_FromString(ctime(&fa.crtime.tv_sec))); + PyTuple_SetItem(tuple, 5, PyString_FromString( + fa.isDirectory ? "dir" : "file")); + PyTuple_SetItem(tuple, 6, PyLong_FromLongLong(fa.fileSize)); + + return tuple; +} + +/*! + * \brief read directory with attributes + * + * Returns a tuple of tuples, each with the following data: + * + * (name, fid, mtime, ctime, crtime, type, size) + */ +static PyObject * +kfs_readdirplus(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + + vector result; + int status = self->client->ReaddirPlus(path.c_str(), result); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + size_t n = result.size(); + PyObject *outer = PyTuple_New(n); + for (size_t i = 0; i != n; i++) { + PyObject *inner = package_fattr(result[i]); + PyTuple_SetItem(outer, i, inner); + } + return outer; +} + +static PyObject * +kfs_stat(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + KfsFileAttr attr; + int status = self->client->Stat(path.c_str(), attr, true); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + /* + * Return the stat information in the same format as + * os.stat() so that we can use the standard stat module + * on it. + */ + PyObject *pstat = PyTuple_New(10); + PyTuple_SetItem(pstat, 0, PyInt_FromLong( + attr.mode | (attr.isDirectory ? S_IFDIR : 0))); + PyTuple_SetItem(pstat, 1, PyLong_FromLongLong(attr.fileId)); + PyTuple_SetItem(pstat, 2, PyLong_FromLong(0)); + PyTuple_SetItem(pstat, 3, PyInt_FromLong(1)); + PyTuple_SetItem(pstat, 4, PyInt_FromLong(attr.user)); + PyTuple_SetItem(pstat, 5, PyInt_FromLong(attr.group)); + PyTuple_SetItem(pstat, 6, PyLong_FromLongLong(attr.fileSize)); + PyTuple_SetItem(pstat, 7, PyInt_FromLong(attr.ctime.tv_sec)); + PyTuple_SetItem(pstat, 8, PyInt_FromLong(attr.mtime.tv_sec)); + PyTuple_SetItem(pstat, 9, PyInt_FromLong(attr.crtime.tv_sec)); + return pstat; +} + +static PyObject * +kfs_getNumChunks(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + int chunkCount = self->client->GetNumChunks(path.c_str()); + if (chunkCount < 0) { + SetPyIoError(chunkCount); + return NULL; + } + return Py_BuildValue("i", chunkCount); +} + +static PyObject * +kfs_getChunkSize(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + string path = build_path(self->cwd, patharg); + int chunksz = self->client->GetChunkSize(path.c_str()); + return Py_BuildValue("i", chunksz); +} + +static PyObject * +kfs_create(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + int numReplicas = 3; + + if (!PyArg_ParseTuple(args, "s|i", &patharg, &numReplicas)) + return NULL; + + string path = build_path(self->cwd, patharg); + int fd = self->client->Create(path.c_str(), numReplicas); + if (fd < 0) { + SetPyIoError(fd); + return NULL; + } + + kfs_File *f = (kfs_File *)kfs_FileType.tp_new(&kfs_FileType, NULL, NULL); + if (f == NULL || set_file_members(f, path.c_str(), "w", self, fd) < 0) + return NULL; + + return (PyObject *)f; +} + +static PyObject * +kfs_remove(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *patharg; + + if (!PyArg_ParseTuple(args, "s", &patharg)) + return NULL; + + string path = build_path(self->cwd, patharg); + int status = self->client->Remove(path.c_str()); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_rename(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *srcpath, *dstpath; + bool overwrite = true; + + if (!PyArg_ParseTuple(args, "ss|b", &srcpath, &dstpath, &overwrite)) + return NULL; + + string spath = build_path(self->cwd, srcpath); + string dpath = build_path(self->cwd, dstpath); + int status = self->client->Rename(spath.c_str(), dpath.c_str(), overwrite); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject * +kfs_coalesceblocks(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + char *srcpath, *dstpath; + + if (!PyArg_ParseTuple(args, "ss", &srcpath, &dstpath)) + return NULL; + + string spath = build_path(self->cwd, srcpath); + string dpath = build_path(self->cwd, dstpath); + chunkOff_t dstStartOffset; + int status = self->client->CoalesceBlocks( + spath.c_str(), dpath.c_str(), &dstStartOffset); + if (status < 0) { + SetPyIoError(status); + return NULL; + } + return Py_BuildValue("l", dstStartOffset); +} + +static PyObject * +kfs_open(PyObject *pself, PyObject *args) +{ + kfs_Client *self = (kfs_Client *)pself; + const char *patharg, *modestr = "r"; + + if (!PyArg_ParseTuple(args, "s|s", &patharg, &modestr)) + return NULL; + + string path = build_path(self->cwd, patharg); + + kfs_File *f = (kfs_File *)kfs_FileType.tp_new(&kfs_FileType, NULL, NULL); + if (f == NULL || + set_file_members(f, path.c_str(), modestr, self, -1) < 0) { + return NULL; + } + return (PyObject *)f; +} + +PyDoc_STRVAR(module_doc, +"This module links to the KFS client library to provide simple KFS\n" +"file services akin to those for built-in Python file objects. To use\n" +"it, you must first create a kfs.client object. This provides the\n" +"connection to KFS; the appropriate KFS servers (i.e., the metaserver\n" +" and chunkservers) must already be active.\n\n" +"Once you have a kfs.client, you can perform file system operations\n" +"corresponding to the KFS client library interfaces and create kfs.file\n" +"objects that represent files in KFS.\n"); + + +PyMODINIT_FUNC +initkfs() +{ + if (PyType_Ready(&kfs_ClientType) < 0 || + PyType_Ready(&kfs_FileType) < 0) + return; + + PyObject *m = Py_InitModule3("kfs", NULL, module_doc); + + Py_INCREF(&kfs_ClientType); + PyModule_AddObject(m, "client", (PyObject *)&kfs_ClientType); + Py_INCREF(&kfs_FileType); + PyModule_AddObject(m, "file", (PyObject *)&kfs_FileType); +} diff --git a/src/cc/access/kfs_setup.py b/src/cc/access/kfs_setup.py new file mode 100644 index 000000000..a10fa893e --- /dev/null +++ b/src/cc/access/kfs_setup.py @@ -0,0 +1,80 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# +# Note: The Python Extension Module is in experimental stage. Please use it +# with caution. +# +# This script uses python distutils to build and install the python KFS +# access module. Execute this script from KFS build directory after building +# libclient. +# +# Python apps can access KFS after installation of the KFS access module. +# +# Instructions for using this script is in top level 'doc' directory in the +# file doc/DeveloperDoc. +# + +from distutils.core import setup, Extension +import sys +import os +import os.path + +kfs_access_dir=os.path.dirname(sys.argv[0]) + +kfsext = Extension( + 'kfs', + include_dirs = [ + os.path.abspath(os.path.join(kfs_access_dir, "..")) + ], + libraries = [ + 'boost_regex', + 'kfs_client', + 'kfs_common', + 'kfs_io', + 'kfs_qcdio', + 'kfs_qcrs' + ], + library_dirs = [ + 'src/cc/libclient', + 'src/cc/common', + 'src/cc/kfsio', + 'src/cc/qcdio', + 'src/cc/qcrs', + ], + runtime_library_dirs = [], + sources = [ + os.path.abspath(os.path.join(kfs_access_dir, "kfs_module_py.cc")) + ] +) + +# OSX boost ports typically end up at /opt/local/lib +if sys.platform in ('darwin', 'Darwin'): + kfsext.library_dirs.append('/opt/local/lib') + +setup( + name = "kfs", version = "0.3", + description="KFS client module", + author="Blake Lewis and Sriram Rao", + ext_modules = [kfsext] +) diff --git a/src/cc/chunk/AtomicRecordAppender.cc b/src/cc/chunk/AtomicRecordAppender.cc new file mode 100644 index 000000000..72cc1d1bc --- /dev/null +++ b/src/cc/chunk/AtomicRecordAppender.cc @@ -0,0 +1,3079 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id: AtomicRecordAppender.cc $ +// +// Created 2009/03/19 +// Author: Sriram Rao +// +// 2009/10/19 +// Mike Ovsiannikov +// Fault tolerant write append protocol +// +// Copyright 2009-2012 Quantcast Corporation. All rights reserved. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Data structure for tracking record appends to chunks. +//---------------------------------------------------------------------------- + +/* + +Fault tolerant (reliable) write append protocol overview. + +At the high level the synchronous replication is the same as the replication +used in the "normal" (random) KFS write. + +The client sends data to the first chunk server in the replication chain: +"append master", then the "master" sends to the first "slave", then first slave +sends the data to the next slave, and so on, until the last participant in the +chain is reached. + +The replication acknowledgment is propagated trough the replication chain in +the opposite direction. Positive ACK received by the master guarantees that all +other replication participants have successfully received the data. + +Each chunk server maintains protocol memory for each chunk replica opened for +"write append". + +The protocol state is needed to deal with unreliable communications, including +crash. Crash can be regarded as special case of communication failure. Timeout +can be regarded as another special case. + +The protocol state is a table (WriteIdState) that keeps track of the status of +the last write append operation for a every client that issued (or can +potentially issue write append to the chunk). In KFS terms this corresponds to +"write id allocation". Thus the table will have one entry per write id (normally + / or 64M/1M = 64 entries). + +From protocol point of view the append operation status in the table can be one +of the three following: +"in progress" (kErrStatusInProgress), +"success" (kErrNone), +"fail" (!= kErrStatusInProgress && != kErrNone). + +The client (appender) can query the table in order to unambiguously recover from +the communication failure by first querying [write append] master for the last +operation status for the given chunk, and write id. + +In the case where the communication with chunk master fails, the client queries +other participating chunk servers for the append operation status. + +The status inquiry has the following side effect. +The status inquiry always makes write id "read only", and un-reserves the space, +thus disallowing further appends with this write id. This guarantees that after +status inquiry returns, no other [new, or stale] appends with this write id can +succeed. + +For status recovery to work correctly it is required that if "Begin make chunk +stable" is executed, then the replica is truncated to the last known +successfully replicated size ("commit offset"). See comment in +BeginMakeStable(). + +To reduce append status recovery latency chunk master piggies back its view of +replication status on the replication RPC. See comment in +UpdateMasterCommittedOffset(). + +The chunk can be in two states "unstable" -- dirty / non readable, and "stable" +-- read only. Only meta server can make a decition to transition chunk replica +into the "stable" state. + +When chunk master declares write append synchronous replication failure, or +decides to stop accepting write appends for other [normal] reasons (chunk full, +no clients etc), it stops accepting write appends, sends meta server its replica +state: chunk checksum and size. "Lease Relinquish" RPC with the chunk size and +chunk checksum is used in this case. + +The meta server broadcasts this information to all participants, including +master. Each participant attempts to converge its replica to the state +broadcasted. + +In case of failure the chunk replica is declared "corrupted", and scheduled for +re-replication by the meta server. + +The meta server keeps track of the chunk master operational status, as it does +for "normal" writes: heartbeat, and write lease renewal mechanisms. + +If meta server decides that the chunk master is non operational, it takes over, +and performs recovery as follows. + +First meta server instructs all chunk servers hosting the chunk to stop +accepting write appends to the chunk, and report back the chunk replica state: +size, and checksum. "Begin Make Chunk Stable" RPC is used for this purpose. + +Once meta server gets sufficient # of replies, it picks the shortest replica, +and instructs all participants to converge replicas to the selected state. +"Make Chunk Stable" RPC is used to convey this information. + +Then meta server updates list of chunk servers hosting the chunk based on the +"Make Chunk Stable" RPC reply status. + +*/ + +#include +#include +#include +#include +#include +#include + +#include "common/MsgLogger.h" +#include "common/StdAllocator.h" +#include "kfsio/Globals.h" +#include "qcdio/QCDLList.h" +#include "AtomicRecordAppender.h" +#include "ChunkManager.h" +#include "LeaseClerk.h" +#include "DiskIo.h" +#include "BufferManager.h" +#include "ClientSM.h" +#include "ChunkServer.h" +#include "MetaServerSM.h" +#include "DiskIo.h" +#include "utils.h" + +namespace KFS { +using std::map; +using std::less; +using std::pair; +using std::make_pair; +using std::min; +using std::max; +using std::string; +using std::ostringstream; +using std::istringstream; +using std::ws; + +#define WAPPEND_LOG_STREAM_PREFIX << "w_append: I" << mInstanceNum << "I " +#define WAPPEND_LOG_STREAM(pri) KFS_LOG_STREAM(pri) WAPPEND_LOG_STREAM_PREFIX +#define WAPPEND_LOG_STREAM_DEBUG KFS_LOG_STREAM_DEBUG WAPPEND_LOG_STREAM_PREFIX +#define WAPPEND_LOG_STREAM_WARN KFS_LOG_STREAM_WARN WAPPEND_LOG_STREAM_PREFIX +#define WAPPEND_LOG_STREAM_INFO KFS_LOG_STREAM_INFO WAPPEND_LOG_STREAM_PREFIX +#define WAPPEND_LOG_STREAM_ERROR KFS_LOG_STREAM_ERROR WAPPEND_LOG_STREAM_PREFIX +#define WAPPEND_LOG_STREAM_FATAL KFS_LOG_STREAM_FATAL WAPPEND_LOG_STREAM_PREFIX + +typedef QCDLList AppendReplicationList; + +RecordAppendOp::RecordAppendOp(kfsSeq_t s) + : KfsOp(CMD_RECORD_APPEND, s), + clientSeq(s), + chunkId(-1), + chunkVersion(-1), + numBytes(0), + writeId(-1), + offset(-1), + fileOffset(-1), + numServers(0), + checksum(0), + servers(), + masterCommittedOffset(-1), + clientSeqStr(), + dataBuf(), + origClnt(0), + origSeq(s), + replicationStartTime(0) +{ + AppendReplicationList::Init(*this); +} + +RecordAppendOp::~RecordAppendOp() +{ + assert(! origClnt && ! QCDLListOp::IsInList(*this)); +} + +string +RecordAppendOp::Show() const +{ + ostringstream os; + os << "record-append:" + " seq: " << seq << + " chunkId: " << chunkId << + " chunkversion: " << chunkVersion << + " file-offset: " << fileOffset << + " writeId = " << writeId << + " offset: " << offset << + " numBytes: " << numBytes << + " servers: " << servers << + " checksum: " << checksum << + " client-seq: " << clientSeq << + " master-committed: " << masterCommittedOffset + ; + return os.str(); +} + +typedef QCDLList PendingFlushList; + +inline AtomicRecordAppendManager::Counters& AtomicRecordAppendManager::Cntrs() + { return mCounters; } + +inline void +AtomicRecordAppendManager::IncAppendersWithWidCount() +{ + mAppendersWithWidCount++; + assert(mAppendersWithWidCount > 0); +} + +inline void +AtomicRecordAppendManager::DecAppendersWithWidCount() +{ + assert(mAppendersWithWidCount > 0); + if (mAppendersWithWidCount > 0) { + mAppendersWithWidCount--; + } +} + +// One per chunk +class AtomicRecordAppender : public KfsCallbackObj +{ +public: + enum + { + kErrNone = 0, + kErrParameters = -EINVAL, + kErrProtocolState = -EPERM, + kErrStatusInProgress = -EAGAIN, + kErrWidReadOnly = -EROFS, + kErrFailedState = -EFAULT, + kErrOutOfSpace = -ENOSPC, + kErrNotFound = -ENOENT, + kErrReplicationFailed = -EHOSTUNREACH + }; + + AtomicRecordAppender( + const DiskIo::FilePtr& chunkFileHandle, + kfsChunkId_t chunkId, + int64_t chunkVersion, + uint32_t numServers, + const string& servers, + ServerLocation peerLoc, + int replicationPos, + int64_t chunkSize); + void MakeChunkStable(MakeChunkStableOp* op = 0); + bool IsOpen() const + { return (mState == kStateOpen); } + bool IsChunkStable() const + { + return ( + mState != kStateOpen && + mState != kStateClosed && + mState != kStateReplicationFailed + ); + } + void Timeout(); + int GetAlignmentAndFwdFlag(bool& forwardFlag) const + { + forwardFlag = IsOpen() && mPeer; + return (mBuffer.BytesConsumableLast() + mBufFrontPadding); + } + kfsChunkId_t GetChunkId() const + { return mChunkId; } + size_t SpaceReserved() const + { return mBytesReserved; } + bool IsMaster() const + { return (mReplicationPos == 0); } + bool WantsToKeepLease() const; + void AllocateWriteId(WriteIdAllocOp *op, int replicationPos, + ServerLocation peerLoc, const DiskIo::FilePtr& chunkFileHandle); + int ChangeChunkSpaceReservaton( + int64_t writeId, size_t nBytesIn, bool releaseFlag, string* errMsg); + int InvalidateWriteId(int64_t writeId, bool declareFailureFlag); + void AppendBegin(RecordAppendOp *op, int replicationPos, + ServerLocation peerLoc); + void GetOpStatus(GetRecordAppendOpStatus* op); + void BeginMakeStable(BeginMakeChunkStableOp* op = 0); + void CloseChunk(CloseOp* op, int64_t writeId, bool& forwardFlag); + bool CanDoLowOnBuffersFlush() const + { return mCanDoLowOnBuffersFlushFlag; } + void LowOnBuffersFlush() + { FlushFullBlocks(); } + void UpdateFlushLimit(int flushLimit) + { + if (mBuffer.BytesConsumable() > flushLimit) { + FlushFullBlocks(); + } + } + int EventHandler(int code, void *data); + void DeleteChunk(); + bool Delete(); + int CheckParameters( + int64_t chunkVersion, uint32_t numServers, const string& servers, + int replicationPos, ServerLocation peerLoc, + const DiskIo::FilePtr& fileHandle, string& msg); + static bool ComputeChecksum( + kfsChunkId_t chunkId, int64_t chunkVersion, + int64_t& chunkSize, uint32_t& chunkChecksum); + void FatalError() + { die("AtomicRecordAppender internal error"); } + +private: + enum State + { + kStateNone = 0, + kStateOpen = 1, + kStateClosed = 2, + kStateReplicationFailed = 3, + kStateStable = 4, + kStateChunkLost = 5, + kStatePendingDelete = 6, + kNumStates + }; + struct WIdState + { + WIdState() + : mBytesReserved(0), + mLength(0), + mOffset(-1), + mSeq(0), + mAppendCount(0), + mStatus(0), + mReadOnlyFlag(false) + {} + size_t mBytesReserved; + size_t mLength; + int64_t mOffset; + kfsSeq_t mSeq; + uint64_t mAppendCount; + int mStatus; + bool mReadOnlyFlag; + }; + typedef map, + StdFastAllocator > + > WriteIdState; + typedef NetManager::Timer Timer; + + const int mReplicationPos; + const uint32_t mNumServers; + const kfsChunkId_t mChunkId; + const int64_t mChunkVersion; + // Bump file ref. count to prevent chunk manager from closing the file and + // unloading checksums. + DiskIo::FilePtr mChunkFileHandle; + const string mCommitAckServers; + const ServerLocation mPeerLocation; + // Protocol state. + State mState; + // The list of ops to be notified once finalize is + // finished (as in, all dirty data is flushed out to disk and + // the metadata is also pushed down). + MakeChunkStableOp* mMakeChunkStableOp; + MakeChunkStableOp* mLastMakeChunkStableOp; + BeginMakeChunkStableOp* mBeginMakeChunkStableOp; + BeginMakeChunkStableOp* mLastBeginMakeChunkStableOp; + // Timer + time_t mLastActivityTime; + time_t mLastAppendActivityTime; + time_t mLastFlushTime; + // when records are streamed in from clients, they are + // buffered by this object and then committed to disk. This + // field tracks the next offset in the file at which a record + // should be appended. + int64_t mNextOffset; + // Next committed append offset. + int64_t mNextCommitOffset; + int64_t mCommitOffsetAckSent; + // Disk write position. + int64_t mNextWriteOffset; + int64_t mMasterCommittedOffset; + // Disk write buffer. + IOBuffer mBuffer; + int mBufFrontPadding; + int mIoOpsInFlight; + int mReplicationsInFlight; + size_t mBytesReserved; + uint64_t mAppendCommitCount; + uint32_t mChunkChecksum; + int64_t mChunkSize; // To report to meta server + bool mStaggerRMWInFlightFlag:1; + bool mRestartFlushFlag:1; + bool mFlushFullBlocksFlag:1; + bool mCanDoLowOnBuffersFlushFlag:1; + bool mMakeStableSucceededFlag:1; + const uint64_t mInstanceNum; + int mConsecutiveOutOfSpaceCount; + WriteIdState mWriteIdState; + Timer mTimer; + const RemoteSyncSMPtr mPeer; + RecordAppendOp* mReplicationList[1]; + AtomicRecordAppender* mPrevPtr[1]; + AtomicRecordAppender* mNextPtr[1]; + friend class QCDLListOp; + + ~AtomicRecordAppender(); + static inline time_t Now() + { return libkfsio::globalNetManager().Now(); } + void SetState(State state, bool notifyIfLostFlag = true); + const char* GetStateAsStr() const + { return GetStateAsStr(mState); } + const char* GetStateAsStr( + State state) const + { + return ((state < kNumStates && state >= 0) ? + sStateNames[state] : "invalid"); + } + int64_t GetChunkSize() const + { + const ChunkInfo_t* const info = gChunkManager.GetChunkInfo(mChunkId); + return (info ? info->chunkSize : -1); + } + bool IsChunkOpen() const + { + const ChunkInfo_t* const info = gChunkManager.GetChunkInfo(mChunkId); + return (info && (info->chunkBlockChecksum || info->chunkSize == 0)); + } + inline void SetCanDoLowOnBuffersFlushFlag(bool flag); + void UpdateMasterCommittedOffset(int64_t masterCommittedOffset); + void AppendCommit(RecordAppendOp *op); + // helper function that flushes the buffered data. the input + // argument specifies whether the flush on the buffered data + // should be aligned to checksum blocks. + void FlushSelf(bool flushFullChecksumBlocks); + void FlushFullBlocks() + { FlushSelf(true); } + void FlushAll() + { FlushSelf(false); } + int GetNextReplicationTimeout() const; + void OpDone(WriteOp* op); + void OpDone(RecordAppendOp* op); + void OpDone(ReadOp* op); + bool DeleteIfNeeded() + { + if (mState == kStatePendingDelete) { + Delete(); + return true; + } + return false; + } + void CheckLeaseAndChunk(const char* prefix); + void MetaWriteDone(int status); + void MakeChunkStableDone(); + bool ComputeChecksum(); + void SubmitResponse(MakeChunkStableOp& op); + void SubmitResponse(BeginMakeChunkStableOp& op); + bool TryToCloseChunk(); + void TrimToLastCommit(const char* inMsgPtr); + void NotifyChunkClosed(); + void SendCommitAck(); + void IncAppendersWithWidCount() + { gAtomicRecordAppendManager.IncAppendersWithWidCount(); } + void DecAppendersWithWidCount() + { gAtomicRecordAppendManager.DecAppendersWithWidCount(); } + int GetCloseEmptyWidStateSec() + { + if (mConsecutiveOutOfSpaceCount > + gAtomicRecordAppendManager.GetCloseOutOfSpaceThreshold()) { + return gAtomicRecordAppendManager.GetCloseOutOfSpaceSec(); + } + return gAtomicRecordAppendManager.GetCloseEmptyWidStateSec(); + } + + template void SubmitResponse(OpT*& listHead, OpT*& listTail) + { + OpT* op = listHead; + listHead = 0; + listTail = 0; + while (op) { + OpT& cur = *op; + op = op->next; + SubmitResponse(cur); + } + } + template void PushBack( + OpT*& listHead, OpT*& listTail, OpT* op) + { + if (listHead) { + listTail->next = op; + while (listTail->next) { + listTail = listTail->next; + } + } else { + listHead = op; + listTail = op; + } + } + static string MakeCommitAckServers( + uint32_t numServers, string servers) + { + string ret; + istringstream is(servers); + for (uint32_t i = 0; is && i < numServers; ) { + string token; + is >> ws >> token; // Host + ret += token + " "; + is >> ws >> token; // Port + ret += token + (++i < numServers ? " -1 " : " -1"); // Write id. + } + return ret; + } + static uint64_t sInstanceNum; + static const char* const sStateNames[kNumStates]; + static AtomicRecordAppendManager::Counters& Cntrs() + { return gAtomicRecordAppendManager.Cntrs(); } +private: + // No copy. + AtomicRecordAppender(const AtomicRecordAppender&); + AtomicRecordAppender& operator=(const AtomicRecordAppender&); +}; + +const char* const AtomicRecordAppender::sStateNames[kNumStates] = +{ + "none", + "open", + "closed", + "replication failed", + "stable", + "chunk lost", + "pending delete" +}; +uint64_t AtomicRecordAppender::sInstanceNum = 10000; + +inline void +AtomicRecordAppendManager::UpdatePendingFlush(AtomicRecordAppender& appender) +{ + if (appender.CanDoLowOnBuffersFlush()) { + if (! PendingFlushList::IsInList(mPendingFlushList, appender)) { + PendingFlushList::PushFront(mPendingFlushList, appender); + } + } else { + PendingFlushList::Remove(mPendingFlushList, appender); + } +} + +inline void +AtomicRecordAppendManager::Detach(AtomicRecordAppender& appender) +{ + const size_t cnt = mAppenders.erase(appender.GetChunkId()); + if (cnt != 1) { + WAPPEND_LOG_STREAM_FATAL << + "appender detach: " << (const void*)&appender << + " chunkId: " << appender.GetChunkId() << + " appenders count: " << mAppenders.size() << + KFS_LOG_EOM; + appender.FatalError(); + } + PendingFlushList::Remove(mPendingFlushList, appender); +} + +inline void +AtomicRecordAppendManager::DecOpenAppenderCount() +{ + if (mOpenAppendersCount > 0) { + mOpenAppendersCount--; + } +} + +inline void +AtomicRecordAppender::SetCanDoLowOnBuffersFlushFlag(bool flag) +{ + if (mCanDoLowOnBuffersFlushFlag != flag) { + mCanDoLowOnBuffersFlushFlag = flag; + gAtomicRecordAppendManager.UpdatePendingFlush(*this); + } +} + +AtomicRecordAppender::AtomicRecordAppender( + const DiskIo::FilePtr& chunkFileHandle, + kfsChunkId_t chunkId, + int64_t chunkVersion, + uint32_t numServers, + const string& servers, + ServerLocation peerLoc, + int replicationPos, + int64_t chunkSize) + : KfsCallbackObj(), + mReplicationPos(replicationPos), + mNumServers(numServers), + mChunkId(chunkId), + mChunkVersion(chunkVersion), + mChunkFileHandle(chunkFileHandle), + mCommitAckServers(MakeCommitAckServers(numServers, servers)), + mPeerLocation(peerLoc), + mState(kStateOpen), + mMakeChunkStableOp(0), + mLastMakeChunkStableOp(0), + mBeginMakeChunkStableOp(0), + mLastBeginMakeChunkStableOp(0), + mLastActivityTime(Now()), + mLastAppendActivityTime(mLastActivityTime), + mLastFlushTime(Now()), + mNextOffset(chunkSize), + mNextCommitOffset(chunkSize), + mCommitOffsetAckSent(mNextCommitOffset), + mNextWriteOffset(chunkSize), + mMasterCommittedOffset(-1), + mBuffer(), + mBufFrontPadding(0), + mIoOpsInFlight(0), + mReplicationsInFlight(0), + mBytesReserved(0), + mAppendCommitCount(0), + mChunkChecksum(0), + mChunkSize(-1), + mStaggerRMWInFlightFlag(false), + mRestartFlushFlag(false), + mFlushFullBlocksFlag(false), + mCanDoLowOnBuffersFlushFlag(false), + mMakeStableSucceededFlag(false), + mInstanceNum(++sInstanceNum), + mConsecutiveOutOfSpaceCount(0), + mWriteIdState(), + mTimer( + libkfsio::globalNetManager(), + *this, + gAtomicRecordAppendManager.GetCleanUpSec() + ), + mPeer(uint32_t(mReplicationPos + 1) < mNumServers ? + new RemoteSyncSM(mPeerLocation) : 0) +{ + assert( + chunkSize >= 0 && + mChunkFileHandle && mChunkFileHandle->IsOpen() && + IsChunkOpen() + ); + SET_HANDLER(this, &AtomicRecordAppender::EventHandler); + PendingFlushList::Init(*this); + AppendReplicationList::Init(mReplicationList); + mNextOffset = GetChunkSize(); + WAPPEND_LOG_STREAM_DEBUG << + "ctor" << + " chunk: " << mChunkId << + " offset: " << mNextOffset << + KFS_LOG_EOM; +} + +AtomicRecordAppender::~AtomicRecordAppender() +{ + assert( + mState == kStatePendingDelete && + mIoOpsInFlight == 0 && + mReplicationsInFlight == 0 && + mWriteIdState.empty() && + AppendReplicationList::IsEmpty(mReplicationList) && + ! gChunkManager.IsWriteAppenderOwns(mChunkId) + ); + WAPPEND_LOG_STREAM_DEBUG << + "dtor" << + " chunk: " << mChunkId << + " offset: " << mNextOffset << + KFS_LOG_EOM; + if (mPeer) { + mPeer->Finish(); + } + mState = kStateNone; // To catch double free; +} + +void +AtomicRecordAppender::SetState(State state, bool notifyIfLostFlag /* = true */) +{ + if (state == mState || mState == kStatePendingDelete) { + return; + } + const State prevState = mState; + const bool wasStableFlag = IsChunkStable(); + mState = state; + const bool nowStableFlag = IsChunkStable(); + if ((wasStableFlag && ! nowStableFlag) || + (mState == kStateReplicationFailed && prevState != kStateOpen)) { + // Presently transition from stable to open is not allowed. + WAPPEND_LOG_STREAM_FATAL << + " invalid state transition:" + " from: " << GetStateAsStr(prevState) << + " to: " << GetStateAsStr() << + " chunk: " << mChunkId << + " offset: " << mNextOffset << + " wid count: " << mWriteIdState.size() << + KFS_LOG_EOM; + FatalError(); + } + if (prevState == kStateOpen) { + gAtomicRecordAppendManager.DecOpenAppenderCount(); + } + if (wasStableFlag != nowStableFlag) { + gAtomicRecordAppendManager.UpdateAppenderFlushLimit(this); + } + if (mState == kStateStable || mState == kStateChunkLost) { + mTimer.SetTimeout(gAtomicRecordAppendManager.GetCleanUpSec()); + } + mMakeStableSucceededFlag = + mMakeStableSucceededFlag || mState == kStateStable; + if (nowStableFlag) { + mBuffer.Clear(); // no more ios. + } + if (mState == kStateStable) { + mChunkFileHandle.reset(); + } else if (mState == kStateChunkLost) { + if (notifyIfLostFlag) { + // Currently smart pointer copy is not strictly necessary. + // All ChunkIOFailed does is pointer comparison, thus the pointer + // does not have to be valid. The copy is more fool proof though. + DiskIo::FilePtr const chunkFileHandle(mChunkFileHandle); + assert(nowStableFlag); + mChunkFileHandle.reset(); + gChunkManager.ChunkIOFailed(mChunkId, 0, chunkFileHandle.get()); + } + Cntrs().mLostChunkCount++; + } else if (mState == kStateReplicationFailed) { + TryToCloseChunk(); + } +} + +bool +AtomicRecordAppender::Delete() +{ + if (mState != kStatePendingDelete) { + if (int(mState) <= kStateNone || int(mState) >= kNumStates) { + // Invalid state, most likely double free. + FatalError(); + } + mTimer.RemoveTimeout(); + mBuffer.Clear(); + SetCanDoLowOnBuffersFlushFlag(false); + if (! mWriteIdState.empty()) { + DecAppendersWithWidCount(); + } + mWriteIdState.clear(); + gAtomicRecordAppendManager.Detach(*this); + SetState(kStatePendingDelete); + } + if (mIoOpsInFlight > 0 || mReplicationsInFlight > 0) { + return false; // wait for in flight ops to finish + } + delete this; + return true; +} + +int +AtomicRecordAppender::CheckParameters( + int64_t chunkVersion, uint32_t numServers, const string& servers, + int replicationPos, ServerLocation peerLoc, + const DiskIo::FilePtr& fileHandle, string& msg) +{ + int status = 0; + if (chunkVersion != mChunkVersion) { + msg = "invalid chunk version"; + status = kErrParameters; + } else if (mReplicationPos != replicationPos) { + status = kErrParameters; + msg = "invalid replication chain position"; + } else if (mPeerLocation != peerLoc) { + status = kErrParameters; + msg = "invalid replication chain peer: " + + peerLoc.ToString() + " expected: " + mPeerLocation.ToString(); + } else if (mNumServers != numServers) { + status = kErrParameters; + msg = "invalid replication factor"; + } else if (mState != kStateOpen) { + msg = GetStateAsStr(); + status = kErrProtocolState; + } else if (MakeCommitAckServers(numServers, servers) != + mCommitAckServers) { + status = kErrParameters; + msg = "invalid replication chain"; + } else if (fileHandle.get() != mChunkFileHandle.get()) { + status = kErrParameters; + msg = "invalid file handle"; + } + return status; +} + +void +AtomicRecordAppender::DeleteChunk() +{ + WAPPEND_LOG_STREAM_DEBUG << + "delete: " << + " chunk: " << mChunkId << + " state: " << GetStateAsStr() << + " offset: " << mNextOffset << + " wid count: " << mWriteIdState.size() << + KFS_LOG_EOM; + if (mState == kStatePendingDelete) { + // Only AtomicRecordAppendManager calls this method. + // Pending delete shouldn't be in AtomicRecordAppendManager::mAppenders. + FatalError(); + } + // Prevent recursion: + // SetState(kStateChunkLost) => StaleChunk() => here + // make sure that StaleChunk() will not be invoked. + // Never invoke Delete() here. + SetState(kStateChunkLost, false); +} + +int +AtomicRecordAppender::EventHandler(int code, void* data) +{ + switch(code) { + case EVENT_INACTIVITY_TIMEOUT: + Timeout(); + break; + case EVENT_DISK_ERROR: + case EVENT_DISK_WROTE: + case EVENT_DISK_RENAME_DONE: { + const int status = data ? + (code == EVENT_DISK_RENAME_DONE ? + (int)*reinterpret_cast(data) : + *reinterpret_cast(data) + ) : -1; + MetaWriteDone( + (code == EVENT_DISK_ERROR && status > 0) ? -1 : status + ); + } + break; + case EVENT_CMD_DONE: { + KfsOp* const op = reinterpret_cast(data); + assert(op && op->clnt == this); + switch (op->op) { + case CMD_WRITE: + OpDone(static_cast(op)); + break; + case CMD_RECORD_APPEND: + OpDone(static_cast(op)); + break; + case CMD_READ: + OpDone(static_cast(op)); + break; + default: + WAPPEND_LOG_STREAM_FATAL << "unexpected op: " << op->Show() << + KFS_LOG_EOM; + FatalError(); + break; + } + } + break; + default: + WAPPEND_LOG_STREAM_FATAL << "unexpected event code: " << code << + KFS_LOG_EOM; + FatalError(); + break; + } + return 0; +} + +void +AtomicRecordAppender::CheckLeaseAndChunk(const char* prefix) +{ + if (! IsChunkStable() && + (! mChunkFileHandle || ! mChunkFileHandle->IsOpen())) { + WAPPEND_LOG_STREAM_ERROR << (prefix ? prefix : "") << + ": chunk manager discarded chunk: " << mChunkId << "?" << + " state: " << GetStateAsStr() << + KFS_LOG_EOM; + SetState(kStateChunkLost); + } else if (mState == kStateOpen && IsMaster() && + ! gLeaseClerk.IsLeaseValid(mChunkId)) { + WAPPEND_LOG_STREAM_ERROR << (prefix ? prefix : "") << + ": write lease has expired, no further append allowed" << + " chunk: " << mChunkId << + KFS_LOG_EOM; + // Handle this exactly the same way as replication failure: trim to last + // commit, and relinquish the lease. + // Transitioning into closed won't relinquish the lease. Without + // explicit lease release it might stay in closed state until no + // activity timer goes off. Status inquiry is considered an + // activity: op status recovery cannot succeed because make chunk + // stable will not be issued until no activity timer goes off. + Cntrs().mLeaseExpiredCount++; + SetState(kStateReplicationFailed); + } +} + +void +AtomicRecordAppender::AllocateWriteId( + WriteIdAllocOp *op, int replicationPos, ServerLocation peerLoc, + const DiskIo::FilePtr& chunkFileHandle) +{ + mLastActivityTime = Now(); + if (! IsChunkStable() && chunkFileHandle != mChunkFileHandle) { + WAPPEND_LOG_STREAM_FATAL << + "invalid chunk file handle: " << + (const void*)chunkFileHandle.get() << + " / " << (const void*)mChunkFileHandle.get() << + KFS_LOG_EOM; + FatalError(); + } + CheckLeaseAndChunk("allocate write id"); + + int status = 0; + string msg; + if (op->chunkId != mChunkId) { + msg = "invalid chunk id"; + status = kErrParameters; + } else if (op->chunkVersion != mChunkVersion) { + msg = "invalid chunk version"; + status = kErrParameters; + } else if (mReplicationPos != replicationPos) { + status = kErrParameters; + msg = "invalid replication chain position"; + } else if (mPeerLocation != peerLoc) { + status = kErrParameters; + msg = "invalid replication chain peer: " + + peerLoc.ToString() + " expected: " + mPeerLocation.ToString(); + } else if (mNumServers != op->numServers) { + status = kErrParameters; + msg = "invalid replication factor"; + } else if (mState != kStateOpen) { + msg = GetStateAsStr(); + status = kErrProtocolState; + } else if (int(mWriteIdState.size()) >= + gAtomicRecordAppendManager.GetMaxWriteIdsPerChunk()) { + msg = "too many write ids"; + status = kErrOutOfSpace; + } else { + const bool waEmptyFlag = mWriteIdState.empty(); + pair res = mWriteIdState.insert( + make_pair(op->writeId, WIdState())); + if (! res.second) { + WAPPEND_LOG_STREAM_FATAL << + "allocate write id: duplicate write id: " << op->writeId << + KFS_LOG_EOM; + FatalError(); + } else { + res.first->second.mSeq = op->clientSeq; + if (waEmptyFlag) { + IncAppendersWithWidCount(); + } + op->appendPeer = mPeer; + } + } + op->status = status; + if (status != 0) { + op->statusMsg = msg; + } + WAPPEND_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "allocate write id: " << + (status != 0 ? msg : string("ok")) << + " state: " << GetStateAsStr() << + " chunk: " << mChunkId << + " wid count: " << mWriteIdState.size() << + " offset: " << mNextOffset << + " reserved: " << mBytesReserved << + " writeId: " << op->writeId << + " seq: " << op->seq << + " cli seq: " << op->clientSeq << + " status: " << status << + KFS_LOG_EOM; +} + +// Ideally space reservation should *not* use write id, but its own token +// "space reservation id" instead. Separate reservation token makes write append +// pipelining more efficient by reducing # of round trips required by the +// protocol: after single reservation multiple write append ops can be started. +// At the time of writing this type of request pipelining is not implemented and +// is not planned to be implemented in the near future by the client. +int +AtomicRecordAppender::ChangeChunkSpaceReservaton( + int64_t writeId, size_t nBytes, bool releaseFlag, string* errMsg) +{ + mLastActivityTime = Now(); + CheckLeaseAndChunk(releaseFlag ? "space reserve" : "space release"); + + int status = 0; + const char* msg = "ok"; + const size_t prevReserved = mBytesReserved; + WriteIdState::iterator it; + if (! IsMaster()) { + msg = "not master"; + status = kErrParameters; + } else if (mState != kStateOpen) { + msg = GetStateAsStr(); + status = kErrProtocolState; + } else if ((it = mWriteIdState.find(writeId)) == mWriteIdState.end()) { + if (! releaseFlag) { + msg = "invalid write id"; + status = kErrParameters; + } + } else if (releaseFlag) { + if (it->second.mReadOnlyFlag) { + if (it->second.mBytesReserved > 0) { + WAPPEND_LOG_STREAM_FATAL << + "invalid write id state: " << + it->second.mBytesReserved << + " bytes reserved in read only state" << + KFS_LOG_EOM; + FatalError(); + } + } else if (it->second.mBytesReserved >= (size_t)nBytes) { + it->second.mBytesReserved -= nBytes; + mBytesReserved -= nBytes; + } else { + mBytesReserved -= it->second.mBytesReserved; + it->second.mBytesReserved = 0; + } + } else if (it->second.mReadOnlyFlag) { + msg = "no appends allowed with this write id"; + status = kErrParameters; + } else { + if (mNextOffset + mBytesReserved + nBytes > int64_t(CHUNKSIZE)) { + msg = "out of space"; + status = kErrOutOfSpace; + mConsecutiveOutOfSpaceCount++; + } else { + mBytesReserved += nBytes; + it->second.mBytesReserved += nBytes; + mConsecutiveOutOfSpaceCount = 0; + } + } + if (errMsg) { + (*errMsg) += msg; + } + if (mBytesReserved <= 0) { + mTimer.ScheduleTimeoutNoLaterThanIn( + gAtomicRecordAppendManager.GetCleanUpSec()); + } + WAPPEND_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelINFO) << + (releaseFlag ? "release: " : "reserve: ") << msg << + " state: " << GetStateAsStr() << + " chunk: " << mChunkId << + " writeId: " << writeId << + " bytes: " << nBytes << + " offset: " << mNextOffset << + " reserved: " << mBytesReserved << + " delta: " << ssize_t(prevReserved - mBytesReserved) << + " status: " << status << + KFS_LOG_EOM; + return status; +} + +int +AtomicRecordAppender::InvalidateWriteId(int64_t writeId, bool declareFailureFlag) +{ + int status = 0; + WriteIdState::iterator const it = mWriteIdState.find(writeId); + if (it != mWriteIdState.end() && + it->second.mStatus == 0 && + it->second.mAppendCount == 0 && + ! it->second.mReadOnlyFlag) { + // Entry with no appends, clean it up. + // This is not orderly close, do not shorten close timeout. + mWriteIdState.erase(it); + if (mWriteIdState.empty()) { + DecAppendersWithWidCount(); + } + if (declareFailureFlag && mState == kStateOpen) { + SetState(kStateReplicationFailed); + } + } else { + status = kErrParameters; + } + WAPPEND_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "invalidate write id" << + (declareFailureFlag ? " declare failure:" : ":") << + " wid: " << writeId << + " chunk: " << mChunkId << + " status: " << status << + " state: " << GetStateAsStr() << + " wids count: " << mWriteIdState.size() << + KFS_LOG_EOM; + return status; +} + +void +AtomicRecordAppender::UpdateMasterCommittedOffset(int64_t masterCommittedOffset) +{ + // Master piggy back its ack on the write append replication. + // The ack can lag because of replication request pipelining. The ack is + // used to determine op status if / when client has to use slave to perform + // "get op status" in the case when the client can not communicate with the + // master. + // This is needed only to reduce client's failure resolution latency. By + // comparing append end with this ack value it might be possible to + // determine the append op status. If the ack value is greater than the + // append end, then the append is successfully committed by all replication + // participants. + if (masterCommittedOffset >= mMasterCommittedOffset && + masterCommittedOffset <= mNextCommitOffset) { + mMasterCommittedOffset = masterCommittedOffset; + } else { + WAPPEND_LOG_STREAM_ERROR << + "out of window master committed" + " offset: " << masterCommittedOffset << + "[" << mMasterCommittedOffset << + "," << mNextCommitOffset << "]" << + KFS_LOG_EOM; + } +} + +void +AtomicRecordAppender::AppendBegin( + RecordAppendOp *op, int replicationPos, ServerLocation peerLoc) +{ + if (op->numBytes < size_t(op->dataBuf.BytesConsumable()) || + op->origClnt) { + WAPPEND_LOG_STREAM_FATAL << + "begin: short op buffer: " << + " req. size: " << op->numBytes << + " buffer: " << op->dataBuf.BytesConsumable() << + " or non null" + " orig client: " << op->origClnt << + " " << op->Show() << + KFS_LOG_EOM; + FatalError(); + } + mLastActivityTime = Now(); + CheckLeaseAndChunk("begin"); + + int status = 0; + string msg; + ClientSM* client = 0; + if (op->chunkId != mChunkId) { + status = kErrParameters; + msg = "invalid chunk id"; + } else if (mState != kStateOpen) { + msg = GetStateAsStr(); + status = kErrProtocolState; + } else if (mPeerLocation != peerLoc) { + status = kErrParameters; + msg = "invalid replication chain peer: " + + peerLoc.ToString() + " expected: " + mPeerLocation.ToString(); + } else if (mReplicationPos != replicationPos) { + status = kErrParameters; + msg = "invalid replication chain position"; + } else if (IsMaster() && op->fileOffset >= 0) { + status = kErrParameters; + msg = "protocol error: offset specified for master"; + } else if (! IsMaster() && op->fileOffset < 0) { + status = kErrParameters; + msg = "protocol error: offset not specified for slave"; + } else if (mNumServers != op->numServers) { + status = kErrParameters; + msg = "invalid replication factor"; + } else if (mNextOffset + op->numBytes > int64_t(CHUNKSIZE)) { + msg = "out of chunk space"; + status = kErrParameters; + } else if (IsMaster() && op->clnt != this && + (client = op->GetClientSM()) && + client->GetReservedSpace(mChunkId, op->writeId) < op->numBytes) { + status = kErrParameters; + msg = "out of client reserved space"; + } + if ((status != 0 || ! IsMaster()) && op->clnt == this) { + WAPPEND_LOG_STREAM_FATAL << + "begin: bad internal op: " << op->Show() << + KFS_LOG_EOM; + FatalError(); + } + + // Check if it is master 0 ack: no payload just commit offset. + const bool masterAckflag = status == 0 && op->numBytes == 0 && + op->writeId == -1 && + (IsMaster() ? (op->clnt == this) : (op->masterCommittedOffset >= 0)); + WriteIdState::iterator const widIt = (masterAckflag || status != 0) ? + mWriteIdState.end() : mWriteIdState.find(op->writeId); + if (masterAckflag) { + if (IsMaster()) { + op->fileOffset = mNextOffset; + } else if (op->fileOffset != mNextOffset) { + // Out of order. + msg = "master 0 ack has invalid offset"; + status = kErrParameters; + SetState(kStateReplicationFailed); + } else { + UpdateMasterCommittedOffset(op->masterCommittedOffset); + } + } else if (status == 0) { + if (widIt == mWriteIdState.end()) { + status = kErrParameters; + msg = "invalid write id"; + } else { + WIdState& ws = widIt->second; + if (! IsMaster()) { + if (op->fileOffset != mNextOffset) { + // Out of order replication. + msg = "invalid append offset"; + status = kErrParameters; + SetState(kStateReplicationFailed); + } else { + UpdateMasterCommittedOffset(op->masterCommittedOffset); + if (ws.mStatus == kErrStatusInProgress && + mMasterCommittedOffset >= + ws.mOffset + int64_t(ws.mLength)) { + ws.mStatus = 0; // Master committed. + } + } + } + if (status != 0) { + // Failed already. + } if (ws.mReadOnlyFlag) { + status = kErrWidReadOnly; + msg = "no appends allowed with this write id"; + } else if (ws.mStatus != 0) { + // Force client to use multiple write ids to do request + // pipelining, and allocate new write id after a failure. + status = kErrParameters; + msg = ws.mStatus == kErrStatusInProgress ? + "has operation in flight" : + "invalid write id: previous append failed"; + } + } + if (status == 0) { + const uint32_t checksum = + ComputeBlockChecksum(&op->dataBuf, op->numBytes); + if (op->checksum != checksum) { + ostringstream os; + os << "checksum mismatch: " + " received: " << op->checksum << + " actual: " << checksum + ; + msg = os.str(); + status = kErrParameters; + Cntrs().mChecksumErrorCount++; + if (! IsMaster()) { + SetState(kStateReplicationFailed); + } + } + } + if (status == 0) { + assert(IsChunkOpen()); + if (IsMaster()) { + // only on the write master is space reserved + if (widIt->second.mBytesReserved < op->numBytes) { + status = kErrParameters; + msg = "write id out of reserved space"; + } else if (mBytesReserved < op->numBytes) { + msg = "out of reserved space"; + status = kErrParameters; + } else { + assert(mNextOffset + mBytesReserved <= int64_t(CHUNKSIZE)); + // Commit the execution. + assert( + widIt != mWriteIdState.end() && + widIt->second.mStatus == 0 + ); + op->fileOffset = mNextOffset; + mBytesReserved -= op->numBytes; + widIt->second.mBytesReserved -= op->numBytes; + mNextOffset += op->numBytes; + // Decrease space reservation for this client connection. + // ClientSM space un-reservation in case of the subsequent + // failures is not needed because these failures will at + // least prevent any further writes with this write id. + if (client) { + client->UseReservedSpace( + mChunkId, op->writeId, op->numBytes); + } + } + } else { + mNextOffset += op->numBytes; + } + } + } + // Empty appends (0 bytes) are always forwarded. + // This is used by propagate master commit ack, and potentially can be used + // for replication health check. + if (status == 0 && mPeer) { + op->origSeq = op->seq; + op->origClnt = op->clnt; + op->clnt = this; + } + if (status == 0 && ! masterAckflag) { + // Write id table is updated only in the case when execution is + // committed. Otherwise the op is discarded, and treated like + // it was never received. + assert(widIt != mWriteIdState.end() && widIt->second.mStatus == 0); + WIdState& ws = widIt->second; + ws.mStatus = kErrStatusInProgress; + ws.mLength = op->numBytes; + ws.mOffset = op->fileOffset; + ws.mSeq = op->clientSeq; + + // Move blocks into the internal buffer. + // The main reason to do this now, and not to wait for the replication + // completion is to save io buffer space. Io buffers can be reclaimed + // immediately after the data goes onto disk, and on the wire. Another + // way to look at this: socket buffer space becomes an extension to + // write appender io buffer space. + // The price is "undoing" writes, which might be necessary in the case of + // replication failure. Undoing writes is a simple truncate, and the the + // failures aren't expected to be frequent enough to matter. + const int prevNumBytes = mBuffer.BytesConsumable(); + // Always try to append to the last buffer. + // Flush() keeps track of the write offset and "slides" buffers + // accordingly. + if (op->numBytes > 0) { + assert(mBufFrontPadding == 0 || mBuffer.IsEmpty()); + IOBuffer dataBuf; + if (op->origClnt) { + // Copy buffer before moving data into appender's write buffer. + // Enqueue for replication at the end. + // Replication completion invokes AppendCommit(). + dataBuf.Copy(&op->dataBuf, op->numBytes); + } + mBuffer.ReplaceKeepBuffersFull( + op->origClnt ? &dataBuf : &op->dataBuf, + mBuffer.BytesConsumable() + mBufFrontPadding, + op->numBytes + ); + if (mBufFrontPadding > 0) { + mBuffer.Consume(mBufFrontPadding); + mBufFrontPadding = 0; + assert(! mBuffer.IsEmpty()); + } + } + // Do space accounting and flush if needed. + if (mBuffer.BytesConsumable() >= + gAtomicRecordAppendManager.GetFlushLimit(*this, + mBuffer.BytesConsumable() - prevNumBytes)) { + // Align the flush to checksum boundaries. + FlushFullBlocks(); + } else { + if (! mBuffer.IsEmpty()) { + mTimer.ScheduleTimeoutNoLaterThanIn( + gAtomicRecordAppendManager.GetFlushIntervalSec()); + } + SetCanDoLowOnBuffersFlushFlag(! mBuffer.IsEmpty()); + } + } + op->status = status; + if (status != 0) { + op->statusMsg = msg; + } + WAPPEND_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "begin: " << msg << + (masterAckflag ? " master ack" : "") << + " state: " << GetStateAsStr() << + " reserved: " << mBytesReserved << + " offset: next: " << mNextOffset << + " commit: " << mNextCommitOffset << + " master: " << mMasterCommittedOffset << + " in flight:" + " replicaton: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + " status: " << status << + " " << op->Show() << + KFS_LOG_EOM; + mReplicationsInFlight++; + op->replicationStartTime = Now(); + AppendReplicationList::PushBack(mReplicationList, *op); + if (op->origClnt || ! mPeer) { + mLastAppendActivityTime = Now(); + } + if (op->origClnt) { + assert(status == 0); + if (IsMaster()) { + op->masterCommittedOffset = mNextCommitOffset; + mCommitOffsetAckSent = mNextCommitOffset; + } + if (mReplicationsInFlight == 1) { + mTimer.ScheduleTimeoutNoLaterThanIn( + gAtomicRecordAppendManager.GetReplicationTimeoutSec()); + } + mPeer->Enqueue(op); + } else { + OpDone(op); + } +} + +int +AtomicRecordAppender::GetNextReplicationTimeout() const +{ + if (mReplicationsInFlight <= 0 || mState != kStateOpen) { + return -1; + } + const int timeout = gAtomicRecordAppendManager.GetReplicationTimeoutSec(); + if (timeout < 0) { + return -1; + } + const time_t now = Now(); + const RecordAppendOp* const op = + AppendReplicationList::Front(mReplicationList); + assert(op); + const time_t end = op->replicationStartTime + timeout; + return (now < end ? end - now : 0); +} + +void +AtomicRecordAppender::OpDone(RecordAppendOp* op) +{ + assert( + mReplicationsInFlight > 0 && + AppendReplicationList::IsInList(mReplicationList, *op) + ); + mReplicationsInFlight--; + AppendReplicationList::Remove(mReplicationList, *op); + if (mReplicationsInFlight > 0 && mState == kStateOpen) { + mTimer.ScheduleTimeoutNoLaterThanIn(GetNextReplicationTimeout()); + } + // Do not commit malformed client requests. + const bool commitFlag = ! IsMaster() || op->origClnt || op->status == 0; + if (op->origClnt) { + op->seq = op->origSeq; + op->clnt = op->origClnt; + op->origClnt = 0; + } + if (commitFlag) { + AppendCommit(op); + } + // Delete if commit ack. + const bool deleteOpFlag = op->clnt == this; + if (deleteOpFlag) { + delete op; + } + DeleteIfNeeded(); + if (! deleteOpFlag) { + Cntrs().mAppendCount++; + if (op->status >= 0) { + Cntrs().mAppendByteCount += op->numBytes; + } else { + Cntrs().mAppendErrorCount++; + if (mState == kStateReplicationFailed) { + Cntrs().mReplicationErrorCount++; + } + } + KFS::SubmitOpResponse(op); + } +} + +void +AtomicRecordAppender::AppendCommit(RecordAppendOp *op) +{ + mLastActivityTime = Now(); + if (mState != kStateOpen) { + op->status = kErrStatusInProgress; // Don't know + op->statusMsg = "closed for append; op status undefined; state: "; + op->statusMsg += GetStateAsStr(); + WAPPEND_LOG_STREAM_ERROR << + "commit: " << op->statusMsg << + " status: " << op->status << + " " << op->Show() << + KFS_LOG_EOM; + return; + } + // Always declare failure here if op status is non zero. + // Theoretically it is possible to recover from errors such "write id is + // read only", *by moving buffer append here* in AppendCommit, from + // AppendBegin. In such case the protocol ensures that no partial + // replication can succeed if an error status is *received*. + // The problem is that there might be more than one replications in flight, + // and waiting for all replications that are currently in flight to fail is + // required to successfully recover. + // On the other hand the price for declaring a failure is only partial + // (non full) chunk. + // For now assume that the replication failures are infrequent enough to + // have any significant effect on the chunk size, and more efficient use of + // io buffer space is more important (see comment in AppendBegin). + if (op->status != 0) { + op->statusMsg += " op (forwarding) failed, op status undefined" + "; state: "; + op->statusMsg += GetStateAsStr(); + WAPPEND_LOG_STREAM_ERROR << + "commit: " << op->statusMsg << + " status: " << op->status << + " reserved: " << mBytesReserved << + " offset: " << mNextCommitOffset << + " nextOffset: " << mNextOffset << + " " << op->Show() << + KFS_LOG_EOM; + op->status = kErrStatusInProgress; // Don't know + SetState(kStateReplicationFailed); + return; + } + // AppendBegin checks if write id is read only. + // If write id wasn't read only in the append begin, it cannot transition + // into into read only between AppendBegin and AppendCommit, as it should + // transition into "in progress" in the AppendBegin, and stay "in progress" + // at least until here. + // If the op is internally generated 0 ack verify that it has no payload. + WriteIdState::iterator const widIt = op->clnt == this ? + mWriteIdState.end() : mWriteIdState.find(op->writeId); + if (op->fileOffset != mNextCommitOffset || + op->chunkId != mChunkId || + op->chunkVersion != mChunkVersion || + (widIt == mWriteIdState.end() ? + ((IsMaster() ? (op->clnt != this) : + (op->masterCommittedOffset < 0)) || + op->numBytes != 0 || op->writeId != -1) : + widIt->second.mStatus != kErrStatusInProgress)) { + WAPPEND_LOG_STREAM_FATAL << + "commit: out of order or invalid op" << + " chunk: " << mChunkId << + " chunkVersion: " << mChunkVersion << + " reserved: " << mBytesReserved << + " offset: " << mNextCommitOffset << + " nextOffset: " << mNextOffset << + " " << op->Show() << + KFS_LOG_EOM; + FatalError(); + return; + } + // Do not pay attention to the lease expiration here. + // If lease has expired, but no make stable was received, then + // commit the append anyway. + op->status = 0; + if (widIt != mWriteIdState.end()) { + mNextCommitOffset += op->numBytes; + mAppendCommitCount++; + widIt->second.mAppendCount++; + } + if (IsMaster()) { + // Only write master can declare a success, he is the last to commit, + // and only in the case if all slaves committed. + if (widIt != mWriteIdState.end()) { + widIt->second.mStatus = 0; + } + // Schedule to send commit ack. + if (mNumServers > 1 && mReplicationsInFlight <= 0 && + mNextCommitOffset > mCommitOffsetAckSent) { + mTimer.ScheduleTimeoutNoLaterThanIn( + gAtomicRecordAppendManager.GetSendCommitAckTimeoutSec()); + } + } + WAPPEND_LOG_STREAM_DEBUG << + "commit:" + " state: " << GetStateAsStr() << + " reserved: " << mBytesReserved << + " offset: next: " << mNextOffset << + " commit: " << mNextCommitOffset << + " master: " << mMasterCommittedOffset << + " in flight:" + " replicaton: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + " status: " << op->status << + " " << op->Show() << + KFS_LOG_EOM; +} + +void +AtomicRecordAppender::GetOpStatus(GetRecordAppendOpStatus* op) +{ + mLastActivityTime = Now(); + + int status = 0; + const char* msg = "ok"; + if (op->chunkId != mChunkId) { + msg = "invalid chunk id"; + status = kErrParameters; + } else { + WriteIdState::iterator const widIt = mWriteIdState.find(op->writeId); + if (widIt == mWriteIdState.end()) { + msg = "no such write id"; + status = kErrNotFound; + } else { + WIdState& ws = widIt->second; + assert( + ws.mBytesReserved == 0 || + (IsMaster() && ! ws.mReadOnlyFlag && + mBytesReserved >= ws.mBytesReserved) + ); + if (ws.mStatus == kErrStatusInProgress) { + const int64_t end = ws.mOffset + ws.mLength; + if (mMakeStableSucceededFlag) { + ws.mStatus = mChunkSize >= end ? 0 : kErrFailedState; + } else if (! IsMaster() && mMasterCommittedOffset >= end) { + ws.mStatus = 0; + } + if (ws.mStatus != kErrStatusInProgress) { + WAPPEND_LOG_STREAM_DEBUG << + "get op status:" + " changed status from in progress" + " to: " << ws.mStatus << + " chunk: " << mChunkId << + " writeId: " << op->writeId << + " op end: " << (ws.mOffset + ws.mLength) << + " master committed: " << mMasterCommittedOffset << + " state: " << GetStateAsStr() << + " chunk size: " << mChunkSize << + KFS_LOG_EOM; + } + } + op->opStatus = ws.mStatus; + op->opLength = ws.mLength; + op->opOffset = ws.mOffset; + op->opSeq = ws.mSeq; + op->widAppendCount = ws.mAppendCount; + op->widBytesReserved = ws.mBytesReserved; + op->widWasReadOnlyFlag = ws.mReadOnlyFlag; + + op->chunkVersion = mChunkVersion; + op->chunkBytesReserved = mBytesReserved; + op->remainingLeaseTime = IsMaster() ? + gLeaseClerk.GetLeaseExpireTime(mChunkId) - Now() : -1; + op->masterFlag = IsMaster(); + op->stableFlag = mState == kStateStable; + op->appenderState = mState; + op->appenderStateStr = GetStateAsStr(); + op->openForAppendFlag = mState == kStateOpen; + op->masterCommitOffset = mMasterCommittedOffset; + op->nextCommitOffset = mNextCommitOffset; + + // The status inquiry always makes write id read only, and + // un-reserves the space, thus disallowing further appends with this + // write id. This guarantees that after status inquiry returns, no + // other [new, or stale] appends with this write id can succeed. + // + // For now this the behaviour is the same for all replication + // participants. + // This speeds up recovery for the clients that can not communicate + // with the replication master. + // The price for this is lower resistance to "DoS", where status + // inquiry with any replication slave when append replication is + // still in flight but haven't reached the slave can make chunks + // less full. + if (mBytesReserved >= ws.mBytesReserved) { + mBytesReserved -= ws.mBytesReserved; + } else { + mBytesReserved = 0; + } + ws.mBytesReserved = 0; + ws.mReadOnlyFlag = true; + op->widReadOnlyFlag = ws.mReadOnlyFlag; + } + } + op->status = status; + if (status != 0) { + op->statusMsg = msg; + } + WAPPEND_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelINFO) << + "get op status: " << msg << + " state: " << GetStateAsStr() << + " chunk: " << mChunkId << + " wid count: " << mWriteIdState.size() << + " offset: " << mNextOffset << + " reserved: " << mBytesReserved << + " writeId: " << op->writeId << + " status: " << status << + " master comitted: " << mMasterCommittedOffset << + " " << op->Show() << + KFS_LOG_EOM; +} + +void +AtomicRecordAppender::CloseChunk(CloseOp* op, int64_t writeId, bool& forwardFlag) +{ + mLastActivityTime = Now(); + + int status = 0; + const char* msg = "ok"; + if (op->chunkId != mChunkId) { + msg = "invalid chunk id"; + status = kErrParameters; + } else if (op->hasWriteId) { + WriteIdState::iterator const widIt = mWriteIdState.find(writeId); + if (widIt == mWriteIdState.end()) { + msg = "no such write id"; + status = kErrNotFound; + } else { + if (! IsMaster()) { + // Update master commit offset, and last op status if needed, + // and possible. + if (op->masterCommitted >= 0) { + UpdateMasterCommittedOffset(op->masterCommitted); + } + WIdState& ws = widIt->second; + if (ws.mStatus == kErrStatusInProgress) { + const int64_t end = ws.mOffset + ws.mLength; + if ((mMakeStableSucceededFlag ? + mChunkSize : mMasterCommittedOffset) >= end) { + ws.mStatus = 0; + } + } + } + if (widIt->second.mStatus == kErrStatusInProgress) { + msg = "write id has op in flight"; + status = kErrStatusInProgress; + } else if (widIt->second.mReadOnlyFlag) { + msg = "write id is read only"; + status = kErrParameters; + } else if (widIt->second.mStatus != 0) { + msg = "append failed with this write id"; + status = kErrParameters; + } else { + // The entry is in good state, and the client indicates that he + // does not intend to use this write id for any purpose in the + // future. Reclaim the reserved space, and discard the entry. + if (mBytesReserved >= widIt->second.mBytesReserved) { + mBytesReserved -= widIt->second.mBytesReserved; + } else { + mBytesReserved = 0; + } + mWriteIdState.erase(widIt); + if (mWriteIdState.empty()) { + DecAppendersWithWidCount(); + } + if (IsMaster() && mState == kStateOpen && + mWriteIdState.empty()) { + // For orderly close case shorten close timeout. + mTimer.ScheduleTimeoutNoLaterThanIn(Timer::MinTimeout( + gAtomicRecordAppendManager.GetCleanUpSec(), + GetCloseEmptyWidStateSec() + )); + } + } + } + } + forwardFlag = forwardFlag && status == 0; + op->status = status; + if (status != 0) { + op->statusMsg = msg; + } + if (IsMaster()) { + // Always send commit offset, it might be need to transition write id + // into "good" state. + // Write ids are deleted only if these are in good state, mostly for + // extra "safety", and to simplify debugging. + // For now do not update the acked: close is "no reply", it can be + // simply dropped. + op->masterCommitted = (forwardFlag && op->hasWriteId) ? + mNextCommitOffset : -1; + } + if (forwardFlag || ! mPeer) { + mLastAppendActivityTime = Now(); + } + if (forwardFlag && mPeer) { + forwardFlag = false; + CloseOp* const fwdOp = new CloseOp(0, op); + fwdOp->needAck = false; + SET_HANDLER(fwdOp, &CloseOp::HandlePeerReply); + mPeer->Enqueue(fwdOp); + } + WAPPEND_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "close chunk status: " << msg << + " state: " << GetStateAsStr() << + " chunk: " << mChunkId << + " wid count: " << mWriteIdState.size() << + " offset: " << mNextOffset << + " reserved: " << mBytesReserved << + " writeId: " << writeId << + " status: " << status << + " " << op->Show() << + KFS_LOG_EOM; +} + +bool +AtomicRecordAppender::ComputeChecksum() +{ + const bool ok = ComputeChecksum( + mChunkId, mChunkVersion, mChunkSize, mChunkChecksum); + if (! ok) { + mChunkChecksum = 0; + mChunkSize = -1; + SetState(kStateChunkLost); + } + return ok; +} + +bool +AtomicRecordAppender::ComputeChecksum( + kfsChunkId_t chunkId, int64_t chunkVersion, + int64_t& chunkSize, uint32_t& chunkChecksum) +{ + const ChunkInfo_t* const info = gChunkManager.GetChunkInfo(chunkId); + if (! info || + (! info->chunkBlockChecksum && info->chunkSize != 0) || + chunkVersion != info->chunkVersion) { + return false; + } + chunkSize = info->chunkSize; + const uint32_t* checksums = info->chunkBlockChecksum; + // Print it as text, to make byte order independent. + ostringstream os; + for (int64_t i = 0; i < chunkSize; i += CHECKSUM_BLOCKSIZE) { + os << *checksums++; + } + const string str = os.str(); + chunkChecksum = ComputeBlockChecksum(str.c_str(), str.length()); + return true; +} + +void +AtomicRecordAppender::SubmitResponse(BeginMakeChunkStableOp& op) +{ + mLastActivityTime = Now(); + if (op.status == 0) { + op.status = mState == kStateClosed ? 0 : kErrFailedState; + if (op.status != 0) { + op.statusMsg = "record append failure; state: "; + op.statusMsg += GetStateAsStr(); + } else { + if (mChunkSize < 0) { + WAPPEND_LOG_STREAM_FATAL << + "begin make stable response: " + " chunk: " << mChunkId << + " invalid size: " << mChunkSize << + KFS_LOG_EOM; + FatalError(); + } + op.status = 0; + op.chunkSize = mChunkSize; + op.chunkChecksum = mChunkChecksum; + } + } + WAPPEND_LOG_STREAM(op.status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "begin make stable done: " << op.statusMsg << + " " << op.Show() << + " size: " << mChunkSize << + " checksum: " << mChunkChecksum << + " state: " << GetStateAsStr() << + " wid count: " << mWriteIdState.size() << + KFS_LOG_EOM; + Cntrs().mBeginMakeStableCount++; + if (op.status < 0) { + Cntrs().mBeginMakeStableErrorCount++; + } + KFS::SubmitOpResponse(&op); +} + +void +AtomicRecordAppender::BeginMakeStable( + BeginMakeChunkStableOp* op /* = 0 */) +{ + WAPPEND_LOG_STREAM_DEBUG << + "begin make stable: " + " chunk: " << mChunkId << + " state: " << GetStateAsStr() << + " wid count: " << mWriteIdState.size() << + " offset: " << mNextOffset << + " reserved: " << mBytesReserved << + " in flight:" + " replicaton: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + " " << (op ? op->Show() : string("no op")) << + KFS_LOG_EOM; + + mLastActivityTime = Now(); + if (op) { + if (mChunkVersion != op->chunkVersion) { + op->statusMsg = "invalid chunk version"; + op->status = kErrParameters; + SubmitResponse(*op); + return; + } + op->status = 0; + } + // Only meta server issues this command, when it decides that + // the append master is not operational. + PushBack(mBeginMakeChunkStableOp, mLastBeginMakeChunkStableOp, op); + if (mMakeChunkStableOp) { + return; // Wait for make stable to finish, it will send the reply. + } + if (mState == kStateOpen || mState == kStateReplicationFailed) { + SetState(kStateClosed); + } + if (mState == kStateClosed) { + FlushAll(); + if (mNextCommitOffset < mNextWriteOffset) { + // Always truncate to the last commit offset. + // This is need to properly handle status inquiry that makes write + // id read only (prevents further ops with this write id to + // succeed). If the append is in flight and status inquiry op + // reaches this node first, then this will guarantee that no + // participant will commit the in flight append, as it will never + // receive the append (down the replicaton chain), or replication + // succeeded status (up the replicaton chain). + // If the status inquiry op comes in after the append op, then + // the last op status will be "in progress" (undefined), or already + // known. + TrimToLastCommit("begin make stable"); + return; + } + } + if (mState == kStateClosed && mIoOpsInFlight > 0) { + return; // Completion will be invoked later. + } + if (mState == kStateClosed && mChunkSize < 0) { + ComputeChecksum(); + } + SubmitResponse(mBeginMakeChunkStableOp, mLastBeginMakeChunkStableOp); +} + +bool +AtomicRecordAppender::WantsToKeepLease() const +{ + return (IsMaster() && ! IsChunkStable()); +} + +void +AtomicRecordAppender::Timeout() +{ + if (DeleteIfNeeded()) { + return; + } + int nextTimeout = -1; // infinite + const int flushInterval = + gAtomicRecordAppendManager.GetFlushIntervalSec(); + // Slaves keep chunk open longer, waiting for [begin] make stable from meta. + // Ideally the slave timeout should come from master. + // For now assume that the master and slave have the same value. + const int kSlaveTimeoutRatio = 4; + const int kMasterMaxIdleRatio = 2; + const int cleanupSec = gAtomicRecordAppendManager.GetCleanUpSec(); + const int cleanupTimeout = cleanupSec * + ((IsMaster() || mState != kStateOpen) ? 1 : kSlaveTimeoutRatio); + const time_t now = Now(); + if (mBuffer.BytesConsumable() >= + gAtomicRecordAppendManager.GetFlushLimit(*this) || + (flushInterval >= 0 && mLastFlushTime + flushInterval <= now)) { + FlushFullBlocks(); + } else if (! mBuffer.IsEmpty() && flushInterval >= 0) { + nextTimeout = Timer::MinTimeout(nextTimeout, + int(mLastFlushTime + flushInterval - now)); + } + if (IsMaster() && mState == kStateOpen) { + const int ackTm = + gAtomicRecordAppendManager.GetSendCommitAckTimeoutSec(); + if (ackTm > 0 && + mNumServers > 1 && mReplicationsInFlight <= 0 && + mNextCommitOffset > mCommitOffsetAckSent) { + if (mLastActivityTime + ackTm <= now) { + SendCommitAck(); + } else { + nextTimeout = Timer::MinTimeout(nextTimeout, + int(mLastActivityTime + ackTm - now)); + } + } + // If no activity, and no reservations, then master closes the chunk. + const int closeTimeout = mWriteIdState.empty() ? Timer::MinTimeout( + cleanupTimeout, + GetCloseEmptyWidStateSec() + ) : cleanupTimeout; + if (closeTimeout >= 0 && mState == kStateOpen && + (mBytesReserved <= 0 || (cleanupSec >= 0 && + mLastAppendActivityTime + + min(int64_t(cleanupSec) * kMasterMaxIdleRatio, + int64_t(cleanupSec) * (int64_t)CHUNKSIZE / + max(int64_t(1), mNextWriteOffset)) <= now))) { + if (mLastAppendActivityTime + closeTimeout <= now) { + if (! TryToCloseChunk()) { + // TryToCloseChunk hasn't scheduled new activity, most + // likely are replications in flight. If this is the case + // then the cleanup timeout is too short, just retry in 3 + // sec. + // To avoid this "busy wait" with short timeout an + // additional state is needed in the state machine. + assert(mReplicationsInFlight > 0); + if (mLastAppendActivityTime + closeTimeout <= now) { + nextTimeout = Timer::MinTimeout(nextTimeout, 3); + } + } + } else { + nextTimeout = Timer::MinTimeout(nextTimeout, + int(mLastAppendActivityTime + closeTimeout - now)); + } + } + } else if (cleanupTimeout >= 0 && + mIoOpsInFlight <= 0 && mReplicationsInFlight <= 0 && + mLastActivityTime + cleanupTimeout <= now) { + time_t metaUptime; + int minMetaUptime; + if (! mMakeStableSucceededFlag && mState != kStateChunkLost && + (metaUptime = gMetaServerSM.ConnectionUptime()) < + (minMetaUptime = max( + cleanupTimeout, + gAtomicRecordAppendManager.GetMetaMinUptimeSec() + ))) { + WAPPEND_LOG_STREAM_INFO << "timeout:" + " short meta connection uptime;" + " required uptime: " << minMetaUptime << " sec." + " last activty: " << (now - mLastActivityTime) << " ago" << + " new last activity: " << metaUptime << " ago" << + KFS_LOG_EOM; + mLastActivityTime = now - max(time_t(0), metaUptime); + } else { + WAPPEND_LOG_STREAM(mMakeStableSucceededFlag ? + MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + "timeout: deleting write appender" + " chunk: " << mChunkId << + " state: " << GetStateAsStr() << + " size: " << mNextWriteOffset << " / " << mChunkSize << + " wid count: " << mWriteIdState.size() << + KFS_LOG_EOM; + if (! mMakeStableSucceededFlag) { + Cntrs().mTimeoutLostCount++; + } + if (mState != kStateStable) { + SetState(kStateChunkLost); + } + Delete(); + return; + } + } + nextTimeout = Timer::MinTimeout(nextTimeout, + mLastActivityTime + cleanupTimeout > now ? + int(mLastActivityTime + cleanupTimeout - now) : cleanupTimeout); + if (mState == kStateOpen && mReplicationsInFlight > 0) { + const int replicationTimeout = GetNextReplicationTimeout(); + if (replicationTimeout == 0) { + const RecordAppendOp* const op = + AppendReplicationList::Front(mReplicationList); + WAPPEND_LOG_STREAM_ERROR << + "replication timeout:" + " chunk: " << mChunkId << + " state: " << GetStateAsStr() << + " optime: " << (now - op->replicationStartTime) << + " cmd: " << op->Show() << + KFS_LOG_EOM; + Cntrs().mReplicationTimeoutCount++; + SetState(kStateReplicationFailed); + } else { + nextTimeout = Timer::MinTimeout(replicationTimeout, nextTimeout); + } + } + WAPPEND_LOG_STREAM_DEBUG << + "set timeout:" + " chunk: " << mChunkId << + " state: " << GetStateAsStr() << + " timeout: " << nextTimeout << + KFS_LOG_EOM; + mTimer.SetTimeout(nextTimeout); +} + +void +AtomicRecordAppender::SubmitResponse(MakeChunkStableOp& op) +{ + op.status = mState == kStateStable ? 0 : kErrFailedState; + if (op.status != 0) { + if (! op.statusMsg.empty()) { + op.statusMsg += " "; + } + op.statusMsg += "record append failure; state: "; + op.statusMsg += GetStateAsStr(); + } + WAPPEND_LOG_STREAM(op.status == 0 ? + MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + "make chunk stable done:" + " chunk: " << mChunkId << + " state: " << GetStateAsStr() << + " size: " << mNextWriteOffset << " / " << mChunkSize << + " checksum: " << mChunkChecksum << + " in flight:" + " replication: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + " " << op.Show() << + KFS_LOG_EOM; + if (op.clnt == this) { + delete &op; + } else { + Cntrs().mMakeStableCount++; + if (op.status != 0) { + Cntrs().mMakeStableErrorCount++; + } + if (mChunkSize >= 0) { + if (mChunkSize < op.chunkSize && op.chunkSize >= 0) { + Cntrs().mMakeStableLengthErrorCount++; + } + if (op.hasChecksum && mChunkChecksum != op.chunkChecksum) { + Cntrs().mMakeStableChecksumErrorCount++; + } + } + KFS::SubmitOpResponse(&op); + } +} + +void +AtomicRecordAppender::MakeChunkStableDone() +{ + mLastActivityTime = Now(); + SubmitResponse(mBeginMakeChunkStableOp, mLastBeginMakeChunkStableOp); + SubmitResponse(mMakeChunkStableOp, mLastMakeChunkStableOp); +} + +void +AtomicRecordAppender::OpDone(ReadOp* op) +{ + // Only read to truncate the chunk should ever get here, and it should be + // the only one op in flight. + if (! op || + ! mMakeChunkStableOp || + mIoOpsInFlight != 1 || + (mState != kStateClosed && + mState != kStatePendingDelete && + mState != kStateChunkLost) || + (mMakeChunkStableOp->chunkSize >= 0 && + mMakeChunkStableOp->chunkSize != + op->offset + int64_t(op->numBytes)) || + op->offset < 0 || + op->numBytes <= 0 || + op->offset % CHECKSUM_BLOCKSIZE != 0 || + op->numBytes >= CHECKSUM_BLOCKSIZE) { + WAPPEND_LOG_STREAM_FATAL << + "make chunk stable read:" << + " internal error" + " chunk: " << mChunkId << + " read op: " << (const void*)op << + " make stable op: " << (const void*)mMakeChunkStableOp << + " in flight:" + " replication: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + KFS_LOG_EOM; + FatalError(); + } + mIoOpsInFlight--; + if (DeleteIfNeeded()) { + delete op; + return; + } + if (op->status >= 0 && ssize_t(op->numBytes) == op->numBytesIO) { + ChunkInfo_t* const info = gChunkManager.GetChunkInfo(mChunkId); + if (! info || (! info->chunkBlockChecksum && info->chunkSize != 0)) { + WAPPEND_LOG_STREAM_FATAL << + "make chunk stable read:" + " failed to get chunk info" << + " chunk: " << mChunkId << + " checksums: " << + (const void*)(info ? info->chunkBlockChecksum : 0) << + " size: " << (info ? info->chunkSize : -1) << + KFS_LOG_EOM; + FatalError(); + SetState(kStateChunkLost); + } else { + const int64_t newSize = op->offset + op->numBytes; + if (info->chunkSize < newSize) { + mChunkSize = -1; + SetState(kStateChunkLost); + } else { + op->dataBuf->ZeroFill(CHECKSUM_BLOCKSIZE - op->numBytes); + info->chunkBlockChecksum[OffsetToChecksumBlockNum(newSize)] = + ComputeBlockChecksum(op->dataBuf, + op->dataBuf->BytesConsumable()); + // Truncation done, set the new size. + gChunkManager.SetChunkSize(*info, newSize); + mNextWriteOffset = newSize; + } + } + } else { + Cntrs().mReadErrorCount++; + SetState(kStateChunkLost); + } + WAPPEND_LOG_STREAM(mState != kStateChunkLost ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "make chunk stable read:" + " status: " << op->status << + " requested: " << op->numBytes << + " read: " << op->numBytesIO << + " chunk: " << mChunkId << + " state: " << GetStateAsStr() << + " size: " << mNextWriteOffset << " / " << mChunkSize << + " checksum: " << mChunkChecksum << " / " << + mMakeChunkStableOp->chunkChecksum << + " in flight:" + " replication: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + KFS_LOG_EOM; + delete op; + MakeChunkStable(); +} + +void +AtomicRecordAppender::MakeChunkStable(MakeChunkStableOp *op /* = 0 */) +{ + mLastActivityTime = Now(); + if (op) { + MakeChunkStableOp* eo = mMakeChunkStableOp; + if (eo && eo->clnt == this) { + eo = eo->next; // get "external" op + if (eo && eo->clnt == this) { + FatalError(); // only one "internal" op + } + } + if (op->chunkId != mChunkId) { + op->status = kErrParameters; + op->statusMsg = "invalid chunk id"; + } else if (op->chunkVersion != mChunkVersion) { + op->status = kErrParameters; + op->statusMsg = "invalid chunk version"; + } else if (eo && ( + eo->chunkVersion != op->chunkVersion || + ((eo->chunkSize >= 0 && op->chunkSize >= 0 && + eo->chunkSize != op->chunkSize) || + (eo->hasChecksum && op->hasChecksum && + eo->chunkChecksum != op->chunkChecksum)))) { + op->status = kErrParameters; + op->statusMsg = + "request parameters differ from the initial request"; + } + if (op->status != 0) { + WAPPEND_LOG_STREAM_ERROR << + "make chunk stable: bad request ignored " << op->statusMsg << + " chunk: " << mChunkId << + " version: " << mChunkVersion << + " " << op->Show() << + KFS_LOG_EOM; + if (op->clnt == this) { + FatalError(); + delete op; + } else { + Cntrs().mMakeStableCount++; + Cntrs().mMakeStableErrorCount++; + KFS::SubmitOpResponse(op); + } + return; + } + PushBack(mMakeChunkStableOp, mLastMakeChunkStableOp, op); + } + WAPPEND_LOG_STREAM(mMakeChunkStableOp ? + MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelFATAL) << + "make chunk stable " << + ((mMakeChunkStableOp && mMakeChunkStableOp->clnt == this) ? + "internal" : "external") << ":" << + " chunk: " << mChunkId << + " state: " << GetStateAsStr() << + " size: " << mNextWriteOffset << + " in flight:" + " replication: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + (op ? " " : "") << (op ? op->Show() : "") << + KFS_LOG_EOM; + if (! mMakeChunkStableOp) { + FatalError(); + } + if (mState == kStateOpen || mState == kStateReplicationFailed) { + SetState(kStateClosed); + FlushAll(); + } + // Wait for previously submitted ops to finish + if (mIoOpsInFlight > 0) { + return; + } + if (mState != kStateClosed) { + MakeChunkStableDone(); + return; + } + if (mMakeChunkStableOp->chunkSize >= 0) { + const int64_t newSize = mMakeChunkStableOp->chunkSize; + if (newSize > mNextWriteOffset) { + SetState(kStateChunkLost); + } else if (newSize < mNextWriteOffset) { + WAPPEND_LOG_STREAM_INFO << + "make chunk stable: truncating chunk to: " << newSize << + " current size: " << mNextWriteOffset << " / " << mChunkSize << + KFS_LOG_EOM; + if (newSize > 0 && newSize % CHECKSUM_BLOCKSIZE != 0) { + ReadOp* const rop = new ReadOp(0); + rop->chunkId = mChunkId; + rop->chunkVersion = mChunkVersion; + rop->offset = + newSize / CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE; + rop->numBytes = newSize - rop->offset; + rop->clnt = this; + rop->dataBuf = new IOBuffer; + mIoOpsInFlight++; + const int res = gChunkManager.ReadChunk(rop); + if (res < 0) { + rop->status = res; + OpDone(rop); + } + return; + } + // No last block read and checksum update is needed. + ChunkInfo_t* const info = gChunkManager.GetChunkInfo(mChunkId); + if (! info || (! info->chunkBlockChecksum && info->chunkSize != 0)) { + WAPPEND_LOG_STREAM_FATAL << + "make chunk stable:" + " failed to get chunk info" << + " chunk: " << mChunkId << + " checksums: " << + (const void*)(info ? info->chunkBlockChecksum : 0) << + " size: " << (info ? info->chunkSize : -1) << + KFS_LOG_EOM; + FatalError(); + SetState(kStateChunkLost); + } + if (info->chunkSize < newSize) { + SetState(kStateChunkLost); + } else { + // Truncation done, set the new size. + gChunkManager.SetChunkSize(*info, newSize); + mNextWriteOffset = newSize; + } + } + } + if (mState == kStateClosed && ( + ! ComputeChecksum() || + (mMakeChunkStableOp->hasChecksum && + mChunkChecksum != mMakeChunkStableOp->chunkChecksum) || + (mMakeChunkStableOp->chunkSize >= 0 && + mChunkSize != mMakeChunkStableOp->chunkSize))) { + SetState(kStateChunkLost); + } + if (mState != kStateClosed) { + MakeChunkStableDone(); + return; + } + if (mMakeChunkStableOp->clnt == this) { + // Internally generated op done, see if there are other ops. + MakeChunkStableOp* const iop = mMakeChunkStableOp; + mMakeChunkStableOp = iop->next; + delete iop; + if (! mMakeChunkStableOp) { + mLastMakeChunkStableOp = 0; + if (mBeginMakeChunkStableOp) { + BeginMakeStable(); // Restart (send response) begin make stable. + } else { + // Internal make chunk stable doesn't transition into the + // "stable" state, it only truncates the chunk, recalculates the + // checksum, and notifies meta server that chunk append is done. + NotifyChunkClosed(); + } + return; + } + if ((mMakeChunkStableOp->hasChecksum && + mChunkChecksum != mMakeChunkStableOp->chunkChecksum) || + (mMakeChunkStableOp->chunkSize >= 0 && + mChunkSize != mMakeChunkStableOp->chunkSize)) { + SetState(kStateChunkLost); + MakeChunkStableDone(); + return; + } + } + WAPPEND_LOG_STREAM_INFO << + "make chunk stable:" + " starting sync of the metadata" + " chunk: " << mChunkId << + " size: " << mNextWriteOffset << " / " << GetChunkSize() << + KFS_LOG_EOM; + mIoOpsInFlight++; + const bool appendFlag = true; + const int res = gChunkManager.MakeChunkStable( + mChunkId, + mChunkVersion, + appendFlag, + this, + mMakeChunkStableOp->statusMsg + ); + if (res < 0) { + MetaWriteDone(res); + } +} + +void +AtomicRecordAppender::FlushSelf(bool flushFullChecksumBlocks) +{ + mLastFlushTime = Now(); + SetCanDoLowOnBuffersFlushFlag(false); + if (mStaggerRMWInFlightFlag) { + mRestartFlushFlag = ! mBuffer.IsEmpty(); + mFlushFullBlocksFlag = mFlushFullBlocksFlag || flushFullChecksumBlocks; + return; + } + mRestartFlushFlag = false; + while (mState == kStateOpen || + mState == kStateClosed || + mState == kStateReplicationFailed) { + const int nBytes = mBuffer.BytesConsumable(); + if (nBytes <= (flushFullChecksumBlocks ? int(CHECKSUM_BLOCKSIZE) : 0)) { + return; + } + assert(! mStaggerRMWInFlightFlag); + size_t bytesToFlush(nBytes <= int(CHECKSUM_BLOCKSIZE) ? + nBytes : nBytes / CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE); + // assert(IsChunkOpen()); // OK to flush deleted chunk. + + // The chunk manager write code requires writes where the # of bytes == + // size of checksum block to be aligned to checksum boundaries; + // otherwise, the writes should be less than a checksum block. + // + // Set RMW flag to allow *only one* concurrent partial checksum block + // write withing the same checksum block. This is need because io + // completion order is undefined, and partial checksum block write does + // read modify write. Obviously two such writes withing the same block + // need to be ordered. + // + const int blkOffset(mNextWriteOffset % CHECKSUM_BLOCKSIZE); + if (blkOffset > 0) { + mStaggerRMWInFlightFlag = + blkOffset + bytesToFlush < CHECKSUM_BLOCKSIZE; + if (! mStaggerRMWInFlightFlag) { + bytesToFlush = CHECKSUM_BLOCKSIZE - blkOffset; + } + assert(! mStaggerRMWInFlightFlag || bytesToFlush == size_t(nBytes)); + } else { + mStaggerRMWInFlightFlag = bytesToFlush < CHECKSUM_BLOCKSIZE; + } + WriteOp* const wop = new WriteOp(mChunkId, mChunkVersion); + wop->InitForRecordAppend(); + wop->clnt = this; + wop->offset = mNextWriteOffset; + wop->numBytes = bytesToFlush; + if (bytesToFlush < CHECKSUM_BLOCKSIZE) { + // Buffer don't have to be full and aligned, chunk manager will have + // to do partial checksum block write, and invoke + // ReplaceKeepBuffersFull() anyway. + wop->dataBuf->Move(&mBuffer, bytesToFlush); + } else { + // Buffer size should always be multiple of checksum block size. + assert(mNextWriteOffset % IOBufferData::GetDefaultBufferSize() == 0); + wop->dataBuf->ReplaceKeepBuffersFull(&mBuffer, 0, bytesToFlush); + } + const int newLimit = gAtomicRecordAppendManager.GetFlushLimit( + *this, mBuffer.BytesConsumable() - nBytes); + + WAPPEND_LOG_STREAM_DEBUG << + "flush write" + " state: " << GetStateAsStr() << + " chunk: " << wop->chunkId << + " offset: " << wop->offset << + " bytes: " << wop->numBytes << + " buffered: " << mBuffer.BytesConsumable() << + " flush limit: " << newLimit << + " in flight:" + " replicaton: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + KFS_LOG_EOM; + + mNextWriteOffset += bytesToFlush; + mBufFrontPadding = 0; + if (bytesToFlush < CHECKSUM_BLOCKSIZE) { + const int off( + mNextWriteOffset % IOBufferData::GetDefaultBufferSize()); + if (off == 0) { + mBuffer.MakeBuffersFull(); + } else { + assert(off > 0 && mBuffer.IsEmpty()); + mBufFrontPadding = off; + } + } + mIoOpsInFlight++; + int res = gChunkManager.WriteChunk(wop); + if (res < 0) { + // Failed to start write, call error handler and return immediately. + // Assume that error handler can delete this. + wop->status = res; + wop->HandleEvent(EVENT_DISK_ERROR, &res); + return; + } + } +} + +void +AtomicRecordAppender::OpDone(WriteOp *op) +{ + assert( + op->chunkId == mChunkId && mIoOpsInFlight > 0 && + (mStaggerRMWInFlightFlag || op->numBytes >= CHECKSUM_BLOCKSIZE) + ); + mIoOpsInFlight--; + const bool failedFlag = + op->status < 0 || size_t(op->status) < op->numBytes; + WAPPEND_LOG_STREAM(failedFlag ? + MsgLogger::kLogLevelERROR : MsgLogger::kLogLevelDEBUG) << + "write " << (failedFlag ? "FAILED" : "done") << + " chunk: " << mChunkId << + " offset: " << op->offset << + " size: " << op->numBytes << + " commit: " << mNextCommitOffset << + " in flight:" + " replicaton: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + " status: " << op->status << + " chunk size: " << GetChunkSize() << + KFS_LOG_EOM; + const int64_t end = op->offset + op->numBytes; + delete op; + if (DeleteIfNeeded()) { + return; + } + if (failedFlag) { + Cntrs().mWriteErrorCount++; + SetState(kStateChunkLost); + } + // There could be more that one write in flight, but only one stagger. + // The stagger end, by definition, is not on checksum block boundary, but + // all other write should end exactly on checksum block boundary. + if (mStaggerRMWInFlightFlag && end % CHECKSUM_BLOCKSIZE != 0) { + mStaggerRMWInFlightFlag = false; + if (mRestartFlushFlag) { + FlushSelf(mFlushFullBlocksFlag); + } + } + if (mIoOpsInFlight <= 0 && mBeginMakeChunkStableOp) { + BeginMakeStable(); + } + if (mIoOpsInFlight <= 0 && mMakeChunkStableOp) { + MakeChunkStable(); + } +} + +void +AtomicRecordAppender::MetaWriteDone(int status) +{ + assert(mIoOpsInFlight > 0); + mIoOpsInFlight--; + WAPPEND_LOG_STREAM(status >= 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "meta write " << (status < 0 ? "FAILED" : "done") << + " chunk: " << mChunkId << + " in flight:" + " replicaton: " << mReplicationsInFlight << + " ios: " << mIoOpsInFlight << + " commit: " << mNextCommitOffset << + " status: " << status << + " owner: " << gChunkManager.IsWriteAppenderOwns(mChunkId) << + KFS_LOG_EOM; + if (DeleteIfNeeded()) { + return; + } + if (status < 0) { + Cntrs().mWriteErrorCount++; + SetState(kStateChunkLost); + } + if (mState == kStateClosed) { + SetState(kStateStable); + } + MakeChunkStable(); +} + +bool +AtomicRecordAppender::TryToCloseChunk() +{ + if (! IsMaster()) { + return false; + } + SendCommitAck(); + if (mState == kStateOpen && mReplicationsInFlight > 0) { + return false; + } + if (mState == kStateOpen || mState == kStateReplicationFailed) { + if (mMakeChunkStableOp || mBeginMakeChunkStableOp) { + FatalError(); + } + TrimToLastCommit("try to close"); + } + return true; +} + +void +AtomicRecordAppender::TrimToLastCommit( + const char* inMsgPtr) +{ + if (mMakeChunkStableOp) { + FatalError(); + } + WAPPEND_LOG_STREAM_DEBUG << + (inMsgPtr ? inMsgPtr : "trim to last commit") << + " chunk: " << mChunkId << + " version: " << mChunkVersion << + " size: " << mNextCommitOffset << + KFS_LOG_EOM; + // Trim the chunk on failure to the last committed offset, if needed. + MakeChunkStableOp* const op = new MakeChunkStableOp(0); + op->chunkId = mChunkId; + op->chunkVersion = mChunkVersion; + op->clnt = this; + op->chunkSize = mNextCommitOffset; + MakeChunkStable(op); +} + +void +AtomicRecordAppender::NotifyChunkClosed() +{ + assert(IsMaster() && mState == kStateClosed); + WAPPEND_LOG_STREAM_DEBUG << + "notify closed:" + " chunk: " << mChunkId << + " size: " << mChunkSize << + " checksum: " << mChunkChecksum << + KFS_LOG_EOM; + gLeaseClerk.RelinquishLease(mChunkId, mChunkSize, true, mChunkChecksum); +} + +void +AtomicRecordAppender::SendCommitAck() +{ + CheckLeaseAndChunk("send commit ack"); + if (! IsMaster() || mState != kStateOpen || + mNumServers <= 1 || mReplicationsInFlight > 0 || + mNextCommitOffset <= mCommitOffsetAckSent) { + return; + } + WAPPEND_LOG_STREAM_DEBUG << + "send commit ack" + " chunk: " << mChunkId << + " last ack: " << mCommitOffsetAckSent << + " size: " << mNextCommitOffset << + " unacked: " << (mNextCommitOffset - mCommitOffsetAckSent) << + KFS_LOG_EOM; + // Use write offset as seq. # for debugging + RecordAppendOp* const op = new RecordAppendOp(mNextWriteOffset); + op->clnt = this; + op->chunkId = mChunkId; + op->chunkVersion = mChunkVersion; + op->numServers = mNumServers; + op->servers = mCommitAckServers; + op->numBytes = 0; + AppendBegin(op, mReplicationPos, mPeerLocation); +} + +AtomicRecordAppendManager::AtomicRecordAppendManager() + : mAppenders(), + mCleanUpSec(4 * 60), + mCloseEmptyWidStateSec(60), + mFlushIntervalSec(60), + mSendCommitAckTimeoutSec(2), + mReplicationTimeoutSec(3 * 60), + mMinMetaUptimeSec(8 * 60), + mFlushLimit(1 << 20), + mMaxAppenderBytes(0), + mTotalBuffersBytes(0), + mTotalPendingBytes(0), + mActiveAppendersCount(0), + mOpenAppendersCount(0), + mAppendersWithWidCount(0), + mBufferLimitRatio(0.6), + mMaxWriteIdsPerChunk(16 << 10), + mCloseOutOfSpaceThreshold(4), + mCloseOutOfSpaceSec(5), + mInstanceNum(0), + mCounters() +{ + PendingFlushList::Init(mPendingFlushList); + mCounters.Clear(); +} + +AtomicRecordAppendManager::~AtomicRecordAppendManager() +{ + assert(mAppenders.empty()); +} + +void +AtomicRecordAppendManager::SetParameters(const Properties& props) +{ + mCleanUpSec = props.getValue( + "chunkServer.recAppender.cleanupSec", mCleanUpSec); + mCloseEmptyWidStateSec = props.getValue( + "chunkServer.recAppender.closeEmptyWidStateSec", + mCloseEmptyWidStateSec); + mFlushIntervalSec = props.getValue( + "chunkServer.recAppender.flushIntervalSec", mFlushIntervalSec), + mSendCommitAckTimeoutSec = props.getValue( + "chunkServer.recAppender.sendCommitAckTimeoutSec", + mSendCommitAckTimeoutSec); + mReplicationTimeoutSec = props.getValue( + "chunkServer.recAppender.replicationTimeoutSec", + mReplicationTimeoutSec); + mMinMetaUptimeSec = props.getValue( + "chunkServer.recAppender.minMetaUptimeSec", + mMinMetaUptimeSec); + mFlushLimit = props.getValue( + "chunkServer.recAppender.flushLimit", mFlushLimit), + mBufferLimitRatio = props.getValue( + "chunkServer.recAppender.bufferLimitRatio", mBufferLimitRatio), + mMaxWriteIdsPerChunk = props.getValue( + "chunkServer.recAppender.maxWriteIdsPerChunk", mMaxWriteIdsPerChunk); + mCloseOutOfSpaceThreshold = props.getValue( + "chunkServer.recAppender.closeOutOfSpaceThreshold", + mCloseOutOfSpaceThreshold); + mCloseOutOfSpaceSec = props.getValue( + "chunkServer.recAppender.closeOutOfSpaceSec", mCloseOutOfSpaceSec); + mTotalBuffersBytes = 0; + if (! mAppenders.empty()) { + UpdateAppenderFlushLimit(); + } +} + +int +AtomicRecordAppendManager::GetFlushLimit( + AtomicRecordAppender& /* appender */, int addBytes /* = 0 */) +{ + if (addBytes != 0) { + assert(mTotalPendingBytes + addBytes >= 0); + mTotalPendingBytes += addBytes; + UpdateAppenderFlushLimit(); + } + return mMaxAppenderBytes; +} + +void +AtomicRecordAppendManager::UpdateAppenderFlushLimit( + const AtomicRecordAppender* appender /* = 0 */) +{ + assert(mActiveAppendersCount >= 0); + if (appender) { + if (appender->IsChunkStable()) { + assert(mActiveAppendersCount > 0); + mActiveAppendersCount--; + } else { + assert((size_t)mActiveAppendersCount < mAppenders.size()); + mActiveAppendersCount++; + } + } + if (mTotalBuffersBytes <= 0) { + mTotalBuffersBytes = (int64_t)( + DiskIo::GetBufferManager().GetTotalCount() * + mBufferLimitRatio); + if (mTotalBuffersBytes <= 0) { + mTotalBuffersBytes = mFlushLimit; + } + } + const int64_t prevLimit = mMaxAppenderBytes; + mMaxAppenderBytes = min(int64_t(mFlushLimit), + (mTotalBuffersBytes + mTotalPendingBytes) / + max(int64_t(1), mActiveAppendersCount) + ); + if (prevLimit * 15 / 16 > mMaxAppenderBytes) { + PendingFlushList::Iterator it(mPendingFlushList); + AtomicRecordAppender* appender; + while ((appender = it.Next())) { + appender->UpdateFlushLimit(mMaxAppenderBytes); + } + } +} + +void +AtomicRecordAppendManager::AllocateChunk( + AllocChunkOp* op, int replicationPos, ServerLocation peerLoc, + const DiskIo::FilePtr& chunkFileHandle) +{ + assert(op); + pair const res = mAppenders.insert( + make_pair(op->chunkId, (AtomicRecordAppender*)0)); + if (res.second) { + assert(! res.first->second); + const ChunkInfo_t* info = 0; + if (! chunkFileHandle || + ! chunkFileHandle->IsOpen() || + ! (info = gChunkManager.GetChunkInfo(op->chunkId)) || + (! info->chunkBlockChecksum && info->chunkSize != 0)) { + op->statusMsg = "chunk manager closed this chunk"; + op->status = AtomicRecordAppender::kErrParameters; + } else if (op->chunkVersion != info->chunkVersion) { + op->statusMsg = "invalid chunk version"; + op->status = AtomicRecordAppender::kErrParameters; + } + WAPPEND_LOG_STREAM(op->status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "allocate chunk: creating new appender: " << + op->statusMsg << + " status: " << op->status << + " appender count: " << mAppenders.size() << + " chunk: " << op->chunkId << + " checksums: " << + (const void*)(info ? info->chunkBlockChecksum : 0) << + " size: " << (info ? info->chunkSize : int64_t(-1)) << + " version: " << (info ? info->chunkVersion : (int64_t)-1) << + " file handle: " << (const void*)chunkFileHandle.get() << + " file open: " << (chunkFileHandle->IsOpen() ? "yes" : "no") << + " " << op->Show() << + KFS_LOG_EOM; + if (op->status != 0) { + mAppenders.erase(res.first); + } else { + res.first->second = new AtomicRecordAppender( + chunkFileHandle, op->chunkId, op->chunkVersion, op->numServers, + op->servers, peerLoc, replicationPos, info->chunkSize + ); + mOpenAppendersCount++; + UpdateAppenderFlushLimit(res.first->second); + } + } else if (res.first->second->IsOpen()) { + op->status = res.first->second->CheckParameters( + op->chunkVersion, op->numServers, + op->servers, replicationPos, peerLoc, chunkFileHandle, op->statusMsg); + WAPPEND_LOG_STREAM(op->status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "allocate chunk: appender exists: " << + " chunk: " << op->chunkId << + " status: " << op->status << + " appender count: " << mAppenders.size() << + " " << op->Show() << + KFS_LOG_EOM; + } else { + // This should not normally happen, but this could happen when meta + // server restarts with partially (or completely) lost meta data, and + // meta server re-uses the same chunk id. + // Cleanup lingering appedner and retry. + res.first->second->Delete(); + AllocateChunk(op, replicationPos, peerLoc, chunkFileHandle); + return; // Tail recursion. + } + if (replicationPos == 0) { + mCounters.mAppenderAllocMasterCount++; + } + mCounters.mAppenderAllocCount++; + if (op->status != 0) { + mCounters.mAppenderAllocErrorCount++; + } +} + +void +AtomicRecordAppendManager::AllocateWriteId( + WriteIdAllocOp *op, int replicationPos, ServerLocation peerLoc, + const DiskIo::FilePtr& chunkFileHandle) +{ + assert(op); + ARAMap::iterator const it = mAppenders.find(op->chunkId); + if (it == mAppenders.end()) { + op->statusMsg = "not open for append; no appender"; + op->status = AtomicRecordAppender::kErrParameters; + mCounters.mWriteIdAllocNoAppenderCount++; + } else { + it->second->AllocateWriteId( + op, replicationPos, peerLoc, chunkFileHandle); + } + mCounters.mWriteIdAllocCount++; + if (op->status != 0) { + mCounters.mWriteIdAllocErrorCount++; + } +} + +void +AtomicRecordAppendManager::Timeout() +{ + FlushIfLowOnBuffers(); +} + +void +AtomicRecordAppendManager::FlushIfLowOnBuffers() +{ + if (! DiskIo::GetBufferManager().IsLowOnBuffers()) { + return; + } + PendingFlushList::Iterator it(mPendingFlushList); + AtomicRecordAppender* appender; + while ((appender = it.Next())) { + appender->LowOnBuffersFlush(); + } +} + +bool +AtomicRecordAppendManager::IsChunkStable(kfsChunkId_t chunkId) const +{ + // Cast until mac std::tr1::unordered_map gets "find() const" + ARAMap::const_iterator const it = + const_cast(mAppenders).find(chunkId); + return (it == mAppenders.end() || it->second->IsChunkStable()); +} + +bool +AtomicRecordAppendManager::IsSpaceReservedInChunk(kfsChunkId_t chunkId) +{ + ARAMap::const_iterator const it = mAppenders.find(chunkId); + return (it != mAppenders.end() && it->second->SpaceReserved() > 0); +} + +int +AtomicRecordAppendManager::ChunkSpaceReserve( + kfsChunkId_t chunkId, int64_t writeId, size_t nBytes, string* errMsg /* = 0 */) +{ + ARAMap::iterator const it = mAppenders.find(chunkId); + int status; + if (it == mAppenders.end()) { + if (errMsg) { + (*errMsg) += "chunk does not exist or not open for append"; + } + status = AtomicRecordAppender::kErrParameters; + } else { + status = it->second->ChangeChunkSpaceReservaton( + writeId, nBytes, false, errMsg); + } + mCounters.mSpaceReserveCount++; + if (status != 0) { + mCounters.mSpaceReserveErrorCount++; + if (status == AtomicRecordAppender::kErrOutOfSpace) { + mCounters.mSpaceReserveDeniedCount++; + } + } else { + mCounters.mSpaceReserveByteCount += nBytes; + } + return status; +} + +int +AtomicRecordAppendManager::ChunkSpaceRelease( + kfsChunkId_t chunkId, int64_t writeId, size_t nBytes, string* errMsg /* = 0 */) +{ + ARAMap::iterator const it = mAppenders.find(chunkId); + if (it == mAppenders.end()) { + if (errMsg) { + (*errMsg) += "chunk does not exist or not open for append"; + } + return AtomicRecordAppender::kErrParameters; + } + return it->second->ChangeChunkSpaceReservaton( + writeId, nBytes, true, errMsg); +} + +int +AtomicRecordAppendManager::InvalidateWriteId( + kfsChunkId_t chunkId, int64_t writeId, bool declareFailureFlag) +{ + ARAMap::const_iterator const it = mAppenders.find(chunkId); + return (it == mAppenders.end() ? 0 : + it->second->InvalidateWriteId(writeId, declareFailureFlag)); +} + +int +AtomicRecordAppendManager::GetAlignmentAndFwdFlag(kfsChunkId_t chunkId, + bool& forwardFlag) const +{ + forwardFlag = false; + ARAMap::const_iterator const it = mAppenders.find(chunkId); + return (it == mAppenders.end() ? 0 : + it->second->GetAlignmentAndFwdFlag(forwardFlag)); +} + +bool +AtomicRecordAppendManager::BeginMakeChunkStable(BeginMakeChunkStableOp* op) +{ + assert(op); + ARAMap::iterator const it = mAppenders.find(op->chunkId); + if (it == mAppenders.end()) { + op->statusMsg = "chunk does not exist or not open for append"; + op->status = AtomicRecordAppender::kErrParameters; + WAPPEND_LOG_STREAM_ERROR << + "begin make stable: no write appender" + " chunk: " << op->chunkId << + " status: " << op->status << + " msg: " << op->statusMsg << + " " << op->Show() << + KFS_LOG_EOM; + mCounters.mBeginMakeStableCount++; + mCounters.mBeginMakeStableErrorCount++; + return false; // Submit response now. + } + it->second->BeginMakeStable(op); + // Completion handler is already invoked or will be invoked later. + return true; +} + +bool +AtomicRecordAppendManager::CloseChunk( + CloseOp* op, int64_t writeId, bool& forwardFlag) +{ + assert(op); + ARAMap::iterator const it = mAppenders.find(op->chunkId); + if (it == mAppenders.end()) { + return false; // let chunk manager handle it + } + it->second->CloseChunk(op, writeId, forwardFlag); + return true; +} + +bool +AtomicRecordAppendManager::MakeChunkStable(MakeChunkStableOp* op) +{ + assert(op); + ARAMap::iterator const it = mAppenders.find(op->chunkId); + if (it != mAppenders.end()) { + it->second->MakeChunkStable(op); + // Completion handler is already invoked or will be invoked later. + return true; + } + int64_t chunkSize = -1; + uint32_t chunkChecksum = 0; + if (op->hasChecksum) { + // The following is pretty much redundant now when write appender + // created at the time of chunk allocation. + if (AtomicRecordAppender::ComputeChecksum( + op->chunkId, op->chunkVersion, chunkSize, chunkChecksum) && + chunkSize == op->chunkSize && + chunkChecksum == op->chunkChecksum) { + op->status = 0; + } else { + op->statusMsg = "no write appender, checksum or size mismatch"; + op->status = AtomicRecordAppender::kErrFailedState; + // Wait for meta sever to tell what to do with the chunk. + // It is possible that this is stale make stable completion. + // gChunkManager.ChunkIOFailed(op->chunkId, -EIO, 0); + mCounters.mLostChunkCount++; + } + } + WAPPEND_LOG_STREAM(op->status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "make stable: no write appender" + " chunk: " << op->chunkId << + " status: " << op->status << + " msg: " << op->statusMsg << + " size: " << chunkSize << + " checksum: " << chunkChecksum << + " " << op->Show() << + KFS_LOG_EOM; + mCounters.mMakeStableCount++; + if (op->status != 0) { + mCounters.mMakeStableErrorCount++; + if (op->hasChecksum) { + if (chunkSize != op->chunkSize) { + mCounters.mMakeStableLengthErrorCount++; + } + if (chunkChecksum != op->chunkChecksum) { + mCounters.mMakeStableChecksumErrorCount++; + } + } + } else { + const bool appendFlag = false; + const int res = gChunkManager.MakeChunkStable( + op->chunkId, op->chunkVersion, appendFlag, op, op->statusMsg); + if (res >= 0) { + return true; + } + op->status = res; + } + return false; // Submit response now. +} + +void +AtomicRecordAppendManager::AppendBegin( + RecordAppendOp* op, int replicationPos, ServerLocation peerLoc) +{ + assert(op); + ARAMap::iterator const it = mAppenders.find(op->chunkId); + if (it == mAppenders.end()) { + op->status = AtomicRecordAppender::kErrParameters; + op->statusMsg = "chunk does not exist or not open for append"; + mCounters.mAppendCount++; + mCounters.mAppendErrorCount++; + KFS::SubmitOpResponse(op); + } else { + it->second->AppendBegin(op, replicationPos, peerLoc); + } +} + +void +AtomicRecordAppendManager::GetOpStatus(GetRecordAppendOpStatus* op) +{ + assert(op); + ARAMap::iterator const it = mAppenders.find(op->chunkId); + if (it == mAppenders.end()) { + op->status = AtomicRecordAppender::kErrParameters; + op->statusMsg = "chunk does not exist or not open for append"; + } else { + it->second->GetOpStatus(op); + } + mCounters.mGetOpStatusCount++; + if (op->status != 0) { + mCounters.mGetOpStatusErrorCount++; + } else if (op->opStatus != AtomicRecordAppender::kErrStatusInProgress) { + mCounters.mGetOpStatusKnownCount++; + } +} + +bool +AtomicRecordAppendManager::WantsToKeepLease(kfsChunkId_t chunkId) const +{ + ARAMap::const_iterator const it = mAppenders.find(chunkId); + return (it != mAppenders.end() && it->second->WantsToKeepLease()); +} + +void +AtomicRecordAppendManager:: DeleteChunk(kfsChunkId_t chunkId) +{ + ARAMap::const_iterator const it = mAppenders.find(chunkId); + if (it != mAppenders.end()) { + it->second->DeleteChunk(); + } +} + +void +AtomicRecordAppendManager::Shutdown() +{ + while (! mAppenders.empty()) { + mAppenders.begin()->second->Delete(); + } +} + +AtomicRecordAppendManager gAtomicRecordAppendManager; + +} diff --git a/src/cc/chunk/AtomicRecordAppender.h b/src/cc/chunk/AtomicRecordAppender.h new file mode 100644 index 000000000..14a77d036 --- /dev/null +++ b/src/cc/chunk/AtomicRecordAppender.h @@ -0,0 +1,208 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id: AtomicRecordAppender.h $ +// +// Created 2009/03/19 +// Author: Sriram Rao +// +// Copyright 2009-2012 Quantcast Corporation. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Data structure for tracking record appends to chunks. +//---------------------------------------------------------------------------- + +#ifndef CHUNK_ATOMICRECORDAPPENDER_H +#define CHUNK_ATOMICRECORDAPPENDER_H + +#include +#include + +#include "DiskIo.h" +#include "KfsOps.h" +#include "common/kfsdecls.h" + +namespace KFS +{ +class AtomicRecordAppender; +class Properties; + +// Write append append globals. +class AtomicRecordAppendManager { +public: + struct Counters + { + typedef int64_t Counter; + + Counter mAppendCount; + Counter mAppendByteCount; + Counter mAppendErrorCount; + Counter mReplicationErrorCount; + Counter mReplicationTimeoutCount; + Counter mAppenderAllocCount; + Counter mAppenderAllocMasterCount; + Counter mAppenderAllocErrorCount; + Counter mWriteIdAllocCount; + Counter mWriteIdAllocErrorCount; + Counter mWriteIdAllocNoAppenderCount; + Counter mSpaceReserveCount; + Counter mSpaceReserveByteCount; + Counter mSpaceReserveDeniedCount; + Counter mSpaceReserveErrorCount; + Counter mBeginMakeStableCount; + Counter mBeginMakeStableErrorCount; + Counter mMakeStableCount; + Counter mMakeStableErrorCount; + Counter mMakeStableLengthErrorCount; + Counter mMakeStableChecksumErrorCount; + Counter mChecksumErrorCount; + Counter mReadErrorCount; + Counter mWriteErrorCount; + Counter mGetOpStatusCount; + Counter mGetOpStatusErrorCount; + Counter mGetOpStatusKnownCount; + Counter mLeaseExpiredCount; + Counter mTimeoutLostCount; + Counter mLostChunkCount; + + void Clear() + { + mAppendCount = 0; + mAppendByteCount = 0; + mAppendErrorCount = 0; + mReplicationErrorCount = 0; + mReplicationTimeoutCount = 0; + mAppenderAllocCount = 0; + mAppenderAllocMasterCount = 0; + mAppenderAllocErrorCount = 0; + mWriteIdAllocCount = 0; + mWriteIdAllocErrorCount = 0; + mWriteIdAllocNoAppenderCount = 0; + mSpaceReserveCount = 0; + mSpaceReserveByteCount = 0; + mSpaceReserveDeniedCount = 0; + mSpaceReserveErrorCount = 0; + mBeginMakeStableCount = 0; + mBeginMakeStableErrorCount = 0; + mMakeStableCount = 0; + mMakeStableErrorCount = 0; + mMakeStableLengthErrorCount = 0; + mMakeStableChecksumErrorCount = 0; + mChecksumErrorCount = 0; + mReadErrorCount = 0; + mWriteErrorCount = 0; + mGetOpStatusCount = 0; + mGetOpStatusErrorCount = 0; + mGetOpStatusKnownCount = 0; + mLeaseExpiredCount = 0; + mTimeoutLostCount = 0; + mLostChunkCount = 0; + } + }; + AtomicRecordAppendManager(); + ~AtomicRecordAppendManager(); + void SetParameters(const Properties& props); + void AllocateChunk(AllocChunkOp *op, int replicationPos, + ServerLocation peerLoc, const DiskIo::FilePtr& chunkFileHandle); + void AllocateWriteId(WriteIdAllocOp *op, int replicationPos, + ServerLocation peerLoc, const DiskIo::FilePtr& chunkFileHandle); + int GetCleanUpSec() const { return mCleanUpSec; } + int GetCloseEmptyWidStateSec() const { return mCloseEmptyWidStateSec; } + int GetFlushIntervalSec() const { return mFlushIntervalSec; } + int GetSendCommitAckTimeoutSec() const { return mSendCommitAckTimeoutSec; } + int GetReplicationTimeoutSec() const { return mReplicationTimeoutSec; } + int GetMetaMinUptimeSec() const { return mMinMetaUptimeSec; } + int GetFlushLimit() const { return mFlushLimit; } + double GetBufferLimitRatio() const { return mBufferLimitRatio; } + int GetMaxWriteIdsPerChunk() const { return mMaxWriteIdsPerChunk; } + int GetCloseOutOfSpaceThreshold() const { return mCloseOutOfSpaceThreshold; } + int GetCloseOutOfSpaceSec() const { return mCloseOutOfSpaceSec; } + bool IsChunkStable(kfsChunkId_t chunkId) const; + /// For record appends, (1) clients will reserve space in a chunk and + /// then write and (2) clients can release their reserved space. + /// As long as space is reserved on a chunk, the chunkserver will + /// renew the write lease with the metaserver. + /// @param[in] chunkId id of the chunk for which space should be + /// reserved/released + /// @param[in] nbytes # of bytes of space reservation/release + /// @retval status code + /// + int ChunkSpaceReserve(kfsChunkId_t chunkId, int64_t writeId, size_t nBytes, std::string* errMsg = 0); + int ChunkSpaceRelease(kfsChunkId_t chunkId, int64_t writeId, size_t nBytes, std::string* errMsg = 0); + int InvalidateWriteId(kfsChunkId_t chunkId, int64_t writeId, bool declareFailureFlag = false); + int InvalidateWriteIdDeclareFailure(kfsChunkId_t chunkId, int64_t writeId) { + return InvalidateWriteId(chunkId, writeId, true); + } + int64_t GetOpenAppendersCount() const { + return mOpenAppendersCount; + } + int64_t GetAppendersWithWidCount() const { + return mAppendersWithWidCount; + } + bool IsSpaceReservedInChunk(kfsChunkId_t chunkId); + int GetAlignmentAndFwdFlag(kfsChunkId_t chunkId, bool& forwardFlag) const; + bool CloseChunk(CloseOp* op, int64_t writeId, bool& forwardFlag); + bool BeginMakeChunkStable(BeginMakeChunkStableOp* op); + bool MakeChunkStable(MakeChunkStableOp* op); + void AppendBegin(RecordAppendOp* op, int replicationPos, ServerLocation peerLoc); + void GetOpStatus(GetRecordAppendOpStatus* op); + bool WantsToKeepLease(kfsChunkId_t chunkId) const; + void Timeout(); + void FlushIfLowOnBuffers(); + void DeleteChunk(kfsChunkId_t chunkId); + void Shutdown(); + size_t GetAppendersCount() const + { return mAppenders.size(); } + void GetCounters(Counters& outCounters) + { outCounters = mCounters; } + + void UpdateAppenderFlushLimit(const AtomicRecordAppender* appender = 0); + int GetFlushLimit(AtomicRecordAppender& appender, int addBytes = 0); + inline void UpdatePendingFlush(AtomicRecordAppender& appender); + inline void Detach(AtomicRecordAppender& appender); + inline void DecOpenAppenderCount(); + inline void IncAppendersWithWidCount(); + inline void DecAppendersWithWidCount(); + inline Counters& Cntrs(); + +private: + typedef std::tr1::unordered_map ARAMap; + + ARAMap mAppenders; + int mCleanUpSec; + int mCloseEmptyWidStateSec; + int mFlushIntervalSec; + int mSendCommitAckTimeoutSec; + int mReplicationTimeoutSec; + int mMinMetaUptimeSec; + int mFlushLimit; + int mMaxAppenderBytes; + int64_t mTotalBuffersBytes; + int64_t mTotalPendingBytes; + int64_t mActiveAppendersCount; + int64_t mOpenAppendersCount; + int64_t mAppendersWithWidCount; + double mBufferLimitRatio; + int mMaxWriteIdsPerChunk; + int mCloseOutOfSpaceThreshold; + int mCloseOutOfSpaceSec; + AtomicRecordAppender* mPendingFlushList[1]; + const uint64_t mInstanceNum; + Counters mCounters; +}; + +extern AtomicRecordAppendManager gAtomicRecordAppendManager; +} + +#endif // CHUNK_ATOMICRECORDAPPENDER_H diff --git a/src/cc/chunk/BufferManager.cc b/src/cc/chunk/BufferManager.cc new file mode 100644 index 000000000..355d9abbb --- /dev/null +++ b/src/cc/chunk/BufferManager.cc @@ -0,0 +1,455 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/06/06 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include + +#include "BufferManager.h" +#include "qcdio/QCUtils.h" +#include "kfsio/NetManager.h" +#include "kfsio/Globals.h" + +namespace KFS +{ + +using libkfsio::globalNetManager; +using std::min; +using std::max; + +// Chunk server disk and network io buffer manager implementation. +BufferManager::Client::Client() + : mManagerPtr(0), + mByteCount(0), + mWaitingForByteCount(0), + mWaitStart(0) +{ + WaitQueue::Init(*this); +} + +BufferManager::BufferManager( + bool inEnabledFlag /* = true */) + : ITimeout(), + mTotalCount(0), + mMaxClientQuota(0), + mRemainingCount(0), + mWaitingByteCount(0), + mGetRequestCount(0), + mPutRequestCount(0), + mClientsWihtBuffersCount(0), + mMinBufferCount(0), + mWaitingCount(0), + mInitedFlag(false), + mDiskOverloadedFlag(false), + mEnabledFlag(inEnabledFlag), + mWaitingAvgIntervalIdx(0), + mWaitingAvgExp(0), + mWaitingAvgUsecsLast(microseconds()), + mWaitingAvgNext(globalNetManager().Now()), + mWaitingAvgBytes(0), + mWaitingAvgCount(0), + mWaitingAvgUsecs(0), + mCounters() +{ + WaitQueue::Init(mWaitQueuePtr); + mCounters.Clear(); + BufferManager::SetWaitingAvgInterval(20); +} + +BufferManager::~BufferManager() +{ + QCRTASSERT(WaitQueue::IsEmpty(mWaitQueuePtr)); + globalNetManager().UnRegisterTimeoutHandler(this); +} + + void +BufferManager::Init( + QCIoBufferPool* inBufferPoolPtr, + BufferManager::ByteCount inTotalCount, + BufferManager::ByteCount inMaxClientQuota, + int inMinBufferCount) +{ + QCRTASSERT(! mInitedFlag); + mInitedFlag = true; + mWaitingCount = 0; + mWaitingByteCount = 0; + mGetRequestCount = 0; + mPutRequestCount = 0; + mClientsWihtBuffersCount = 0; + mBufferPoolPtr = inBufferPoolPtr; + mTotalCount = inTotalCount; + mRemainingCount = mTotalCount; + mMinBufferCount = inMinBufferCount; + mMaxClientQuota = min(mTotalCount, inMaxClientQuota); + mDiskOverloadedFlag = false; + globalNetManager().RegisterTimeoutHandler(this); +} + + bool +BufferManager::Modify( + BufferManager::Client& inClient, + BufferManager::ByteCount inByteCount, + bool inForDiskIoFlag) +{ + if (! mEnabledFlag) { + return true; + } + assert(inClient.mByteCount >= 0 && inClient.mWaitingForByteCount >= 0); + assert(inClient.mManagerPtr || + inClient.mWaitingForByteCount + inClient.mByteCount == 0); + assert(! inClient.mManagerPtr || inClient.mManagerPtr == this); + assert(inClient.IsWaiting() || inClient.mWaitingForByteCount == 0); + assert(mRemainingCount + inClient.mByteCount <= mTotalCount); + + const bool theHadBuffersFlag = inClient.mByteCount > 0; + mRemainingCount += inClient.mByteCount; + if (inByteCount < 0) { + mPutRequestCount++; + inClient.mByteCount += inByteCount; + if (inClient.mByteCount < 0) { + inClient.mByteCount = 0; + } + mRemainingCount -= inClient.mByteCount; + if (theHadBuffersFlag && inClient.mByteCount <= 0) { + mClientsWihtBuffersCount--; + } + return true; + } + mCounters.mRequestCount++; + mCounters.mRequestByteCount += inByteCount; + mGetRequestCount++; + inClient.mManagerPtr = this; + const ByteCount theReqCount = + inClient.mWaitingForByteCount + inClient.mByteCount + inByteCount; + const bool theGrantedFlag = ! inClient.IsWaiting() && ( + theReqCount <= 0 || ( + (! inForDiskIoFlag || ! mDiskOverloadedFlag) && + ! IsLowOnBuffers() && + theReqCount < mRemainingCount && + ! IsOverQuota(inClient) + ) + ); + if (theGrantedFlag) { + inClient.mByteCount = theReqCount; + mRemainingCount -= theReqCount; + mCounters.mRequestGrantedCount++; + mCounters.mRequestGrantedByteCount += inByteCount; + } else { + mCounters.mRequestDeniedCount++; + mCounters.mRequestDeniedByteCount += inByteCount; + // If already waiting leave him in the same place in the queue. + if (! inClient.IsWaiting()) { + inClient.mWaitStart = microseconds(); + WaitQueue::PushBack(mWaitQueuePtr, inClient); + mWaitingCount++; + } + mWaitingByteCount += inByteCount; + mRemainingCount -= inClient.mByteCount; + inClient.mWaitingForByteCount += inByteCount; + } + assert(mRemainingCount >= 0 && mRemainingCount <= mTotalCount); + assert(inClient.IsWaiting() || inClient.mWaitingForByteCount == 0); + if (! theHadBuffersFlag && inClient.mByteCount > 0) { + mClientsWihtBuffersCount++; + } + return theGrantedFlag; +} + + void +BufferManager::Unregister( + BufferManager::Client& inClient) +{ + if (! inClient.mManagerPtr) { + return; + } + QCRTASSERT(inClient.mManagerPtr == this); + if (IsWaiting(inClient)) { + mWaitingCount--; + mWaitingByteCount -= inClient.mWaitingForByteCount; + } + WaitQueue::Remove(mWaitQueuePtr, inClient); + inClient.mWaitingForByteCount = 0; + Put(inClient, inClient.mByteCount); + assert(! inClient.IsWaiting() && inClient.mByteCount == 0); +} + + void +BufferManager::CancelRequest( + Client& inClient) +{ + if (! inClient.mManagerPtr) { + return; + } + QCRTASSERT(inClient.mManagerPtr == this); + if (! IsWaiting(inClient)) { + assert(inClient.mWaitingForByteCount == 0); + return; + } + WaitQueue::Remove(mWaitQueuePtr, inClient); + mWaitingCount--; + mWaitingByteCount -= inClient.mWaitingForByteCount; + inClient.mWaitingForByteCount = 0; +} + + bool +BufferManager::IsLowOnBuffers() const +{ + return ( + mBufferPoolPtr && + mBufferPoolPtr->GetFreeBufferCount() < max( + ByteCount(mMinBufferCount), + mRemainingCount / mBufferPoolPtr->GetBufferSize() + 1 + ) + ); +} + + /* virtual */ void +BufferManager::Timeout() +{ + bool theSetTimeFlag = true; + int64_t theNowUsecs = 0; + while (! mDiskOverloadedFlag && ! IsLowOnBuffers()) { + WaitQueue::Iterator theIt(mWaitQueuePtr); + Client* theClientPtr; + while ((theClientPtr = theIt.Next())) { + // Skip all that are over quota. + if (! IsOverQuota(*theClientPtr)) { + break; + } + } + if (! theClientPtr || + theClientPtr->mWaitingForByteCount > mRemainingCount) { + break; + } + WaitQueue::Remove(mWaitQueuePtr, *theClientPtr); + mWaitingCount--; + const ByteCount theGrantedCount = theClientPtr->mWaitingForByteCount; + assert(theGrantedCount > 0); + mRemainingCount -= theGrantedCount; + assert(mRemainingCount <= mTotalCount); + mWaitingByteCount -= theGrantedCount; + if (theClientPtr->mByteCount <= 0 && theGrantedCount > 0) { + mClientsWihtBuffersCount++; + } + if (theSetTimeFlag) { + theSetTimeFlag = false; + theNowUsecs = microseconds(); + } + mCounters.mRequestWaitUsecs += max(int64_t(0), + theNowUsecs - theClientPtr->mWaitStart); + mCounters.mRequestGrantedCount++; + mCounters.mRequestGrantedByteCount += theGrantedCount; + theClientPtr->mByteCount += theGrantedCount; + theClientPtr->mWaitingForByteCount = 0; + theClientPtr->Granted(theGrantedCount); + } + UpdateWaitingAvg(); +} + +static const int64_t kWaitingAvgExp[] = { +1507,2484,2935,3190,3354,3467,3551,3615,3665,3706, +3740,3769,3793,3814,3832,3848,3862,3875,3886,3896, +3906,3914,3922,3929,3935,3941,3947,3952,3957,3962, +3966,3970,3974,3977,3981,3984,3987,3990,3992,3995, +3997,4000,4002,4004,4006,4008,4010,4012,4013,4015, +4016,4018,4019,4021,4022,4024,4025,4026,4027,4028, +4029,4030,4031,4032,4033,4034,4035,4036,4037,4038, +4039,4040,4040,4041,4042,4042,4043,4044,4044,4045, +4046,4046,4047,4048,4048,4049,4049,4050,4050,4051, +4051,4052,4052,4053,4053,4054,4054,4054,4055,4055, +4056,4056,4056,4057,4057,4058,4058,4058,4059,4059, +4059,4060,4060,4060,4061,4061,4061,4061,4062,4062, +4062,4063,4063,4063,4063,4064,4064,4064,4064,4065, +4065,4065,4065,4066,4066,4066,4066,4066,4067,4067, +4067,4067,4067,4068,4068,4068,4068,4068,4069,4069, +4069,4069,4069,4069,4070,4070,4070,4070,4070,4070, +4071,4071,4071,4071,4071,4071,4072,4072,4072,4072, +4072,4072,4072,4073,4073,4073,4073,4073,4073,4073, +4073,4074,4074,4074,4074,4074,4074,4074,4074,4074, +4075,4075,4075,4075,4075,4075,4075,4075,4075,4076, +4076,4076,4076,4076,4076,4076,4076,4076,4076,4077, +4077,4077,4077,4077,4077,4077,4077,4077,4077,4077, +4078,4078,4078,4078,4078,4078,4078,4078,4078,4078, +4078,4078,4078,4079,4079,4079,4079,4079,4079,4079, +4079,4079,4079,4079,4079,4079,4079,4080,4080,4080, +4080,4080,4080,4080,4080,4080,4080,4080,4080,4080, +4080,4080,4080,4081,4081,4081,4081,4081,4081,4081, +4081,4081,4081,4081,4081,4081,4081,4081,4081,4081, +4081,4082,4082,4082,4082,4082,4082,4082,4082,4082, +4082,4082,4082,4082,4082,4082,4082,4082,4082,4082, +4082,4082,4083,4083,4083,4083,4083,4083,4083,4083, +4083,4083,4083,4083,4083,4083,4083,4083,4083,4083, +4083,4083,4083,4083,4083,4083,4083,4084,4084,4084, +4084,4084,4084,4084,4084,4084,4084,4084,4084,4084, +4084,4084,4084,4084,4084,4084,4084,4084,4084,4084, +4084,4084,4084,4084,4084,4085,4085,4085,4085,4085, +4085,4085,4085,4085,4085,4085,4085,4085,4085,4085, +4085,4085,4085,4085,4085,4085,4085,4085,4085,4085, +4085,4085,4085,4085,4085,4085,4085,4085,4085,4086, +4086,4086,4086,4086,4086,4086,4086,4086,4086,4086, +4086,4086,4086,4086,4086,4086,4086,4086,4086,4086, +4086,4086,4086,4086,4086,4086,4086,4086,4086,4086, +4086,4086,4086,4086,4086,4086,4086,4086,4086,4086, +4087,4087,4087,4087,4087,4087,4087,4087,4087,4087, +4087,4087,4087,4087,4087,4087,4087,4087,4087,4087, +4087,4087,4087,4087,4087,4087,4087,4087,4087,4087, +4087,4087,4087,4087,4087,4087,4087,4087,4087,4087, +4087,4087,4087,4087,4087,4087,4087,4087,4087,4087, +4087,4088,4088,4088,4088,4088,4088,4088,4088,4088, +4088,4088,4088,4088,4088,4088,4088,4088,4088,4088, +4088,4088,4088,4088,4088,4088,4088,4088,4088,4088, +4088,4088,4088,4088,4088,4088,4088,4088,4088,4088, +4088,4088,4088,4088,4088,4088,4088,4088,4088,4088, +4088,4088,4088,4088,4088,4088,4088,4088,4088,4088, +4088,4088,4088,4088,4088,4089,4089,4089,4089,4089, +4089,4089,4089,4089,4089,4089,4089,4089,4089,4089, +4089,4089,4089,4089,4089,4089,4089,4089,4089,4089, +4089,4089,4089,4089,4089,4089,4089,4089,4089,4089, +4089,4089,4089,4089,4089,4089,4089,4089,4089,4089, +4089,4089,4089,4089,4089,4089,4089,4089,4089,4089, +4089,4089,4089,4089,4089,4089,4089,4089,4089,4089, +4089,4089,4089,4089,4089,4089,4089,4089,4089,4089, +4089,4089,4089,4089,4089,4089,4089,4089,4089,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4090,4090,4090,4090,4090,4090, +4090,4090,4090,4090,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4091, +4091,4091,4091,4091,4091,4091,4091,4091,4091,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4092, +4092,4092,4092,4092,4092,4092,4092,4092,4092,4093, +4093,4093,4093,4093,4093,4093,4093,4093,4093,4093, +4093,4093,4093,4093,4093,4093,4093,4093,4093,4093, +4093,4093,4093,4093,4093,4093,4093,4093,4093,4093 +}; + +// Generate the exp table the above from 1 to 1200 sec average +/* +#!/bin/sh + +awk -v mx=1200 -v samp_interval=1 -v frac_bits=12 'BEGIN { + f1 = lshift(1, frac_bits); + for(i = samp_interval; i <= mx; i += samp_interval) { + printf("%.f,%s", f1 / exp(samp_interval / i), \ + i % (10 * samp_interval) == 0 ? "\n" : ""); + } + exit(0); +}' +*/ + +void +BufferManager::SetWaitingAvgInterval( + int inSecs) +{ + mWaitingAvgIntervalIdx = (int)min( + size_t(max(1, (inSecs + kWaitingAvgSampleIntervalSec - 1) / + kWaitingAvgSampleIntervalSec)), + sizeof(kWaitingAvgExp) / sizeof(kWaitingAvgExp[0])) - 1; + mWaitingAvgExp = kWaitingAvgExp[mWaitingAvgIntervalIdx]; +} + +int64_t +BufferManager::CalcWaitingAvg( + int64_t inAvg, + int64_t inSample) const +{ + // IIR filter + const int64_t kWaitingAvgFixed_1 = int64_t(1) << kWaitingAvgFracBits; + return (( + inAvg * mWaitingAvgExp + + (inSample << kWaitingAvgFracBits) * + (kWaitingAvgFixed_1 - mWaitingAvgExp) + ) >> kWaitingAvgFracBits); +} + +void +BufferManager::UpdateWaitingAvg() +{ + const int64_t kWaitingAvgIntervalUsec = + int64_t(kWaitingAvgSampleIntervalSec) * 1000 * 1000; + + const time_t theNow = globalNetManager().Now(); + if (theNow < mWaitingAvgNext) { + return; + } + const int64_t theNowUsecs = microseconds(); + const int64_t theEnd = theNowUsecs - kWaitingAvgIntervalUsec; + const int64_t theWaitUsecs = WaitQueue::IsEmpty(mWaitQueuePtr) ? + int64_t(0) : max(int64_t(0), + theNowUsecs - WaitQueue::Front(mWaitQueuePtr)->mWaitStart); + while (mWaitingAvgUsecsLast <= theEnd) { + mWaitingAvgBytes = CalcWaitingAvg(mWaitingAvgBytes, mWaitingByteCount); + mWaitingAvgCount = CalcWaitingAvg(mWaitingAvgCount, mWaitingCount); + mWaitingAvgUsecs = CalcWaitingAvg(mWaitingAvgUsecs, theWaitUsecs); + mWaitingAvgUsecsLast += kWaitingAvgIntervalUsec; + mWaitingAvgNext += kWaitingAvgSampleIntervalSec; + } +} + +} /* namespace KFS */ diff --git a/src/cc/chunk/BufferManager.h b/src/cc/chunk/BufferManager.h new file mode 100644 index 000000000..8143a8538 --- /dev/null +++ b/src/cc/chunk/BufferManager.h @@ -0,0 +1,253 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/06/06 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef BUFFER_MANAGER_H +#define BUFFER_MANAGER_H + +#include +#include "qcdio/QCDLList.h" +#include "qcdio/QCIoBufferPool.h" +#include "kfsio/ITimeout.h" + +namespace KFS +{ + +// Chunk server disk and network io buffer manager. The intent is "fair" io +// buffer allocation between clients [connections]. The buffer pool size fixed +// at startup. Clients added to the wait queue if not enough buffers available +// to serve the request, and the client request processing suspended. The +// client's request processing resumed once enough buffers become available. +// +// Exponentially decaying average request wait time presently used by the meta +// server as feedback chunk server "load" metric in chunk placement. The load +// metric presently has the most effect for write append chunk placement with +// large number of append clients in radix sort. +class BufferManager : private ITimeout +{ +public: + typedef int64_t ByteCount; + typedef int64_t RequestCount; + struct Counters + { + typedef int64_t Counter; + + Counter mRequestCount; + Counter mRequestByteCount; + Counter mRequestDeniedCount; + Counter mRequestDeniedByteCount; + Counter mRequestGrantedCount; + Counter mRequestGrantedByteCount; + Counter mRequestWaitUsecs; + + void Clear() + { + mRequestCount = 0; + mRequestByteCount = 0; + mRequestDeniedCount = 0; + mRequestDeniedByteCount = 0; + mRequestGrantedCount = 0; + mRequestGrantedByteCount = 0; + mRequestWaitUsecs = 0; + } + }; + + class Client + { + public: + typedef BufferManager::ByteCount ByteCount; + + virtual void Granted( + ByteCount inByteCount) = 0; + // virtual int EmeregencyRelease( + // ByteCount inByteCount) + // { return 0; } + ByteCount GetByteCount() const + { return mByteCount; } + ByteCount GetWaitingForByteCount() const + { return mWaitingForByteCount; } + bool IsWaiting() const + { return (mManagerPtr && mManagerPtr->IsWaiting(*this)); } + void CancelRequest() + { + if (mManagerPtr) { + mManagerPtr->CancelRequest(*this); + } + } + void Unregister() + { + if (mManagerPtr) { + mManagerPtr->Unregister(*this); + } + } + protected: + Client(); + virtual ~Client() + { Client::Unregister(); } + private: + Client* mPrevPtr[1]; + Client* mNextPtr[1]; + BufferManager* mManagerPtr; + ByteCount mByteCount; + ByteCount mWaitingForByteCount; + int64_t mWaitStart; + + friend class BufferManager; + friend class QCDLListOp; + }; + BufferManager( + bool inEnabledFlag); + ~BufferManager(); + void Init( + QCIoBufferPool* inBufferPoolPtr, + ByteCount inTotalCount, + ByteCount inMaxClientQuota, + int inMinBufferCount); + ByteCount GetMaxClientQuota() const + { return mMaxClientQuota; } + bool IsOverQuota( + Client& inClient, + ByteCount inByteCount = 0) + { + return (mMaxClientQuota < + inClient.mByteCount + inClient.mWaitingForByteCount + inByteCount); + } + bool Get( + Client& inClient, + ByteCount inByteCount, + bool inForDiskIoFlag = false) + { + return (inByteCount <= 0 || + Modify(inClient, inByteCount, inForDiskIoFlag)); + } + bool Put( + Client& inClient, + ByteCount inByteCount) + { return (inByteCount <= 0 || Modify(inClient, -inByteCount, false)); } + bool GetForDiskIo( + Client& inClient, + ByteCount inByteCount) + { return Get(inClient, inByteCount, true); } + ByteCount GetTotalCount() const + { return mTotalCount; } + bool IsLowOnBuffers() const; + virtual void Timeout(); + bool IsWaiting( + const Client& inClient) const + { return WaitQueue::IsInList(mWaitQueuePtr, inClient); } + void Unregister( + Client& inClient); + void CancelRequest( + Client& inClient); + ByteCount GetTotalByteCount() const + { return mTotalCount; } + ByteCount GetRemainingByteCount() const + { return mRemainingCount; } + ByteCount GetUsedByteCount() const + { return (mTotalCount - mRemainingCount); } + int GetFreeBufferCount() const + { return (mBufferPoolPtr ? mBufferPoolPtr->GetFreeBufferCount() : 0); } + int GetMinBufferCount() const + { return mMinBufferCount; } + int GetTotalBufferCount() const + { + const int theSize = mBufferPoolPtr ? mBufferPoolPtr->GetBufferSize() : 0; + return (theSize > 0 ? mTotalCount / theSize : 0); + } + int GetWaitingCount() const + { return mWaitingCount; } + int GetWaitingByteCount() const + { return mWaitingByteCount; } + RequestCount GetGetRequestCount() const + { return mGetRequestCount; } + RequestCount GetPutRequestCount() const + { return mPutRequestCount; } + int GetClientsWihtBuffersCount() const + { return mClientsWihtBuffersCount; } + void GetCounters( + Counters& outCounters) const + { outCounters = mCounters; } + void SetDiskOverloaded( + bool inFlag) + { mDiskOverloadedFlag = inFlag; } + int64_t GetWaitingAvgBytes() const + { return (mWaitingAvgBytes >> kWaitingAvgFracBits); } + int64_t GetWaitingAvgUsecs() const + { return (mWaitingAvgUsecs >> kWaitingAvgFracBits); } + int64_t GetWaitingAvgCount() const + { return (mWaitingAvgCount >> kWaitingAvgFracBits); } + void SetWaitingAvgInterval( + int inSecs); + int GetWaitingAvgInterval() const + { + return ((mWaitingAvgIntervalIdx + 1) * kWaitingAvgSampleIntervalSec); + } +private: + typedef QCDLList WaitQueue; + // 39 bits integer part -- max 0.5TB bytes waiting + // 24 bits after 12 bits fractional part multiplication -- should be sufficent + // for 2 sec resolution. + enum { kWaitingAvgFracBits = 12 }; + enum { kWaitingAvgSampleIntervalSec = 1 }; + + Client* mWaitQueuePtr[1]; + QCIoBufferPool* mBufferPoolPtr; + ByteCount mTotalCount; + ByteCount mMaxClientQuota; + ByteCount mRemainingCount; + ByteCount mWaitingByteCount; + RequestCount mGetRequestCount; + RequestCount mPutRequestCount; + int mClientsWihtBuffersCount; + int mMinBufferCount; + int mWaitingCount; + bool mInitedFlag; + bool mDiskOverloadedFlag; + const bool mEnabledFlag; + int mWaitingAvgIntervalIdx; + int64_t mWaitingAvgExp; + int64_t mWaitingAvgUsecsLast; + time_t mWaitingAvgNext; + int64_t mWaitingAvgBytes; + int64_t mWaitingAvgCount; + int64_t mWaitingAvgUsecs; + Counters mCounters; + + bool Modify( + Client& inClient, + ByteCount inByteCount, + bool inForDiskIoFlag); + void UpdateWaitingAvg(); + int64_t CalcWaitingAvg( + int64_t inAvg, + int64_t inSample) const; + + BufferManager( + const BufferManager& inManager); + BufferManager& operator=( + const BufferManager& inManager); +}; + +} /* namespace KFS */ +#endif /* BUFFER_MANAGER_H */ diff --git a/src/cc/chunk/CMakeLists.txt b/src/cc/chunk/CMakeLists.txt new file mode 100644 index 000000000..bc9ed22b9 --- /dev/null +++ b/src/cc/chunk/CMakeLists.txt @@ -0,0 +1,74 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2009-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +add_executable (chunkserver + chunkserver_main.cc + AtomicRecordAppender.cc + BufferManager.cc + ChunkManager.cc + ChunkServer.cc + ClientManager.cc + ClientSM.cc + DiskIo.cc + KfsOps.cc + LeaseClerk.cc + Logger.cc + MetaServerSM.cc + RemoteSyncSM.cc + Replicator.cc + utils.cc + DirChecker.cc +) +add_executable (chunkscrubber chunkscrubber_main.cc) +add_executable (chunktrimmer chunktrimmer_main.cc) + +set (exe_files chunkserver chunkscrubber chunktrimmer) + +foreach (exe_file ${exe_files}) + if (USE_STATIC_LIB_LINKAGE) + target_link_libraries (${exe_file} kfsIO kfsCommon kfsClient qcdio pthread crypto) + add_dependencies (${exe_file} kfsCommon kfsClient-shared kfsIO qcdio) + else (USE_STATIC_LIB_LINKAGE) + target_link_libraries (${exe_file} kfsIO-shared kfsCommon-shared kfsClient-shared qcdio-shared pthread crypto) + add_dependencies (${exe_file} kfsCommon-shared kfsIO-shared kfsClient-shared qcdio-shared) + endif (USE_STATIC_LIB_LINKAGE) +endforeach (exe_file) + +if (NOT APPLE) + target_link_libraries(chunkserver rt) +endif (NOT APPLE) + +if (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + target_link_libraries(chunkserver umem) +endif (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + +# +# Install them +# +install (TARGETS ${exe_files} + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static) + diff --git a/src/cc/chunk/Chunk.h b/src/cc/chunk/Chunk.h new file mode 100644 index 000000000..50b4d9860 --- /dev/null +++ b/src/cc/chunk/Chunk.h @@ -0,0 +1,263 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/22 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _CHUNKSERVER_CHUNK_H +#define _CHUNKSERVER_CHUNK_H + +#include +#include +#include +#include + +#include +#include + +#include "common/MsgLogger.h" +#include "common/kfstypes.h" +#include "kfsio/FileHandle.h" +#include "kfsio/checksum.h" +#include "utils.h" + +namespace KFS +{ + +/// +/// \file Chunk.h +/// \brief Declarations related to a Chunk in KFS. +/// + + +/// +/// \brief ChunkInfo_t +/// For each chunk, the chunkserver maintains a meta-data file. This +/// file defines the chunk attributes such as, the file it is +/// associated with, the chunk version #, and the checksums for each +/// block of the file. For each chunk, this structure is read in at +/// startup time. +/// + +/// The max # of checksum blocks we have for a given chunk +const uint32_t MAX_CHUNK_CHECKSUM_BLOCKS = CHUNKSIZE / CHECKSUM_BLOCKSIZE; + +/// In the chunk header, we store upto 256 char of the file that +/// originally created the chunk. +const size_t MAX_FILENAME_LEN = 256; + +const uint32_t CHUNK_META_MAGIC = 0xCAFECAFE; +const uint32_t CHUNK_META_VERSION = 0x1; + +// This structure is on-disk +struct DiskChunkInfo_t { + DiskChunkInfo_t() : metaMagic (CHUNK_META_MAGIC), metaVersion(CHUNK_META_VERSION) { } + DiskChunkInfo_t(kfsFileId_t f, kfsChunkId_t c, int64_t s, kfsSeq_t v) : + metaMagic (CHUNK_META_MAGIC), metaVersion(CHUNK_META_VERSION), + fileId(f), chunkId(c), chunkVersion(v), chunkSize(s), numReads(0), unused(0) { + memset(filename, 0, MAX_FILENAME_LEN); + } + void SetChecksums(const uint32_t *checksums) { + memcpy(chunkBlockChecksum, checksums, MAX_CHUNK_CHECKSUM_BLOCKS * sizeof(uint32_t)); + } + + int Validate() const { + if (metaMagic != CHUNK_META_MAGIC) { + KFS_LOG_STREAM_INFO << + "Magic # mismatch (got: " << std::hex << metaMagic << + ", expect: " << CHUNK_META_MAGIC << ")" << std::dec << + KFS_LOG_EOM; + return -KFS::EBADCKSUM; + } + if (metaVersion != CHUNK_META_VERSION) { + KFS_LOG_STREAM_INFO << + "Version # mismatch (got: << " << std::hex << metaVersion << + ", expect: << " << CHUNK_META_VERSION << ")" << std::dec << + KFS_LOG_EOM; + return -KFS::EBADCKSUM; + } + if (chunkSize > (uint64_t)CHUNKSIZE) { + KFS_LOG_STREAM_INFO << + "Invlid chunk size: " << chunkSize << + KFS_LOG_EOM; + return -KFS::EBADCKSUM; + } + return 0; + } + + int Validate(kfsChunkId_t cid, kfsSeq_t vers) const { + const int ret = Validate(); + if (ret < 0) { + return ret; + } + if ((kfsChunkId_t)chunkId != cid) { + KFS_LOG_STREAM_INFO << + "Chunkid mismatch (got: " << chunkId << ", expect: " << cid << ")" << + KFS_LOG_EOM; + return -KFS::EBADCKSUM; + } + if ((kfsSeq_t)chunkVersion != vers) { + KFS_LOG_STREAM_INFO << + "Chunk version mismatch (got: " << chunkVersion << ", expect: " << vers << ")" << + KFS_LOG_EOM; + return -KFS::EBADCKSUM; + } + return 0; + } + + uint32_t metaMagic; + uint32_t metaVersion; + + uint64_t fileId; + uint64_t chunkId; + uint64_t chunkVersion; + uint64_t chunkSize; + uint32_t chunkBlockChecksum[MAX_CHUNK_CHECKSUM_BLOCKS]; + // some statistics about the chunk: + // -- version # has an estimate of the # of writes + // -- track the # of reads + // ... + uint32_t numReads; + char filename[MAX_FILENAME_LEN]; + uint32_t unused; // legacy padding +} __attribute__ ((__packed__)); + +// This structure is in-core +struct ChunkInfo_t { + + ChunkInfo_t() : fileId(0), chunkId(0), chunkVersion(0), chunkSize(0), + chunkBlockChecksum(NULL) + { + // memset(chunkBlockChecksum, 0, sizeof(chunkBlockChecksum)); + } + ~ChunkInfo_t() { + delete [] chunkBlockChecksum; + } + ChunkInfo_t(const ChunkInfo_t &other) : + fileId(other.fileId), chunkId(other.chunkId), chunkVersion(other.chunkVersion), + chunkSize(other.chunkSize), chunkBlockChecksum(NULL) { + } + ChunkInfo_t& operator= (const ChunkInfo_t &other) + { + fileId = other.fileId; + chunkId = other.chunkId; + chunkVersion = other.chunkVersion; + chunkSize = other.chunkSize; + SetChecksums(other.chunkBlockChecksum); + + return *this; + } + + void Init(kfsFileId_t f, kfsChunkId_t c, int64_t v) { + fileId = f; + chunkId = c; + chunkVersion = v; + chunkBlockChecksum = new uint32_t[MAX_CHUNK_CHECKSUM_BLOCKS]; + memset(chunkBlockChecksum, 0, MAX_CHUNK_CHECKSUM_BLOCKS * sizeof(uint32_t)); + } + + bool AreChecksumsLoaded() const { + return chunkBlockChecksum != NULL; + } + + void UnloadChecksums() { + delete [] chunkBlockChecksum; + chunkBlockChecksum = NULL; + KFS_LOG_STREAM_DEBUG << + "Unloading chunk checksum for chunk " << chunkId << + KFS_LOG_EOM; + } + + void SetChecksums(const uint32_t *checksums) { + delete [] chunkBlockChecksum; + if (checksums == NULL) { + chunkBlockChecksum = NULL; + return; + } + + chunkBlockChecksum = new uint32_t[MAX_CHUNK_CHECKSUM_BLOCKS]; + memcpy(chunkBlockChecksum, checksums, MAX_CHUNK_CHECKSUM_BLOCKS * sizeof(uint32_t)); + } + + void VerifyChecksumsLoaded() const { + assert(chunkBlockChecksum != NULL); + if (chunkBlockChecksum == NULL) + die("Checksums are not loaded!"); + } + + // save the chunk meta-data to the buffer; + void Serialize(IOBuffer *dataBuf) { + DiskChunkInfo_t dci(fileId, chunkId, chunkSize, chunkVersion); + + assert(chunkBlockChecksum != NULL); + dci.SetChecksums(chunkBlockChecksum); + + dataBuf->CopyIn((char *) &dci, sizeof(DiskChunkInfo_t)); + } + + int Deserialize(const DiskChunkInfo_t &dci, bool validate) { + if (validate) { + if (dci.metaMagic != CHUNK_META_MAGIC) { + KFS_LOG_STREAM_INFO << + "Magic # mismatch (got: " << std::hex << dci.metaMagic << + ", expect: " << CHUNK_META_MAGIC << ")" << std::dec << + KFS_LOG_EOM; + return -EINVAL; + } + if (dci.metaVersion != CHUNK_META_VERSION) { + KFS_LOG_STREAM_INFO << + "Version # mismatch (got: << " << std::hex << dci.metaVersion << + ", expect: << " << CHUNK_META_VERSION << ")" << std::dec << + KFS_LOG_EOM; + return -EINVAL; + } + } + fileId = dci.fileId; + chunkId = dci.chunkId; + chunkSize = dci.chunkSize; + chunkVersion = dci.chunkVersion; + + delete [] chunkBlockChecksum; + chunkBlockChecksum = new uint32_t[MAX_CHUNK_CHECKSUM_BLOCKS]; + memcpy(chunkBlockChecksum, dci.chunkBlockChecksum, + MAX_CHUNK_CHECKSUM_BLOCKS * sizeof(uint32_t)); + KFS_LOG_STREAM_DEBUG << + "Loading chunk checksum for chunk " << chunkId << + KFS_LOG_EOM; + + return 0; + } + + kfsFileId_t fileId; + kfsChunkId_t chunkId; + kfsSeq_t chunkVersion; + int64_t chunkSize; + // uint32_t chunkBlockChecksum[MAX_CHUNK_CHECKSUM_BLOCKS]; + // this is unpinned; whenever we open the chunk, this has to be + // paged in...damn..would've been nice if this was at the end + uint32_t *chunkBlockChecksum; +}; + +} + +#endif // _CHUNKSERVER_CHUNK_H diff --git a/src/cc/chunk/ChunkManager.cc b/src/cc/chunk/ChunkManager.cc new file mode 100644 index 000000000..5ca1654c4 --- /dev/null +++ b/src/cc/chunk/ChunkManager.cc @@ -0,0 +1,4373 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/28 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include + +#include "common/MsgLogger.h" +#include "common/kfstypes.h" + +#include "ChunkManager.h" +#include "ChunkServer.h" +#include "MetaServerSM.h" +#include "LeaseClerk.h" +#include "AtomicRecordAppender.h" +#include "utils.h" +#include "Logger.h" +#include "DiskIo.h" +#include "Replicator.h" + +#include "kfsio/Counter.h" +#include "kfsio/checksum.h" +#include "kfsio/Globals.h" +#include "qcdio/QCUtils.h" + +#include +#include +#include +#include +#include + +namespace KFS +{ +using std::ofstream; +using std::ostringstream; +using std::min; +using std::max; +using std::string; +using std::vector; +using std::make_pair; +using std::sort; +using std::unique; +using std::greater; +using std::binary_function; + +using namespace KFS::libkfsio; + +ChunkManager gChunkManager; + +typedef QCDLList ChunkList; +typedef QCDLList ChunkDirList; +typedef ChunkList ChunkLru; + +// Chunk directory state. The present production deployment use one chunk +// directory per physical disk. +struct ChunkManager::ChunkDirInfo +{ + ChunkDirInfo() + : dirname(), + usedSpace(0), + availableSpace(0), + totalSpace(0), + pendingReadBytes(0), + pendingWriteBytes(0), + corruptedChunksCount(0), + evacuateCheckIoErrorsCount(0), + evacuateStartByteCount(0), + evacuateStartChunkCount(-1), + chunkCount(0), + diskTimeoutCount(0), + evacuateInFlightCount(0), + rescheduleEvacuateThreshold(0), + diskQueue(0), + deviceId(-1), + dirLock(), + countFsSpaceAvailableFlag(true), + fsSpaceAvailInFlightFlag(false), + checkDirReadableFlightFlag(false), + checkEvacuateFileInFlightFlag(false), + evacuateChunksOpInFlightFlag(false), + evacuateFlag(false), + evacuateStartedFlag(false), + evacuateDoneFlag(false), + evacuateFileRenameInFlightFlag(false), + placementSkipFlag(false), + lastEvacuationActivityTime( + globalNetManager().Now() - 365 * 24 * 60 * 60), + fsSpaceAvailCb(), + checkDirReadableCb(), + checkEvacuateFileCb(), + evacuateChunksCb(), + evacuateChunksOp(0, &evacuateChunksCb) + { + fsSpaceAvailCb.SetHandler(this, + &ChunkDirInfo::FsSpaceAvailDone); + checkDirReadableCb.SetHandler(this, + &ChunkDirInfo::CheckDirReadableDone); + checkEvacuateFileCb.SetHandler(this, + &ChunkDirInfo::CheckEvacuateFileDone); + evacuateChunksCb.SetHandler(this, + &ChunkDirInfo::EvacuateChunksDone); + renameEvacuateFileCb.SetHandler(this, + &ChunkDirInfo::RenameEvacuateFileDone); + for (int i = 0; i < kChunkDirListCount; i++) { + ChunkList::Init(chunkLists[i]); + ChunkDirList::Init(chunkLists[i]); + } + } + int FsSpaceAvailDone(int code, void* data); + int CheckDirReadableDone(int code, void* data); + int CheckEvacuateFileDone(int code, void* data); + int RenameEvacuateFileDone(int code, void* data); + void DiskError(int sysErr); + int EvacuateChunksDone(int code, void* data); + void ScheduleEvacuate(int maxChunkCount = -1); + void RestartEvacuation(); + void UpdateLastEvacuationActivityTime() + { + lastEvacuationActivityTime = globalNetManager().Now(); + } + void ChunkEvacuateDone() + { + UpdateLastEvacuationActivityTime(); + if (evacuateInFlightCount > 0 && + --evacuateInFlightCount <= rescheduleEvacuateThreshold) { + ScheduleEvacuate(); + } + } + void Stop() + { + for (int i = 0; i < kChunkDirListCount; i++) { + if (! ChunkDirList::IsEmpty(chunkLists[i])) { + die("chunk dir stop: chunk list is not empty"); + } + } + if (chunkCount != 0) { + die("chunk dir stop: invalid chunk count"); + chunkCount = 0; + } + if (diskQueue) { + string err; + if (! DiskIo::StopIoQueue( + diskQueue, dirname.c_str(), deviceId, &err)) { + die("failed to stop io queue: " + err); + } + deviceId = -1; + diskQueue = 0; + } + availableSpace = -1; + rescheduleEvacuateThreshold = 0; + evacuateFlag = false; + evacuateStartedFlag = false; + evacuateDoneFlag = false; + diskTimeoutCount = 0; + countFsSpaceAvailableFlag = false; + usedSpace = 0; + totalSpace = 0; + evacuateStartChunkCount = -1; + evacuateStartByteCount = -1; + } + void SetEvacuateStarted() + { + evacuateStartedFlag = true; + evacuateStartChunkCount = max(evacuateStartChunkCount, chunkCount); + evacuateStartByteCount = max(evacuateStartByteCount, usedSpace); + } + int GetEvacuateDoneChunkCount() const + { + return (max(evacuateStartChunkCount, chunkCount) - chunkCount); + } + int64_t GetEvacuateDoneByteCount() const + { + return (max(evacuateStartByteCount, usedSpace) - usedSpace); + } + + string dirname; + int64_t usedSpace; + int64_t availableSpace; + int64_t totalSpace; + int64_t pendingReadBytes; + int64_t pendingWriteBytes; + int64_t corruptedChunksCount; + int64_t evacuateCheckIoErrorsCount; + int64_t evacuateStartByteCount; + int evacuateStartChunkCount; + int chunkCount; + int diskTimeoutCount; + int evacuateInFlightCount; + int rescheduleEvacuateThreshold; + DiskQueue* diskQueue; + DirChecker::DeviceId deviceId; + DirChecker::LockFdPtr dirLock; + bool countFsSpaceAvailableFlag:1; + bool fsSpaceAvailInFlightFlag:1; + bool checkDirReadableFlightFlag:1; + bool checkEvacuateFileInFlightFlag:1; + bool evacuateChunksOpInFlightFlag:1; + bool evacuateFlag:1; + bool evacuateStartedFlag:1; + bool evacuateDoneFlag:1; + bool evacuateFileRenameInFlightFlag:1; + bool placementSkipFlag:1; + time_t lastEvacuationActivityTime; + KfsCallbackObj fsSpaceAvailCb; + KfsCallbackObj checkDirReadableCb; + KfsCallbackObj checkEvacuateFileCb; + KfsCallbackObj evacuateChunksCb; + KfsCallbackObj renameEvacuateFileCb; + EvacuateChunksOp evacuateChunksOp; + + enum { kChunkInfoHDirListCount = kChunkInfoHandleListCount + 1 }; + enum ChunkListType + { + kChunkDirList = 0, + kChunkDirEvacuateList = 1, + kChunkDirListNone = 2 + }; + enum { kChunkDirListCount = kChunkDirEvacuateList + 1 }; + typedef ChunkInfoHandle* ChunkLists[kChunkInfoHDirListCount]; + ChunkLists chunkLists[kChunkDirListCount]; + +private: + ChunkDirInfo(const ChunkDirInfo&); + ChunkDirInfo& operator=(const ChunkDirInfo&); +}; + +inline ChunkManager::ChunkDirs::~ChunkDirs() +{ + delete [] mChunkDirs; +} + +inline ChunkManager::ChunkDirs::iterator +ChunkManager::ChunkDirs::end() +{ + return mChunkDirs + mSize; +} + +inline ChunkManager::ChunkDirs::const_iterator +ChunkManager::ChunkDirs::end() const +{ + return mChunkDirs + mSize; +} + +inline ChunkManager::ChunkDirInfo& +ChunkManager::ChunkDirs::operator[](size_t i) +{ + return mChunkDirs[i]; +} + +inline const ChunkManager::ChunkDirInfo& +ChunkManager::ChunkDirs::operator[](size_t i) const +{ + return mChunkDirs[i]; +} + +void +ChunkManager::ChunkDirs::Allocate(size_t size) +{ + delete [] mChunkDirs; + mChunkDirs = 0; + mSize = 0; + mChunkDirs = new ChunkDirInfo[size]; + mSize = size; +} + +// OP for reading/writing out the meta-data associated with each chunk. This +// is an internally generated op (ops that generate this one are +// allocate/write/truncate/change-chunk-vers). +struct WriteChunkMetaOp : public KfsOp { + kfsChunkId_t const chunkId; + DiskIo* const diskIo; /* disk connection used for writing data */ + IOBuffer dataBuf; /* buffer with the data to be written */ + WriteChunkMetaOp* next; + const kfsSeq_t targetVersion; + const bool renameFlag; + const bool stableFlag; + + WriteChunkMetaOp( + kfsChunkId_t c, + KfsCallbackObj* o, + DiskIo* d, + bool rename, + bool stable, + kfsSeq_t version) + : KfsOp(CMD_WRITE_CHUNKMETA, 0, o), + chunkId(c), + diskIo(d), + dataBuf(), + next(0), + targetVersion(version), + renameFlag(rename), + stableFlag(stable) + { + SET_HANDLER(this, &WriteChunkMetaOp::HandleDone); + } + ~WriteChunkMetaOp() { + delete diskIo; + } + void Execute() {} + inline bool IsRenameNeeded(const ChunkInfoHandle* cih) const; + bool IsWaiting() const { + return (! diskIo && ! renameFlag); + } + int Start(ChunkInfoHandle* cih); + string Show() const { + ostringstream os; + os << "write-chunk-meta: " + " chunkid: " << chunkId << + " rename: " << renameFlag << + " stable: " << stableFlag << + " version: " << targetVersion + ; + return os.str(); + + } + // Notify the op that is waiting for the write to finish that all + // is done + int HandleDone(int code, void *data) { + if (clnt) { + clnt->HandleEvent(code, data); + } + delete this; + return 0; + } +}; + +/// Encapsulate a chunk file descriptor and information about the +/// chunk such as name and version #. +class ChunkInfoHandle : public KfsCallbackObj +{ +public: + typedef ChunkManager::ChunkLists ChunkLists; + typedef ChunkManager::ChunkDirInfo ChunkDirInfo; + typedef ChunkDirInfo::ChunkLists ChunkDirLists; + + ChunkInfoHandle(ChunkDirInfo& chunkdir, bool stableFlag = true) + : KfsCallbackObj(), + chunkInfo(), + dataFH(), + lastIOTime(0), + readChunkMetaOp(0), + isBeingReplicated(false), + mDeleteFlag(false), + mWriteAppenderOwnsFlag(false), + mWaitForWritesInFlightFlag(false), + mMetaDirtyFlag(false), + mStableFlag(stableFlag), + mInDoneHandlerFlag(false), + mKeepFlag(false), + mChunkList(ChunkManager::kChunkLruList), + mChunkDirList(ChunkDirInfo::kChunkDirList), + mRenamesInFlight(0), + mWritesInFlight(0), + mWriteMetaOpsHead(0), + mWriteMetaOpsTail(0), + mChunkDir(chunkdir) + { + ChunkList::Init(*this); + ChunkDirList::Init(*this); + ChunkDirList::PushBack(mChunkDir.chunkLists[mChunkDirList], *this); + SET_HANDLER(this, &ChunkInfoHandle::HandleChunkMetaWriteDone); + mChunkDir.chunkCount++; + assert(mChunkDir.chunkCount > 0); + } + + void Delete(ChunkLists* chunkInfoLists) { + const bool evacuateFlag = IsEvacuate(); + ChunkList::Remove(chunkInfoLists[mChunkList], *this); + DetachFromChunkDir(evacuateFlag); + if (mWriteAppenderOwnsFlag) { + mWriteAppenderOwnsFlag = false; + gAtomicRecordAppendManager.DeleteChunk(chunkInfo.chunkId); + } + if (mWriteMetaOpsHead || mInDoneHandlerFlag) { + mDeleteFlag = true; + const bool runHanlder = ! mInDoneHandlerFlag && + mWritesInFlight > 0 && mWaitForWritesInFlightFlag; + mWaitForWritesInFlightFlag = false; + mWritesInFlight = 0; + if (runHanlder) { + int res = -1; + HandleEvent(EVENT_DISK_ERROR, &res); + } + } else { + delete this; + } + } + + bool IsEvacuate() const { + return (! IsStale() && + mChunkDirList == ChunkDirInfo::kChunkDirEvacuateList); + } + + bool SetEvacuate(bool flag) { + if (IsStale()) { + return false; + } + if (IsEvacuate() == flag) { + return true; + } + mChunkDir.evacuateInFlightCount += (flag ? 1 : -1); + if (mChunkDir.evacuateInFlightCount < 0) { + mChunkDir.evacuateInFlightCount = 0; + } + ChunkDirList::Remove(mChunkDir.chunkLists[mChunkDirList], *this); + mChunkDirList = flag ? + ChunkDirInfo::kChunkDirEvacuateList : + ChunkDirInfo::kChunkDirList; + ChunkDirList::PushBack(mChunkDir.chunkLists[mChunkDirList], *this); + return true; + } + + ChunkInfo_t chunkInfo; + /// Chunks are stored as files in he underlying filesystem; each + /// chunk file is named by the chunkId. Each chunk has a header; + /// this header is hidden from clients; all the client I/O is + /// offset by the header amount + DiskIo::FilePtr dataFH; + // when was the last I/O done on this chunk + time_t lastIOTime; + /// keep track of the op that is doing the read + ReadChunkMetaOp* readChunkMetaOp; + + void Release(ChunkLists* chunkInfoLists); + + bool IsFileOpen() const { + return (dataFH && dataFH->IsOpen()); + } + + bool IsFileInUse() const { + return (IsFileOpen() && ! dataFH.unique()); + } + + bool IsStable() const { + return mStableFlag; + } + + void StartWrite(WriteOp* /* op */) { + assert(mWritesInFlight >= 0); + mWritesInFlight++; + mMetaDirtyFlag = true; + } + + void SetMetaDirty() { + mMetaDirtyFlag = true; + } + + void WriteDone(const WriteOp* op = 0) { + assert(mWritesInFlight > 0); + mWritesInFlight--; + if (mWritesInFlight == 0 && mWaitForWritesInFlightFlag) { + assert(mWriteMetaOpsHead); + mWaitForWritesInFlightFlag = false; + int res = mWriteMetaOpsHead->Start(this); + if (res < 0) { + HandleEvent(EVENT_DISK_ERROR, &res); + } + } + } + + bool IsFileEquals(const DiskIo::File* file) const { + return (file && file == dataFH.get()); + } + + bool IsFileEquals(const DiskIo* diskIo) const { + return (diskIo && IsFileEquals(diskIo->GetFilePtr().get())); + } + + bool IsFileEquals(const DiskIoPtr& diskIoPtr) const { + return IsFileEquals(diskIoPtr.get()); + } + + bool SyncMeta() { + if (mWriteMetaOpsHead || mWritesInFlight > 0) { + return true; + } + if (mMetaDirtyFlag) { + WriteChunkMetadata(); + return true; + } + return false; + } + + inline void LruUpdate(ChunkLists* chunkInfoLists); + inline void SetWriteAppenderOwns(ChunkLists* chunkInfoLists, bool flag); + inline bool IsWriteAppenderOwns() const; + int WriteChunkMetadata( + KfsCallbackObj* cb, + bool renameFlag, + bool stableFlag, + kfsSeq_t targetVersion); + int WriteChunkMetadata( + KfsCallbackObj* cb = 0) + { + return WriteChunkMetadata(cb, false, mStableFlag, + mStableFlag ? chunkInfo.chunkVersion : kfsSeq_t(0)); + } + kfsSeq_t GetTargetStateAndVersion(bool& stableFlag) const { + if (! mWriteMetaOpsTail || mRenamesInFlight <= 0) { + stableFlag = mStableFlag; + return chunkInfo.chunkVersion; + } + if (mWriteMetaOpsTail->renameFlag) { + stableFlag = mWriteMetaOpsTail->stableFlag; + return mWriteMetaOpsTail->targetVersion; + } + stableFlag = mStableFlag; + kfsSeq_t theRet = chunkInfo.chunkVersion; + for (const WriteChunkMetaOp* + op = mWriteMetaOpsHead; op; op = op->next) { + if (op->renameFlag) { + theRet = op->targetVersion; + stableFlag = mWriteMetaOpsTail->stableFlag; + } + } + return theRet; + } + bool CanHaveVersion(kfsSeq_t vers) const { + if (vers == chunkInfo.chunkVersion) { + return true; + } + for (const WriteChunkMetaOp* + op = mWriteMetaOpsHead; op; op = op->next) { + if (op->renameFlag && vers == op->targetVersion) { + return true; + } + } + return false; + } + bool IsChunkReadable() const { + return (! mWriteMetaOpsHead && mStableFlag && mWritesInFlight <= 0); + } + bool IsRenameInFlight() const { + return (mRenamesInFlight > 0); + } + bool HasWritesInFlight() const { + return (mWritesInFlight > 0); + } + bool IsStale() const { + return (mChunkList == ChunkManager::kChunkStaleList || + mChunkList == ChunkManager::kChunkPendingStaleList); + } + bool IsKeep() const { + return mKeepFlag; + } + void MakeStale(ChunkLists* chunkInfoLists, bool keepFlag) { + if (IsStale()) { + return; + } + mKeepFlag = keepFlag; + if (mWriteAppenderOwnsFlag) { + mWriteAppenderOwnsFlag = false; + gAtomicRecordAppendManager.DeleteChunk(chunkInfo.chunkId); + } + UpdateStale(chunkInfoLists); + // Chunk is no longer in the chunk table, no further write ops + // completion notification will get here. Clear write op counter and + // restart the next op if needed. + if (mWritesInFlight > 0) { + mWritesInFlight = 1; + WriteDone(); + } + } + void UpdateStale(ChunkLists* chunkInfoLists) { + const bool evacuateFlag = IsEvacuate(); + ChunkList::Remove(chunkInfoLists[mChunkList], *this); + mChunkList = mRenamesInFlight > 0 ? + ChunkManager::kChunkPendingStaleList : + ChunkManager::kChunkStaleList; + ChunkList::PushBack(chunkInfoLists[mChunkList], *this); + DetachFromChunkDir(evacuateFlag); + } + const string& GetDirname() const { return mChunkDir.dirname; } + const ChunkDirInfo& GetDirInfo() const { return mChunkDir; } + ChunkDirInfo& GetDirInfo() { return mChunkDir; } + + bool isBeingReplicated:1; // is the chunk being replicated from + // another server +private: + bool mDeleteFlag:1; + bool mWriteAppenderOwnsFlag:1; + bool mWaitForWritesInFlightFlag:1; + bool mMetaDirtyFlag:1; + bool mStableFlag:1; + bool mInDoneHandlerFlag:1; + bool mKeepFlag:1; + ChunkManager::ChunkListType mChunkList:2; + ChunkDirInfo::ChunkListType mChunkDirList:2; + unsigned int mRenamesInFlight:19; + // Chunk meta data updates need to be executed in order, allow only one + // write in flight. + int mWritesInFlight; + WriteChunkMetaOp* mWriteMetaOpsHead; + WriteChunkMetaOp* mWriteMetaOpsTail; + ChunkDirInfo& mChunkDir; + ChunkInfoHandle* mPrevPtr[ChunkDirInfo::kChunkInfoHDirListCount]; + ChunkInfoHandle* mNextPtr[ChunkDirInfo::kChunkInfoHDirListCount]; + + void DetachFromChunkDir(bool evacuateFlag) { + if (mChunkDirList == ChunkDirInfo::kChunkDirListNone) { + return; + } + ChunkDirList::Remove(mChunkDir.chunkLists[mChunkDirList], *this); + assert(mChunkDir.chunkCount > 0); + mChunkDir.chunkCount--; + mChunkDirList = ChunkDirInfo::kChunkDirListNone; + if (evacuateFlag) { + mChunkDir.ChunkEvacuateDone(); + } + } + + int HandleChunkMetaWriteDone(int code, void *data); + virtual ~ChunkInfoHandle() { + if (mWriteMetaOpsHead) { + // Object is the "client" of this op. + die("attempt to delete chunk info handle " + "with meta data write in flight"); + } + if (IsFileOpen()) { + globals().ctrOpenDiskFds.Update(-1); + } + } + void UpdateState() { + if (mInDoneHandlerFlag) { + return; + } + if (mDeleteFlag || IsStale()) { + if (! mWriteMetaOpsHead) { + if (IsStale()) { + gChunkManager.UpdateStale(*this); + } else { + delete this; + } + } + } else { + gChunkManager.LruUpdate(*this); + } + } + friend class QCDLListOp; + friend class QCDLListOp; +private: + ChunkInfoHandle(const ChunkInfoHandle&); + ChunkInfoHandle& operator=(const ChunkInfoHandle&); +}; + +inline bool ChunkManager::IsInLru(const ChunkInfoHandle& cih) const { + return (! cih.IsStale() && + ChunkList::IsInList(mChunkInfoLists[kChunkLruList], cih)); +} + +inline void ChunkInfoHandle::LruUpdate(ChunkInfoHandle::ChunkLists* chunkInfoLists) { + if (IsStale()) { + return; + } + lastIOTime = globalNetManager().Now(); + if (! mWriteAppenderOwnsFlag && ! isBeingReplicated && ! mWriteMetaOpsHead) { + ChunkList::PushBack(chunkInfoLists[mChunkList], *this); + assert(gChunkManager.IsInLru(*this)); + } else { + ChunkList::Remove(chunkInfoLists[mChunkList], *this); + assert(! gChunkManager.IsInLru(*this)); + } +} + +inline void ChunkInfoHandle::SetWriteAppenderOwns(ChunkInfoHandle::ChunkLists* chunkInfoLists, bool flag) { + if (mDeleteFlag || IsStale() || flag == mWriteAppenderOwnsFlag) { + return; + } + mWriteAppenderOwnsFlag = flag; + if (mWriteAppenderOwnsFlag) { + ChunkList::Remove(chunkInfoLists[mChunkList], *this); + assert(! gChunkManager.IsInLru(*this)); + } else { + LruUpdate(chunkInfoLists); + } +} + +inline bool ChunkInfoHandle::IsWriteAppenderOwns() const +{ + return mWriteAppenderOwnsFlag; +} + +inline void ChunkManager::LruUpdate(ChunkInfoHandle& cih) { + cih.LruUpdate(mChunkInfoLists); +} + +inline void ChunkManager::Release(ChunkInfoHandle& cih) { + cih.Release(mChunkInfoLists); +} + +inline void ChunkManager::Delete(ChunkInfoHandle& cih) { + if (! cih.IsStale() && ! mPendingWrites.Delete( + cih.chunkInfo.chunkId, cih.chunkInfo.chunkVersion)) { + ostringstream os; + os << "delete failed to cleanup pending writes: " + " chunk: " << cih.chunkInfo.chunkId << + " version: " << cih.chunkInfo.chunkVersion + ; + die(os.str()); + } + cih.Delete(mChunkInfoLists); +} + +inline void ChunkManager::UpdateStale(ChunkInfoHandle& cih) { + assert(cih.IsStale()); + cih.UpdateStale(mChunkInfoLists); + RunStaleChunksQueue(); +} + +void +ChunkInfoHandle::Release(ChunkInfoHandle::ChunkLists* chunkInfoLists) +{ + chunkInfo.UnloadChecksums(); + if (! IsFileOpen()) { + return; + } + string errMsg; + if (! dataFH->Close( + chunkInfo.chunkSize + KFS_CHUNK_HEADER_SIZE, + &errMsg)) { + KFS_LOG_STREAM_INFO << + "chunk " << chunkInfo.chunkId << " close error: " << errMsg << + KFS_LOG_EOM; + dataFH.reset(); + } + KFS_LOG_STREAM_INFO << + "Closing chunk " << chunkInfo.chunkId << " and might give up lease" << + KFS_LOG_EOM; + gLeaseClerk.RelinquishLease(chunkInfo.chunkId, chunkInfo.chunkSize); + + ChunkList::Remove(chunkInfoLists[mChunkList], *this); + globals().ctrOpenDiskFds.Update(-1); +} + +inline bool +WriteChunkMetaOp::IsRenameNeeded(const ChunkInfoHandle* cih) const +{ + return ( + renameFlag && + ((cih->IsStable() && cih->chunkInfo.chunkVersion != targetVersion) || + cih->IsStable() != stableFlag) + ); +} + +int +WriteChunkMetaOp::Start(ChunkInfoHandle* cih) +{ + gChunkManager.LruUpdate(*cih); + if (renameFlag) { + if (! IsRenameNeeded(cih)) { + int64_t res = 0; + cih->HandleEvent(EVENT_DISK_RENAME_DONE, &res); + return 0; + } + if (! DiskIo::Rename( + gChunkManager.MakeChunkPathname(cih).c_str(), + gChunkManager.MakeChunkPathname( + cih, stableFlag, targetVersion).c_str(), + cih, + &statusMsg)) { + status = -EAGAIN; + KFS_LOG_STREAM_ERROR << + Show() << " failed: " << statusMsg << + KFS_LOG_EOM; + } + } else { + assert(diskIo); + status = diskIo->Write(0, dataBuf.BytesConsumable(), &dataBuf); + } + return status; +} + +int +ChunkInfoHandle::WriteChunkMetadata( + KfsCallbackObj* cb, + bool renameFlag, + bool stableFlag, + kfsSeq_t targetVersion) +{ + if (renameFlag && (int)mRenamesInFlight + 1 <= 0) { + // Overflow: too many renames in flight. + return -ESERVERBUSY; + } + // If chunk is not stable and is not transitioning into stable, and there + // are no pending ops, just assign the version and mark meta dirty. + if (targetVersion > 0 && chunkInfo.chunkVersion != targetVersion && + mWritesInFlight <= 0 && + ! IsStable() && ! stableFlag && ! mWriteMetaOpsTail && + ! mInDoneHandlerFlag && IsFileOpen() && + ! mDeleteFlag && ! IsStale()) { + mMetaDirtyFlag = true; + chunkInfo.chunkVersion = targetVersion; + if (cb) { + int res = 0; + cb->HandleEvent(renameFlag ? + EVENT_DISK_RENAME_DONE : EVENT_DISK_WROTE, &res); + } + UpdateState(); + return 0; + } + if (renameFlag) { + // Queue the version update first, then immediately queue rename. + // Not stable chunks on disk always have version 0. + mMetaDirtyFlag = true; + const int ret = WriteChunkMetadata( + 0, false, stableFlag, stableFlag ? targetVersion : kfsSeq_t(0)); + if (ret != 0) { + return ret; + } + } + DiskIo* d = 0; + if (! renameFlag) { + if (! mMetaDirtyFlag) { + if (! cb) { + return 0; + } + if (! mWriteMetaOpsTail) { + assert(mRenamesInFlight <= 0); + int res = 0; + cb->HandleEvent(EVENT_DISK_WROTE, &res); + UpdateState(); + return 0; + } + } + if (mMetaDirtyFlag) { + d = gChunkManager.SetupDiskIo(this, this); + if (! d) { + return -ESERVERBUSY; + } + mMetaDirtyFlag = false; + } else { + // Add to pending meta op to completion queue. + assert(mWriteMetaOpsTail); + } + } + WriteChunkMetaOp* const wcm = new WriteChunkMetaOp(chunkInfo.chunkId, + cb, d, renameFlag, stableFlag, targetVersion); + if (d) { + const kfsSeq_t prevVersion = chunkInfo.chunkVersion; + chunkInfo.chunkVersion = targetVersion; + chunkInfo.Serialize(&wcm->dataBuf); + chunkInfo.chunkVersion = prevVersion; + const uint64_t checksum = + ComputeBlockChecksum(&wcm->dataBuf, wcm->dataBuf.BytesConsumable()); + wcm->dataBuf.CopyIn( + reinterpret_cast(&checksum), (int)sizeof(checksum)); + wcm->dataBuf.ZeroFillLast(); + if ((int)KFS_CHUNK_HEADER_SIZE < wcm->dataBuf.BytesConsumable()) { + die("invalid io buffer size"); + } + } + if (wcm->renameFlag) { + mRenamesInFlight++; + assert(mRenamesInFlight > 0); + } + if (mWriteMetaOpsTail) { + assert(mWriteMetaOpsHead); + while (mWriteMetaOpsTail->next) { + mWriteMetaOpsTail = mWriteMetaOpsTail->next; + } + mWriteMetaOpsTail->next = wcm; + mWriteMetaOpsTail = wcm; + return 0; + } + assert(! mWriteMetaOpsHead); + mWriteMetaOpsHead = wcm; + mWriteMetaOpsTail = wcm; + if (mWritesInFlight > 0) { + mWaitForWritesInFlightFlag = true; + return 0; + } + const int res = wcm->Start(this); + if (res < 0) { + mWriteMetaOpsHead = 0; + mWriteMetaOpsTail = 0; + delete wcm; + } + return (res >= 0 ? 0 : res); +} + +int +ChunkInfoHandle::HandleChunkMetaWriteDone(int codeIn, void *dataIn) +{ + const bool prevInDoneHandlerFlag = mInDoneHandlerFlag; + mInDoneHandlerFlag = true; + int64_t res; + int err; + int code = codeIn; + void* data = dataIn; + // Do not rely on compiler to unroll tail recursion, use loop. + for (; ;) { + assert(mWriteMetaOpsHead); + int status = data ? *reinterpret_cast(data) : -1; + if (code == EVENT_DISK_ERROR && status >= 0) { + status = -1; + } + if ((! mDeleteFlag && ! IsStale()) && status < 0) { + KFS_LOG_STREAM_ERROR << mWriteMetaOpsHead->Show() << + " failed: status: " << status << + " op: status: " << mWriteMetaOpsHead->status << + " msg: " << mWriteMetaOpsHead->statusMsg << + KFS_LOG_EOM; + if (! isBeingReplicated) { + gChunkManager.ChunkIOFailed(this, status); + } + } + if (mWriteMetaOpsHead->status >= 0) { + mWriteMetaOpsHead->status = status; + } + if (mWriteMetaOpsHead->renameFlag) { + assert(mRenamesInFlight > 0); + mRenamesInFlight--; + if (mWriteMetaOpsHead->status == 0) { + if (code != EVENT_DISK_RENAME_DONE) { + ostringstream os; + os << "chunk meta write completion:" + " unexpected event code: " << code; + die(os.str()); + } + mStableFlag = mWriteMetaOpsHead->stableFlag; + chunkInfo.chunkVersion = mWriteMetaOpsHead->targetVersion; + if (mStableFlag) { + mWriteAppenderOwnsFlag = false; + // LruUpdate below will add it back to the lru list. + } + } + } + WriteChunkMetaOp* const cur = mWriteMetaOpsHead; + mWriteMetaOpsHead = cur->next; + const bool doneFlag = ! mWriteMetaOpsHead; + if (doneFlag) { + mWriteMetaOpsTail = 0; + } + cur->HandleEvent(code, data); + if (doneFlag) { + break; + } + if (mWriteMetaOpsHead->IsWaiting()) { + // Call the completion, this op was waiting for the one that + // just completed. + continue; + } + if (mWritesInFlight > 0) { + mWaitForWritesInFlightFlag = true; + break; + } + if (mWriteMetaOpsHead->renameFlag && + ! mWriteMetaOpsHead->IsRenameNeeded(this)) { + res = 0; + data = &res; + code = EVENT_DISK_RENAME_DONE; + continue; + } + if (mDeleteFlag || IsStale()) { + err = -EBADF; + } else if ((err = mWriteMetaOpsHead->Start(this)) >= 0) { + break; + } + data = &err; + code = EVENT_DISK_ERROR; + } + mInDoneHandlerFlag = prevInDoneHandlerFlag; + UpdateState(); + return 0; +} + +static int +GetMaxOpenFds() +{ + struct rlimit rlim; + int maxOpenFds = 0; + + if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { + maxOpenFds = rlim.rlim_cur; + // bump the soft limit to the hard limit + rlim.rlim_cur = rlim.rlim_max; + if (setrlimit(RLIMIT_NOFILE, &rlim) == 0) { + maxOpenFds = rlim.rlim_cur; + } + } + KFS_LOG_STREAM_INFO << + "max # of open files: " << maxOpenFds << + KFS_LOG_EOM; + return maxOpenFds; +} + +// Chunk manager implementation. +ChunkManager::ChunkManager() + : mMaxPendingWriteLruSecs(300), + mCheckpointIntervalSecs(120), + mTotalSpace(int64_t(1) << 62), + mUsedSpace(0), + mMinFsAvailableSpace((int64_t)(CHUNKSIZE + KFS_CHUNK_HEADER_SIZE)), + mMaxSpaceUtilizationThreshold(0.05), + mNextCheckpointTime(0), + mMaxOpenChunkFiles((64 << 10) - 8), + mMaxOpenFds(1 << 10), + mFdsPerChunk(1), + mChunkDirs(), + mWriteId(GetRandomSeq()), // Seed write id. + mPendingWrites(), + mChunkTable(), + mMaxIORequestSize(4 << 20), + mNextChunkDirsCheckTime(globalNetManager().Now() - 1), + mChunkDirsCheckIntervalSecs(120), + mNextGetFsSpaceAvailableTime(globalNetManager().Now() - 1), + mGetFsSpaceAvailableIntervalSecs(25), + mInactiveFdsCleanupIntervalSecs(300), + mNextInactiveFdCleanupTime(0), + mReadChecksumMismatchMaxRetryCount(0), + mAbortOnChecksumMismatchFlag(false), + mRequireChunkHeaderChecksumFlag(false), + mForceDeleteStaleChunksFlag(false), + mKeepEvacuatedChunksFlag(false), + mStaleChunkCompletion(*this), + mStaleChunkOpsInFlight(0), + mMaxStaleChunkOpsInFlight(4), + mMaxDirCheckDiskTimeouts(4), + mChunkPlacementPendingReadWeight(0), + mChunkPlacementPendingWriteWeight(0), + mMaxPlacementSpaceRatio(0.2), + mMinPendingIoThreshold(8 << 20), + mAllowSparseChunksFlag(true), + mBufferedIoFlag(false), + mNullBlockChecksum(0), + mCounters(), + mDirChecker(), + mCleanupChunkDirsFlag(true), + mStaleChunksDir("lost+found"), + mDirtyChunksDir("dirty"), + mEvacuateFileName("evacuate"), + mEvacuateDoneFileName(mEvacuateFileName + ".done"), + mChunkDirLockName("lock"), + mEvacuationInactivityTimeout(300), + mMetaHeartbeatTime(globalNetManager().Now() - 365 * 24 * 60 * 60), + mMetaEvacuateCount(-1), + mMaxEvacuateIoErrors(2), + mChunkHeaderBuffer(reinterpret_cast(&mChunkHeaderBufferAlloc)) +{ + mDirChecker.SetInterval(180); + srand48((long)globalNetManager().Now()); + for (int i = 0; i < kChunkInfoListCount; i++) { + ChunkList::Init(mChunkInfoLists[i]); + } + globalNetManager().SetMaxAcceptsPerRead(4096); +} + +ChunkManager::~ChunkManager() +{ + assert(mChunkTable.IsEmpty()); + globalNetManager().UnRegisterTimeoutHandler(this); +} + +void +ChunkManager::Shutdown() +{ + mDirChecker.Stop(); + // Run delete queue before removing chunk table entries. + RunStaleChunksQueue(); + for (int i = 0; ;) { + const bool completionFlag = DiskIo::RunIoCompletion(); + if (mStaleChunkOpsInFlight <= 0) { + break; + } + if (completionFlag) { + continue; + } + if (++i > 1000) { + KFS_LOG_STREAM_ERROR << + "ChunkManager::Shutdown pending delete timeout exceeded" << + KFS_LOG_EOM; + ChunkList::Iterator it(mChunkInfoLists[kChunkStaleList]); + ChunkInfoHandle* cih; + while ((cih = it.Next())) { + Delete(*cih); + } + break; + } + usleep(10000); + } + + ScavengePendingWrites(time(0) + 2 * mMaxPendingWriteLruSecs); + CMap tmp; + const CMapEntry* p; + mChunkTable.First(); + while ((p = mChunkTable.Next())) { + ChunkInfoHandle* const cih = p->GetVal(); + if (cih->IsFileInUse()) { + cih->SetWriteAppenderOwns(mChunkInfoLists, false); + bool newEntryFlag = true; + tmp.Insert(p->GetKey(), cih, newEntryFlag); + continue; + } + Release(*cih); + Delete(*cih); + } + mChunkTable.Clear(); + mChunkTable.Swap(tmp); + gAtomicRecordAppendManager.Shutdown(); + for (int i = 0; ;) { + mChunkTable.First(); + while ((p = mChunkTable.Next())) { + ChunkInfoHandle* const cih = p->GetVal(); + if (! cih) { + mChunkTable.Erase(p->GetKey()); + continue; + } + if (cih->IsFileInUse()) { + break; + } + mChunkTable.Erase(p->GetKey()); + Release(*cih); + Delete(*cih); + } + const bool completionFlag = DiskIo::RunIoCompletion(); + if (mChunkTable.IsEmpty()) { + break; + } + if (completionFlag) { + continue; + } + if (++i > 1000) { + KFS_LOG_STREAM_ERROR << + "ChunkManager::Shutdown timeout exceeded" << + KFS_LOG_EOM; + break; + } + usleep(10000); + } + globalNetManager().UnRegisterTimeoutHandler(this); + string errMsg; + if (! DiskIo::Shutdown(&errMsg)) { + KFS_LOG_STREAM_INFO << + "DiskIo::Shutdown failure: " << errMsg << + KFS_LOG_EOM; + } +} + +bool +ChunkManager::IsWriteAppenderOwns(kfsChunkId_t chunkId) const +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + return (ci && (*ci)->IsWriteAppenderOwns()); +} + +void +ChunkManager::SetParameters(const Properties& prop) +{ + mInactiveFdsCleanupIntervalSecs = prop.getValue( + "chunkServer.inactiveFdsCleanupIntervalSecs", + mInactiveFdsCleanupIntervalSecs); + mMaxPendingWriteLruSecs = max(1, prop.getValue( + "chunkServer.maxPendingWriteLruSecs", + mMaxPendingWriteLruSecs)); + mCheckpointIntervalSecs = max(1, prop.getValue( + "chunkServer.checkpointIntervalSecs", + mCheckpointIntervalSecs)); + mChunkDirsCheckIntervalSecs = max(1, prop.getValue( + "chunkServer.chunkDirsCheckIntervalSecs", + mChunkDirsCheckIntervalSecs)); + mGetFsSpaceAvailableIntervalSecs = max(1, prop.getValue( + "chunkServer.getFsSpaceAvailableIntervalSecs", + mGetFsSpaceAvailableIntervalSecs)); + mAbortOnChecksumMismatchFlag = prop.getValue( + "chunkServer.abortOnChecksumMismatchFlag", + mAbortOnChecksumMismatchFlag ? 1 : 0) != 0; + mReadChecksumMismatchMaxRetryCount = prop.getValue( + "chunkServer.readChecksumMismatchMaxRetryCount", + mReadChecksumMismatchMaxRetryCount); + mRequireChunkHeaderChecksumFlag = prop.getValue( + "chunkServer.requireChunkHeaderChecksum", + mRequireChunkHeaderChecksumFlag ? 1 : 0) != 0; + mForceDeleteStaleChunksFlag = prop.getValue( + "chunkServer.forceDeleteStaleChunks", + mForceDeleteStaleChunksFlag ? 1 : 0) != 0; + mKeepEvacuatedChunksFlag = prop.getValue( + "chunkServer.keepEvacuatedChunksFlag", + mKeepEvacuatedChunksFlag ? 1 : 0) != 0; + mMaxStaleChunkOpsInFlight = prop.getValue( + "chunkServer.maxStaleChunkOpsInFlight", + mMaxStaleChunkOpsInFlight); + mMaxDirCheckDiskTimeouts = prop.getValue( + "chunkServer.maxDirCheckDiskTimeouts", + mMaxDirCheckDiskTimeouts); + mTotalSpace = prop.getValue( + "chunkServer.totalSpace", + mTotalSpace); + mMinFsAvailableSpace = max(int64_t(CHUNKSIZE + KFS_CHUNK_HEADER_SIZE), + prop.getValue( + "chunkServer.minFsAvailableSpace", + mMinFsAvailableSpace)); + mMaxSpaceUtilizationThreshold = prop.getValue( + "chunkServer.maxSpaceUtilizationThreshold", + mMaxSpaceUtilizationThreshold); + mChunkPlacementPendingReadWeight = prop.getValue( + "chunkServer.chunkPlacementPendingReadWeight", + mChunkPlacementPendingReadWeight); + mChunkPlacementPendingWriteWeight = prop.getValue( + "chunkServer.chunkPlacementPendingWriteWeight", + mChunkPlacementPendingWriteWeight); + mMinPendingIoThreshold = prop.getValue( + "chunkServer.minPendingIoThreshold", + mMinPendingIoThreshold); + mMaxPlacementSpaceRatio = prop.getValue( + "chunkServer.maxPlacementSpaceRatio", + mMaxPlacementSpaceRatio); + mAllowSparseChunksFlag = prop.getValue( + "chunkServer.allowSparseChunks", + mAllowSparseChunksFlag ? 1 : 0) != 0; + mBufferedIoFlag = prop.getValue( + "chunkServer.bufferedIo", + mBufferedIoFlag ? 1 : 0) != 0; + mEvacuateFileName = prop.getValue( + "chunkServer.evacuateFileName", + mEvacuateFileName); + mEvacuateDoneFileName = prop.getValue( + "chunkServer.evacuateDoneFileName", + mEvacuateDoneFileName); + mEvacuationInactivityTimeout = prop.getValue( + "chunkServer.evacuationInactivityTimeout", + mEvacuationInactivityTimeout); + mDirChecker.SetInterval(prop.getValue( + "chunkServer.dirRecheckInterval", + mDirChecker.GetInterval() / 1000) * 1000); + mCleanupChunkDirsFlag = prop.getValue( + "chunkServer.cleanupChunkDirs", + mCleanupChunkDirsFlag); + mDirChecker.SetRemoveFilesFlag(mCleanupChunkDirsFlag); + + TcpSocket::SetDefaultRecvBufSize(prop.getValue( + "chunkServer.tcpSocket.recvBufSize", + TcpSocket::GetDefaultRecvBufSize())); + TcpSocket::SetDefaultSendBufSize(prop.getValue( + "chunkServer.tcpSocket.sendBufSize", + TcpSocket::GetDefaultSendBufSize())); + + globalNetManager().SetMaxAcceptsPerRead(prop.getValue( + "chunkServer.net.maxAcceptsPerRead", + globalNetManager().GetMaxAcceptsPerRead())); + + DiskIo::SetParameters(prop); + Replicator::SetParameters(prop); + + gClientManager.SetTimeouts( + prop.getValue("chunkServer.client.ioTimeoutSec", 5 * 60), + prop.getValue("chunkServer.client.idleTimeoutSec", 10 * 60) + ); + RemoteSyncSM::SetResponseTimeoutSec( + prop.getValue("chunkServer.remoteSync.responseTimeoutSec", + RemoteSyncSM::GetResponseTimeoutSec()) + ); + RemoteSyncSM::SetTraceRequestResponse( + prop.getValue("chunkServer.remoteSync.traceRequestResponse", false) + ); + mMaxEvacuateIoErrors = max(1, prop.getValue( + "chunkServer.maxEvacuateIoErrors", + mMaxEvacuateIoErrors + )); + + DirChecker::FileNames excludes; + excludes.insert(mEvacuateDoneFileName); + mDirChecker.SetDontUseIfExist(excludes); + gAtomicRecordAppendManager.SetParameters(prop); + + const time_t now = globalNetManager().Now(); + mNextGetFsSpaceAvailableTime = min(mNextGetFsSpaceAvailableTime, + now + mGetFsSpaceAvailableIntervalSecs); + mNextChunkDirsCheckTime = min(mNextChunkDirsCheckTime, + now + mChunkDirsCheckIntervalSecs); +} + +static string AddTrailingPathSeparator(const string& dir) +{ + return ((! dir.empty() && dir[dir.length() - 1] != '/') ? + dir + "/" : dir); +} + +struct EqualPrefixStr : public binary_function +{ + bool operator()(const string& x, const string& y) const + { + return x.compare(0, min(x.length(), y.length()), y) == 0; + } +}; + +bool +ChunkManager::Init(const vector& chunkDirs, const Properties& prop) +{ + if (chunkDirs.empty()) { + KFS_LOG_STREAM_ERROR << + "no chunk directories specified" << + KFS_LOG_EOM; + return false; + } + + // allow to change dir names only before io starts. + mStaleChunksDir = prop.getValue( + "chunkServer.staleChunksDir", + mStaleChunksDir); + mDirtyChunksDir = prop.getValue( + "chunkServer.dirtyChunksDir", + mDirtyChunksDir); + mChunkDirLockName = prop.getValue( + "chunkServer.dirLockFileName", + mChunkDirLockName); + if (mStaleChunksDir.empty()) { + KFS_LOG_STREAM_ERROR << + "invalid stale chunks dir name: " << mStaleChunksDir << + KFS_LOG_EOM; + return false; + } + if (mDirtyChunksDir.empty()) { + KFS_LOG_STREAM_ERROR << + "invalid stale chunks dir name: " << mDirtyChunksDir << + KFS_LOG_EOM; + return false; + } + mStaleChunksDir = AddTrailingPathSeparator(mStaleChunksDir); + mDirtyChunksDir = AddTrailingPathSeparator(mDirtyChunksDir); + + SetParameters(prop); + + // Normalize tailing /, and keep only longest prefixes: + // only leave leaf directories. + vector dirs; + dirs.reserve(chunkDirs.size()); + for (vector::const_iterator it = chunkDirs.begin(); + it < chunkDirs.end(); + ++it) { + if (it->empty()) { + continue; + } + string dir = *it; + size_t pos = dir.length(); + while (pos > 1 && dir[pos - 1] == '/') { + --pos; + } + if (++pos < dir.length()) { + dir.erase(pos); + } + dirs.push_back(AddTrailingPathSeparator(dir)); + } + sort(dirs.begin(), dirs.end(), greater()); + size_t cnt = unique(dirs.begin(), dirs.end(), EqualPrefixStr()) - + dirs.begin(); + mChunkDirs.Allocate(cnt); + vector::const_iterator di = dirs.begin(); + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it < mChunkDirs.end(); + ++it, ++di) { + it->dirname = *di; + } + + string errMsg; + if (! DiskIo::Init(prop, &errMsg)) { + KFS_LOG_STREAM_ERROR << + "DiskIo::Init failure: " << errMsg << + KFS_LOG_EOM; + return false; + } + const int kMinOpenFds = 32; + mMaxOpenFds = GetMaxOpenFds(); + if (mMaxOpenFds < kMinOpenFds) { + KFS_LOG_STREAM_ERROR << + "file descriptor limit too small: " << mMaxOpenFds << + KFS_LOG_EOM; + return false; + } + mFdsPerChunk = DiskIo::GetFdCountPerFile(); + if (mFdsPerChunk < 1) { + KFS_LOG_STREAM_ERROR << + "invalid fd count per chunk: " << mFdsPerChunk << + KFS_LOG_EOM; + return false; + } + mMaxOpenChunkFiles = min((mMaxOpenFds - kMinOpenFds / 2) / mFdsPerChunk, + prop.getValue("chunkServer.maxOpenChunkFiles", mMaxOpenChunkFiles)); + if (mMaxOpenChunkFiles < kMinOpenFds / 2) { + KFS_LOG_STREAM_ERROR << + "open chunks limit too small: " << mMaxOpenChunkFiles << + KFS_LOG_EOM; + return false; + } + { + IOBuffer buf; + buf.ZeroFill((int)CHECKSUM_BLOCKSIZE); + mNullBlockChecksum = ComputeBlockChecksum(&buf, buf.BytesConsumable()); + } + // force a stat of the dirs and update space usage counts + return StartDiskIo(); +} + +int +ChunkManager::AllocChunk( + kfsFileId_t fileId, + kfsChunkId_t chunkId, + kfsSeq_t chunkVersion, + bool isBeingReplicated, + ChunkInfoHandle** outCih, + bool mustExistFlag /* = false */) +{ + ChunkInfoHandle** const cie = mChunkTable.Find(chunkId); + if (cie) { + if (isBeingReplicated) { + return -EINVAL; + } + ChunkInfoHandle* const cih = *cie; + if (cih->isBeingReplicated || cih->IsStable() || + cih->IsWriteAppenderOwns() || + cih->chunkInfo.chunkVersion != chunkVersion) { + return -EINVAL; + } + if (outCih) { + *outCih = cih; + } + return 0; + } else if (mustExistFlag) { + return -EBADF; + } + + // Find the directory to use + ChunkDirInfo* const chunkdir = GetDirForChunk(); + if (! chunkdir) { + KFS_LOG_STREAM_INFO << + "no directory has space to host chunk " << chunkId << + KFS_LOG_EOM; + return -ENOSPC; + } + + // Chunks are dirty until they are made stable: A chunk becomes + // stable when the write lease on the chunk expires and the + // metaserver says the chunk is now stable. Dirty chunks are + // stored in a "dirty" dir; chunks in this dir will get nuked + // on a chunkserver restart. This provides a very simple failure + // handling model. + + CleanupInactiveFds(); + + const bool stableFlag = false; + ChunkInfoHandle* const cih = new ChunkInfoHandle(*chunkdir, stableFlag); + cih->chunkInfo.Init(fileId, chunkId, chunkVersion); + cih->isBeingReplicated = isBeingReplicated; + cih->SetMetaDirty(); + bool newEntryFlag = false; + if (! mChunkTable.Insert(chunkId, cih, newEntryFlag) || ! newEntryFlag) { + die("chunk insertion failure"); + cih->Delete(mChunkInfoLists); + return -EFAULT; + } + KFS_LOG_STREAM_INFO << "Creating chunk: " << MakeChunkPathname(cih) << + KFS_LOG_EOM; + int ret = OpenChunk(cih, O_RDWR | O_CREAT); + if (ret < 0) { + // open chunk failed: the entry in the chunk table is cleared and + // Delete(*cih) is also called in OpenChunk(). Return the + // error code + return ret; + } + if (outCih) { + *outCih = cih; + } + return ret; +} + +void +ChunkManager::AllocChunkForAppend( + AllocChunkOp* op, int replicationPos, ServerLocation peerLoc) +{ + if (IsWritePending(op->chunkId)) { + op->statusMsg = "random write in progress"; + op->status = -EINVAL; + } + ChunkInfoHandle *cih = 0; + op->status = AllocChunk( + op->fileId, op->chunkId, op->chunkVersion, false, &cih, + op->mustExistFlag); + if (op->status != 0) { + return; + } + assert(cih); + gAtomicRecordAppendManager.AllocateChunk( + op, replicationPos, peerLoc, cih->dataFH); + if (op->status == 0) { + cih->SetWriteAppenderOwns(mChunkInfoLists, true); + } +} + +bool +ChunkManager::IsChunkStable(const ChunkInfoHandle* cih) const +{ + return ( + cih->IsStable() && + (! cih->IsWriteAppenderOwns() || + gAtomicRecordAppendManager.IsChunkStable(cih->chunkInfo.chunkId)) && + ! IsWritePending(cih->chunkInfo.chunkId) && + ! cih->isBeingReplicated + ); +} + +bool +ChunkManager::IsChunkStable(kfsChunkId_t chunkId) const +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + return (! ci || IsChunkStable(*ci)); +} + +bool +ChunkManager::IsChunkReadable(kfsChunkId_t chunkId) const +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + return (! ci || (IsChunkStable(*ci) && (*ci)->IsChunkReadable())); +} + +bool +ChunkManager::IsChunkStable(MakeChunkStableOp* op) +{ + if (op->hasChecksum) { + return false; // Have to run make stable to compare the checksum. + } + ChunkInfoHandle** const ci = mChunkTable.Find(op->chunkId); + if (! ci) { + op->statusMsg = "no such chunk"; + op->status = -EBADF; + return true; + } + // See if it have to wait until the chunk becomes readable. + ChunkInfoHandle* const cih = *ci; + return (op->chunkVersion == cih->chunkInfo.chunkVersion && + IsChunkStable(cih) && cih->IsChunkReadable()); +} + +int +ChunkManager::MakeChunkStable(kfsChunkId_t chunkId, kfsSeq_t chunkVersion, + bool appendFlag, KfsCallbackObj* cb, string& statusMsg) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + statusMsg = "no such chunk"; + return -EBADF; + } + ChunkInfoHandle* const cih = *ci; + assert(cih); + bool stableFlag = false; + if (cih->IsRenameInFlight()) { + if (chunkVersion != cih->GetTargetStateAndVersion(stableFlag)) { + statusMsg = (stableFlag ? "" : "not "); + statusMsg += "stable target version mismatch"; + return -EINVAL; + } + } else if (chunkVersion != cih->chunkInfo.chunkVersion) { + statusMsg = "version mismatch"; + return -EINVAL; + } + if (cih->isBeingReplicated) { + statusMsg = "chunk replication is in progress"; + return -EINVAL; + } + if (! cih->chunkInfo.chunkBlockChecksum) { + statusMsg = "checksum are not loaded"; + return -EAGAIN; + } + if ((appendFlag ? + ! cih->IsWriteAppenderOwns() : + (cih->IsWriteAppenderOwns() && + ! gAtomicRecordAppendManager.IsChunkStable(chunkId)))) { + ostringstream os; + os << "make stable invalid state: " + " chunk: " << chunkId << + " version: " << cih->chunkInfo.chunkVersion << + "/" << chunkVersion << + " append: " << appendFlag << + " appender owns:" << cih->IsWriteAppenderOwns() + ; + die(os.str()); + } + if (! mPendingWrites.Delete(chunkId, cih->chunkInfo.chunkVersion)) { + ostringstream os; + os << "make stable failed to cleanup pending writes: " + " chunk: " << chunkId << + " version: " << cih->chunkInfo.chunkVersion + ; + die(os.str()); + } + stableFlag = true; + const bool renameFlag = true; + const int res = cih->WriteChunkMetadata( + cb, renameFlag, stableFlag, cih->chunkInfo.chunkVersion); + if (res < 0) { + statusMsg = "failed to start chunk meta data write"; + } + return res; +} + +int +ChunkManager::DeleteChunk(kfsChunkId_t chunkId) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return -EBADF; + } + KFS_LOG_STREAM_INFO << "deleting chunk: " << chunkId << + KFS_LOG_EOM; + const bool forceDeleteFlag = true; + return StaleChunk(*ci, forceDeleteFlag); +} + +void +ChunkManager::DumpChunkMap() +{ + ofstream ofs; + ofs.open("chunkdump.txt"); + if (ofs) { + DumpChunkMap(ofs); + } + ofs.flush(); + ofs.close(); +} + +void +ChunkManager::DumpChunkMap(ostream &ofs) +{ + // Dump chunk map in the format of + // chunkID fileID chunkSize + mChunkTable.First(); + const CMapEntry* p; + while ((p = mChunkTable.Next())) { + ChunkInfoHandle* const cih = p->GetVal(); + ofs << cih->chunkInfo.chunkId << + " " << cih->chunkInfo.fileId << + " " << cih->chunkInfo.chunkSize << + "\n"; + } +} + +int +ChunkManager::WriteChunkMetadata( + kfsChunkId_t chunkId, KfsCallbackObj* cb, bool forceFlag /* = false */) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return -EBADF; + } + if (forceFlag) { + (*ci)->SetMetaDirty(); + } + return (*ci)->WriteChunkMetadata(cb); +} + +int +ChunkManager::ReadChunkMetadata(kfsChunkId_t chunkId, KfsOp* cb) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return -EBADF; + } + ChunkInfoHandle* const cih = *ci; + if (cih->isBeingReplicated) { + KFS_LOG_STREAM_ERROR << + "denied meta data read for chunk: " << chunkId << + " replication is in flight" << + KFS_LOG_EOM; + return -EBADF; + } + + LruUpdate(*cih); + if (cih->chunkInfo.AreChecksumsLoaded()) { + int res = 0; + cb->HandleEvent(EVENT_CMD_DONE, &res); + return 0; + } + + if (cih->readChunkMetaOp) { + // if we have issued a read request for this chunk's metadata, + // don't submit another one; otherwise, we will simply drive + // up memory usage for useless IO's + cih->readChunkMetaOp->AddWaiter(cb); + return 0; + } + + ReadChunkMetaOp* const rcm = new ReadChunkMetaOp(chunkId, cb); + DiskIo* const d = SetupDiskIo(cih, rcm); + if (! d) { + delete rcm; + return -ESERVERBUSY; + } + rcm->diskIo.reset(d); + + const int res = rcm->diskIo->Read(0, KFS_CHUNK_HEADER_SIZE); + if (res < 0) { + ReportIOFailure(cih, res); + delete rcm; + return res; + } + cih->readChunkMetaOp = rcm; + return 0; +} + +void +ChunkManager::ReadChunkMetadataDone(ReadChunkMetaOp* op, IOBuffer* dataBuf) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(op->chunkId); + if (! ci) { + if (op->status == 0) { + op->status = -EBADF; + op->statusMsg = "no such chunk"; + KFS_LOG_STREAM_ERROR << + "chunk meta data read completion: " << + op->statusMsg << " " << op->Show() << + KFS_LOG_EOM; + } + return; + } + ChunkInfoHandle* const cih = *ci; + if (op != cih->readChunkMetaOp) { + if (op->status >= 0) { + op->status = -EAGAIN; + op->statusMsg = "stale meta data read"; + } + KFS_LOG_STREAM_ERROR << + "chunk meta data read completion: " << + op->statusMsg << " " << op->Show() << + KFS_LOG_EOM; + return; + } + int res; + if (! dataBuf || + dataBuf->BytesConsumable() < (int)KFS_CHUNK_HEADER_SIZE || + dataBuf->CopyOut(mChunkHeaderBuffer, kChunkHeaderBufferSize) != + kChunkHeaderBufferSize) { + if (op->status != -ETIMEDOUT) { + op->status = -EIO; + op->statusMsg = "short chunk meta data read"; + } else { + op->statusMsg = "read timed out"; + } + KFS_LOG_STREAM_ERROR << + "chunk meta data read completion: " << op->statusMsg << + " " << (dataBuf ? dataBuf->BytesConsumable() : 0) << + " " << op->Show() << + KFS_LOG_EOM; + } else { + const DiskChunkInfo_t& dci = + *reinterpret_cast(mChunkHeaderBuffer); + const uint64_t& checksum = + *reinterpret_cast(&dci + 1); + uint32_t headerChecksum = 0; + if ((checksum != 0 || mRequireChunkHeaderChecksumFlag) && + (headerChecksum = ComputeBlockChecksum( + mChunkHeaderBuffer, sizeof(dci))) != checksum) { + op->status = -EBADCKSUM; + op->statusMsg = "chunk header checksum mismatch"; + ostringstream os; + os << "chunk meta data read completion: " << op->statusMsg << + " expected: " << checksum << + " computed: " << headerChecksum << + " " << op->Show() + ; + const string str = os.str(); + KFS_LOG_STREAM_ERROR << str << KFS_LOG_EOM; + if (mAbortOnChecksumMismatchFlag) { + die(str); + } + } else if ((res = dci.Validate(op->chunkId, cih->IsStable() ? + cih->chunkInfo.chunkVersion : kfsSeq_t(0))) < 0) { + op->status = res; + op->statusMsg = "chunk metadata validation mismatch"; + KFS_LOG_STREAM_ERROR << + "chunk meta data read completion: " << op->statusMsg << + " " << op->Show() << + KFS_LOG_EOM; + } else { + cih->chunkInfo.SetChecksums(dci.chunkBlockChecksum); + if (cih->chunkInfo.chunkSize > (int64_t)dci.chunkSize) { + const int64_t extra = cih->chunkInfo.chunkSize - dci.chunkSize; + mUsedSpace -= extra; + UpdateDirSpace(cih, -extra); + cih->chunkInfo.chunkSize = dci.chunkSize; + } else if (cih->chunkInfo.chunkSize != (int64_t)dci.chunkSize) { + op->status = res; + op->statusMsg = "chunk metadata size mismatch"; + KFS_LOG_STREAM_ERROR << + "chunk meta data read completion: " << op->statusMsg << + " file: " << cih->chunkInfo.chunkSize << + " meta: " << dci.chunkSize << + " " << op->Show() << + KFS_LOG_EOM; + } + } + } + LruUpdate(*cih); + cih->readChunkMetaOp = 0; + if (op->status < 0 && op->status != -ETIMEDOUT) { + mCounters.mBadChunkHeaderErrorCount++; + ChunkIOFailed(cih, op->status); + } +} + +bool +ChunkManager::IsChunkMetadataLoaded(kfsChunkId_t chunkId) +{ + ChunkInfoHandle *cih = 0; + return ( + GetChunkInfoHandle(chunkId, &cih) >= 0 && + cih->chunkInfo.AreChecksumsLoaded() + ); +} + +ChunkInfo_t* +ChunkManager::GetChunkInfo(kfsChunkId_t chunkId) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + return (ci ? &((*ci)->chunkInfo) : 0); +} + +int +ChunkManager::MarkChunkStale(ChunkInfoHandle* cih, KfsCallbackObj* cb) +{ + const string s = MakeChunkPathname(cih); + const string staleChunkPathname = MakeStaleChunkPathname(cih); + string err; + const int ret = DiskIo::Rename( + s.c_str(), staleChunkPathname.c_str(), cb, &err) ? 0 : -1; + KFS_LOG_STREAM_INFO << + "Moving chunk " << cih->chunkInfo.chunkId << + " to staleChunks dir " << staleChunkPathname << + (ret == 0 ? " ok" : " error:") << err << + KFS_LOG_EOM; + return ret; +} + +int +ChunkManager::StaleChunk(kfsChunkId_t chunkId, + bool forceDeleteFlag, bool evacuatedFlag) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return -EBADF; + } + return StaleChunk(*ci, forceDeleteFlag, evacuatedFlag); +} + +int +ChunkManager::StaleChunk(ChunkInfoHandle* cih, + bool forceDeleteFlag, bool evacuatedFlag) +{ + assert(cih); + if (mChunkTable.Erase(cih->chunkInfo.chunkId) <= 0) { + return -EBADF; + } + gLeaseClerk.UnRegisterLease(cih->chunkInfo.chunkId); + if (! cih->IsStale() && ! mPendingWrites.Delete( + cih->chunkInfo.chunkId, cih->chunkInfo.chunkVersion)) { + ostringstream os; + os << "make stale failed to cleanup pending writes: " + " chunk: " << cih->chunkInfo.chunkId << + " version: " << cih->chunkInfo.chunkVersion + ; + die(os.str()); + } + + cih->MakeStale(mChunkInfoLists, + (! forceDeleteFlag && ! mForceDeleteStaleChunksFlag) || + (evacuatedFlag && mKeepEvacuatedChunksFlag) + ); + assert(! cih->HasWritesInFlight()); + RunStaleChunksQueue(); + return 0; +} + +int +ChunkManager::TruncateChunk(kfsChunkId_t chunkId, int64_t chunkSize) +{ + // the truncated size should not exceed chunk size. + if (chunkSize > (int64_t)CHUNKSIZE) { + return -EINVAL; + } + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return -EBADF; + } + ChunkInfoHandle* const cih = *ci; + string const chunkPathname = MakeChunkPathname(cih); + + // Cnunk close will truncate it to the cih->chunkInfo.chunkSize + + UpdateDirSpace(cih, -cih->chunkInfo.chunkSize); + + mUsedSpace -= cih->chunkInfo.chunkSize; + mUsedSpace += chunkSize; + cih->chunkInfo.chunkSize = chunkSize; + + UpdateDirSpace(cih, cih->chunkInfo.chunkSize); + + uint32_t const lastChecksumBlock = OffsetToChecksumBlockNum(chunkSize); + + // XXX: Could do better; recompute the checksum for this last block + cih->chunkInfo.chunkBlockChecksum[lastChecksumBlock] = 0; + cih->SetMetaDirty(); + + return 0; +} + +int +ChunkManager::ChangeChunkVers(ChangeChunkVersOp* op) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(op->chunkId); + if (! ci) { + return -EBADF; + } + ChunkInfoHandle* const cih = *ci; + bool stableFlag = cih->IsStable(); + if (cih->IsRenameInFlight()) { + if (op->fromChunkVersion != cih->GetTargetStateAndVersion(stableFlag)) { + op->statusMsg = (stableFlag ? "" : "not "); + op->statusMsg += "stable target version mismatch"; + op->status = -EINVAL; + return op->status; + } + } else if (op->fromChunkVersion != cih->chunkInfo.chunkVersion) { + op->statusMsg = "version mismatch"; + op->status = -EINVAL; + return op->status; + } + if (cih->HasWritesInFlight()) { + op->statusMsg = "writes in flight"; + op->status = -EINVAL; + return op->status; + } + const int ret = ChangeChunkVers( + cih, op->chunkVersion, op->makeStableFlag || stableFlag, op); + if (ret < 0) { + op->status = ret; + } + return ret; +} + +int +ChunkManager::ChangeChunkVers( + kfsChunkId_t chunkId, + int64_t chunkVersion, + bool stableFlag, + KfsCallbackObj* cb) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return -EBADF; + } + return ChangeChunkVers(*ci, chunkVersion, stableFlag, cb); +} + +int +ChunkManager::ChangeChunkVers( + ChunkInfoHandle* cih, + int64_t chunkVersion, + bool stableFlag, + KfsCallbackObj* cb) +{ + if (! cih->chunkInfo.chunkBlockChecksum) { + KFS_LOG_STREAM_ERROR << + "attempt to change version on chunk: " << + cih->chunkInfo.chunkId << " denied: checksums are not loaded" << + KFS_LOG_EOM; + return -EINVAL; + } + if (cih->IsWriteAppenderOwns() && ! IsChunkStable(cih)) { + KFS_LOG_STREAM_WARN << + "attempt to change version on unstable chunk: " << + cih->chunkInfo.chunkId << " owned by write appender denied" << + KFS_LOG_EOM; + return -EINVAL; + } + + KFS_LOG_STREAM_INFO << + "Chunk " << MakeChunkPathname(cih) << + " already exists; changing version #" << + " from " << cih->chunkInfo.chunkVersion << " to " << chunkVersion << + " stable: " << cih->IsStable() << "=>" << stableFlag << + KFS_LOG_EOM; + + if (! mPendingWrites.Delete( + cih->chunkInfo.chunkId, cih->chunkInfo.chunkVersion)) { + ostringstream os; + os << "change version failed to cleanup pending writes: " + " chunk: " << cih->chunkInfo.chunkId << + " version: " << cih->chunkInfo.chunkVersion + ; + die(os.str()); + } + const bool renameFlag = true; + return cih->WriteChunkMetadata(cb, renameFlag, stableFlag, chunkVersion); +} + +void +ChunkManager::ReplicationDone(kfsChunkId_t chunkId, int status) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return; + } + ChunkInfoHandle* const cih = *ci; + if (! cih->isBeingReplicated) { + KFS_LOG_STREAM_DEBUG << + "irnored stale replication completion for" + " chunk: " << chunkId << + " status: " << status << + KFS_LOG_EOM; + return; + } + + KFS_LOG_STREAM_DEBUG << + "Replication for chunk: " << chunkId << + " status: " << status << + " " << MakeChunkPathname(cih) << + KFS_LOG_EOM; + if (status < 0) { + const bool forceDeleteFlag = true; + StaleChunk(cih, forceDeleteFlag); + return; + } + + cih->isBeingReplicated = false; + LruUpdate(*cih); // Add it to lru. + if (cih->IsFileOpen() && cih->IsStable() && + ! cih->IsFileInUse() && ! cih->SyncMeta()) { + Release(*cih); + } +} + +void +ChunkManager::Start() +{ + globalNetManager().RegisterTimeoutHandler(this); +} + +void +ChunkManager::UpdateDirSpace(ChunkInfoHandle* cih, int64_t nbytes) +{ + ChunkDirInfo& dir = cih->GetDirInfo(); + dir.usedSpace += nbytes; + if (dir.usedSpace < 0) { + dir.usedSpace = 0; + } +} + +ChunkManager::ChunkDirInfo* +ChunkManager::GetDirForChunk() +{ + // do weighted random, so that we can fill all drives + ChunkDirs::iterator dirToUse = mChunkDirs.end(); + int64_t totalFreeSpace = 0; + int64_t totalPendingRead = 0; + int64_t totalPendingWrite = 0; + int64_t maxFreeSpace = 0; + int dirCount = 0; + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it < mChunkDirs.end(); + ++it) { + it->placementSkipFlag = true; + if (it->evacuateStartedFlag) { + continue; + } + const int64_t space = it->availableSpace; + if (space < mMinFsAvailableSpace || + space <= it->totalSpace * mMaxSpaceUtilizationThreshold) { + continue; + } + dirCount++; + totalFreeSpace += space; + if (dirToUse == mChunkDirs.end()) { + dirToUse = it; + } + if (maxFreeSpace < space) { + maxFreeSpace = space; + } + it->placementSkipFlag = false; + if (mChunkPlacementPendingReadWeight <= 0 && + mChunkPlacementPendingWriteWeight <= 0) { + it->pendingReadBytes = 0; + it->pendingWriteBytes = 0; + continue; + } + int freeRequestCount; + int requestCount; + int64_t readBlockCount; + int64_t writeBlockCount; + int blockSize; + if (! DiskIo::GetDiskQueuePendingCount( + it->diskQueue, + freeRequestCount, + requestCount, + readBlockCount, + writeBlockCount, + blockSize)) { + die(it->dirname + ": get pending io count failed"); + } + it->pendingReadBytes = readBlockCount * blockSize; + it->pendingWriteBytes = writeBlockCount * blockSize; + totalPendingRead += it->pendingReadBytes; + totalPendingWrite += it->pendingWriteBytes; + } + if (dirCount <= 0 || totalFreeSpace <= 0) { + return 0; + } + if (dirCount == 1) { + return &(*dirToUse); + } + if (mChunkPlacementPendingReadWeight > 0 || + mChunkPlacementPendingWriteWeight > 0) { + // Exclude directories / drives that exceed "max io pending". + const int64_t maxPendingIo = max(mMinPendingIoThreshold, (int64_t) + (totalPendingRead * mChunkPlacementPendingReadWeight + + totalPendingWrite * mChunkPlacementPendingReadWeight) / dirCount); + ChunkDirs::iterator minIoPendingDir = mChunkDirs.end(); + for (ChunkDirs::iterator it = dirToUse; + it < mChunkDirs.end(); + ++it) { + if (it->placementSkipFlag) { + continue; + } + if (it->pendingReadBytes + it->pendingWriteBytes > + maxPendingIo) { + if (minIoPendingDir == mChunkDirs.end() || + it->pendingReadBytes + it->pendingWriteBytes < + minIoPendingDir->pendingReadBytes + + minIoPendingDir->pendingWriteBytes) { + minIoPendingDir = it; + } + if (--dirCount <= 0) { + return &(*minIoPendingDir); + } + it->placementSkipFlag = true; + if (it->availableSpace == maxFreeSpace) { + maxFreeSpace = -1; // Force update. + } + totalFreeSpace -= it->availableSpace; + if (it == dirToUse) { + dirToUse = mChunkDirs.end(); + } + } else if (dirToUse == mChunkDirs.end()) { + dirToUse = it; + } + } + } + assert(totalFreeSpace > 0); + int64_t minAvail = 0; + if (mMaxPlacementSpaceRatio > 0) { + if (maxFreeSpace < 0) { + maxFreeSpace = 0; + for (ChunkDirs::iterator it = dirToUse; + it < mChunkDirs.end(); + ++it) { + if (it->placementSkipFlag) { + continue; + } + if (maxFreeSpace < it->availableSpace) { + maxFreeSpace = it->availableSpace; + } + } + } + minAvail = (int64_t)(maxFreeSpace * mMaxPlacementSpaceRatio); + for (ChunkDirs::iterator it = dirToUse; + it < mChunkDirs.end(); + ++it) { + if (it->placementSkipFlag) { + continue; + } + if (minAvail <= it->availableSpace) { + continue; + } + totalFreeSpace += minAvail - it->availableSpace; + } + } + const double spaceWeight = double(1) / totalFreeSpace; + const double randVal = drand48(); + double curVal = 0; + for (ChunkDirs::iterator it = dirToUse; + it < mChunkDirs.end(); + ++it) { + if (it->placementSkipFlag) { + continue; + } + curVal += max(minAvail, it->availableSpace) * spaceWeight; + if (randVal < curVal) { + dirToUse = it; + break; + } + } + return (dirToUse == mChunkDirs.end() ? 0 : &(*dirToUse)); +} + +string +ChunkManager::MakeChunkPathname(ChunkInfoHandle *cih) +{ + return MakeChunkPathname(cih, cih->IsStable(), cih->chunkInfo.chunkVersion); +} + +string +ChunkManager::MakeChunkPathname(ChunkInfoHandle *cih, bool stableFlag, kfsSeq_t targetVersion) +{ + return MakeChunkPathname( + stableFlag ? + cih->GetDirname() : + cih->GetDirname() + mDirtyChunksDir, + cih->chunkInfo.fileId, + cih->chunkInfo.chunkId, + stableFlag ? targetVersion : 0 + ); +} + +string +ChunkManager::MakeChunkPathname(const string &chunkdir, kfsFileId_t fid, kfsChunkId_t chunkId, kfsSeq_t chunkVersion) +{ + ostringstream os; + + os << chunkdir << fid << '.' << chunkId << '.' << chunkVersion; + return os.str(); +} + +string +ChunkManager::MakeStaleChunkPathname(ChunkInfoHandle *cih) +{ + return MakeChunkPathname( + cih->GetDirname() + mStaleChunksDir, + cih->chunkInfo.fileId, + cih->chunkInfo.chunkId, + cih->chunkInfo.chunkVersion + ); +} + +void +ChunkManager::AddMapping(ChunkManager::ChunkDirInfo& dir, const char* filename, + int64_t infilesz) +{ + const int kNumComponents = 3; + long long components[kNumComponents]; + const char* ptr = filename; + char* end = 0; + int64_t filesz = infilesz; + int i; + + for (i = 0; i < kNumComponents; i++) { + components[i] = strtoll(ptr, &end, 10); + if (components[i] < 0) { + break; + } + if ((*end & 0xFF) != '.') { + if (*end == 0) { + i++; + } + break; + } + ptr = end + 1; + } + if (i != kNumComponents || *end) { + KFS_LOG_STREAM_INFO << + "ignoring malformed chunk file name: " << + dir.dirname << filename << + KFS_LOG_EOM; + return; + } + // Allow files bigger than chunk size. If file wasn't properly closed, + // but was in the stable directory, its header needs to be read, + // validated and proper size must be set. + // The file might be bigger by one io buffer size, and io buffer size is + // guaranteed to be less or equal to the KFS_CHUNK_HEADER_SIZE. + const int64_t kMaxChunkFileSize = (int64_t)(KFS_CHUNK_HEADER_SIZE + CHUNKSIZE); + if (filesz < (int64_t)KFS_CHUNK_HEADER_SIZE || + filesz > (int64_t)(kMaxChunkFileSize + KFS_CHUNK_HEADER_SIZE)) { + KFS_LOG_STREAM_INFO << + "ignoring invalid chunk file: " << dir.dirname << filename << + " size: " << filesz << + KFS_LOG_EOM; + return; + } + const chunkId_t chunkId = components[1]; + const kfsSeq_t chunkVers = components[2]; + if (filesz > kMaxChunkFileSize) { + // Load and validate chunk header, and set proper file size. + const string cf(dir.dirname + filename); + const int fd = open(cf.c_str(), O_RDONLY); + if (fd < 0) { + const int err = errno; + KFS_LOG_STREAM_INFO << + "ignoring invalid chunk file: " << cf << + " size: " << filesz << + " :" << QCUtils::SysError(err) << + KFS_LOG_EOM; + return; + } + const ssize_t rd = read(fd, mChunkHeaderBuffer, kChunkHeaderBufferSize); + close(fd); + if (rd != kChunkHeaderBufferSize) { + const int err = rd < 0 ? errno : EINVAL; + KFS_LOG_STREAM_INFO << + "ignoring invalid chunk file: " << cf << + " size: " << filesz << + " read: " << rd << + " :" << QCUtils::SysError(err) << + KFS_LOG_EOM; + return; + } + const DiskChunkInfo_t& dci = + *reinterpret_cast(mChunkHeaderBuffer); + const uint64_t checksum = + *reinterpret_cast(&dci + 1); + const int res = dci.Validate(chunkId, chunkVers); + if (res < 0) { + KFS_LOG_STREAM_INFO << + "ignoring invalid chunk file: " << cf << + " size: " << filesz << + " invalid chunk header" + " status: " << res << + KFS_LOG_EOM; + return; + } + uint32_t hdrChecksum = 0; + if ((checksum != 0 || mRequireChunkHeaderChecksumFlag) && + ((hdrChecksum = ComputeBlockChecksum( + mChunkHeaderBuffer, sizeof(dci))) != checksum)) { + KFS_LOG_STREAM_INFO << + "ignoring invalid chunk file: " << cf << + " invalid header:" + " size: " << filesz << + " chunk size: " << dci.chunkSize << + " checksum: " << checksum << + " expect: " << hdrChecksum << + KFS_LOG_EOM; + return; + } + filesz = dci.chunkSize + KFS_CHUNK_HEADER_SIZE; + if (truncate(cf.c_str(), filesz)) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + "failed truncate chunk file: " << cf << + " size: " << infilesz << + " to: " << filesz << + " :" << QCUtils::SysError(err) << + KFS_LOG_EOM; + } else { + KFS_LOG_STREAM_INFO << + "truncated chunk file: " << cf << + " size: " << infilesz << + " to: " << filesz << + KFS_LOG_EOM; + } + } + ChunkInfoHandle* cih = 0; + if (GetChunkInfoHandle(chunkId, &cih) == 0) { + string const name(dir.dirname + filename); + KFS_LOG_STREAM_INFO << + (mForceDeleteStaleChunksFlag ? "deleting" : "moving") << + " duplicate chunk: " << chunkId << + " file name: " << name << + " keeping: " << MakeChunkPathname(cih) << + KFS_LOG_EOM; + if (mForceDeleteStaleChunksFlag) { + if (unlink(name.c_str())) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + "failed to remove " << name << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + } + } else { + string const staleName( + dir.dirname + mStaleChunksDir + filename); + if (rename(name.c_str(), staleName.c_str())) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + "failed to rename " << name << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + } + } + return; + } + cih = new ChunkInfoHandle(dir); + cih->chunkInfo.fileId = components[0]; + cih->chunkInfo.chunkId = chunkId; + cih->chunkInfo.chunkVersion = chunkVers; + cih->chunkInfo.chunkSize = filesz - KFS_CHUNK_HEADER_SIZE; + AddMapping(cih); +} + +int +ChunkManager::OpenChunk(kfsChunkId_t chunkId, int openFlags) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + KFS_LOG_STREAM_DEBUG << "no such chunk: " << chunkId << KFS_LOG_EOM; + return -EBADF; + } + return OpenChunk(*ci, openFlags); +} + +int +ChunkManager::OpenChunk(ChunkInfoHandle* cih, int openFlags) +{ + if (cih->IsFileOpen()) { + return 0; + } + if (! cih->dataFH) { + cih->dataFH.reset(new DiskIo::File()); + } + string errMsg; + const bool kReserveFileSpace = true; + const string fn = MakeChunkPathname(cih); + bool tempFailureFlag = false; + // Set reservation size larger than max chunk size in order to detect files + // that weren't properly closed. + 1 here will make file one io block bigger + // QCDiskQueue::OpenFile() makes EOF block size aligned. + if (! cih->dataFH->Open( + fn.c_str(), + CHUNKSIZE + KFS_CHUNK_HEADER_SIZE + 1, + (openFlags & (O_WRONLY | O_RDWR)) == 0, + kReserveFileSpace, + (openFlags & O_CREAT) != 0, + &errMsg, + &tempFailureFlag, + mBufferedIoFlag)) { + mCounters.mOpenErrorCount++; + if ((openFlags & O_CREAT) != 0 || ! tempFailureFlag) { + // + // we are unable to open/create a file. notify the metaserver + // of lost data so that it can re-replicate if needed. + // + NotifyMetaCorruptedChunk(cih, -EBADF); + if (mChunkTable.Erase(cih->chunkInfo.chunkId) > 0) { + const int64_t size = min(mUsedSpace, cih->chunkInfo.chunkSize); + UpdateDirSpace(cih, -size); + mUsedSpace -= size; + } + Delete(*cih); + } + KFS_LOG_STREAM_ERROR << + "failed to " << (((openFlags & O_CREAT) == 0) ? "open" : "create") << + " chunk file: " << fn << " :" << errMsg << + KFS_LOG_EOM; + return (tempFailureFlag ? -EAGAIN : -EBADF); + } + globals().ctrOpenDiskFds.Update(1); + LruUpdate(*cih); + + // the checksums will be loaded async + return 0; +} + +int +ChunkManager::CloseChunk(kfsChunkId_t chunkId) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return -EBADF; + } + return CloseChunk(*ci); +} + +bool +ChunkManager::CloseChunkIfReadable(kfsChunkId_t chunkId) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + return -EBADF; + } + return ( + IsChunkStable(*ci) && + (*ci)->IsChunkReadable() && + CloseChunk(*ci) == 0 + ); +} + +int +ChunkManager::CloseChunk(ChunkInfoHandle* cih) +{ + if (cih->IsWriteAppenderOwns()) { + KFS_LOG_STREAM_INFO << + "Ignoring close chunk on chunk: " << cih->chunkInfo.chunkId << + " open for append " << + KFS_LOG_EOM; + return -EINVAL; + } + + // Close file if not in use. + if (cih->IsFileOpen() && ! cih->IsFileInUse() && + ! cih->isBeingReplicated && ! cih->SyncMeta()) { + Release(*cih); + } else { + KFS_LOG_STREAM_INFO << + "Didn't release chunk " << cih->chunkInfo.chunkId << + " on close; might give up lease" << + KFS_LOG_EOM; + gLeaseClerk.RelinquishLease( + cih->chunkInfo.chunkId, cih->chunkInfo.chunkSize); + } + return 0; +} + +void +ChunkManager::ChunkSize(SizeOp* op) +{ + ChunkInfoHandle* cih; + if (GetChunkInfoHandle(op->chunkId, &cih) < 0) { + op->status = -EBADF; + op->statusMsg = "no such chunk"; + return; + } + if (cih->isBeingReplicated) { + op->status = -EAGAIN; + op->statusMsg = "chunk replication in progress"; + return; + } + if (op->chunkVersion >= 0 && + op->chunkVersion != cih->chunkInfo.chunkVersion) { + op->status = -EBADVERS; + op->statusMsg = "chunk version mismatch"; + return; + } + if (cih->IsWriteAppenderOwns() && + ! gAtomicRecordAppendManager.IsChunkStable(op->chunkId)) { + op->statusMsg = "write append in progress, returning max chunk size"; + op->size = CHUNKSIZE; + KFS_LOG_STREAM_DEBUG << + op->statusMsg << + " chunk: " << op->chunkId << + " file: " << op->fileId << + " size: " << op->size << + KFS_LOG_EOM; + return; + } + op->size = cih->chunkInfo.chunkSize; +} + +void +ChunkManager::GetDriveName(ReadOp *op) +{ + ChunkInfoHandle *cih; + + if (GetChunkInfoHandle(op->chunkId, &cih) < 0) { + return; + } + // provide the path to the client for telemetry + op->driveName = cih->GetDirname(); +} + +int +ChunkManager::ReadChunk(ReadOp *op) +{ + ChunkInfoHandle* cih = 0; + if (GetChunkInfoHandle(op->chunkId, &cih) < 0) { + return -EBADF; + } + // provide the path to the client for telemetry + op->driveName = cih->GetDirname(); + + // the checksums should be loaded... + cih->chunkInfo.VerifyChecksumsLoaded(); + + if (op->chunkVersion != cih->chunkInfo.chunkVersion) { + KFS_LOG_STREAM_INFO << "Version # mismatch (have=" << + cih->chunkInfo.chunkVersion << " vs asked=" << op->chunkVersion << + ")...failing a read" << + KFS_LOG_EOM; + return -EBADVERS; + } + DiskIo* const d = SetupDiskIo(cih, op); + if (! d) { + return -ESERVERBUSY; + } + + op->diskIo.reset(d); + + // schedule a read based on the chunk size + if (op->offset >= cih->chunkInfo.chunkSize) { + op->numBytesIO = 0; + } else if ((int64_t) (op->offset + op->numBytes) > cih->chunkInfo.chunkSize) { + op->numBytesIO = cih->chunkInfo.chunkSize - op->offset; + } else { + op->numBytesIO = op->numBytes; + } + + if (op->numBytesIO == 0) { + return -EIO; + } + // for checksumming to work right, reads should be in terms of + // checksum-blocks. + const int64_t offset = OffsetToChecksumBlockStart(op->offset); + + size_t numBytesIO = OffsetToChecksumBlockEnd(op->offset + op->numBytesIO - 1) - offset; + + // Make sure we don't try to read past EOF; the checksumming will + // do the necessary zero-padding. + if ((int64_t) (offset + numBytesIO) > cih->chunkInfo.chunkSize) + numBytesIO = cih->chunkInfo.chunkSize - offset; + + const int ret = op->diskIo->Read(offset + KFS_CHUNK_HEADER_SIZE, numBytesIO); + if (ret < 0) { + ReportIOFailure(cih, ret); + return ret; + } + // read was successfully scheduled + return 0; +} + +int +ChunkManager::WriteChunk(WriteOp *op) +{ + ChunkInfoHandle* cih = 0; + if (GetChunkInfoHandle(op->chunkId, &cih) < 0) { + return -EBADF; + } + // the checksums should be loaded... + cih->chunkInfo.VerifyChecksumsLoaded(); + + // schedule a write based on the chunk size. Make sure that a + // write doesn't overflow the size of a chunk. + op->numBytesIO = min((size_t) (CHUNKSIZE - op->offset), op->numBytes); + + if (op->numBytesIO <= 0 || op->offset < 0) + return -EINVAL; + + const int64_t addedBytes(op->offset + op->numBytesIO - cih->chunkInfo.chunkSize); + if (addedBytes > 0 && mUsedSpace + addedBytes >= mTotalSpace) { + KFS_LOG_STREAM_ERROR << + "out of disk space: " << mUsedSpace << " + " << addedBytes << + " = " << (mUsedSpace + addedBytes) << " >= " << mTotalSpace << + KFS_LOG_EOM; + return -ENOSPC; + } + + int64_t offset = op->offset; + ssize_t numBytesIO = op->numBytesIO; + if ((OffsetToChecksumBlockStart(offset) == offset) && + ((size_t) numBytesIO >= (size_t) CHECKSUM_BLOCKSIZE)) { + if (numBytesIO % CHECKSUM_BLOCKSIZE != 0) { + return -EINVAL; + } + if (op->wpop && !op->isFromReReplication && + op->checksums.size() == size_t(numBytesIO / CHECKSUM_BLOCKSIZE)) { + assert(op->checksums[0] == op->wpop->checksum || op->checksums.size() > 1); + } else { + op->checksums = ComputeChecksums(op->dataBuf, numBytesIO); + } + } else { + if ((size_t) numBytesIO >= (size_t) CHECKSUM_BLOCKSIZE) { + assert((size_t) numBytesIO < (size_t) CHECKSUM_BLOCKSIZE); + return -EINVAL; + } + int off = (int)(offset % CHECKSUM_BLOCKSIZE); + const uint32_t blkSize = (size_t(off + numBytesIO) > CHECKSUM_BLOCKSIZE) ? + 2 * CHECKSUM_BLOCKSIZE : CHECKSUM_BLOCKSIZE; + + op->checksums.clear(); + // The checksum block we are after is beyond the current + // end-of-chunk. So, treat that as a 0-block and splice in. + if (offset - off >= cih->chunkInfo.chunkSize) { + IOBuffer data; + data.ReplaceKeepBuffersFull(op->dataBuf, off, numBytesIO); + data.ZeroFill(blkSize - (off + numBytesIO)); + op->dataBuf->Move(&data); + } else { + // Need to read the data block over which the checksum is + // computed. + if (op->rop == NULL) { + // issue a read + ReadOp *rop = new ReadOp(op, offset - off, blkSize); + KFS_LOG_STREAM_DEBUG << + "write triggered a read for offset=" << offset << + KFS_LOG_EOM; + op->rop = rop; + rop->Execute(); + // It is possible that the both read and write ops are complete + // at this point. This normally happens in the case of errors. + // In such cases all error handlers are already invoked. + // If not then the write op will be restarted once read op + // completes. + // Return now. + return 0; + } + // If the read failed, cleanup and bail + if (op->rop->status < 0) { + op->status = op->rop->status; + op->rop->wop = NULL; + delete op->rop; + op->rop = NULL; + return op->HandleDone(EVENT_DISK_ERROR, NULL); + } + + // All is good. So, get on with checksumming + op->rop->dataBuf->ReplaceKeepBuffersFull(op->dataBuf, off, numBytesIO); + + delete op->dataBuf; + op->dataBuf = op->rop->dataBuf; + op->rop->dataBuf = NULL; + // If the buffer doesn't have a full CHECKSUM_BLOCKSIZE worth + // of data, zero-pad the end. We don't need to zero-pad the + // front because the underlying filesystem will zero-fill when + // we read a hole. + ZeroPad(op->dataBuf); + } + + assert(op->dataBuf->BytesConsumable() == (int) blkSize); + op->checksums = ComputeChecksums(op->dataBuf, blkSize); + + // Trim data at the buffer boundary from the beginning, to make write + // offset close to where we were asked from. + int numBytes(numBytesIO); + offset -= off; + op->dataBuf->TrimAtBufferBoundaryLeaveOnly(off, numBytes); + offset += off; + numBytesIO = numBytes; + } + + DiskIo* const d = SetupDiskIo(cih, op); + if (! d) { + return -ESERVERBUSY; + } + op->diskIo.reset(d); + + /* + KFS_LOG_STREAM_DEBUG << + "Checksum for chunk: " << op->chunkId << ", offset=" << op->offset << + ", bytes=" << op->numBytesIO << ", # of cksums=" << op->checksums.size() << + KFS_LOG_EOM; + */ + + int res = op->diskIo->Write( + offset + KFS_CHUNK_HEADER_SIZE, numBytesIO, op->dataBuf); + if (res >= 0) { + UpdateChecksums(cih, op); + assert(res <= numBytesIO); + res = min(res, int(op->numBytesIO)); + op->numBytesIO = numBytesIO; + cih->StartWrite(op); + } else { + op->diskIo.reset(); + ReportIOFailure(cih, res); + } + return res; +} + +void +ChunkManager::UpdateChecksums(ChunkInfoHandle *cih, WriteOp *op) +{ + int64_t endOffset = op->offset + op->numBytesIO; + + // the checksums should be loaded... + cih->chunkInfo.VerifyChecksumsLoaded(); + + for (vector::size_type i = 0; i < op->checksums.size(); i++) { + int64_t offset = op->offset + i * CHECKSUM_BLOCKSIZE; + uint32_t checksumBlock = OffsetToChecksumBlockNum(offset); + + cih->chunkInfo.chunkBlockChecksum[checksumBlock] = op->checksums[i]; + } + + if (cih->chunkInfo.chunkSize < endOffset) { + + UpdateDirSpace(cih, endOffset - cih->chunkInfo.chunkSize); + + mUsedSpace += endOffset - cih->chunkInfo.chunkSize; + cih->chunkInfo.chunkSize = endOffset; + + } + assert(0 <= mUsedSpace && mUsedSpace <= mTotalSpace); +} + +void +ChunkManager::WriteDone(WriteOp* op) +{ + ChunkInfoHandle* cih = 0; + if (GetChunkInfoHandle(op->chunkId, &cih) < 0) { + return; + } + if (! cih->IsFileEquals(op->diskIo)) { + KFS_LOG_STREAM_DEBUG << + "ignoring stale write completion: " << op->Show() << + " disk io: " << reinterpret_cast(op->diskIo.get()) << + KFS_LOG_EOM; + return; + } + cih->WriteDone(op); +} + +bool +ChunkManager::ReadChunkDone(ReadOp *op) +{ + ChunkInfoHandle *cih = NULL; + + bool staleRead = false; + if ((GetChunkInfoHandle(op->chunkId, &cih) < 0) || + (op->chunkVersion != cih->chunkInfo.chunkVersion) || + (staleRead = ! cih->IsFileEquals(op->diskIo))) { + if (op->dataBuf) { + op->dataBuf->Clear(); + } + if (cih) { + KFS_LOG_STREAM_INFO << "Version # mismatch (have=" << + cih->chunkInfo.chunkVersion << + " vs asked=" << op->chunkVersion << ")" << + (staleRead ? " stale read" : "") << + KFS_LOG_EOM; + } + op->status = -EBADVERS; + return true; + } + + const int readLen = op->dataBuf->BytesConsumable(); + if (readLen <= 0) { + KFS_LOG_STREAM_ERROR << "Short read for" << + " chunk: " << cih->chunkInfo.chunkId << + " size: " << cih->chunkInfo.chunkSize << + " read:" + " offset: " << op->offset << + " len: " << readLen << + KFS_LOG_EOM; + if (cih->chunkInfo.chunkSize > op->offset + readLen) { + op->status = -EIO; + ChunkIOFailed(cih, op->status); + } else { + // Size has decreased while read was in flight. + // Possible race with truncation, which could be considered valid. + // Another possibility that read and write completed out of order, + // which is really a bug, especially if this really is read modify + // write. + assert(! op->wop); + op->status = -EAGAIN; + } + return true; + } + + ZeroPad(op->dataBuf); + + assert(op->dataBuf->BytesConsumable() >= (int) CHECKSUM_BLOCKSIZE); + + // either nothing to verify or it better match + + bool mismatch = false; + + // figure out the block we are starting from and grab all the checksums + vector::size_type i, checksumBlock = OffsetToChecksumBlockNum(op->offset); + op->checksum = ComputeChecksums(op->dataBuf, op->dataBuf->BytesConsumable()); + + // the checksums should be loaded... + if (!cih->chunkInfo.AreChecksumsLoaded()) { + // the read took too long; the checksums got paged out. ask the client to retry + KFS_LOG_STREAM_INFO << "Checksums for chunk " << + cih->chunkInfo.chunkId << + " got paged out; returning EAGAIN to client" << + KFS_LOG_EOM; + op->status = -EAGAIN; + return true; + } + + cih->chunkInfo.VerifyChecksumsLoaded(); + + for (i = 0; + i < op->checksum.size() && + checksumBlock < MAX_CHUNK_CHECKSUM_BLOCKS; + checksumBlock++, i++) { + const uint32_t checksum = + cih->chunkInfo.chunkBlockChecksum[checksumBlock]; + if (checksum == 0 && op->checksum[i] == mNullBlockChecksum && + mAllowSparseChunksFlag) { + KFS_LOG_STREAM_INFO << + " chunk: " << cih->chunkInfo.chunkId << + " block: " << checksumBlock << + " no checksum " << + " read: " << op->checksum[i] << + KFS_LOG_EOM; + continue; + } + if (op->checksum[i] != checksum) { + mismatch = true; + break; + } + } + + if (!mismatch) { + // for checksums to verify, we did reads in multiples of + // checksum block sizes. so, get rid of the extra + AdjustDataRead(op); + return true; + } + const bool retry = op->retryCnt++ < mReadChecksumMismatchMaxRetryCount; + op->status = -EBADCKSUM; + + ostringstream os; + os << + "Checksum mismatch for chunk=" << op->chunkId << + " offset=" << op->offset << + " bytes=" << op->numBytesIO << + ": expect: " << cih->chunkInfo.chunkBlockChecksum[checksumBlock] << + " computed: " << op->checksum[i] << + " try: " << op->retryCnt << + ((mAbortOnChecksumMismatchFlag && ! retry) ? " abort" : "") + ; + const string str = os.str(); + KFS_LOG_STREAM_ERROR << str << KFS_LOG_EOM; + if (retry) { + op->dataBuf->Clear(); + if (ReadChunk(op) == 0) { + return false; + } + } + if (mAbortOnChecksumMismatchFlag) { + die(str); + } + op->dataBuf->Clear(); + + // Notify the metaserver that the chunk we have is "bad"; the + // metaserver will re-replicate this chunk. + mCounters.mReadChecksumErrorCount++; + ChunkIOFailed(cih, op->status); + return true; +} + +void +ChunkManager::NotifyMetaCorruptedChunk(ChunkInfoHandle* cih, int err) +{ + assert(cih); + if (err == 0) { + mCounters.mLostChunksCount++; + cih->GetDirInfo().corruptedChunksCount++; + } else { + mCounters.mCorruptedChunksCount++; + } + + KFS_LOG_STREAM_ERROR << + (err == 0 ? "lost" : "corrupted") << + " chunk: " << cih->chunkInfo.chunkId << + " file: " << cih->chunkInfo.fileId << + " error: " << err << + (err ? string() : QCUtils::SysError(-err, " ")) << + " dir: " << cih->GetDirname() << + " total:" + " lost: " << mCounters.mLostChunksCount << + " corrupted: " << mCounters.mCorruptedChunksCount << + KFS_LOG_EOM; + + // This op will get deleted when we get an ack from the metaserver + CorruptChunkOp* const op = new CorruptChunkOp( + 0, cih->chunkInfo.fileId, cih->chunkInfo.chunkId); + op->isChunkLost = err == 0; + gMetaServerSM.EnqueueOp(op); + // Meta server automatically cleans up leases for corrupted chunks. + gLeaseClerk.UnRegisterLease(cih->chunkInfo.chunkId); +} + +void +ChunkManager::ChunkIOFailed(kfsChunkId_t chunkId, int err, const DiskIo::File* file) +{ + ChunkInfoHandle* cih; + if (GetChunkInfoHandle(chunkId, &cih) < 0) { + KFS_LOG_STREAM_ERROR << + "corrupt chunk: " << chunkId << " not in table" << + KFS_LOG_EOM; + return; + } + if (! cih->IsFileEquals(file)) { + KFS_LOG_STREAM_DEBUG << + "ignoring stale io failure notification: " << chunkId << + " file: " << reinterpret_cast(file) << + KFS_LOG_EOM; + return; + } + ChunkIOFailed(cih, err); +} + +void +ChunkManager::ReportIOFailure(ChunkInfoHandle* cih, int err) +{ + if (err == -EAGAIN || err == -ENOMEM || err == -ETIMEDOUT) { + KFS_LOG_STREAM_ERROR << + "assuming temporary io failure chunk: " << cih->chunkInfo.chunkId << + " dir: " << cih->GetDirname() << + " " << QCUtils::SysError(-err) << + KFS_LOG_EOM; + return; + } + ChunkIOFailed(cih, err); +} + +void +ChunkManager::ChunkIOFailed(ChunkInfoHandle* cih, int err) +{ + NotifyMetaCorruptedChunk(cih, err); + StaleChunk(cih); +} + +void +ChunkManager::ChunkIOFailed(kfsChunkId_t chunkId, int err, const DiskIo* diskIo) +{ + ChunkIOFailed(chunkId, err, diskIo ? diskIo->GetFilePtr().get() : 0); +} + +// +// directory with dirname is unaccessable; maybe drive failed. so, +// notify metaserver of lost blocks. the metaserver will then +// re-replicate. +// +void +ChunkManager::NotifyMetaChunksLost(ChunkManager::ChunkDirInfo& dir) +{ + KFS_LOG_STREAM(dir.evacuateDoneFlag ? + MsgLogger::kLogLevelWARN : MsgLogger::kLogLevelERROR) << + (dir.evacuateDoneFlag ? "evacuate done: " : "lost") << + " chunk directory: " << dir.dirname << + KFS_LOG_EOM; + CorruptChunkOp* op = 0; + const string* dname = &(dir.dirname); + for (int i = 0; i < ChunkDirInfo::kChunkDirListCount; i++) { + ChunkDirInfo::ChunkLists& list = dir.chunkLists[i]; + ChunkInfoHandle* cih; + while ((cih = ChunkDirList::Front(list))) { + const kfsChunkId_t chunkId = cih->chunkInfo.chunkId; + const kfsFileId_t fileId = cih->chunkInfo.fileId; + // get rid of chunkid from our list + const bool staleFlag = cih->IsStale(); + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (ci && *ci == cih) { + if (mChunkTable.Erase(chunkId) <= 0) { + die("corrupted chunk table"); + } + } + const int64_t size = min(mUsedSpace, cih->chunkInfo.chunkSize); + UpdateDirSpace(cih, -size); + mUsedSpace -= size; + Delete(*cih); + if (staleFlag) { + continue; + } + KFS_LOG_STREAM_INFO << + "lost chunk: " << chunkId << + " file: " << fileId << + KFS_LOG_EOM; + mCounters.mDirLostChunkCount++; + if (! gMetaServerSM.IsConnected()) { + // If no connection exists then the meta server assumes that + // the chunks are lost anyway, and the inventory synchronization + // in the meta hello is sufficient on re-connect. + continue; + } + if (! op) { + op = new CorruptChunkOp(0, fileId, chunkId, dname); + // Do not count as corrupt. + op->isChunkLost = true; + dname = 0; + } else { + op->fid = fileId; + op->chunkId = chunkId; + op->chunkDir.clear(); + } + const int ref = op->Ref(); + gMetaServerSM.EnqueueOp(op); + assert(op->GetRef() >= ref); + if (op->GetRef() > ref) { + // Op in flight / queued allocate a new one. + op->UnRef(); + op = 0; + } + } + } + if (op) { + op->UnRef(); + } + if (! dir.evacuateDoneFlag) { + mCounters.mChunkDirLostCount++; + } + const bool updateFlag = dir.countFsSpaceAvailableFlag; + dir.Stop(); + if (updateFlag) { + UpdateCountFsSpaceAvailableFlags(); + } + mDirChecker.Add(dir.dirname, dir.dirLock); +} + +int +ChunkManager::UpdateCountFsSpaceAvailableFlags() +{ + int ret = 0; + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it != mChunkDirs.end(); + ++it) { + if (it->availableSpace < 0 || it->evacuateStartedFlag) { + it->countFsSpaceAvailableFlag = false; + continue; + } + ChunkDirs::const_iterator cit; + for (cit = mChunkDirs.begin(); + cit != it && + (cit->availableSpace < 0 || + ! cit->countFsSpaceAvailableFlag || + cit->deviceId != it->deviceId); + ++cit) + {} + it->countFsSpaceAvailableFlag = cit == it; + if (it->countFsSpaceAvailableFlag) { + ret++; + } + } + return ret; +} + +void +ChunkManager::ZeroPad(IOBuffer *buffer) +{ + const int bytesFilled = buffer->BytesConsumable(); + if ((bytesFilled % CHECKSUM_BLOCKSIZE) == 0) { + return; + } + const int numToZero = CHECKSUM_BLOCKSIZE - (bytesFilled % CHECKSUM_BLOCKSIZE); + if (numToZero > 0) { + // pad with 0's + buffer->ZeroFill(numToZero); + } +} + +void +ChunkManager::AdjustDataRead(ReadOp *op) +{ + op->dataBuf->Consume( + op->offset - OffsetToChecksumBlockStart(op->offset)); + op->dataBuf->Trim(op->numBytesIO); +} + +uint32_t +ChunkManager::GetChecksum(kfsChunkId_t chunkId, int64_t offset) +{ + ChunkInfoHandle *cih; + + if (offset < 0 || GetChunkInfoHandle(chunkId, &cih) < 0) + return 0; + + const uint32_t checksumBlock = OffsetToChecksumBlockNum(offset); + // the checksums should be loaded... + cih->chunkInfo.VerifyChecksumsLoaded(); + + assert(checksumBlock < MAX_CHUNK_CHECKSUM_BLOCKS); + + return cih->chunkInfo.chunkBlockChecksum[ + min(MAX_CHUNK_CHECKSUM_BLOCKS - 1, checksumBlock)]; +} + +vector +ChunkManager::GetChecksums(kfsChunkId_t chunkId, int64_t offset, size_t numBytes) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + + if (offset < 0 || ! ci) { + return vector(); + } + + const ChunkInfoHandle * const cih = *ci; + // the checksums should be loaded... + cih->chunkInfo.VerifyChecksumsLoaded(); + + return (vector( + cih->chunkInfo.chunkBlockChecksum + + OffsetToChecksumBlockNum(offset), + cih->chunkInfo.chunkBlockChecksum + + min(MAX_CHUNK_CHECKSUM_BLOCKS, + OffsetToChecksumBlockNum( + offset + numBytes + CHECKSUM_BLOCKSIZE - 1)) + )); +} + +DiskIo* +ChunkManager::SetupDiskIo(ChunkInfoHandle *cih, KfsCallbackObj *op) +{ + if (! cih->IsFileOpen()) { + CleanupInactiveFds(); + if (OpenChunk(cih, O_RDWR) < 0) { + return 0; + } + } + LruUpdate(*cih); + return new DiskIo(cih->dataFH, op); +} + +int +ChunkManager::Restart() +{ + if (gLogger.GetVersionFromCkpt() != gLogger.GetLoggerVersionNum()) { + KFS_LOG_STREAM_FATAL << + "Unsupported log version. Copy out the data and copy it back in." << + KFS_LOG_EOM; + return -1; + } + Restore(); + return 0; +} + +// +// On a restart, whatever chunks were dirty need to be nuked: we may +// have had writes pending to them and we never flushed them to disk. +// +void +ChunkManager::RemoveDirtyChunks() +{ + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it != mChunkDirs.end(); + ++it) { + if (it->availableSpace < 0) { + continue; + } + const string dir = it->dirname + mDirtyChunksDir; + DIR* const dirStream = opendir(dir.c_str()); + if (! dirStream) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + "unable to open " << dir << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + continue; + } + struct dirent const* dent; + while ((dent = readdir(dirStream))) { + const string name = dir + dent->d_name; + struct stat buf; + if (stat(name.c_str(), &buf) || ! S_ISREG(buf.st_mode)) { + continue; + } + KFS_LOG_STREAM_INFO << + "Cleaning out dirty chunk: " << name << + KFS_LOG_EOM; + if (unlink(name.c_str())) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + "unable to remove " << name << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + } + } + closedir(dirStream); + } +} + +void +ChunkManager::Restore() +{ + RemoveDirtyChunks(); + bool scheduleEvacuateFlag = false; + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it != mChunkDirs.end(); + ++it) { + if (it->availableSpace < 0) { + continue; + } + const string& dir = it->dirname; + if (! mEvacuateDoneFileName.empty()) { + const string name(dir + mEvacuateDoneFileName); + struct stat buf; + if (stat(name.c_str(), &buf) == 0) { + KFS_LOG_STREAM_INFO << + "ignoring directory: " << dir << + " file: " << mEvacuateDoneFileName << " exists" << + KFS_LOG_EOM; + it->availableSpace = -1; + continue; + } + const int err = errno; + if (err != ENOENT) { + KFS_LOG_STREAM_INFO << + "ignoring directory: " << dir << + " file: " << mEvacuateDoneFileName << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + it->availableSpace = -1; + continue; + } + } + DIR* const dirStream = opendir(dir.c_str()); + if (! dirStream) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + "unable to open directory: " << dir << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + it->availableSpace = -1; + continue; + } + struct dirent const* dent; + while ((dent = readdir(dirStream))) { + if (dent->d_name == mEvacuateFileName) { + KFS_LOG_STREAM_INFO << + "evacuate directory: " << dir << + " file: " << mEvacuateFileName << " exists" << + KFS_LOG_EOM; + it->evacuateFlag = true; + scheduleEvacuateFlag = true; + } + if (dent->d_name == mChunkDirLockName) { + continue; + } + string const name(dir + dent->d_name); + struct stat buf; + if (stat(name.c_str(), &buf)) { + const int err = errno; + KFS_LOG_STREAM_INFO << + "ignoring directory entry: " << name << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + } else if (S_ISREG(buf.st_mode)) { + AddMapping(*it, dent->d_name, buf.st_size); + } + } + closedir(dirStream); + } + if (scheduleEvacuateFlag) { + UpdateCountFsSpaceAvailableFlags(); + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it != mChunkDirs.end(); ++it) { + if (it->evacuateFlag) { + it->ScheduleEvacuate(); + } + } + } + mDirChecker.SetRemoveFilesFlag(mCleanupChunkDirsFlag); +} + +void +ChunkManager::AddMapping(ChunkInfoHandle *cih) +{ + bool newEntryFlag = false; + ChunkInfoHandle** const ci = mChunkTable.Insert( + cih->chunkInfo.chunkId, cih, newEntryFlag); + if (! ci) { + die("add mapping failure"); + } + if (! newEntryFlag) { + *ci = cih; + } + mUsedSpace += cih->chunkInfo.chunkSize; + UpdateDirSpace(cih, cih->chunkInfo.chunkSize); +} + +void +ChunkManager::GetHostedChunks( + vector &stable, + vector ¬Stable, + vector ¬StableAppend) +{ + // walk thru the table and pick up the chunk-ids + mChunkTable.First(); + const CMapEntry* p; + while ((p = mChunkTable.Next())) { + const ChunkInfoHandle* const cih = p->GetVal(); + if (cih->isBeingReplicated) { + // Do not report replicated chunks, replications should be canceled + // on reconnect. + continue; + } + if (cih->IsRenameInFlight()) { + // Tell meta server the target version. It comes here when the + // meta server connection breaks while make stable or version change + // is in flight. + // Report the target version and status, otherwise meta server might + // think that this is stale chunk copy, and delete it. + // This creates time gap with the client: the chunk still might be + // transitioning when the read comes. In such case the chunk will + // not be "readable" and the client will be asked to come back later. + bool stableFlag = false; + const kfsSeq_t vers = cih->GetTargetStateAndVersion(stableFlag); + vector& dest = stableFlag ? stable : + (cih->IsWriteAppenderOwns() ? notStableAppend : notStable); + dest.push_back(cih->chunkInfo); + dest.back().chunkVersion = vers; + } else { + (IsChunkStable(cih) ? stable : + (cih->IsWriteAppenderOwns() ? + notStableAppend : notStable + )).push_back(cih->chunkInfo); + } + } +} + +int +ChunkManager::GetChunkInfoHandle(kfsChunkId_t chunkId, ChunkInfoHandle **cih) +{ + ChunkInfoHandle** const ci = mChunkTable.Find(chunkId); + if (! ci) { + *cih = 0; + return -EBADF; + } + *cih = *ci; + return 0; +} + +int +ChunkManager::AllocateWriteId(WriteIdAllocOp *wi, int replicationPos, ServerLocation peerLoc) +{ + ChunkInfoHandle *cih = 0; + + if (GetChunkInfoHandle(wi->chunkId, &cih) < 0) { + wi->statusMsg = "no such chunk"; + wi->status = -EBADF; + } else if (wi->chunkVersion != cih->chunkInfo.chunkVersion) { + wi->statusMsg = "chunk version mismatch"; + wi->status = -EINVAL; + } else if (wi->isForRecordAppend && IsWritePending(wi->chunkId)) { + wi->statusMsg = "random write in progress"; + wi->status = -EINVAL; + } else if (wi->isForRecordAppend && ! IsWriteAppenderOwns(wi->chunkId)) { + wi->statusMsg = "not open for append"; + wi->status = -EINVAL; + } else if (! wi->isForRecordAppend && cih->IsWriteAppenderOwns()) { + wi->statusMsg = "write append in progress"; + wi->status = -EINVAL; + } else { + mWriteId++; + wi->writeId = mWriteId; + if (wi->isForRecordAppend) { + gAtomicRecordAppendManager.AllocateWriteId( + wi, replicationPos, peerLoc, cih->dataFH); + } else if (cih->IsStable()) { + wi->statusMsg = "chunk stable"; + wi->status = -EINVAL; + } else if (cih->IsRenameInFlight()) { + wi->statusMsg = "chunk state transition is in progress"; + wi->status = -EAGAIN; + } else { + WriteOp* const op = new WriteOp( + wi->seq, wi->chunkId, wi->chunkVersion, + wi->offset, wi->numBytes, NULL, mWriteId + ); + op->enqueueTime = globalNetManager().Now(); + op->isWriteIdHolder = true; + mPendingWrites.push_back(op); + } + } + if (wi->status != 0) { + KFS_LOG_STREAM_ERROR << + "failed: " << wi->Show() << + KFS_LOG_EOM; + } + return wi->status; +} + +int64_t +ChunkManager::GetChunkVersion(kfsChunkId_t c) +{ + ChunkInfoHandle *cih; + + if (GetChunkInfoHandle(c, &cih) < 0) + return -1; + + return cih->chunkInfo.chunkVersion; +} + +WriteOp * +ChunkManager::CloneWriteOp(int64_t writeId) +{ + WriteOp* const other = mPendingWrites.find(writeId); + if (! other || other->status < 0) { + // if the write is "bad" already, don't add more data to it + if (other) { + KFS_LOG_STREAM_ERROR << + "clone write op failed due to status: " << other->status << + KFS_LOG_EOM; + } + return 0; + } + + // Since we are cloning, "touch" the time + other->enqueueTime = globalNetManager().Now(); + // offset/size/buffer are to be filled in + return new WriteOp(other->seq, other->chunkId, other->chunkVersion, + 0, 0, NULL, other->writeId); +} + +void +ChunkManager::SetWriteStatus(int64_t writeId, int status) +{ + WriteOp* const op = mPendingWrites.find(writeId); + if (! op) { + return; + } + op->status = status; + + KFS_LOG_STREAM_INFO << + "setting the status of writeid: " << writeId << " to " << status << + KFS_LOG_EOM; +} + +int +ChunkManager::GetWriteStatus(int64_t writeId) +{ + const WriteOp* const op = mPendingWrites.find(writeId); + return (op ? op->status : -EINVAL); +} + +void +ChunkManager::RunStaleChunksQueue(bool completionFlag) +{ + if (completionFlag) { + assert(mStaleChunkOpsInFlight > 0); + mStaleChunkOpsInFlight--; + } + ChunkList::Iterator it(mChunkInfoLists[kChunkStaleList]); + ChunkInfoHandle* cih; + while (mStaleChunkOpsInFlight < mMaxStaleChunkOpsInFlight && + (cih = it.Next())) { + // If the chunk with target version already exists, then do not issue + // delete. + // If the existing chunk is already stable but the chunk to delete has + // the same version but it is not stable, then the file is likely have + // already been deleted , when the existing chunk transitioned into + // stable version. If not then unstable chunk will be cleaned up on the + // next restart. + ChunkInfoHandle** const ci = mChunkTable.Find(cih->chunkInfo.chunkId); + if (! ci || + ! (*ci)->CanHaveVersion(cih->chunkInfo.chunkVersion)) { + if (cih->IsKeep()) { + if (MarkChunkStale(cih, &mStaleChunkCompletion) == 0) { + mStaleChunkOpsInFlight++; + } + } else { + const string fileName = MakeChunkPathname(cih); + string err; + const bool ok = DiskIo::Delete( + fileName.c_str(), &mStaleChunkCompletion, &err); + if (ok) { + mStaleChunkOpsInFlight++; + } + KFS_LOG_STREAM(ok ? + MsgLogger::kLogLevelINFO : + MsgLogger::kLogLevelERROR) << + "deleting stale chunk: " << fileName << + (ok ? " ok" : " error: ") << err << + " in flight: " << mStaleChunkOpsInFlight << + KFS_LOG_EOM; + } + } + const int64_t size = min(mUsedSpace, cih->chunkInfo.chunkSize); + UpdateDirSpace(cih, -size); + mUsedSpace -= size; + Delete(*cih); + } +} + +void +ChunkManager::Timeout() +{ + const time_t now = globalNetManager().Now(); + + if (now >= mNextCheckpointTime) { + mNextCheckpointTime = globalNetManager().Now() + mCheckpointIntervalSecs; + // if any writes have been around for "too" long, remove them + // and reclaim memory + ScavengePendingWrites(now); + // cleanup inactive fd's and thereby free up fd's + CleanupInactiveFds(now); + } + if (mNextChunkDirsCheckTime < now) { + // once in a while check that the drives hosting the chunks are good. + CheckChunkDirs(); + mNextChunkDirsCheckTime = now + mChunkDirsCheckIntervalSecs; + } + if (mNextGetFsSpaceAvailableTime < now) { + GetFsSpaceAvailable(); + mNextGetFsSpaceAvailableTime = now + mGetFsSpaceAvailableIntervalSecs; + } + gLeaseClerk.Timeout(); + gAtomicRecordAppendManager.Timeout(); +} + +void +ChunkManager::ScavengePendingWrites(time_t now) +{ + const time_t opExpireTime = now - mMaxPendingWriteLruSecs; + + while (! mPendingWrites.empty()) { + WriteOp* const op = mPendingWrites.front(); + // The list is sorted by enqueue time + if (opExpireTime < op->enqueueTime) { + break; + } + // if it exceeds 5 mins, retire the op + KFS_LOG_STREAM_DEBUG << + "Retiring write with id=" << op->writeId << + " as it has been too long" << + KFS_LOG_EOM; + mPendingWrites.pop_front(); + + ChunkInfoHandle *cih; + if (GetChunkInfoHandle(op->chunkId, &cih) == 0) { + if (now - cih->lastIOTime >= mInactiveFdsCleanupIntervalSecs) { + // close the chunk only if it is inactive + CloseChunk(cih); + // CloseChunk never deletes cih + } + if (cih->IsFileOpen() && + ! ChunkLru::IsInList(mChunkInfoLists[kChunkLruList], *cih)) { + LruUpdate(*cih); + } + } + delete op; + } +} + +int +ChunkManager::Sync(WriteOp *op) +{ + if (!op->diskIo) { + return -1; + } + return op->diskIo->Sync(op->waitForSyncDone); +} + +void +ChunkManager::CleanupInactiveFds(time_t now) +{ + const bool periodic = now > 0; + // if we haven't cleaned up in 5 mins or if we too many fd's that + // are open, clean up. + if (periodic) { + if (now < mNextInactiveFdCleanupTime) { + return; + } + } else { + const uint64_t openChunkCnt = globals().ctrOpenDiskFds.GetValue(); + if (openChunkCnt < (uint64_t)mMaxOpenChunkFiles && + openChunkCnt * mFdsPerChunk + + globals().ctrOpenNetFds.GetValue() < + (uint64_t)mMaxOpenFds) { + return; + } + } + + const time_t cur = periodic ? now : globalNetManager().Now(); + // either we are periodic cleaning or we have too many FDs open + // shorten the interval if we're out of fd. + const time_t expireTime = cur - (periodic ? + mInactiveFdsCleanupIntervalSecs : + (mInactiveFdsCleanupIntervalSecs + 2) / 3); + ChunkLru::Iterator it(mChunkInfoLists[kChunkLruList]); + ChunkInfoHandle* cih; + while ((cih = it.Next()) && cih->lastIOTime < expireTime) { + if (! cih->IsFileOpen() || cih->isBeingReplicated) { + // Doesn't belong here, if / when io completes it will be added back. + ChunkLru::Remove(mChunkInfoLists[kChunkLruList], *cih); + continue; + } + bool inUse; + bool hasLease = false; + if ((inUse = cih->IsFileInUse()) || + (hasLease = gLeaseClerk.IsLeaseValid(cih->chunkInfo.chunkId)) || + IsWritePending(cih->chunkInfo.chunkId)) { + KFS_LOG_STREAM_DEBUG << "cleanup: stale entry in chunk lru:" + " fileid: " << (const void*)cih->dataFH.get() << + " chunk: " << cih->chunkInfo.chunkId << + " last io: " << (now - cih->lastIOTime) << " sec. ago" << + (inUse ? " file in use" : "") << + (hasLease ? " has lease" : "") << + KFS_LOG_EOM; + continue; + } + if (cih->SyncMeta()) { + continue; + } + // we have a valid file-id and it has been over 5 mins since we last did + // I/O on it. + KFS_LOG_STREAM_DEBUG << "cleanup: closing" + " fileid: " << (const void*)cih->dataFH.get() << + " chunk: " << cih->chunkInfo.chunkId << + " last io: " << (now - cih->lastIOTime) << " sec. ago" << + KFS_LOG_EOM; + Release(*cih); + } + cih = ChunkLru::Front(mChunkInfoLists[kChunkLruList]); + mNextInactiveFdCleanupTime = mInactiveFdsCleanupIntervalSecs + + ((cih && cih->lastIOTime > expireTime) ? cih->lastIOTime : cur); +} + +bool +ChunkManager::StartDiskIo() +{ + if ((int)KFS_CHUNK_HEADER_SIZE < IOBufferData::GetDefaultBufferSize()) { + KFS_LOG_STREAM_INFO << + "invalid io buffer size: " << + IOBufferData::GetDefaultBufferSize() << + " exceeds chunk header size: " << KFS_CHUNK_HEADER_SIZE << + KFS_LOG_EOM; + return false; + } + mDirChecker.SetLockFileName(mChunkDirLockName); + mDirChecker.SetRemoveFilesFlag(false); + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it < mChunkDirs.end(); ++it) { + mDirChecker.Add(it->dirname); + } + mDirChecker.SetInterval(mChunkDirsCheckIntervalSecs * 1000); + mDirChecker.AddSubDir(mStaleChunksDir); + mDirChecker.AddSubDir(mDirtyChunksDir); + DirChecker::DirsAvailable dirs; + mDirChecker.Start(dirs); + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it != mChunkDirs.end(); + ++it) { + DirChecker::DirsAvailable::const_iterator const dit = + dirs.find(it->dirname); + if (dit == dirs.end()) { + KFS_LOG_STREAM_INFO << it->dirname << + ": not using" << + KFS_LOG_EOM; + it->availableSpace = -1; + NotifyMetaChunksLost(*it); + continue; + } + // UpdateCountFsSpaceAvailableFlags() below will set the following flag. + it->countFsSpaceAvailableFlag = false; + it->deviceId = dit->second.first; + it->dirLock = dit->second.second; + it->availableSpace = 0; + it->totalSpace = it->usedSpace; + string errMsg; + if (! DiskIo::StartIoQueue( + it->dirname.c_str(), + it->deviceId, + mMaxOpenChunkFiles, + &errMsg)) { + KFS_LOG_STREAM_ERROR << + "Failed to start disk queue for: " << it->dirname << + " dev: << " << it->deviceId << " :" << errMsg << + KFS_LOG_EOM; + DiskIo::Shutdown(); + return false; + } + if (! (it->diskQueue = DiskIo::FindDiskQueue(it->dirname.c_str()))) { + die(it->dirname + ": failed to find disk queue"); + } + KFS_LOG_STREAM_INFO << + "chunk directory: " << it->dirname << + " devId: " << it->deviceId << + " space:" + " available: " << it->availableSpace << + " used: " << it->usedSpace << + KFS_LOG_EOM; + } + mMaxIORequestSize = min(CHUNKSIZE, DiskIo::GetMaxRequestSize()); + UpdateCountFsSpaceAvailableFlags(); + GetFsSpaceAvailable(); + return true; +} + +int64_t +ChunkManager::GetTotalSpace(int64_t& totalFsSpace, int& chunkDirs, + int& evacuateInFlightCount, int& writableDirs, + int& evacuateChunks, int64_t& evacuateByteCount, + int* evacuateDoneChunkCount, int64_t* evacuateDoneByteCount, + HelloMetaOp::LostChunkDirs* lostChunkDirs) +{ + totalFsSpace = 0; + chunkDirs = 0; + writableDirs = 0; + evacuateInFlightCount = 0; + evacuateChunks = 0; + evacuateByteCount = 0; + int evacuateDoneChunks = 0; + int64_t evacuateDoneBytes = 0; + int64_t totalFsAvailableSpace = 0; + int64_t usedSpace = 0; + for (ChunkDirs::const_iterator it = mChunkDirs.begin(); + it < mChunkDirs.end(); ++it) { + if (it->availableSpace < 0) { + if (lostChunkDirs) { + lostChunkDirs->insert(lostChunkDirs->end(), it->dirname); + } + continue; + } + if (it->evacuateFlag) { + // Never send evacuate count to the meta server <= 0 while + // evacuation is in progress -- the meta server clears evacuation + // queue when counter is 0. + // The counter can be sent on heartbeat, while evacuation response + // in flight, so the two can potentially get out of sync. + evacuateInFlightCount += max(1, it->evacuateInFlightCount); + evacuateChunks += it->chunkCount; + evacuateByteCount += it->usedSpace; + evacuateDoneChunks += it->GetEvacuateDoneChunkCount(); + evacuateDoneBytes += it->GetEvacuateDoneByteCount(); + } else { + if (it->availableSpace > mMinFsAvailableSpace && + it->availableSpace > + it->totalSpace * mMaxSpaceUtilizationThreshold) { + writableDirs++; + } + } + chunkDirs++; + if (it->countFsSpaceAvailableFlag) { + totalFsSpace += it->totalSpace; + if (it->availableSpace > mMinFsAvailableSpace) { + totalFsAvailableSpace += + it->availableSpace - mMinFsAvailableSpace; + } + } + usedSpace += it->usedSpace; + KFS_LOG_STREAM_DEBUG << + "chunk directory: " << it->dirname << + " has space " << it->availableSpace << + " total: " << totalFsAvailableSpace << + " used: " << usedSpace << + " limit: " << mTotalSpace << + KFS_LOG_EOM; + } + if (evacuateDoneChunkCount) { + *evacuateDoneChunkCount = evacuateDoneChunks; + } + if (evacuateDoneByteCount) { + *evacuateDoneByteCount = evacuateDoneBytes; + } + return (min(totalFsAvailableSpace, mTotalSpace) + mUsedSpace); +} + +int +ChunkManager::ChunkDirInfo::CheckDirReadableDone(int code, void* data) +{ + if ((code != EVENT_DISK_CHECK_DIR_READABLE_DONE && + code != EVENT_DISK_ERROR) || ! checkDirReadableFlightFlag) { + die("CheckDirReadableDone invalid completion"); + } + + checkDirReadableFlightFlag = false; + if (availableSpace < 0) { + return 0; // Ignore, already marked not in use. + } + + if (code == EVENT_DISK_ERROR) { + DiskError(*reinterpret_cast(data)); + } else { + KFS_LOG_STREAM_DEBUG << + "chunk directory: " << dirname << " is readable" + " space: " << availableSpace << + " used: " << usedSpace << + " dev: " << deviceId << + " queue: " << (const void*)diskQueue << + KFS_LOG_EOM; + diskTimeoutCount = 0; + } + return 0; +} + +int +ChunkManager::ChunkDirInfo::FsSpaceAvailDone(int code, void* data) +{ + if ((code != EVENT_DISK_GET_FS_SPACE_AVAIL_DONE && + code != EVENT_DISK_ERROR) || ! fsSpaceAvailInFlightFlag) { + die("FsSpaceAvailDone invalid completion"); + } + + fsSpaceAvailInFlightFlag = false; + if (availableSpace < 0) { + return 0; // Ignore, already marked not in use. + } + + if (code == EVENT_DISK_ERROR) { + DiskError(*reinterpret_cast(data)); + } else { + if (availableSpace >= 0) { + const int64_t* const ret = + reinterpret_cast(data); + const int64_t fsAvail = ret[0]; + const int64_t fsTotal = ret[1]; + KFS_LOG_STREAM_DEBUG << + "chunk directory: " << dirname << + " available: " << availableSpace << + " => " << fsAvail << + " total: " << totalSpace << + " => " << fsTotal << + " used: " << usedSpace << + KFS_LOG_EOM; + availableSpace = max(int64_t(0), fsAvail); + totalSpace = max(int64_t(0), fsTotal); + } + diskTimeoutCount = 0; + } + return 0; +} + +void +ChunkManager::ChunkDirInfo::DiskError(int sysErr) +{ + if (availableSpace < 0) { + return; // Ignore, already marked not in use. + } + KFS_LOG_STREAM_ERROR << + "chunk directory: " << dirname << + " error: " << QCUtils::SysError(-sysErr) << + " space:" + " available: " << availableSpace << + " used: " << usedSpace << + KFS_LOG_EOM; + if ((sysErr != -EMFILE && sysErr != -ENFILE) && + (sysErr != -ETIMEDOUT || ++diskTimeoutCount > + gChunkManager.GetMaxDirCheckDiskTimeouts())) { + gChunkManager.NotifyMetaChunksLost(*this); + } +} + +int +ChunkManager::ChunkDirInfo::CheckEvacuateFileDone(int code, void* data) +{ + if ((code != EVENT_DISK_GET_FS_SPACE_AVAIL_DONE && + code != EVENT_DISK_ERROR) || ! checkEvacuateFileInFlightFlag) { + die("CheckEvacuateFileDone invalid completion"); + } + + checkEvacuateFileInFlightFlag = false; + if (availableSpace < 0) { + return 0; // Ignore, already marked not in use. + } + + if (code == EVENT_DISK_ERROR) { + const int sysErr = *reinterpret_cast(data); + KFS_LOG_STREAM(sysErr == -ENOENT ? + MsgLogger::kLogLevelDEBUG : + MsgLogger::kLogLevelERROR) << + "chunk directory: " << dirname << + " \"evacuate\"" + " error: " << QCUtils::SysError(-sysErr) << + " space: " << availableSpace << + " used: " << usedSpace << + " dev: " << deviceId << + " queue: " << (const void*)diskQueue << + KFS_LOG_EOM; + if (sysErr == -EIO) { + if (++evacuateCheckIoErrorsCount >= + gChunkManager.GetMaxEvacuateIoErrors()) { + DiskError(sysErr); + } + } else { + evacuateCheckIoErrorsCount = 0; + } + } else if (! evacuateFlag) { + KFS_LOG_STREAM_INFO << + "chunk directory: " << dirname << + " \"evacuate\"" + " space: " << availableSpace << + " used: " << usedSpace << + " dev: " << deviceId << + " queue: " << (const void*)diskQueue << + KFS_LOG_EOM; + diskTimeoutCount = 0; + evacuateFlag = true; + ScheduleEvacuate(); + } + return 0; +} + +int +ChunkManager::ChunkDirInfo::EvacuateChunksDone(int code, void* data) +{ + if (code != EVENT_CMD_DONE || data != &evacuateChunksOp || + ! evacuateChunksOpInFlightFlag) { + die("EvacuateChunksDone invalid completion"); + } + + evacuateChunksOpInFlightFlag = false; + if (availableSpace < 0) { + return 0; // Ignore, already marked not in use. + } + + if (! evacuateFlag) { + return 0; + } + UpdateLastEvacuationActivityTime(); + if (evacuateChunksOp.status != 0) { + if (! evacuateStartedFlag && evacuateChunksOp.status == -EAGAIN) { + SetEvacuateStarted(); + } + if (! evacuateStartedFlag || (evacuateInFlightCount <= 0 && + (evacuateChunksOp.status != -EAGAIN || + evacuateChunksOp.numChunks <= 1))) { + // Restart from the evacuate file check, in order to try again with + // a delay. + if (! ChunkDirList::IsEmpty(chunkLists[kChunkDirEvacuateList])) { + die("non empty evacuate list"); + } + evacuateStartedFlag = false; + evacuateFlag = false; + KFS_LOG_STREAM_WARN << + "evacuate: " << dirname << + " status: " << evacuateChunksOp.status << + " restarting from evacuation file check" << + KFS_LOG_EOM; + } + if (evacuateStartedFlag == countFsSpaceAvailableFlag) { + gChunkManager.UpdateCountFsSpaceAvailableFlags(); + } + rescheduleEvacuateThreshold = max(0, + evacuateInFlightCount - max(0, evacuateChunksOp.numChunks)); + if (evacuateInFlightCount <= 0 && evacuateStartedFlag) { + // Do one chunk at a time if we get -EAGAIN and no + // evacuations are in flight at the moment. + ScheduleEvacuate(1); + } + return 0; + } + + SetEvacuateStarted(); + if (countFsSpaceAvailableFlag) { + gChunkManager.UpdateCountFsSpaceAvailableFlags(); + } + // Minor optimization: try to traverse the chunk list first, it likely + // that all chunks that were scheduled for evacuation are still in the list + // in the same order that they were scheduled. + ChunkDirList::Iterator it(chunkLists[kChunkDirList]); + int i; + for (i = 0; i < evacuateChunksOp.numChunks; i++) { + ChunkInfoHandle* const cih = it.Next(); + if (! cih || cih->chunkInfo.chunkId != evacuateChunksOp.chunkIds[i]) { + break; + } + cih->SetEvacuate(true); + } + for ( ; i < evacuateChunksOp.numChunks; i++) { + ChunkInfoHandle* cih; + if (gChunkManager.GetChunkInfoHandle( + evacuateChunksOp.chunkIds[i], &cih) == 0 && + &(cih->GetDirInfo()) == this) { + cih->SetEvacuate(true); + } + } + ScheduleEvacuate(); + return 0; +} + +int +ChunkManager::ChunkDirInfo::RenameEvacuateFileDone(int code, void* data) +{ + if ((code != EVENT_DISK_RENAME_DONE && + code != EVENT_DISK_ERROR) || ! evacuateFileRenameInFlightFlag) { + die("RenameEvacuateFileDone invalid completion"); + } + + evacuateFileRenameInFlightFlag = false; + if (availableSpace < 0) { + return 0; // Ignore, already marked not in use. + } + + if (code == EVENT_DISK_ERROR) { + DiskError(*reinterpret_cast(data)); + } else { + KFS_LOG_STREAM_DEBUG << + "chunk directory: " << dirname << " evacuation done" + " space: " << availableSpace << + " used: " << usedSpace << + " dev: " << deviceId << + " queue: " << (const void*)diskQueue << + KFS_LOG_EOM; + diskTimeoutCount = 0; + evacuateDoneFlag = true; + gChunkManager.NotifyMetaChunksLost(*this); + } + return 0; +} + +void +ChunkManager::ChunkDirInfo::ScheduleEvacuate( + int maxChunkCount) +{ + if (availableSpace < 0) { + return; // Ignore, already marked not in use. + } + + if (evacuateChunksOpInFlightFlag || ! evacuateFlag || + ! globalNetManager().IsRunning()) { + return; + } + if (evacuateStartedFlag && + ChunkDirList::IsEmpty(chunkLists[kChunkDirList])) { + if (evacuateInFlightCount > 0 || + ! ChunkDirList::IsEmpty(chunkLists[kChunkDirEvacuateList])) { + return; + } + if (evacuateDoneFlag || evacuateFileRenameInFlightFlag) { + return; + } + if (gChunkManager.GetEvacuateFileName().empty() || + gChunkManager.GetEvacuateDoneFileName().empty()) { + evacuateDoneFlag = true; + return; + } + const string src = dirname + gChunkManager.GetEvacuateFileName(); + const string dst = dirname + gChunkManager.GetEvacuateDoneFileName(); + string statusMsg; + evacuateFileRenameInFlightFlag = true; + if (! DiskIo::Rename( + src.c_str(), + dst.c_str(), + &renameEvacuateFileCb, + &statusMsg)) { + KFS_LOG_STREAM_ERROR << + "evacuate done rename " << + src << " to " << dst << + " " << statusMsg << + KFS_LOG_EOM; + evacuateFileRenameInFlightFlag = false; // Retry later + } + return; + } + if (evacuateStartedFlag) { + evacuateChunksOp.totalSpace = -1; + evacuateChunksOp.totalFsSpace = -1; + evacuateChunksOp.usedSpace = -1; + evacuateChunksOp.chunkDirs = -1; + evacuateChunksOp.writableChunkDirs = -1; + evacuateChunksOp.evacuateInFlightCount = -1; + evacuateChunksOp.numChunks = 0; + evacuateChunksOp.evacuateChunks = -1; + evacuateChunksOp.evacuateByteCount = -1; + const int maxCnt = maxChunkCount > 0 ? + min(int(EvacuateChunksOp::kMaxChunkIds), maxChunkCount) : + EvacuateChunksOp::kMaxChunkIds; + ChunkDirList::Iterator it(chunkLists[kChunkDirList]); + ChunkInfoHandle* cih; + while (evacuateChunksOp.numChunks < maxCnt && (cih = it.Next())) { + evacuateChunksOp.chunkIds[evacuateChunksOp.numChunks++] = + cih->chunkInfo.chunkId; + } + } else { + KFS_LOG_STREAM_WARN << + "evacuate: " << dirname << + " starting" << + KFS_LOG_EOM; + // On the first evacuate update the meta server space, in order to + // to prevent chunk allocation failures. + // When the response comes back the evacuate started flag is set to + // true. + const bool updateFlag = countFsSpaceAvailableFlag; + SetEvacuateStarted(); + if (updateFlag) { + gChunkManager.UpdateCountFsSpaceAvailableFlags(); + } + evacuateChunksOp.totalSpace = gChunkManager.GetTotalSpace( + evacuateChunksOp.totalFsSpace, + evacuateChunksOp.chunkDirs, + evacuateChunksOp.evacuateInFlightCount, + evacuateChunksOp.writableChunkDirs, + evacuateChunksOp.evacuateChunks, + evacuateChunksOp.evacuateByteCount + ); + evacuateChunksOp.usedSpace = gChunkManager.GetUsedSpace(); + evacuateStartedFlag = false; + if (updateFlag) { + gChunkManager.UpdateCountFsSpaceAvailableFlags(); + } + } + UpdateLastEvacuationActivityTime(); + // Submit op even if the chunk list is empty in order to update meta + // server's free space counters. + evacuateChunksOpInFlightFlag = true; + evacuateChunksOp.status = 0; + gMetaServerSM.EnqueueOp(&evacuateChunksOp); +} + +void +ChunkManager::ChunkDirInfo::RestartEvacuation() +{ + if (availableSpace < 0) { + return; // Ignore, already marked not in use. + } + if (! evacuateStartedFlag) { + return; + } + KFS_LOG_STREAM_WARN << + "evacuate: " << dirname << + " restarting" + " in flight: " << evacuateInFlightCount << + KFS_LOG_EOM; + ChunkDirInfo::ChunkLists& list = chunkLists[kChunkDirEvacuateList]; + ChunkInfoHandle* cih; + while ((cih = ChunkDirList::Front(list))) { + cih->SetEvacuate(false); + } + ScheduleEvacuate(); +} + +void +ChunkManager::MetaServerConnectionLost() +{ + mMetaEvacuateCount = -1; + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it < mChunkDirs.end(); ++it) { + if (it->availableSpace < 0 || ! it->evacuateFlag) { + continue; + } + // Take directory out of allocation now. Hello will update the + // meta server's free space parameters used in chunk placement. + it->SetEvacuateStarted(); + if (it->countFsSpaceAvailableFlag) { + UpdateCountFsSpaceAvailableFlags(); + } + it->RestartEvacuation(); + } +} + +long +ChunkManager::GetNumWritableChunks() const +{ + return (long)mPendingWrites.GetChunkIdCount(); +} + +void +ChunkManager::CheckChunkDirs() +{ + KFS_LOG_STREAM_DEBUG << "Checking chunk dirs" << KFS_LOG_EOM; + + DirChecker::DirsAvailable dirs; + mDirChecker.GetNewlyAvailable(dirs); + bool getFsSpaceAvailFlag = false; + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it < mChunkDirs.end(); ++it) { + if (it->availableSpace < 0 || it->checkDirReadableFlightFlag) { + DirChecker::DirsAvailable::const_iterator const dit = + dirs.find(it->dirname); + if (dit == dirs.end()) { + continue; + } + if (it->checkDirReadableFlightFlag) { + // Add it back, and wait in flight op completion. + mDirChecker.Add(it->dirname); + continue; + } + string errMsg; + if (DiskIo::StartIoQueue( + it->dirname.c_str(), + dit->second.first, + mMaxOpenChunkFiles, + &errMsg)) { + if (! (it->diskQueue = DiskIo::FindDiskQueue( + it->dirname.c_str()))) { + die(it->dirname + ": failed to find disk queue"); + } + it->availableSpace = 0; + it->deviceId = dit->second.first; + it->dirLock = dit->second.second; + it->corruptedChunksCount = 0; + it->evacuateCheckIoErrorsCount = 0; + ChunkDirs::const_iterator cit; + for (cit = mChunkDirs.begin(); cit != mChunkDirs.end(); ++cit) { + if (cit == it || cit->availableSpace < 0) { + continue; + } + if (it->deviceId == cit->deviceId && + it->countFsSpaceAvailableFlag) { + break; + } + } + it->countFsSpaceAvailableFlag = cit == mChunkDirs.end(); + KFS_LOG_STREAM_INFO << + "chunk directory: " << it->dirname << + " devId: " << it->deviceId << + " space:" + " used: " << it->usedSpace << + " countAvail: " << it->countFsSpaceAvailableFlag << + KFS_LOG_EOM; + getFsSpaceAvailFlag = true; + // Notify meta serve that directory is now in use. + gMetaServerSM.EnqueueOp( + new CorruptChunkOp(0, -1, -1, &(it->dirname), true)); + continue; + } + KFS_LOG_STREAM_ERROR << + "failed to start disk queue for: " << it->dirname << + " dev: << " << it->deviceId << " :" << errMsg << + KFS_LOG_EOM; + // For now do not keep trying. + // mDirChecker.Add(it->dirname); + continue; + } + string err; + it->checkDirReadableFlightFlag = true; + if (! DiskIo::CheckDirReadable( + it->dirname.c_str(), &(it->checkDirReadableCb), &err)) { + it->checkDirReadableFlightFlag = false; + KFS_LOG_STREAM_ERROR << "failed to queue" + " check dir readable request for: " << it->dirname << + " : " << err << + KFS_LOG_EOM; + // Do not declare directory unusable on req. queueing failure. + // DiskIo can be temp. out of requests. + } + } + if (getFsSpaceAvailFlag) { + GetFsSpaceAvailable(); + } +} + +void +ChunkManager::GetFsSpaceAvailable() +{ + for (ChunkDirs::iterator it = mChunkDirs.begin(); + it < mChunkDirs.end(); ++it) { + if (it->availableSpace < 0) { + continue; + } + string err; + if (! it->evacuateFlag && ! it->checkEvacuateFileInFlightFlag) { + const string fn = it->dirname + mEvacuateFileName; + it->checkEvacuateFileInFlightFlag = true; + if (! DiskIo::GetFsSpaceAvailable( + fn.c_str(), &(it->checkEvacuateFileCb), &err)) { + it->checkEvacuateFileInFlightFlag = false; + KFS_LOG_STREAM_ERROR << "failed to queue " + "fs space available request for: " << fn << + " : " << err << + KFS_LOG_EOM; + // Do not declare directory unusable on req. queueing failure. + // DiskIo can be temp. out of requests. + continue; + } + } + if (it->evacuateStartedFlag && + mEvacuationInactivityTimeout > 0 && + mMetaEvacuateCount == 0 && + ! it->evacuateChunksOpInFlightFlag && + it->evacuateInFlightCount > 0 && + it->lastEvacuationActivityTime + mEvacuationInactivityTimeout < + mMetaHeartbeatTime) { + it->RestartEvacuation(); + } + if (it->fsSpaceAvailInFlightFlag) { + continue; + } + it->fsSpaceAvailInFlightFlag = true; + if (! DiskIo::GetFsSpaceAvailable( + it->dirname.c_str(), &(it->fsSpaceAvailCb), &err)) { + it->fsSpaceAvailInFlightFlag = 0; + KFS_LOG_STREAM_ERROR << "failed to queue " + "fs space available request for: " << it->dirname << + " : " << err << + KFS_LOG_EOM; + // Do not declare directory unusable on req. queueing failure. + // DiskIo can be temp. out of requests. + } + } +} + +void +ChunkManager::MetaHeartbeat(HeartbeatOp& op) +{ + mMetaHeartbeatTime = globalNetManager().Now(); + mMetaEvacuateCount = op.metaEvacuateCount; +} + +} // namespace KFS diff --git a/src/cc/chunk/ChunkManager.h b/src/cc/chunk/ChunkManager.h new file mode 100644 index 000000000..423c9abbd --- /dev/null +++ b/src/cc/chunk/ChunkManager.h @@ -0,0 +1,824 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/28 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file ChunkManager.h +// \brief Handles all chunk related ops. +// +//---------------------------------------------------------------------------- + +#ifndef _CHUNKMANAGER_H +#define _CHUNKMANAGER_H + +#include "Chunk.h" +#include "KfsOps.h" +#include "DiskIo.h" +#include "DirChecker.h" + +#include "kfsio/ITimeout.h" +#include "common/LinearHash.h" +#include "common/StdAllocator.h" + +#include +#include +#include +#include +#include + +namespace KFS +{ + +using std::string; +using std::vector; +using std::ostream; +using std::list; +using std::map; +using std::pair; +using std::make_pair; +using std::less; + +/// We allow a chunk header upto 16K in size +const size_t KFS_CHUNK_HEADER_SIZE = 16 << 10; + +class ChunkInfoHandle; + +class Properties; + +/// The chunk manager writes out chunks as individual files on disk. +/// The location of the chunk directory is defined by chunkBaseDir. +/// The file names of chunks is a string representation of the chunk +/// id. The chunk manager performs disk I/O asynchronously -- it never blocks. +/// All disk io related requests, including host file system meta operations +/// (create, delete, stat etc) added to disk io queues. The specified request +/// completion handler invoked upon completion of the request. +/// +class ChunkManager : private ITimeout { +public: + struct Counters + { + typedef int64_t Counter; + + Counter mBadChunkHeaderErrorCount; + Counter mReadChecksumErrorCount; + Counter mReadErrorCount; + Counter mWriteErrorCount; + Counter mOpenErrorCount; + Counter mCorruptedChunksCount; + Counter mLostChunksCount; + Counter mDirLostChunkCount; + Counter mChunkDirLostCount; + + void Clear() + { + mBadChunkHeaderErrorCount = 0; + mReadChecksumErrorCount = 0; + mReadErrorCount = 0; + mWriteErrorCount = 0; + mOpenErrorCount = 0; + mCorruptedChunksCount = 0; + mLostChunksCount = 0; + mDirLostChunkCount = 0; + mChunkDirLostCount = 0; + } + }; + + ChunkManager(); + ~ChunkManager(); + + void SetParameters(const Properties& prop); + /// Init function to configure the chunk manager object. + bool Init(const vector& chunkDirs, const Properties& prop); + + /// Allocate a file to hold a chunk on disk. The filename is the + /// chunk id itself. + /// @param[in] fileId id of the file that has chunk chunkId + /// @param[in] chunkId id of the chunk being allocated. + /// @param[in] chunkVersion the version assigned by the metaserver to this chunk + /// @param[in] isBeingReplicated is the allocation for replicating a chunk? + /// @retval status code + int AllocChunk(kfsFileId_t fileId, kfsChunkId_t chunkId, + int64_t chunkVersion, + bool isBeingReplicated = false, + ChunkInfoHandle **cih = 0, + bool mustExistFlag = false); + void AllocChunkForAppend( + AllocChunkOp* op, int replicationPos, ServerLocation peerLoc); + /// Delete a previously allocated chunk file. + /// @param[in] chunkId id of the chunk being deleted. + /// @retval status code + int DeleteChunk(kfsChunkId_t chunkId); + + /// Dump chunk map with information about chunkID and chunkSize + void DumpChunkMap(); + + /// Dump chunk map with information about chunkID and chunkSize + /// to a string stream + void DumpChunkMap(ostream& ofs); + + /// A previously created dirty chunk should now be made "stable". + /// Move that chunk out of the dirty dir. + int MakeChunkStable(kfsChunkId_t chunkId, kfsSeq_t chunkVersion, + bool appendFlag, KfsCallbackObj* cb, string& statusMsg); + bool IsChunkStable(kfsChunkId_t chunkId) const; + bool IsChunkReadable(kfsChunkId_t chunkId) const; + bool IsChunkStable(MakeChunkStableOp* op); + + /// A previously created chunk is stale; move it to stale chunks + /// dir only if we want to preserve it; otherwise, delete + /// + /// @param[in] chunkId id of the chunk being moved + /// @retval status code + int StaleChunk(kfsChunkId_t chunkId, + bool forceDeleteFlag = false, bool evacuatedFlag = false); + + /// Truncate a chunk to the specified size + /// @param[in] chunkId id of the chunk being truncated. + /// @param[in] chunkSize size to which chunk should be truncated. + /// @retval status code + int TruncateChunk(kfsChunkId_t chunkId, int64_t chunkSize); + + /// Change a chunk's version # to what the server says it should be. + /// @param[in] fileId id of the file that has chunk chunkId + /// @param[in] chunkId id of the chunk being allocated. + /// @param[in] chunkVersion the version assigned by the metaserver to this chunk + /// @retval status code + int ChangeChunkVers(kfsChunkId_t chunkId, + int64_t chunkVersion, bool stableFlag, KfsCallbackObj* cb); + int ChangeChunkVers(ChunkInfoHandle *cih, + int64_t chunkVersion, bool stableFlag, KfsCallbackObj* cb); + int ChangeChunkVers(ChangeChunkVersOp* op); + + /// Open a chunk for I/O. + /// @param[in] chunkId id of the chunk being opened. + /// @param[in] openFlags O_RDONLY, O_WRONLY + /// @retval status code + int OpenChunk(kfsChunkId_t chunkId, int openFlags); + + /// Close a previously opened chunk and release resources. + /// @param[in] chunkId id of the chunk being closed. + /// @retval 0 if the close was accepted; -1 otherwise + int CloseChunk(kfsChunkId_t chunkId); + int CloseChunk(ChunkInfoHandle* cih); + bool CloseChunkIfReadable(kfsChunkId_t chunkId); + + /// Utility function that returns a pointer to mChunkTable[chunkId]. + /// @param[in] chunkId the chunk id for which we want info + /// @param[out] cih the resulting pointer from mChunkTable[chunkId] + /// @retval 0 on success; -EBADF if we can't find mChunkTable[chunkId] + int GetChunkInfoHandle(kfsChunkId_t chunkId, ChunkInfoHandle **cih); + + /// Given a byte range, return the checksums for that range. + vector GetChecksums(kfsChunkId_t chunkId, int64_t offset, size_t numBytes); + + /// For telemetry purposes, provide the driveName where the chunk + /// is stored and pass that back to the client. + void GetDriveName(ReadOp *op); + + /// Schedule a read on a chunk. + /// @param[in] op The read operation being scheduled. + /// @retval 0 if op was successfully scheduled; -1 otherwise + int ReadChunk(ReadOp *op); + + /// Schedule a write on a chunk. + /// @param[in] op The write operation being scheduled. + /// @retval 0 if op was successfully scheduled; -1 otherwise + int WriteChunk(WriteOp *op); + + /// Write/read out/in the chunk meta-data and notify the cb when the op + /// is done. + /// @retval 0 if op was successfully scheduled; -errno otherwise + int WriteChunkMetadata(kfsChunkId_t chunkId, KfsCallbackObj *cb, bool forceFlag = false); + int ReadChunkMetadata(kfsChunkId_t chunkId, KfsOp *cb); + + /// Notification that read is finished + void ReadChunkMetadataDone(ReadChunkMetaOp* op, IOBuffer* dataBuf); + bool IsChunkMetadataLoaded(kfsChunkId_t chunkId); + + /// A previously scheduled write op just finished. Update chunk + /// size and the amount of used space. + /// @param[in] op The write op that just finished + /// + bool ReadChunkDone(ReadOp *op); + void ReplicationDone(kfsChunkId_t chunkId, int status); + /// Determine the size of a chunk. + /// @param[in] chunkId The chunk whose size is needed + /// @param[out] fid Return the file-id that owns the chunk + /// @param[out] chunkSize The size of the chunk + /// @retval status code + void ChunkSize(SizeOp* op); + + /// Register a timeout handler with the net manager for taking + /// checkpoints. Also, get the logger going + void Start(); + + /// Read the chunk table from disk following a restart. See + /// comments in the method for issues relating to validation (such + /// as, checkpoint contains a chunk name, but the associated file + /// is not there on disk, etc.). + int Restart(); + + /// Retrieve the chunks hosted on this chunk server. + /// @param[out] result A vector containing info of all chunks + /// hosted on this server. + void GetHostedChunks( + vector &stable, + vector ¬Stable, + vector ¬StableAppend); + + /// Return the total space that is exported by this server. If + /// chunks are stored in a single directory, we use statvfs to + /// determine the total space avail; we report the min of statvfs + /// value and the configured mTotalSpace. + int64_t GetTotalSpace(int64_t& totalFsSpace, int& chunkDirs, + int& evacuateInFlightCount, int& writableDirs, + int& evacuateChunks, int64_t& evacuteByteCount, + int* evacuateDoneChunkCount = 0, int64_t* evacuateDoneByteCount = 0, + HelloMetaOp::LostChunkDirs* lostChunkDirs = 0); + int64_t GetUsedSpace() const { return mUsedSpace; }; + long GetNumChunks() const { return mChunkTable.GetSize(); }; + long GetNumWritableChunks() const; + + /// For a write, the client is defining a write operation. The op + /// is queued and the client pushes data for it subsequently. + /// @param[in] wi The op that defines the write + /// @retval status code + int AllocateWriteId(WriteIdAllocOp *wi, int replicationPos, ServerLocation peerLoc); + + /// Check if a write is pending to a chunk. + /// @param[in] chunkId The chunkid for which we are checking for + /// pending write(s). + /// @retval True if a write is pending; false otherwise + bool IsWritePending(kfsChunkId_t chunkId) const { + return mPendingWrites.HasChunkId(chunkId); + } + + /// Given a chunk id, return its version + int64_t GetChunkVersion(kfsChunkId_t c); + + /// Retrieve the write op given a write id. + /// @param[in] writeId The id corresponding to a previously + /// enqueued write. + /// @retval WriteOp if one exists; NULL otherwise + WriteOp *GetWriteOp(int64_t writeId); + + /// The model with writes: allocate a write id (this causes a + /// write-op to be created); then, push data for writes (which + /// retrieves the write-op and then sends writes down to disk). + /// The "clone" method makes a copy of a previously created + /// write-op. + /// @param[in] writeId the write id that was previously assigned + /// @retval WriteOp if one exists; NULL otherwise + WriteOp *CloneWriteOp(int64_t writeId); + + /// Set the status for a given write id + void SetWriteStatus(int64_t writeId, int status); + int GetWriteStatus(int64_t writeId); + + /// Is the write id a valid one + bool IsValidWriteId(int64_t writeId) { + return mPendingWrites.find(writeId); + } + + virtual void Timeout(); + + /// Push the changes from the write out to disk + int Sync(WriteOp *op); + + ChunkInfo_t* GetChunkInfo(kfsChunkId_t chunkId); + + + void ChunkIOFailed(kfsChunkId_t chunkId, int err, const DiskIo::File* file); + void ChunkIOFailed(kfsChunkId_t chunkId, int err, const DiskIo* diskIo); + void ChunkIOFailed(ChunkInfoHandle* cih, int err); + void ReportIOFailure(ChunkInfoHandle* cih, int err); + size_t GetMaxIORequestSize() const { + return mMaxIORequestSize; + } + void Shutdown(); + bool IsWriteAppenderOwns(kfsChunkId_t chunkId) const; + + inline void LruUpdate(ChunkInfoHandle& cih); + inline bool IsInLru(const ChunkInfoHandle& cih) const; + inline void UpdateStale(ChunkInfoHandle& cih); + + void GetCounters(Counters& counters) + { counters = mCounters; } + + /// Utility function that sets up a disk connection for an + /// I/O operation on a chunk. + /// @param[in] cih chunk handle on which we are doing I/O + /// @param[in] op The KfsCallbackObj that is being on the chunk + /// @retval A disk connection pointer allocated via a call to new; + /// it is the caller's responsibility to free the memory + DiskIo *SetupDiskIo(ChunkInfoHandle *cih, KfsCallbackObj *op); + /// Notify the metaserver that chunk chunkId is corrupted; the + /// metaserver will re-replicate this chunk and for now, won't + /// send us traffic for this chunk. + void NotifyMetaCorruptedChunk(ChunkInfoHandle *cih, int err); + int StaleChunk(ChunkInfoHandle *cih, + bool forceDeleteFlag = false, bool evacuatedFlag = false); + /// Utility function that given a chunkId, returns the full path + /// to the chunk filename. + string MakeChunkPathname(ChunkInfoHandle *cih); + string MakeChunkPathname(ChunkInfoHandle *cih, bool stableFlag, kfsSeq_t targetVersion); + void WriteDone(WriteOp* op); + int GetMaxDirCheckDiskTimeouts() const + { return mMaxDirCheckDiskTimeouts; } + void MetaServerConnectionLost(); + void SetChunkSize(ChunkInfo_t& ci, int64_t chunkSize) + { + if (ci.chunkSize > 0) { + mUsedSpace = mUsedSpace >= ci.chunkSize ? + mUsedSpace - ci.chunkSize : 0; + } + ci.chunkSize = chunkSize > 0 ? chunkSize : 0; + mUsedSpace += ci.chunkSize; + } + + enum { kChunkInfoHandleListCount = 1 }; + enum ChunkListType + { + kChunkLruList = 0, + kChunkStaleList = 1, + kChunkPendingStaleList = 2, + kChunkInfoListCount + }; + typedef ChunkInfoHandle* ChunkLists[kChunkInfoHandleListCount]; + struct ChunkDirInfo; + + const string& GetEvacuateFileName() const { return mEvacuateFileName; } + const string& GetEvacuateDoneFileName() const { return mEvacuateDoneFileName; } + int UpdateCountFsSpaceAvailableFlags(); + void MetaHeartbeat(HeartbeatOp& op); + int GetMaxEvacuateIoErrors() const { return mMaxEvacuateIoErrors; } + +private: + class PendingWrites + { + public: + PendingWrites() + : mWriteIds(), mChunkIds(), mLru(), mKeyOp(0, 0) + {} + bool empty() const + { return (mWriteIds.empty()); } + bool push_front(WriteOp* op) + { return Insert(op, true); } + bool push_back(WriteOp* op) + { return Insert(op, false); } + bool pop_front() + { return Remove(true); } + bool pop_back() + { return Remove(false); } + size_t size() const + { return mWriteIds.size(); } + WriteOp* front() const + { return mLru.front().mWriteIdIt->mOp; } + WriteOp* back() const + { return mLru.back().mWriteIdIt->mOp; } + WriteOp* find(int64_t writeId) const + { + WriteOp& op = GetKeyOp(); + op.writeId = writeId; + WriteIdSet::const_iterator const i = + mWriteIds.find(WriteIdEntry(&op)); + return (i == mWriteIds.end() ? 0 : i->mOp); + } + bool HasChunkId(kfsChunkId_t chunkId) const + { return (mChunkIds.find(chunkId) != mChunkIds.end()); } + bool erase(WriteOp* op) + { + const WriteIdSet::iterator i = mWriteIds.find(WriteIdEntry(op)); + return (i != mWriteIds.end() && op == i->mOp && Erase(i)); + } + bool erase(int64_t writeId) + { + WriteOp& op = GetKeyOp(); + op.writeId = writeId; + WriteIdSet::const_iterator const i = + mWriteIds.find(WriteIdEntry(&op)); + return (i != mWriteIds.end() && Erase(i)); + } + bool Delete(kfsChunkId_t chunkId, kfsSeq_t chunkVersion) + { + ChunkIdMap::iterator i = mChunkIds.find(chunkId); + if (i == mChunkIds.end()) { + return true; + } + ChunkWrites& wr = i->second; + for (ChunkWrites::iterator w = wr.begin(); w != wr.end(); ) { + Lru::iterator const c = w->GetLruIterator(); + if (c->mWriteIdIt->mOp->chunkVersion == chunkVersion) { + WriteOp* const op = c->mWriteIdIt->mOp; + mWriteIds.erase(c->mWriteIdIt); + mLru.erase(c); + w = wr.erase(w); + delete op; + } else { + ++w; + } + } + if (wr.empty()) { + mChunkIds.erase(i); + return true; + } + return false; + } + WriteOp* FindAndMoveBack(int64_t writeId) + { + mKeyOp.writeId = writeId; + const WriteIdSet::iterator i = + mWriteIds.find(WriteIdEntry(&mKeyOp)); + if (i == mWriteIds.end()) { + return 0; + } + // splice: "All iterators remain valid including iterators that + // point to elements of x." x == mLru + mLru.splice(mLru.end(), mLru, i->GetLruIterator()); + return i->mOp; + } + size_t GetChunkIdCount() const + { return mChunkIds.size(); } + private: + class LruIterator; + class OpListEntry + { + private: + struct { // Make it struct aligned. + char mArray[sizeof(list::iterator)]; + } mLruIteratorStorage; + public: + inline OpListEntry(); + inline ~OpListEntry(); + // Set iterator prohibit node mutation, because the node is the + // key, and changing the key can potentially change the order. + // In this particular case order only depends on mOp->writeId. + // The following hack is also needed to get around type dependency + // cycle with Lru::iterator, and WriteIdEntry. + LruIterator& GetLruIterator() const + { + return *reinterpret_cast( + &const_cast(this)->mLruIteratorStorage); + } + }; + struct WriteIdEntry : public OpListEntry + { + public: + inline WriteIdEntry(WriteOp* op = 0); + WriteOp* mOp; + }; + struct WriteIdCmp + { + bool operator()(const WriteIdEntry& x, const WriteIdEntry& y) const + { return (x.mOp->writeId < y.mOp->writeId); } + }; + typedef set + > WriteIdSet; + typedef list > ChunkWrites; + typedef map, + StdFastAllocator< + pair > + > ChunkIdMap; + struct LruEntry + { + LruEntry() + : mWriteIdIt(), mChunkIdIt(), mChunkWritesIt() + {} + LruEntry( + WriteIdSet::iterator writeIdIt, + ChunkIdMap::iterator chunkIdIt, + ChunkWrites::iterator chunkWritesIt) + : mWriteIdIt(writeIdIt), + mChunkIdIt(chunkIdIt), + mChunkWritesIt(chunkWritesIt) + {} + WriteIdSet::iterator mWriteIdIt; + ChunkIdMap::iterator mChunkIdIt; + ChunkWrites::iterator mChunkWritesIt; + }; + typedef list > Lru; + class LruIterator : public Lru::iterator + { + public: + LruIterator& operator=(const Lru::iterator& it) + { + Lru::iterator::operator=(it); + return *this; + } + }; + + WriteIdSet mWriteIds; + ChunkIdMap mChunkIds; + Lru mLru; + WriteOp mKeyOp; + + bool Insert(WriteOp* op, bool front) + { + if (! op) { + return false; + } + pair const w = + mWriteIds.insert(WriteIdEntry(op)); + if (! w.second) { + return false; + } + ChunkIdMap::iterator const c = mChunkIds.insert( + make_pair(op->chunkId, ChunkWrites())).first; + ChunkWrites::iterator const cw = + c->second.insert(c->second.end(), OpListEntry()); + w.first->GetLruIterator() = mLru.insert( + front ? mLru.begin() : mLru.end(), + LruEntry(w.first, c, cw)); + cw->GetLruIterator() = w.first->GetLruIterator(); + return true; + } + bool Remove(bool front) + { + if (mLru.empty()) { + return false; + } + LruEntry& c = front ? mLru.front() : mLru.back(); + mWriteIds.erase(c.mWriteIdIt); + c.mChunkIdIt->second.erase(c.mChunkWritesIt); + if (c.mChunkIdIt->second.empty()) { + mChunkIds.erase(c.mChunkIdIt); + } + if (front) { + mLru.pop_front(); + } else { + mLru.pop_back(); + } + return true; + } + bool Erase(WriteIdSet::iterator i) + { + const Lru::iterator c = i->GetLruIterator(); + c->mChunkIdIt->second.erase(c->mChunkWritesIt); + if (c->mChunkIdIt->second.empty()) { + mChunkIds.erase(c->mChunkIdIt); + } + mLru.erase(c); + mWriteIds.erase(i); + return true; + } + WriteOp& GetKeyOp() const + { return *const_cast(&mKeyOp); } + private: + PendingWrites(const PendingWrites&); + PendingWrites& operator=(const PendingWrites&); + }; + + class ChunkDirs + { + public: + typedef ChunkDirInfo* iterator; + typedef const ChunkDirInfo* const_iterator; + ChunkDirs() + : mChunkDirs(0), + mSize(0) + {} + inline ~ChunkDirs(); + inline ChunkDirInfo& operator[](size_t i); + inline const ChunkDirInfo& operator[](size_t i) const; + inline iterator begin() { return mChunkDirs; } + inline iterator end(); + inline const_iterator begin() const { return mChunkDirs; }; + inline const_iterator end() const; + void Allocate(size_t size); + size_t size() const { return mSize; } + private: + ChunkDirInfo* mChunkDirs; + size_t mSize; + + ChunkDirs(const ChunkDirs&); + ChunkDirs& operator=(const ChunkDirs&); + }; + + struct StaleChunkCompletion : public KfsCallbackObj + { + StaleChunkCompletion( + ChunkManager& m) + : KfsCallbackObj(), + mMgr(m) + { SET_HANDLER(this, &StaleChunkCompletion::Done); } + int Done(int /* code */, void* /* data */) { + const bool completionFlag = true; + mMgr.RunStaleChunksQueue(completionFlag); + return 0; + } + ChunkManager& mMgr; + }; + + bool StartDiskIo(); + + /// Map from a chunk id to a chunk handle + /// + typedef KVPair CMapEntry; + typedef LinearHash< + CMapEntry, + KeyCompare, + DynamicArray< + SingleLinkedList*, + 20 // 2 * sizeof(size_t) = 16 MB initial + >, + StdFastAllocator + > CMap; + + /// How long should a pending write be held in LRU + int mMaxPendingWriteLruSecs; + /// take a checkpoint once every 2 mins + int mCheckpointIntervalSecs; + + /// space available for allocation + int64_t mTotalSpace; + /// how much is used up by chunks + int64_t mUsedSpace; + int64_t mMinFsAvailableSpace; + double mMaxSpaceUtilizationThreshold; + + time_t mNextCheckpointTime; + int mMaxOpenChunkFiles; + int mMaxOpenFds; + int mFdsPerChunk; + + /// directories for storing the chunks + ChunkDirs mChunkDirs; + + /// See the comments in KfsOps.cc near WritePrepareOp related to write handling + int64_t mWriteId; + PendingWrites mPendingWrites; + + /// table that maps chunkIds to their associated state + CMap mChunkTable; + size_t mMaxIORequestSize; + /// Chunk lru, and stale chunks list heads. + ChunkLists mChunkInfoLists[kChunkInfoListCount]; + + /// Periodically do an IO and check the chunk dirs and identify failed drives + time_t mNextChunkDirsCheckTime; + int mChunkDirsCheckIntervalSecs; + time_t mNextGetFsSpaceAvailableTime; + int mGetFsSpaceAvailableIntervalSecs; + + // Cleanup fds on which no I/O has been done for the past N secs + int mInactiveFdsCleanupIntervalSecs; + time_t mNextInactiveFdCleanupTime; + + int mReadChecksumMismatchMaxRetryCount; + bool mAbortOnChecksumMismatchFlag; // For debugging + bool mRequireChunkHeaderChecksumFlag; + bool mForceDeleteStaleChunksFlag; + bool mKeepEvacuatedChunksFlag; + StaleChunkCompletion mStaleChunkCompletion; + int mStaleChunkOpsInFlight; + int mMaxStaleChunkOpsInFlight; + int mMaxDirCheckDiskTimeouts; + double mChunkPlacementPendingReadWeight; + double mChunkPlacementPendingWriteWeight; + double mMaxPlacementSpaceRatio; + int64_t mMinPendingIoThreshold; + bool mAllowSparseChunksFlag; + bool mBufferedIoFlag; + + uint32_t mNullBlockChecksum; + + Counters mCounters; + DirChecker mDirChecker; + bool mCleanupChunkDirsFlag; + string mStaleChunksDir; + string mDirtyChunksDir; + string mEvacuateFileName; + string mEvacuateDoneFileName; + string mChunkDirLockName; + int mEvacuationInactivityTimeout; + time_t mMetaHeartbeatTime; + int64_t mMetaEvacuateCount; + int mMaxEvacuateIoErrors; + + enum + { + kChunkHeaderBufferSize = + (int)(sizeof(DiskChunkInfo_t) + sizeof(uint64_t)) + }; + struct + { + char buf[kChunkHeaderBufferSize]; + } mChunkHeaderBufferAlloc; + char* const mChunkHeaderBuffer; + + inline void Delete(ChunkInfoHandle& cih); + inline void Release(ChunkInfoHandle& cih); + + /// When a checkpoint file is read, update the mChunkTable[] to + /// include a mapping for cih->chunkInfo.chunkId. + void AddMapping(ChunkDirInfo& dir, const char* filename, int64_t filesz); + void AddMapping(ChunkInfoHandle *cih); + + /// Of the various directories this chunkserver is configured with, find the directory to store a chunk file. + /// This method does a "directory allocation". + ChunkDirInfo* GetDirForChunk(); + + void CheckChunkDirs(); + void GetFsSpaceAvailable(); + + string MakeChunkPathname(const string &chunkdir, kfsFileId_t fid, kfsChunkId_t chunkId, kfsSeq_t chunkVersion); + + /// Utility function that given a chunkId, returns the full path + /// to the chunk filename in the "stalechunks" dir + string MakeStaleChunkPathname(ChunkInfoHandle *cih); + + /// update the used space in the directory where the chunk resides by nbytes. + void UpdateDirSpace(ChunkInfoHandle *cih, int64_t nbytes); + + /// Checksums are computed on 64K blocks. To verify checksums on + /// reads, reads are aligned at 64K boundaries and data is read in + /// 64K blocks. So, for reads that are un-aligned/read less data, + /// adjust appropriately. + void AdjustDataRead(ReadOp *op); + + /// Pad the buffer with sufficient 0's so that checksumming works + /// out. + /// @param[in/out] buffer The buffer to be padded with 0's + void ZeroPad(IOBuffer *buffer); + + /// Given a chunkId and offset, return the checksum of corresponding + /// "checksum block"---i.e., the 64K block that contains offset. + uint32_t GetChecksum(kfsChunkId_t chunkId, int64_t offset); + + /// For any writes that have been held for more than 2 mins, + /// scavenge them and reclaim memory. + void ScavengePendingWrites(time_t now); + + /// If we have too many open fd's close out whatever we can. When + /// periodic is set, we do a scan and clean up. + void CleanupInactiveFds(time_t now = 0); + + /// For some reason, dirname is not accessable (for instance, the + /// drive may have failed); in this case, notify metaserver that + /// all the blocks on that dir are lost and the metaserver can + /// then re-replicate. + void NotifyMetaChunksLost(ChunkDirInfo& dir); + + /// Helper function to move a chunk to the stale dir + int MarkChunkStale(ChunkInfoHandle *cih, KfsCallbackObj* cb); + + /// On a restart, nuke out all the dirty chunks + void RemoveDirtyChunks(); + + /// Scan the chunk dirs and rebuild the list of chunks that are hosted on this server + void Restore(); + /// Restore the chunk meta-data from the specified file name. + void RestoreChunkMeta(const string &chunkMetaFn); + + /// Update the checksums in the chunk metadata based on the op. + void UpdateChecksums(ChunkInfoHandle *cih, WriteOp *op); + bool IsChunkStable(const ChunkInfoHandle* cih) const; + void RunStaleChunksQueue(bool completionFlag = false); + int OpenChunk(ChunkInfoHandle* cih, int openFlags); +private: + // No copy. + ChunkManager(const ChunkManager&); + ChunkManager& operator=(const ChunkManager&); +}; + +inline ChunkManager::PendingWrites::OpListEntry::OpListEntry() +{ + BOOST_STATIC_ASSERT(sizeof(mLruIteratorStorage) >= sizeof(LruIterator)); + LruIterator* const i = + ::new (static_cast(&mLruIteratorStorage)) LruIterator(); + assert(i == &GetLruIterator()); + (void)i; +} + +inline ChunkManager::PendingWrites::OpListEntry::~OpListEntry() +{ GetLruIterator().~LruIterator(); } + +inline ChunkManager::PendingWrites::WriteIdEntry::WriteIdEntry(WriteOp* op) + : OpListEntry(), mOp(op) +{} + +extern ChunkManager gChunkManager; + +} + +#endif // _CHUNKMANAGER_H diff --git a/src/cc/chunk/ChunkServer.cc b/src/cc/chunk/ChunkServer.cc new file mode 100644 index 000000000..4e67e60c9 --- /dev/null +++ b/src/cc/chunk/ChunkServer.cc @@ -0,0 +1,125 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/23 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "kfsio/Globals.h" + +#include "ChunkServer.h" +#include "Logger.h" +#include "utils.h" + +#include +#include +#include + +namespace KFS { + +using std::string; +using libkfsio::globalNetManager; + + +ChunkServer gChunkServer; + +void +ChunkServer::Init() +{ +} + +void +ChunkServer::SendTelemetryReport(KfsOp_t /* op */, double /* timeSpent */) +{ +} + +bool +ChunkServer::MainLoop(int clientAcceptPort, const string& serverIp) +{ + if (clientAcceptPort <= 0) { + KFS_LOG_STREAM_FATAL << + "invalid client port: " << clientAcceptPort << + KFS_LOG_EOM; + return false; + } + mUpdateServerIpFlag = serverIp.empty(); + if (! mUpdateServerIpFlag) { + // For now support only ipv4 addresses. + // The ip does not have to be assigned to any local NICs. + // The ip is valid as long as the clients can reach this particular + // process using this ip. + // + // In the case when the chunk server is on the same host as the meta + // server, but the clients aren't, the server ip must be specified. + // Setting cnchunkServer.metaServer.hostname to the client "visible" ip + // might also work. + // + // This also allows to work with NAT between the clients, and chunk and + // meta servers. + // The server ip can also be used for the testing purposes, so that the + // clients always fail to connect to the chunk server, but the meta + // server considers this server operational. + struct in_addr addr; + if (! inet_aton(serverIp.c_str(), &addr)) { + KFS_LOG_STREAM_FATAL << + "invalid server ip: " << serverIp << + KFS_LOG_EOM; + return false; + } + } + mLocation.Reset(serverIp.c_str(), clientAcceptPort); + if (! gClientManager.StartAcceptor(clientAcceptPort)) { + KFS_LOG_STREAM_FATAL << + "Unable to start acceptor on port: " << clientAcceptPort << + KFS_LOG_EOM; + return false; + } + if (gChunkManager.Restart() != 0) { + return false; + } + gLogger.Start(); + gChunkManager.Start(); + gMetaServerSM.Init(); + + globalNetManager().MainLoop(); + return true; +} + +void +StopNetProcessor(int /* status */) +{ + globalNetManager().Shutdown(); +} + +RemoteSyncSMPtr +ChunkServer::FindServer(const ServerLocation &location, bool connect) +{ + return KFS::FindServer(mRemoteSyncers, location, connect); +} + +void +ChunkServer::RemoveServer(RemoteSyncSM *target) +{ + KFS::RemoveServer(mRemoteSyncers, target); +} + +} diff --git a/src/cc/chunk/ChunkServer.h b/src/cc/chunk/ChunkServer.h new file mode 100644 index 000000000..c6fc189f7 --- /dev/null +++ b/src/cc/chunk/ChunkServer.h @@ -0,0 +1,98 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/16 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _CHUNKSERVER_H +#define _CHUNKSERVER_H + +#include "ChunkManager.h" +#include "ClientManager.h" +#include "ClientSM.h" +#include "MetaServerSM.h" +#include "RemoteSyncSM.h" + +namespace KFS +{ +using std::string; +using std::list; + +// Chunk server globals and main event loop. +class ChunkServer +{ +public: + ChunkServer() : + mOpCount(0), + mUpdateServerIpFlag(false), + mLocation(), + mRemoteSyncers() + {} + + void Init(); + bool MainLoop(int clientAcceptPort, const string& serverIp); + bool IsLocalServer(const ServerLocation& location) const { + return mLocation == location; + } + RemoteSyncSMPtr FindServer(const ServerLocation& location, + bool connect = true); + void RemoveServer(RemoteSyncSM* target); + string GetMyLocation() const { + return mLocation.ToString(); + } + const ServerLocation& GetLocation() const { + return mLocation; + } + void OpInserted() { + mOpCount++; + } + void OpFinished() { + mOpCount--; + if (mOpCount < 0) { + mOpCount = 0; + } + } + int GetNumOps() const { + return mOpCount; + } + void SendTelemetryReport(KfsOp_t op, double timeSpent); + bool CanUpdateServerIp() const { + return mUpdateServerIpFlag; + } + inline void SetLocation(const ServerLocation& loc); +private: + // # of ops in the system + int mOpCount; + bool mUpdateServerIpFlag; + ServerLocation mLocation; + list mRemoteSyncers; +private: + // No copy. + ChunkServer(const ChunkServer&); + ChunkServer& operator=(const ChunkServer&); +}; + +extern ChunkServer gChunkServer; +} + +#endif // _CHUNKSERVER_H diff --git a/src/cc/chunk/ClientManager.cc b/src/cc/chunk/ClientManager.cc new file mode 100644 index 000000000..9e5340da4 --- /dev/null +++ b/src/cc/chunk/ClientManager.cc @@ -0,0 +1,41 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/28 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "ClientManager.h" + +namespace KFS +{ + +ClientManager gClientManager; + +bool +ClientManager::StartAcceptor(int port) +{ + mAcceptor = new Acceptor(port, this); + return mAcceptor->IsAcceptorStarted(); +} + +} diff --git a/src/cc/chunk/ClientManager.h b/src/cc/chunk/ClientManager.h new file mode 100644 index 000000000..370a20e60 --- /dev/null +++ b/src/cc/chunk/ClientManager.h @@ -0,0 +1,202 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/28 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _CLIENTMANAGER_H +#define _CLIENTMANAGER_H + +#include +#include +#include "kfsio/Acceptor.h" +#include "ClientSM.h" + +namespace KFS +{ + +// Client connection listener. +class ClientManager : public IAcceptorOwner { +public: + struct Counters + { + typedef int64_t Counter; + + Counter mAcceptCount; + Counter mClientCount; + Counter mBadRequestCount; + Counter mBadRequestHeaderCount; + Counter mRequestLengthExceededCount; + Counter mIdleTimeoutCount; + Counter mOtherRequestCount; + Counter mOtherRequestTimeMicroSecs; + Counter mOtherRequestErrors; + Counter mReadRequestCount; + Counter mReadRequestTimeMicroSecs; + Counter mReadRequestBytes; + Counter mReadRequestErrors; + Counter mWriteRequestCount; + Counter mWriteRequestTimeMicroSecs; + Counter mWriteRequestBytes; + Counter mWriteRequestErrors; + Counter mAppendRequestCount; + Counter mAppendRequestTimeMicroSecs; + Counter mAppendRequestBytes; + Counter mAppendRequestErrors; + + void Clear() { + mAcceptCount = 0; + mClientCount = 0; + mBadRequestCount = 0; + mBadRequestHeaderCount = 0; + mRequestLengthExceededCount = 0; + mIdleTimeoutCount = 0; + mOtherRequestCount = 0; + mOtherRequestTimeMicroSecs = 0; + mOtherRequestErrors = 0; + mReadRequestCount = 0; + mReadRequestTimeMicroSecs = 0; + mReadRequestBytes = 0; + mReadRequestErrors = 0; + mWriteRequestCount = 0; + mWriteRequestTimeMicroSecs = 0; + mWriteRequestBytes = 0; + mWriteRequestErrors = 0; + mAppendRequestCount = 0; + mAppendRequestTimeMicroSecs = 0; + mAppendRequestBytes = 0; + mAppendRequestErrors = 0; + } + }; + ClientManager() + : mAcceptor(0), mIoTimeoutSec(-1), mIdleTimeoutSec(-1), mCounters() { + mCounters.Clear(); + } + void SetTimeouts(int ioTimeoutSec, int idleTimeoutSec) { + mIoTimeoutSec = ioTimeoutSec; + mIdleTimeoutSec = idleTimeoutSec; + } + virtual ~ClientManager() { + assert(mCounters.mClientCount == 0); + delete mAcceptor; + }; + bool StartAcceptor(int port); + KfsCallbackObj *CreateKfsCallbackObj(NetConnectionPtr &conn) { + ClientSM *clnt = new ClientSM(conn); + assert(mCounters.mClientCount >= 0); + mCounters.mAcceptCount++; + mCounters.mClientCount++; + return clnt; + } + void Remove(ClientSM * /* clnt */) { + assert(mCounters.mClientCount > 0); + mCounters.mClientCount--; + } + int GetIdleTimeoutSec() const { + return mIdleTimeoutSec; + } + int GetIoTimeoutSec() const { + return mIoTimeoutSec; + } + void BadRequest() { + mCounters.mBadRequestCount++; + } + void BadRequestHeader() { + mCounters.mBadRequestHeaderCount++; + } + void RequestLengthExceeded() { + mCounters.mRequestLengthExceededCount++; + } + void IdleTimeout() { + mCounters.mIdleTimeoutCount++; + } + void RequestDone(int64_t requestTimeMicroSecs, const KfsOp& op) { + const int64_t tm = requestTimeMicroSecs > 0 ? requestTimeMicroSecs : 0; + switch (op.op) { + case CMD_READ: + mCounters.mReadRequestCount++; + mCounters.mReadRequestTimeMicroSecs += tm; + if (op.status >= 0) { + const int64_t len = + static_cast(op).numBytesIO; + if (len > 0) { + mCounters.mReadRequestBytes += len; + } + } else { + mCounters.mReadRequestErrors++; + } + break; + case CMD_WRITE_PREPARE: + mCounters.mWriteRequestCount++; + mCounters.mWriteRequestTimeMicroSecs += tm; + if (op.status >= 0) { + const int64_t len = + static_cast(op).numBytes; + if (len > 0) { + mCounters.mWriteRequestBytes += len; + } + } else { + mCounters.mWriteRequestErrors++; + } + break; + case CMD_RECORD_APPEND: + mCounters.mAppendRequestCount++; + mCounters.mAppendRequestTimeMicroSecs += tm; + if (op.status >= 0) { + const int64_t len = + static_cast(op).numBytes; + if (len > 0) { + mCounters.mAppendRequestBytes += len; + } + } else { + mCounters.mAppendRequestErrors++; + } + break; + default: + mCounters.mOtherRequestCount++; + mCounters.mOtherRequestTimeMicroSecs += tm; + if (op.status < 0) { + mCounters.mOtherRequestErrors++; + } + break; + } + } + void GetCounters(Counters& counters) { + counters = mCounters; + } +private: + Acceptor* mAcceptor; + int mIoTimeoutSec; + int mIdleTimeoutSec; + Counters mCounters; +private: + // No copy. + ClientManager(const ClientManager&); + ClientManager& operator=(const ClientManager&); +}; + +extern ClientManager gClientManager; + +} + +#endif // _CLIENTMANAGER_H diff --git a/src/cc/chunk/ClientSM.cc b/src/cc/chunk/ClientSM.cc new file mode 100644 index 000000000..e2df80253 --- /dev/null +++ b/src/cc/chunk/ClientSM.cc @@ -0,0 +1,790 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/23 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "ClientSM.h" + +#include "ChunkManager.h" +#include "ChunkServer.h" +#include "utils.h" +#include "KfsOps.h" +#include "AtomicRecordAppender.h" +#include "DiskIo.h" + +#include "common/MsgLogger.h" +#include "common/time.h" +#include "kfsio/Globals.h" +#include "kfsio/NetManager.h" +#include "qcdio/QCUtils.h" + +#include +#include +#include + +#define CLIENT_SM_LOG_STREAM_PREFIX << "I" << mInstanceNum << "I " << GetPeerName() << " " +#define CLIENT_SM_LOG_STREAM(pri) KFS_LOG_STREAM(pri) CLIENT_SM_LOG_STREAM_PREFIX +#define CLIENT_SM_LOG_STREAM_DEBUG KFS_LOG_STREAM_DEBUG CLIENT_SM_LOG_STREAM_PREFIX +#define CLIENT_SM_LOG_STREAM_WARN KFS_LOG_STREAM_WARN CLIENT_SM_LOG_STREAM_PREFIX +#define CLIENT_SM_LOG_STREAM_INFO KFS_LOG_STREAM_INFO CLIENT_SM_LOG_STREAM_PREFIX +#define CLIENT_SM_LOG_STREAM_ERROR KFS_LOG_STREAM_ERROR CLIENT_SM_LOG_STREAM_PREFIX +#define CLIENT_SM_LOG_STREAM_FATAL KFS_LOG_STREAM_FATAL CLIENT_SM_LOG_STREAM_PREFIX + +namespace KFS +{ +using std::string; +using std::max; +using std::make_pair; +using std::list; +using KFS::libkfsio::globalNetManager; + +// KFS client protocol state machine implementation. + +const int kMaxCmdHeaderLength = 1 << 10; + +bool ClientSM::sTraceRequestResponse = false; +uint64_t ClientSM::sInstanceNum = 10000; + +inline string +ClientSM::GetPeerName() +{ + return (mNetConnection ? + mNetConnection->GetPeerName() : + string("not connected") + ); +} + +inline BufferManager& +ClientSM::GetBufferManager() +{ + return DiskIo::GetBufferManager(); +} + +inline void +ClientSM::SendResponse(KfsOp* op, ClientSM::ByteCount opBytes) +{ + ByteCount respBytes = mNetConnection->GetNumBytesToWrite(); + SendResponse(op); + respBytes = max(ByteCount(0), + mNetConnection->GetNumBytesToWrite() - respBytes); + mPrevNumToWrite = mNetConnection->GetNumBytesToWrite(); + GetBufferManager().Put(*this, opBytes - respBytes); +} + +inline static bool +IsDependingOpType(const KfsOp* op) +{ + const KfsOp_t type = op->op; + return ( + (type == CMD_WRITE_PREPARE && + ! static_cast(op)->replyRequestedFlag) || + (type == CMD_WRITE_PREPARE_FWD && + ! static_cast( + op)->owner.replyRequestedFlag) || + type == CMD_WRITE + ); +} + +ClientSM::ClientSM(NetConnectionPtr &conn) + : mNetConnection(conn), + mCurOp(0), + mOps(), + mReservations(), + mPendingOps(), + mPendingSubmitQueue(), + mRemoteSyncers(), + mPrevNumToWrite(0), + mRecursionCnt(0), + mInstanceNum(sInstanceNum++), + mWOStream() +{ + SET_HANDLER(this, &ClientSM::HandleRequest); + mNetConnection->SetMaxReadAhead(kMaxCmdHeaderLength); + mNetConnection->SetInactivityTimeout(gClientManager.GetIdleTimeoutSec()); +} + +ClientSM::~ClientSM() +{ + KfsOp* op; + + assert(mOps.empty() && mPendingOps.empty() && mPendingSubmitQueue.empty()); + while (!mOps.empty()) { + op = mOps.front().first; + mOps.pop_front(); + delete op; + } + while (!mPendingOps.empty()) { + op = mPendingOps.front().dependentOp; + mPendingOps.pop_front(); + delete op; + } + while (!mPendingSubmitQueue.empty()) { + op = mPendingSubmitQueue.front().dependentOp; + mPendingSubmitQueue.pop_front(); + delete op; + } + delete mCurOp; + mCurOp = 0; + gClientManager.Remove(this); +} + +/// +/// Send out the response to the client request. The response is +/// generated by MetaRequest as per the protocol. +/// @param[in] op The request for which we finished execution. +/// +void +ClientSM::SendResponse(KfsOp* op) +{ + assert(mNetConnection && op); + + const int64_t timespent = + globalNetManager().Now() - op->startTime / 1000000; + const bool tooLong = timespent > 5; + CLIENT_SM_LOG_STREAM( + (op->status >= 0 || + (op->op == CMD_SPC_RESERVE && op->status == -ENOSPC)) ? + (tooLong ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelDEBUG) : + MsgLogger::kLogLevelERROR) << + "seq: " << op->seq << + " status: " << op->status << + " buffers: " << GetByteCount() << + " " << op->Show() << + (op->statusMsg.empty() ? "" : " msg: ") << op->statusMsg << + (tooLong ? " RPC too long " : " took: ") << + timespent << " sec." << + KFS_LOG_EOM; + + op->Response(mWOStream.Set(mNetConnection->GetOutBuffer())); + mWOStream.Reset(); + + IOBuffer* iobuf = 0; + int len = 0; + op->ResponseContent(iobuf, len); + mNetConnection->Write(iobuf, len); + gClientManager.RequestDone(timespent, *op); +} + +/// +/// Generic event handler. Decode the event that occurred and +/// appropriately extract out the data and deal with the event. +/// @param[in] code: The type of event that occurred +/// @param[in] data: Data being passed in relative to the event that +/// occurred. +/// @retval 0 to indicate successful event handling; -1 otherwise. +/// +int +ClientSM::HandleRequest(int code, void* data) +{ + assert(mRecursionCnt >= 0 && mNetConnection); + mRecursionCnt++; + + switch (code) { + case EVENT_NET_READ: { + if (IsWaiting()) { + CLIENT_SM_LOG_STREAM_DEBUG << + "spurious read: " << (mCurOp ? mCurOp->Show() : "cmd") << + " waiting for: " << GetByteCount() << + " bytes of io buffers" << + KFS_LOG_EOM; + mNetConnection->SetMaxReadAhead(0); + break; + } + // We read something from the network. Run the RPC that + // came in. + int cmdLen = 0; + bool gotCmd = false; + IOBuffer& iobuf = mNetConnection->GetInBuffer(); + assert(&iobuf == data); + while ((mCurOp || IsMsgAvail(&iobuf, &cmdLen)) && + (gotCmd = HandleClientCmd(&iobuf, cmdLen))) { + cmdLen = 0; + gotCmd = false; + } + if (! mCurOp) { + int hdrsz; + if (cmdLen > 0 && ! gotCmd) { + CLIENT_SM_LOG_STREAM_ERROR << + " failed to parse request, closing connection;" + " header size: " << cmdLen << + " read available: " << iobuf.BytesConsumable() << + KFS_LOG_EOM; + gClientManager.BadRequest(); + } else if ((hdrsz = iobuf.BytesConsumable()) > MAX_RPC_HEADER_LEN) { + CLIENT_SM_LOG_STREAM_ERROR << + " exceeded max request header size: " << hdrsz << + " limit: " << MAX_RPC_HEADER_LEN << + ", closing connection" << + KFS_LOG_EOM; + gClientManager.BadRequestHeader(); + } else { + break; + } + iobuf.Clear(); + mNetConnection->Close(); + } + break; + } + + case EVENT_NET_WROTE: { + const int rem = mNetConnection->GetNumBytesToWrite(); + GetBufferManager().Put(*this, mPrevNumToWrite - rem); + mPrevNumToWrite = rem; + break; + } + + case EVENT_CMD_DONE: { + // An op finished execution. Send response back in FIFO + assert(data); + KfsOp* op = reinterpret_cast(data); + gChunkServer.OpFinished(); + op->done = true; + assert(!mOps.empty()); + if (sTraceRequestResponse) { + IOBuffer::OStream os; + op->Response(os); + IOBuffer::IStream is(os); + string line; + while (getline(is, line)) { + CLIENT_SM_LOG_STREAM_DEBUG << + "response: " << line << + KFS_LOG_EOM; + } + } + while (!mOps.empty()) { + KfsOp* qop = mOps.front().first; + if (!qop->done) { + if (! op) { + break; + } + if (! IsDependingOpType(op)) { + OpsQueue::iterator i; + for (i = mOps.begin(); i != mOps.end() && op != i->first; ++i) + {} + assert(i != mOps.end() && op == i->first); + assert(mPendingOps.empty() || op != mPendingOps.front().op); + if (i != mOps.end()) { + SendResponse(op, i->second); + } + if (i != mOps.end()) { + mOps.erase(i); + OpFinished(op); + } + delete op; + } else { + CLIENT_SM_LOG_STREAM_DEBUG << + "previous op still pending: " << + qop->Show() << "; deferring reply to: " << + op->Show() << + KFS_LOG_EOM; + } + break; + } + if (qop == op) { + op = 0; + } + SendResponse(qop, mOps.front().second); + mOps.pop_front(); + OpFinished(qop); + delete qop; + } + break; + } + + case EVENT_INACTIVITY_TIMEOUT: + case EVENT_NET_ERROR: + CLIENT_SM_LOG_STREAM_DEBUG << + "closing connection" + " due to " << (code == EVENT_INACTIVITY_TIMEOUT ? + "inactivity timeout" : "network error") << + ", socket error: " << + QCUtils::SysError(mNetConnection->GetSocketError()) << + ", pending read: " << mNetConnection->GetNumBytesToRead() << + " write: " << mNetConnection->GetNumBytesToWrite() << + KFS_LOG_EOM; + mNetConnection->Close(); + if (mCurOp) { + delete mCurOp; + mCurOp = 0; + CancelRequest(); + } + break; + + default: + assert(!"Unknown event"); + break; + } + + assert(mRecursionCnt > 0); + if (mRecursionCnt == 1) { + mNetConnection->StartFlush(); + if (mNetConnection->IsGood()) { + // Enforce 5 min timeout if connection has pending read and write. + mNetConnection->SetInactivityTimeout( + (mNetConnection->HasPendingRead() || + mNetConnection->IsWriteReady()) ? + gClientManager.GetIoTimeoutSec() : + gClientManager.GetIdleTimeoutSec()); + } else { + list serversToRelease; + + mRemoteSyncers.swap(serversToRelease); + // get rid of the connection to all the peers in daisy chain; + // if there were any outstanding ops, they will all come back + // to this method as EVENT_CMD_DONE and we clean them up above. + ReleaseAllServers(serversToRelease); + ReleaseChunkSpaceReservations(); + mRecursionCnt--; + // if there are any disk ops, wait for the ops to finish + SET_HANDLER(this, &ClientSM::HandleTerminate); + HandleTerminate(EVENT_NET_ERROR, NULL); + // this can be deleted, return now. + return 0; + } + } + mRecursionCnt--; + return 0; +} + +/// +/// Termination handler. For the client state machine, we could have +/// ops queued at the logger. So, for cleanup wait for all the +/// outstanding ops to finish and then delete this. In this state, +/// the only event that gets raised is that an op finished; anything +/// else is bad. +/// +int +ClientSM::HandleTerminate(int code, void* data) +{ + switch (code) { + case EVENT_CMD_DONE: { + assert(data); + KfsOp* op = reinterpret_cast(data); + gChunkServer.OpFinished(); + // An op finished execution. Send a response back + op->done = true; + if (op != mOps.front().first) + break; + while (!mOps.empty()) { + op = mOps.front().first; + if (!op->done) + break; + GetBufferManager().Put(*this, mOps.front().second); + OpFinished(op); + // we are done with the op + mOps.pop_front(); + delete op; + } + break; + } + + case EVENT_INACTIVITY_TIMEOUT: + case EVENT_NET_ERROR: + // clean things up + break; + + default: + assert(!"Unknown event"); + break; + } + + if (mOps.empty()) { + // all ops are done...so, now, we can nuke ourself. + assert(mPendingOps.empty()); + if (mNetConnection) { + mNetConnection->SetOwningKfsCallbackObj(0); + } + delete this; + return 1; + } + return 0; +} + +inline static BufferManager::ByteCount +IoRequestBytes(BufferManager::ByteCount numBytes, bool forwardFlag = false) +{ + BufferManager::ByteCount ret = IOBufferData::GetDefaultBufferSize(); + if (forwardFlag) { + // ret += (numBytes + ret - 1) / ret * ret; + } + if (numBytes > 0) { + ret += ((numBytes + CHECKSUM_BLOCKSIZE - 1) / + CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE); + } + return ret; +} + +bool +ClientSM::GetWriteOp(KfsOp* wop, int align, int numBytes, + IOBuffer* iobuf, IOBuffer*& ioOpBuf, bool forwardFlag) +{ + const int nAvail = iobuf->BytesConsumable(); + if (! mCurOp) { + const ByteCount bufferBytes = IoRequestBytes(numBytes, forwardFlag); + BufferManager& bufMgr = GetBufferManager(); + bool overQuota = false; + if (numBytes < 0 || + (size_t)numBytes > gChunkManager.GetMaxIORequestSize() || + (overQuota = bufMgr.IsOverQuota(*this, bufferBytes))) { + CLIENT_SM_LOG_STREAM_ERROR << + "seq: " << wop->seq << + " invalid write request size: " << bufferBytes << + " buffers: " << GetByteCount() << + (overQuota ? " over quota" : "") << + ", closing connection" << + KFS_LOG_EOM; + delete wop; + return false; + } + if (nAvail <= numBytes) { + // Move write data to the start of the buffers, to make it + // aligned. Normally only one buffer will be created. + const int off(align % IOBufferData::GetDefaultBufferSize()); + if (off > 0) { + IOBuffer buf; + buf.ReplaceKeepBuffersFull(iobuf, off, nAvail); + iobuf->Move(&buf); + iobuf->Consume(off); + } else { + iobuf->MakeBuffersFull(); + } + } + mCurOp = wop; + if (! bufMgr.GetForDiskIo(*this, bufferBytes)) { + CLIENT_SM_LOG_STREAM_DEBUG << + "seq: " << wop->seq << + " request for: " << bufferBytes << " bytes denied" << + " cur: " << GetByteCount() << + " total: " << bufMgr.GetTotalByteCount() << + " used: " << bufMgr.GetUsedByteCount() << + " bufs: " << bufMgr.GetFreeBufferCount() << + " op: " << wop->Show() << + " waiting for buffers" << + KFS_LOG_EOM; + mNetConnection->SetMaxReadAhead(0); + return false; + } + } + if (nAvail < numBytes) { + mNetConnection->SetMaxReadAhead(numBytes - nAvail); + // we couldn't process the command...so, wait + return false; + } + if (ioOpBuf) { + ioOpBuf->Clear(); + } else { + ioOpBuf = new IOBuffer(); + } + if (nAvail != numBytes) { + assert(nAvail > numBytes); + const int off(align % IOBufferData::GetDefaultBufferSize()); + ioOpBuf->ReplaceKeepBuffersFull(iobuf, off, numBytes); + if (off > 0) { + ioOpBuf->Consume(off); + } + } else { + ioOpBuf->Move(iobuf); + } + mCurOp = 0; + mNetConnection->SetMaxReadAhead(kMaxCmdHeaderLength); + return true; +} + +/// +/// We have a command in a buffer. It is possible that we don't have +/// everything we need to execute it (for example, for a write we may +/// not have received all the data the client promised). So, parse +/// out the command and if we have everything execute it. +/// +bool +ClientSM::HandleClientCmd(IOBuffer* iobuf, int cmdLen) +{ + KfsOp* op = mCurOp; + + assert(op ? cmdLen == 0 : cmdLen > 0); + if (! op) { + if (sTraceRequestResponse) { + IOBuffer::IStream is(*iobuf, cmdLen); + string line; + while (getline(is, line)) { + CLIENT_SM_LOG_STREAM_DEBUG << + "request: " << line << + KFS_LOG_EOM; + } + } + if (ParseCommand(*iobuf, cmdLen, &op) != 0) { + assert(! op); + IOBuffer::IStream is(*iobuf, cmdLen); + string line; + int maxLines = 64; + while (--maxLines >= 0 && getline(is, line)) { + CLIENT_SM_LOG_STREAM_ERROR << + "invalid request: " << line << + KFS_LOG_EOM; + } + iobuf->Consume(cmdLen); + // got a bogus command + return false; + } + } + + iobuf->Consume(cmdLen); + ByteCount bufferBytes = -1; + if (op->op == CMD_WRITE_PREPARE) { + WritePrepareOp* const wop = static_cast(op); + assert(! wop->dataBuf); + const bool kForwardFlag = false; // The forward always share the buffers. + if (! GetWriteOp(wop, wop->offset, (int)wop->numBytes, + iobuf, wop->dataBuf, kForwardFlag)) { + return false; + } + bufferBytes = IoRequestBytes(wop->numBytes); + } else if (op->op == CMD_RECORD_APPEND) { + RecordAppendOp* const waop = static_cast(op); + IOBuffer* opBuf = &waop->dataBuf; + bool forwardFlag = false; + const int align = mCurOp ? 0 : + gAtomicRecordAppendManager.GetAlignmentAndFwdFlag( + waop->chunkId, forwardFlag); + if (! GetWriteOp( + waop, + align, + (int)waop->numBytes, + iobuf, + opBuf, + forwardFlag + )) { + return false; + } + assert(opBuf == &waop->dataBuf); + bufferBytes = IoRequestBytes(waop->numBytes); + } + CLIENT_SM_LOG_STREAM_DEBUG << + "got: seq: " << op->seq << " " << op->Show() << + KFS_LOG_EOM; + + bool submitResponseFlag = false; + kfsChunkId_t chunkId = 0; + int64_t reqBytes = 0; + if (bufferBytes < 0 && op->IsChunkReadOp(reqBytes, chunkId) && reqBytes >= 0) { + bufferBytes = reqBytes + IoRequestBytes(0); // 1 buffer for reply header + if (! mCurOp) { + BufferManager& bufMgr = GetBufferManager(); + if (bufMgr.IsOverQuota(*this, bufferBytes)) { + CLIENT_SM_LOG_STREAM_ERROR << + " bad read request size: " << bufferBytes << + " need: " << bufferBytes << + " buffers: " << GetByteCount() << + " over quota, closing connection" << + " " << op->Show() << + KFS_LOG_EOM; + op->status = -EAGAIN; + op->statusMsg = "over io buffers quota"; + submitResponseFlag = true; + } else if (! bufMgr.GetForDiskIo(*this, bufferBytes)) { + mCurOp = op; + CLIENT_SM_LOG_STREAM_DEBUG << + "request for: " << bufferBytes << " bytes denied" << + " cur: " << GetByteCount() << + " total: " << bufMgr.GetTotalByteCount() << + " used: " << bufMgr.GetUsedByteCount() << + " bufs: " << bufMgr.GetFreeBufferCount() << + " op: " << op->Show() << + " waiting for buffers" << + KFS_LOG_EOM; + mNetConnection->SetMaxReadAhead(0); + return false; + } + mNetConnection->SetMaxReadAhead(kMaxCmdHeaderLength); + } + mCurOp = 0; + if (! gChunkManager.IsChunkReadable(chunkId)) { + // Do not allow dirty reads. + op->statusMsg = "chunk not readable"; + op->status = -EAGAIN; + submitResponseFlag = true; + CLIENT_SM_LOG_STREAM_ERROR << + " read request for chunk: " << chunkId << + " denied: " << op->statusMsg << + KFS_LOG_EOM; + } + } + + if (bufferBytes < 0) { + assert( + op->op != CMD_WRITE_PREPARE && + op->op != CMD_RECORD_APPEND && + op->op != CMD_READ + ); + // This is needed to account for large number of small responses to + // prevent out of buffers in the case where the client queues requests + // but doesn't read replies. + // To speedup append status recovery give record append status inquiry a + // "free pass", if there are no ops pending and connection input and + // output buffers are empty. This should be the normal case as clients + // create new connection to do status inquiry. There is virtually + // no danger of running out of buffers: the reply size is small enough + // to fit into the socket buffer, and free up the io buffer immediately. + // Since the op is synchronous and doesn't involve disk io or forwarding + // the same io buffer that was just freed by IOBuffer::Consume() the + // the above should be re-used for send, and freed immediately as the + // kernel's socket buffer is expected to have at least around 1K + // available. + bufferBytes = (op->op == CMD_GET_RECORD_APPEND_STATUS && + ! mCurOp && + mOps.empty() && + GetByteCount() <= 0 && + ! IsWaiting() && + mNetConnection->GetOutBuffer().IsEmpty() && + mNetConnection->GetInBuffer().IsEmpty() + ) ? ByteCount(0) : IoRequestBytes(0); + if (! mCurOp) { + BufferManager& bufMgr = GetBufferManager(); + if (! bufMgr.Get(*this, bufferBytes)) { + mCurOp = op; + CLIENT_SM_LOG_STREAM_DEBUG << + "request for: " << bufferBytes << " bytes denied" << + " cur: " << GetByteCount() << + " total: " << bufMgr.GetTotalByteCount() << + " used: " << bufMgr.GetUsedByteCount() << + " bufs: " << bufMgr.GetFreeBufferCount() << + " op: " << op->Show() << + " waiting for buffers" << + KFS_LOG_EOM; + mNetConnection->SetMaxReadAhead(0); + return false; + } + } + mNetConnection->SetMaxReadAhead(kMaxCmdHeaderLength); + mCurOp = 0; + } + + op->clientSMFlag = true; + if (op->op == CMD_WRITE_SYNC) { + // make the write sync depend on a previous write + KfsOp* w = 0; + for (OpsQueue::iterator i = mOps.begin(); i != mOps.end(); i++) { + if (IsDependingOpType(i->first)) { + w = i->first; + } + } + if (w) { + OpPair p; + + op->clnt = this; + p.op = w; + p.dependentOp = op; + mPendingOps.push_back(p); + + CLIENT_SM_LOG_STREAM_DEBUG << + "keeping write-sync (" << op->seq << + ") pending and depends on " << w->seq << + KFS_LOG_EOM; + return true; + } else { + CLIENT_SM_LOG_STREAM_DEBUG << + "write-sync is being pushed down; no writes left, " + << mOps.size() << " ops left" << + KFS_LOG_EOM; + } + } + + mOps.push_back(make_pair(op, bufferBytes)); + op->clnt = this; + gChunkServer.OpInserted(); + if (submitResponseFlag) { + HandleRequest(EVENT_CMD_DONE, op); + } else { + SubmitOp(op); + } + return true; +} + +void +ClientSM::OpFinished(KfsOp* doneOp) +{ + // Multiple ops could be waiting for a single op to finish. + // + // Do not run pending submit queue here, if it is not empty. + // If pending submit is not empty here, then this is recursive call. Just + // add the op to the pending submit queue and let the caller run the queue. + // This is need to send responses in the request order, and to limit the + // recursion depth. + const bool runPendingSubmitQueueFlag = mPendingSubmitQueue.empty(); + while (! mPendingOps.empty()) { + OpPair& p = mPendingOps.front(); + if (p.op != doneOp) { + break; + } + CLIENT_SM_LOG_STREAM_DEBUG << + "submitting write-sync (" << p.dependentOp->seq << + ") since " << p.op->seq << " finished" << + KFS_LOG_EOM; + mPendingSubmitQueue.splice(mPendingSubmitQueue.end(), + mPendingOps, mPendingOps.begin()); + } + if (! runPendingSubmitQueueFlag) { + return; + } + while (! mPendingSubmitQueue.empty()) { + KfsOp* const op = mPendingSubmitQueue.front().dependentOp; + mPendingSubmitQueue.pop_front(); + gChunkServer.OpInserted(); + mOps.push_back(make_pair(op, 0)); + SubmitOp(op); + } +} + +void +ClientSM::ReleaseChunkSpaceReservations() +{ + for (ChunkSpaceResMap::iterator iter = mReservations.begin(); + iter != mReservations.end(); iter++) { + gAtomicRecordAppendManager.ChunkSpaceRelease( + iter->first.chunkId, iter->first.transactionId, iter->second); + } +} + +RemoteSyncSMPtr +ClientSM::FindServer(const ServerLocation &loc, bool connect) +{ + return KFS::FindServer(mRemoteSyncers, loc, connect); +} + +void +ClientSM::Granted(ClientSM::ByteCount byteCount) +{ + CLIENT_SM_LOG_STREAM_DEBUG << "granted: " << byteCount << " op: " << + (mCurOp ? mCurOp->Show() : string("null")) << + KFS_LOG_EOM; + if (! mNetConnection) { + return; + } + if (mCurOp) { + ClientSM::HandleClientCmd(&(mNetConnection->GetInBuffer()), 0); + } else { + mNetConnection->SetMaxReadAhead(kMaxCmdHeaderLength); + } +} +} diff --git a/src/cc/chunk/ClientSM.h b/src/cc/chunk/ClientSM.h new file mode 100644 index 000000000..eb772985b --- /dev/null +++ b/src/cc/chunk/ClientSM.h @@ -0,0 +1,195 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/22 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _CLIENTSM_H +#define _CLIENTSM_H + +#include +#include +#include +#include +#include + +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/NetConnection.h" +#include "kfsio/IOBuffer.h" +#include "common/StdAllocator.h" +#include "Chunk.h" +#include "RemoteSyncSM.h" +#include "KfsOps.h" +#include "BufferManager.h" + +namespace KFS +{ + +// There is a dependency in waiting for a write-op to finish +// before we can execute a write-sync op. Use this struct to track +// such dependencies. +struct OpPair { + // once op is finished, we can then execute dependent op. + KfsOp *op; + KfsOp *dependentOp; +}; + +// For record appends when client reserves space within a chunk, +// we use a hash table to track the various reservation requests +// for a single client. The hash table is keyed by +struct ChunkSpaceReservationKey_t { + ChunkSpaceReservationKey_t(kfsChunkId_t c, int64_t t) : + chunkId(c), transactionId(t) { } + kfsChunkId_t chunkId; + int64_t transactionId; // unique for each chunkserver + bool operator==(const ChunkSpaceReservationKey_t &other) const { + return chunkId == other.chunkId && transactionId == other.transactionId; + } +}; +static inline std::size_t hash_value(ChunkSpaceReservationKey_t const &csr) { + boost::hash h; + return h(csr.transactionId); +} + +typedef std::tr1::unordered_map< + ChunkSpaceReservationKey_t, size_t, boost::hash +> ChunkSpaceResMap; + +// KFS client protocol state machine. +class ClientSM : public KfsCallbackObj, private BufferManager::Client { +public: + + ClientSM(NetConnectionPtr &conn); + + ~ClientSM(); + + // + // Sequence: + // Client connects. + // - A new client sm is born + // - reads a request out of the connection + // - client says READ chunkid + // - request handler calls the disk manager to get the size + // -- the request handler then runs in a loop: + // -- in READ START: schedule a read for 4k; transition to READ DONE + // -- in READ DONE: data that was read arrives; + // schedule that data to be sent out and transition back to READ START + // + int HandleRequest(int code, void *data); + + // This is a terminal state handler. In this state, we wait for + // all outstanding ops to finish and then destroy this. + int HandleTerminate(int code, void *data); + + // For daisy-chain writes, retrieve the server object for the + // chunkserver running at the specified location. + // + RemoteSyncSMPtr FindServer(const ServerLocation &loc, bool connect = true); + + void ChunkSpaceReserve(kfsChunkId_t chunkId, int64_t writeId, int nbytes); + + void ReleaseChunkSpaceReservations(); + + void ReleaseReservedSpace(kfsChunkId_t chunkId, int64_t writeId) { + mReservations.erase(ChunkSpaceReservationKey_t(chunkId, writeId)); + } + + size_t UseReservedSpace(kfsChunkId_t chunkId, int64_t writeId, size_t nbytes) { + ChunkSpaceResMap::iterator const iter = mReservations.find( + ChunkSpaceReservationKey_t(chunkId, writeId)); + size_t ret = 0; + if (iter != mReservations.end()) { + ret = std::min(iter->second, nbytes); + iter->second -= ret; + } + return ret; + } + size_t GetReservedSpace(kfsChunkId_t chunkId, int64_t writeId) const { + // Cast until mac std::tr1::unordered_map gets "find() const" + ChunkSpaceResMap::const_iterator const iter = + const_cast(mReservations).find( + ChunkSpaceReservationKey_t(chunkId, writeId)); + return (iter == mReservations.end() ? 0 : iter->second); + } + void ChunkSpaceReserve(kfsChunkId_t chunkId, int64_t writeId, size_t nbytes) { + mReservations.insert( + std::make_pair(ChunkSpaceReservationKey_t(chunkId, writeId), 0) + ).first->second += nbytes; + } + + static void SetTraceRequestResponse(bool flag) { + sTraceRequestResponse = flag; + } + + virtual void Granted(ByteCount byteCount); +private: + typedef std::deque > OpsQueue; + typedef std::list > PendingOpsList; + + NetConnectionPtr mNetConnection; + KfsOp* mCurOp; + /// Queue of outstanding ops from the client. We reply to ops in FIFO + OpsQueue mOps; + + /// chunks for which the client has space reserved + ChunkSpaceResMap mReservations; + + /// Queue of pending ops: ops that depend on other ops to finish before we can execute them. + PendingOpsList mPendingOps; + PendingOpsList mPendingSubmitQueue; + + /// for writes, we daisy-chain the chunkservers in the forwarding path. this list + /// maintains the set of servers to which we have a connection. + std::list mRemoteSyncers; + ByteCount mPrevNumToWrite; + int mRecursionCnt; + const uint64_t mInstanceNum; + IOBuffer::WOStream mWOStream; + static bool sTraceRequestResponse; + static uint64_t sInstanceNum; + + /// Given a (possibly) complete op in a buffer, run it. + /// @retval True if the command was handled (i.e., we have all the + /// data and we could execute it); false otherwise. + bool HandleClientCmd(IOBuffer *iobuf, int cmdLen); + + /// Op has finished execution. Send a response to the client. + void SendResponse(KfsOp *op); + + /// Submit ops that have been held waiting for doneOp to finish. + void OpFinished(KfsOp *doneOp); + bool GetWriteOp(KfsOp* wop, int align, int numBytes, IOBuffer* iobuf, + IOBuffer*& ioOpBuf, bool forwardFlag); + std::string GetPeerName(); + inline void SendResponse(KfsOp* op, ByteCount opBytes); + inline static BufferManager& GetBufferManager(); +private: + // No copy. + ClientSM(const ClientSM&); + ClientSM& operator=(const ClientSM&); +}; + +} + +#endif // _CLIENTSM_H diff --git a/src/cc/chunk/DirChecker.cc b/src/cc/chunk/DirChecker.cc new file mode 100644 index 000000000..67797ec63 --- /dev/null +++ b/src/cc/chunk/DirChecker.cc @@ -0,0 +1,743 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/7/10 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file DirChecker.cc +// \brief thread periodically checks if directories / drives re-appear. +// +//---------------------------------------------------------------------------- + +#include "DirChecker.h" +#include "common/MsgLogger.h" +#include "common/StBuffer.h" +#include "common/time.h" +#include "qcdio/QCThread.h" +#include "qcdio/QCMutex.h" +#include "qcdio/QCUtils.h" +#include "qcdio/qcstutils.h" +#include "qcdio/qcdebug.h" +#include "utils.h" + +#include +#include +#include +#include +#include +#ifndef KFS_DONT_USE_FLOCK +#include +#endif + +#include +#include +#include +#include + +namespace KFS +{ + +using std::pair; +using std::make_pair; +using std::ostringstream; + +class DirChecker::Impl : public QCRunnable +{ +public: + typedef DirChecker::LockFd LockFd; + typedef DirChecker::FileNames FileNames; + typedef DirChecker::DirNames DirNames; + typedef QCMutex::Time Time; + + Impl() + : QCRunnable(), + mDeviceIds(), + mNextDevId(1), + mDirNames(), + mSubDirNames(), + mDontUseIfExistFileNames(), + mAvailableDirs(), + mThread(), + mMutex(), + mCond(), + mDoneCond(), + mCheckIntervalNanoSec(Time(60) * 1000 * 1000 * 1000), + mLockFileName(), + mDirLocks(), + mRemoveFilesFlag(false), + mRunFlag(false), + mDoneFlag(false), + mSleepFlag(true), + mUpdateDirNamesFlag(false) + {} + virtual ~Impl() + { Impl::Stop(); } + virtual void Run() + { + const string theLockToken = CreateLockToken(); + + QCStMutexLocker theLocker(mMutex); + DirNames theDirNames = mDirNames; + DirNames theSubDirNames = mSubDirNames; + FileNames theDontUseIfExistFileNames = mDontUseIfExistFileNames; + string theLockFileName; + DirLocks theDirLocks; + mUpdateDirNamesFlag = false; + while (mRunFlag) { + if (mSleepFlag) { + mCond.Wait(mMutex, mCheckIntervalNanoSec); + } + if (! mRunFlag) { + break; + } + mSleepFlag = true; + if (mUpdateDirNamesFlag) { + theDirNames = mDirNames; + theSubDirNames = mSubDirNames; + theDontUseIfExistFileNames = mDontUseIfExistFileNames; + mUpdateDirNamesFlag = false; + } + const bool theRemoveFilesFlag = mRemoveFilesFlag; + theLockFileName = mLockFileName; + DirsAvailable theAvailableDirs; + theDirLocks.swap(mDirLocks); + QCASSERT(mDirLocks.empty()); + { + QCStMutexUnlocker theUnlocker(mMutex); + theDirLocks.clear(); + CheckDirs( + theDirNames, + theSubDirNames, + theDontUseIfExistFileNames, + mDeviceIds, + mNextDevId, + theAvailableDirs, + theRemoveFilesFlag, + theLockFileName, + theLockToken + ); + } + bool theUpdateDirNamesFlag = false; + for (DirsAvailable::iterator theIt = theAvailableDirs.begin(); + theIt != theAvailableDirs.end(); + ) { + if (mDirNames.erase(theIt->first) <= 0) { + if (mAvailableDirs.empty()) { + theAvailableDirs.erase(theIt++); + } else { + ++theIt; + } + } else { + if (! mAvailableDirs.empty()) { + mAvailableDirs.insert(*theIt); + } + ++theIt; + theUpdateDirNamesFlag = true; + } + } + if (theUpdateDirNamesFlag) { + theDirNames = mDirNames; + } + if (mAvailableDirs.empty()) { + mAvailableDirs.swap(theAvailableDirs); + } else { + theAvailableDirs.clear(); + } + mDoneFlag = true; + mDoneCond.Notify(); + } + } + void SetInterval( + int inMilliSeconds) + { + QCStMutexLocker theLocker(mMutex); + const Time theInterval = Time(inMilliSeconds) * 1000 * 1000; + if (theInterval == mCheckIntervalNanoSec) { + return; + } + mCheckIntervalNanoSec = theInterval; + if (theInterval > mCheckIntervalNanoSec / 4) { + return; + } + mCond.Notify(); + } + int GetInterval() + { + QCStMutexLocker theLocker(mMutex); + return (int)(mCheckIntervalNanoSec / (1000 * 1000)); + } + void Clear() + { + QCStMutexLocker theLocker(mMutex); + mUpdateDirNamesFlag = true; + mDirNames.clear(); + mAvailableDirs.clear(); + } + bool Add( + const string& inDirName, + LockFdPtr* inLockPtr) + { + QCStMutexLocker theLocker(mMutex); + if (inLockPtr) { + LockFdPtr& theLockPtr = *inLockPtr; + if (theLockPtr) { + mDirLocks.push_back(theLockPtr); + theLockPtr.reset(); + } + } + if (inDirName.empty()) { + return false; + } + const string theDirName = Normalize(inDirName); + mUpdateDirNamesFlag = true; + mAvailableDirs.erase(theDirName); + return mDirNames.insert(theDirName).second; + } + bool Remove( + const string& inDirName) + { + if (inDirName.empty()) { + return false; + } + const string theDirName = Normalize(inDirName); + QCStMutexLocker theLocker(mMutex); + mUpdateDirNamesFlag = true; + mAvailableDirs.erase(theDirName); + return (mDirNames.erase(theDirName) != 0); + } + bool Add( + const DirNames& inDirNames) + { + QCStMutexLocker theLocker(mMutex); + mUpdateDirNamesFlag = true; + const size_t theSize = mDirNames.size(); + for (DirNames::const_iterator theIt = inDirNames.begin(); + theIt != inDirNames.end(); + ++theIt) { + if (theIt->empty()) { + continue; + } + const string theDirName = Normalize(*theIt); + mAvailableDirs.erase(theDirName); + mDirNames.insert(theDirName); + } + return (theSize < mDirNames.size()); + } + bool Remove( + const DirNames& inDirNames) + { + QCStMutexLocker theLocker(mMutex); + mUpdateDirNamesFlag = true; + const size_t theSize = mDirNames.size(); + for (DirNames::const_iterator theIt = inDirNames.begin(); + theIt != inDirNames.end(); + ++theIt) { + if (theIt->empty()) { + continue; + } + const string theDirName = Normalize(*theIt); + mAvailableDirs.erase(theDirName); + mDirNames.erase(theDirName); + } + return (theSize > mDirNames.size()); + } + void GetNewlyAvailable( + DirsAvailable& outDirs, + bool inSyncFlag) + { + QCStMutexLocker theLocker(mMutex); + if (inSyncFlag && mRunFlag) { + mDoneFlag = false; + mSleepFlag = false; + mCond.Notify(); + while (! mDoneFlag && mRunFlag) { + mDoneCond.Wait(mMutex); + } + } + if (mAvailableDirs.empty()) { + return; + } + for (DirsAvailable::const_iterator theIt = mAvailableDirs.begin(); + theIt != mAvailableDirs.end(); + ++theIt) { + mDirNames.erase(theIt->first); + } + mUpdateDirNamesFlag = true; + if (outDirs.empty()) { + outDirs.swap(mAvailableDirs); + return; + } + outDirs.insert(mAvailableDirs.begin(), mAvailableDirs.end()); + mAvailableDirs.clear(); + } + void Start( + DirsAvailable& outDirs) + { + { + QCStMutexLocker theLocker(mMutex); + if (! mRunFlag) { + mRunFlag = true; + const int kStackSize = 32 << 10; + mThread.Start(this, kStackSize); + } + } + GetNewlyAvailable(outDirs, true); + } + void Stop() + { + { + QCStMutexLocker theLocker(mMutex); + mRunFlag = false; + mCond.Notify(); + } + mThread.Join(); + } + void AddSubDir( + const string& inDirName) + { + const size_t theLen = inDirName.length(); + size_t i = 0; + while (i < theLen && inDirName[i] == '/') { + i++; + } + if (theLen <= i) { + return; + } + QCStMutexLocker theLocker(mMutex); + mSubDirNames.insert(Normalize(inDirName.substr(i))); + mUpdateDirNamesFlag = true; + } + void SetDontUseIfExist( + const FileNames& inFileNames) + { + QCStMutexLocker theLocker(mMutex); + mDontUseIfExistFileNames.clear(); + for (FileNames::const_iterator it = inFileNames.begin(); + it != inFileNames.end(); + ++it) { + if (it->empty()) { + continue; + } + mDontUseIfExistFileNames.insert(*it); + } + mUpdateDirNamesFlag = true; + } + void SetRemoveFilesFlag( + bool inFlag) + { + QCStMutexLocker theLocker(mMutex); + mRemoveFilesFlag = inFlag; + } + void SetLockFileName( + const string& inName) + { + QCStMutexLocker theLocker(mMutex); + mLockFileName = inName; + } + +private: + typedef std::map DeviceIds; + typedef std::deque DirLocks; + + DeviceIds mDeviceIds; + DeviceId mNextDevId; + DirNames mDirNames; + DirNames mSubDirNames; + FileNames mDontUseIfExistFileNames; + DirsAvailable mAvailableDirs; + QCThread mThread; + QCMutex mMutex; + QCCondVar mCond; + QCCondVar mDoneCond; + Time mCheckIntervalNanoSec; + string mLockFileName; + DirLocks mDirLocks; + bool mRemoveFilesFlag; + bool mRunFlag; + bool mDoneFlag; + bool mSleepFlag; + bool mUpdateDirNamesFlag; + + static void CheckDirs( + const DirNames& inDirNames, + const DirNames& inSubDirNames, + const FileNames& inDontUseIfExistFileNames, + DeviceIds& inDeviceIds, + DeviceId& ioNextDevId, + DirsAvailable& outDirsAvailable, + bool inRemoveFilesFlag, + const string& inLockName, + const string& inLockToken) + { + for (DirNames::const_iterator theIt = inDirNames.begin(); + theIt != inDirNames.end(); + ++theIt) { + struct stat theStat = {0}; + if (stat(theIt->c_str(), &theStat) != 0 || + ! S_ISDIR(theStat.st_mode)) { + continue; + } + FileNames::const_iterator theEit = + inDontUseIfExistFileNames.begin(); + for (theEit = inDontUseIfExistFileNames.begin(); + theEit != inDontUseIfExistFileNames.end(); + ++theEit) { + string theFileName = *theIt + *theEit; + if (stat(theFileName.c_str(), &theStat) == 0) { + break; + } + const int theSysErr = errno; + if (theSysErr != ENOENT) { + KFS_LOG_STREAM_ERROR << + "stat " << theFileName << ": " << + QCUtils::SysError(errno) << + KFS_LOG_EOM; + break; + } + } + if (theEit != inDontUseIfExistFileNames.end()) { + continue; + } + LockFdPtr theLockFdPtr; + if (! inLockName.empty()) { + const string theLockName = *theIt + inLockName; + const int theLockFd = TryLock(theLockName, inLockToken); + if (theLockFd < 0) { + KFS_LOG_STREAM_ERROR << + theLockName << ": " << + QCUtils::SysError(-theLockFd) << + KFS_LOG_EOM; + continue; + } + theLockFdPtr.reset(new LockFd(theLockFd)); + } + DirNames::const_iterator theSit; + for (theSit = inSubDirNames.begin(); + theSit != inSubDirNames.end(); + ++theSit) { + string theDirName = *theIt + *theSit; + if (mkdir(theDirName.c_str(), 0755)) { + if (errno != EEXIST) { + KFS_LOG_STREAM_ERROR << + "mkdir " << theDirName << ": " << + QCUtils::SysError(errno) << + KFS_LOG_EOM; + break; + } + if (stat(theDirName.c_str(), &theStat) != 0) { + KFS_LOG_STREAM_ERROR << + theDirName << ": " << + QCUtils::SysError(errno) << + KFS_LOG_EOM; + break; + } + if (! S_ISDIR(theStat.st_mode)) { + KFS_LOG_STREAM_ERROR << + theDirName << ": " << + " not a directory" << + KFS_LOG_EOM; + break; + } + if (inRemoveFilesFlag && Remove(theDirName, true) != 0) { + break; + } + } + } + if (theSit != inSubDirNames.end()) { + continue; + } + if (inRemoveFilesFlag && + Remove(*theIt, false, inLockName.c_str()) != 0) { + continue; + } + pair const theRes = + inDeviceIds.insert(make_pair(theStat.st_dev, ioNextDevId)); + if (theRes.second) { + ioNextDevId++; + } + outDirsAvailable.insert( + make_pair(*theIt, make_pair( + theRes.first->second, theLockFdPtr))); + } + } + static int Remove( + const string& inDirName, + bool inRecursiveFlag, + const char* inExcludeNamePtr = "") + { + QCASSERT(! inDirName.empty() && *(inDirName.rbegin()) == '/'); + if (inDirName == "/") { + KFS_LOG_STREAM_ERROR << + "attempt to delete " << inDirName << " denied" << + KFS_LOG_EOM; + return -EPERM; + } + int theErr = 0; + DIR* const theDirStream = opendir(inDirName.c_str()); + if (! theDirStream) { + theErr = errno; + KFS_LOG_STREAM_ERROR << + "unable to open " << inDirName << + " error: " << QCUtils::SysError(theErr) << + KFS_LOG_EOM; + return theErr; + } + struct dirent const* theEntryPtr; + while ((theEntryPtr = readdir(theDirStream))) { + if (strcmp(theEntryPtr->d_name, ".") == 0 || + strcmp(theEntryPtr->d_name, "..") == 0 || + strcmp(theEntryPtr->d_name, inExcludeNamePtr) == 0) { + continue; + } + const string theName = inDirName + theEntryPtr->d_name; + struct stat theBuf = { 0 }; + if (stat(theName.c_str(), &theBuf) == 0 && + S_ISDIR(theBuf.st_mode)) { + if (! inRecursiveFlag) { + continue; + } + Remove(theName, inRecursiveFlag); + } + KFS_LOG_STREAM_DEBUG << + "removing: " << theName << + KFS_LOG_EOM; + if (unlink(theName.c_str()) && errno != ENOENT) { + theErr = errno; + KFS_LOG_STREAM_ERROR << + "unable to remove " << theName << + " error: " << QCUtils::SysError(theErr) << + KFS_LOG_EOM; + break; + } + } + closedir(theDirStream); + return theErr; + } + static int TryLock( + const string& inFileName, + const string& inLockToken) + { + const int theFd = open(inFileName.c_str(), O_CREAT|O_RDWR, 0644); + if (theFd < 0) { + return (errno > 0 ? -errno : -1); + } + if (fcntl(theFd, FD_CLOEXEC, 1)) { + const int theErr = errno; + KFS_LOG_STREAM_ERROR << + inFileName << + ": " << QCUtils::SysError(theErr) << + " enabling FD_CLOEXEC" << + KFS_LOG_EOM; + } +#ifdef KFS_DONT_USE_FLOCK + struct flock theLock = { 0 }; + theLock.l_type = F_WRLCK; + theLock.l_whence = SEEK_SET; + if (fcntl(theFd, F_SETLK, &theLock)) { + const int theErr = errno; + close(theFd); + return (theErr > 0 ? -theErr : -1); + } + const size_t theLen = inLockToken.length(); + StBufferT theBuf; + char* const theBufPtr = theBuf.Resize(theLen + 1); + const ssize_t theNRd = read(theFd, theBufPtr, theLen + 1); + if (theNRd < 0) { + const int theErr = errno; + close(theFd); + return (theErr > 0 ? -theErr : -1); + } + if ((size_t)theNRd == theLen && + memcmp(inLockToken.data(), theBufPtr, theLen) == 0) { + close(theFd); + return -EACCES; + } + if (lseek(theFd, 0, SEEK_SET) != 0 || + write(theFd, inLockToken.data(), theLen) != (ssize_t)theLen || + ((size_t)theNRd > theLen && ftruncate(theFd, theLen) != 0)) { + const int theErr = errno; + close(theFd); + return (theErr > 0 ? -theErr : -1); + } +#else + if (flock(theFd, LOCK_EX | LOCK_NB)) { + const int theErr = errno; + close(theFd); + return (theErr > 0 ? -theErr : -1); + } +#endif + return theFd; + } + static string Normalize( + const string& inDirName) + { + const size_t theInitialLen = inDirName.length(); + size_t theLen = theInitialLen; + while (theLen > 0 && inDirName[theLen - 1] == '/') { + --theLen; + } + if (theInitialLen == theLen + 1) { + return inDirName; + } else if (theInitialLen > theLen) { + return inDirName.substr(0, theLen + 1); + } + return (inDirName + "/"); + } + static string CreateLockToken() + { + ostringstream theStream; + theStream << getpid() << + " " << microseconds() << + " " << GetRandomSeq() << + "\n"; + return theStream.str(); + } + +private: + Impl( + const Impl& inImpl); + Impl& operator=( + const Impl& inImpl); +}; + +DirChecker::LockFd::~LockFd() +{ + if (mFd >= 0) { +#ifdef KFS_DONT_USE_FLOCK + ftruncate(mFd, 0); +#endif + close(mFd); + } +} + +DirChecker::DirChecker() + : mImpl(*(new Impl())) +{ +} + +DirChecker::~DirChecker() +{ + delete &mImpl; +} + + void +DirChecker::Clear() +{ + return mImpl.Clear(); +} + + bool +DirChecker::Add( + const DirNames& inDirNames) +{ + return mImpl.Add(inDirNames); +} + + bool +DirChecker::Remove( + const DirNames& inDirNames) +{ + return mImpl.Remove(inDirNames); +} + + bool +DirChecker::Add( + const string& inDirName) +{ + return mImpl.Add(inDirName, 0); +} + + bool +DirChecker::Add( + const string& inDirName, + DirChecker::LockFdPtr& ioLockFdPtr) +{ + return mImpl.Add(inDirName, &ioLockFdPtr); +} + + bool +DirChecker::Remove( + const string& inDirName) +{ + return mImpl.Remove(inDirName); +} + + void +DirChecker::GetNewlyAvailable( + DirsAvailable& outDirs, + bool inSyncFlag /* = false */) +{ + mImpl.GetNewlyAvailable(outDirs, inSyncFlag); +} + + void +DirChecker::Start( + DirsAvailable& outDirs) +{ + mImpl.Start(outDirs); +} + + void +DirChecker::Stop() +{ + mImpl.Stop(); +} + + void +DirChecker::SetInterval( + int inTimeMilliSec) +{ + mImpl.SetInterval(inTimeMilliSec); +} + + int +DirChecker::GetInterval() +{ + return mImpl.GetInterval(); +} + + void +DirChecker::AddSubDir( + const string& inDirName) +{ + mImpl.AddSubDir(inDirName); +} + + void +DirChecker::SetDontUseIfExist( + const DirChecker::FileNames& inFileNames) +{ + mImpl.SetDontUseIfExist(inFileNames); +} + + void +DirChecker::SetLockFileName( + const string& inName) +{ + mImpl.SetLockFileName(inName); +} + + void +DirChecker::SetRemoveFilesFlag( + bool inFlag) +{ + mImpl.SetRemoveFilesFlag(inFlag); +} + +} diff --git a/src/cc/chunk/DirChecker.h b/src/cc/chunk/DirChecker.h new file mode 100644 index 000000000..fad21203a --- /dev/null +++ b/src/cc/chunk/DirChecker.h @@ -0,0 +1,124 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/7/10 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file DirChecker.h +// \brief Checks if directories / drives to re-appears. +// +//---------------------------------------------------------------------------- + +#ifndef DIR_CHECKER_H +#define DIR_CHECKER_H + +#include +#include +#include +#include + +#include + +namespace KFS +{ + +using std::set; +using std::string; +using std::map; +using std::pair; +using boost::shared_ptr; + +// "Off line" chunk directory monitor. +// When chunk directory deemed to be "off line" / unusable chunk manager adds to +// the directory monitoring thread. Once chunk directory becomes "available" the +// monitoring thread acquires lock, deletes all files in this directory, and +// adds directory to "available list". Chunk manager periodically invokes +// GetNewlyAvailable() and puts newly available directories in use. +// Directories with files with names from the "black" / "don't use" list aren't +// considered available until such files are removed / renamed. Typically the +// "black" list contains "evacuate", and "evacuate.done". +class DirChecker +{ +public: + class LockFd + { + public: + LockFd( + int inFd = -1) + : mFd(inFd) + {} + ~LockFd(); + private: + const int mFd; + private: + LockFd( + const LockFd& inLockFd); + LockFd& operator=( + const LockFd& inLockFd); + }; + typedef shared_ptr LockFdPtr; + typedef int64_t DeviceId; + typedef set FileNames; + typedef FileNames DirNames; + typedef map > DirsAvailable; + + DirChecker(); + ~DirChecker(); + void Clear(); + bool Add( + const string& inDirName); + bool Add( + const string& inDirName, + LockFdPtr& ioLockFdPtr); + bool Remove( + const string& inDirName); + bool Add( + const DirNames& inDirNames); + bool Remove( + const DirNames& inDirNames); + void GetNewlyAvailable( + DirsAvailable& outDirs, + bool inSyncFlag = false); + void Start( + DirsAvailable& outDirs); + void Stop(); + void SetInterval( + int inTimeMilliSec); + int GetInterval(); + void AddSubDir( + const string& inDirName); + void SetDontUseIfExist( + const FileNames& inFileNames); + void SetLockFileName( + const string& inName); + void SetRemoveFilesFlag( + bool inFlag); +private: + class Impl; + Impl& mImpl; +private: + DirChecker( + const DirChecker& inChecker); + DirChecker& operator=( + const DirChecker& inChecker); +}; + +}; + +#endif /* DIR_CHECKER_H */ diff --git a/src/cc/chunk/DiskIo.cc b/src/cc/chunk/DiskIo.cc new file mode 100644 index 000000000..8ad812613 --- /dev/null +++ b/src/cc/chunk/DiskIo.cc @@ -0,0 +1,2104 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/01/17 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "DiskIo.h" +#include "BufferManager.h" + +#include "kfsio/IOBuffer.h" +#include "kfsio/Globals.h" +#include "common/Properties.h" +#include "common/MsgLogger.h" +#include "common/kfstypes.h" + +#include "qcdio/QCDLList.h" +#include "qcdio/QCMutex.h" +#include "qcdio/qcstutils.h" +#include "qcdio/QCUtils.h" +#include "qcdio/QCIoBufferPool.h" +#include "qcdio/qcdebug.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace KFS +{ +using std::max; +using std::min; +using std::string; +using std::set; +using std::numeric_limits; +using std::setw; +using std::setfill; + +using libkfsio::globalNetManager; +using libkfsio::SetIOBufferAllocator; +using libkfsio::IOBufferAllocator; +using libkfsio::globals; + +static void DiskIoReportError( + const char* inMsgPtr, + int inSysError = 0); +static void DiskIoReportError( + string inMsg, + int inSysError = 0) +{ DiskIoReportError(inMsg.c_str(), inSysError); } + +static int DiskQueueToSysError( + QCDiskQueue::Error inStatus) +{ + switch (inStatus) + { + case QCDiskQueue::kErrorNone: return 0; + case QCDiskQueue::kErrorRead: return EIO; + case QCDiskQueue::kErrorWrite: return EIO; + case QCDiskQueue::kErrorCancel: return ECANCELED; + case QCDiskQueue::kErrorSeek: return EIO; + case QCDiskQueue::kErrorEnqueue: return EAGAIN; + case QCDiskQueue::kErrorOutOfBuffers: return ENOMEM; + case QCDiskQueue::kErrorParameter: return EINVAL; + case QCDiskQueue::kErrorQueueStopped: return EINVAL; + case QCDiskQueue::kErrorFileIdxOutOfRange: return EINVAL; + case QCDiskQueue::kErrorBlockIdxOutOfRange: return EINVAL; + case QCDiskQueue::kErrorBlockCountOutOfRange: return EINVAL; + case QCDiskQueue::kErrorOutOfRequests: return EAGAIN; + case QCDiskQueue::kErrorOpen: return EIO; + case QCDiskQueue::kErrorClose: return EIO; + case QCDiskQueue::kErrorHasPendingRequests: return EINVAL; + case QCDiskQueue::kErrorSpaceAlloc: return EIO; + case QCDiskQueue::kErrorDelete: return EIO; + case QCDiskQueue::kErrorRename: return EIO; + case QCDiskQueue::kErrorGetFsAvailable: return EIO; + case QCDiskQueue::kErrorCheckDirReadable: return EIO; + default: break; + } + return EINVAL; +} + +// Disk error simulator. Used for testing of error handling, including +// "timing holes" and request queues "isolation". +class DiskErrorSimulator : public QCDiskQueue::IoStartObserver +{ +public: + class Config + { + public: + Config( + const Properties& inConfig) + : mMinPeriodReq(inConfig.getValue( + "chunkServer.diskErrorSimulator.minPeriod", int64_t(0) + )), + mMaxPeriodReq(inConfig.getValue( + "chunkServer.diskErrorSimulator.maxPeriod", int64_t(16) + )), + mMinTimeMicroSec(inConfig.getValue( + "chunkServer.diskErrorSimulator.minTimeMicroSec", int64_t(0) + )), + mMaxTimeMicroSec(inConfig.getValue( + "chunkServer.diskErrorSimulator.maxTimeMicroSec", int64_t(0) + )), + mPrefixes() + { + if (! IsEnabled(string())) { + return; + } + const string thePrefs = inConfig.getValue( + "chunkServer.diskErrorSimulator.chunkDirPrefixes", ""); + for (size_t theNextPos = 0; ;) { + const size_t theEndPos = thePrefs.find(';', theNextPos); + const string thePref = thePrefs.substr( + theNextPos, + theEndPos == string::npos ? + theEndPos : theEndPos - theNextPos + ); + if (! thePref.empty()) { + if (mPrefixes.insert(thePref).second) { + KFS_LOG_STREAM_INFO << + "disk error simulator: added prefix: " << + thePref << + KFS_LOG_EOM; + } + } + if (theEndPos == string::npos) { + break; + } + theNextPos = theEndPos + 1; + } + if (mPrefixes.empty()) { + KFS_LOG_STREAM_INFO << + "disk error simulator: enabled for all prefixes" << + KFS_LOG_EOM; + } + } + bool IsEnabled( + string inPrefix) const + { + return ( + mMinTimeMicroSec <= mMaxTimeMicroSec && mMaxTimeMicroSec > 0 && + (mPrefixes.empty() || + mPrefixes.find(inPrefix) != mPrefixes.end()) + ); + } + const int64_t mMinPeriodReq; + const int64_t mMaxPeriodReq; + const int64_t mMinTimeMicroSec; + const int64_t mMaxTimeMicroSec; + set mPrefixes; + }; + + DiskErrorSimulator( + const Config& inConfig) + : QCDiskQueue::IoStartObserver(), + mMutex(), + mSleepCond(), + mRandom(Seed()), + mRandMax(mRandom.max()), + mMinPeriodReq(inConfig.mMinPeriodReq), + mMaxPeriodReq(inConfig.mMaxPeriodReq), + mMinTimeMicroSec(inConfig.mMinTimeMicroSec), + mMaxTimeMicroSec(inConfig.mMaxTimeMicroSec), + mSleepingFlag(false), + mReqCount(0) + { mReqCount = Rand(mMinPeriodReq, mMaxPeriodReq); } + virtual ~DiskErrorSimulator() + { DiskErrorSimulator::Shutdown(); } + void Shutdown() + { + QCStMutexLocker theLocker(mMutex); + mReqCount = numeric_limits::max(); + mSleepingFlag = false; + mSleepCond.NotifyAll(); + } + virtual void Notify( + QCDiskQueue::ReqType inReqType, + QCDiskQueue::RequestId inRequestId, + QCDiskQueue::FileIdx inFileIdx, + QCDiskQueue::BlockIdx inStartBlockIdx, + int inBufferCount) + { + // The idea is stall all disk io threads servicing io queue. + QCStMutexLocker theLocker(mMutex); + while (mSleepingFlag) { + mSleepCond.Wait(mMutex); + } + if (--mReqCount > 0) { + return; + } + mReqCount = Rand(mMinPeriodReq, mMaxPeriodReq); + const int64_t theSleepMicroSec = + Rand(mMinTimeMicroSec, mMaxTimeMicroSec); + KFS_LOG_STREAM_INFO << + "disk error simulator:" + " request: type: " << inReqType << + " id: " << inRequestId << + " file: " << inFileIdx << + " block: " << inStartBlockIdx << + " count: " << inBufferCount << + " sleeping for " << theSleepMicroSec * 1e-6 << " sec," + " next after " << mReqCount << " requests" << + KFS_LOG_EOM; + if (theSleepMicroSec > 0) { + mSleepingFlag = true; + mSleepCond.Wait(mMutex, QCCondVar::Time(theSleepMicroSec) * 1000); + mSleepingFlag = false; + mSleepCond.NotifyAll(); // Wakeup all other threads if sleeping. + } + } +private: + typedef boost::mt19937 Random; + + QCMutex mMutex; + QCCondVar mSleepCond; + Random mRandom; + const Random::result_type mRandMax; + const int64_t mMinPeriodReq; + const int64_t mMaxPeriodReq; + const int64_t mMinTimeMicroSec; + const int64_t mMaxTimeMicroSec; + bool mSleepingFlag; + int64_t mReqCount; + + int64_t Rand( + int64_t inFrom, + int64_t inTo) + { + if (inFrom >= inTo) { + return inTo; + } + // Don't use modulo, low order bits might be "less random". + // Though this shouldn't be a problem with Mersenne twister. + const int64_t theInterval = inTo - inFrom; + return (inFrom + mRandom() * theInterval / mRandMax); + } + static Random::result_type Seed() + { + Random::result_type theRet = 1; + RAND_pseudo_bytes( + reinterpret_cast(&theRet), + int(sizeof(theRet)) + ); + return theRet; + } +private: + DiskErrorSimulator( + const DiskErrorSimulator&); + DiskErrorSimulator operator=( + const DiskErrorSimulator&); +}; + +// Disk io queue. +class DiskQueue : public QCDiskQueue, + private QCDiskQueue::DebugTracer +{ +public: + typedef QCDLList DiskQueueList; + typedef DiskIo::DeviceId DeviceId; + + DiskQueue( + DiskQueue** inListPtr, + DeviceId inDeviceId, + const char* inFileNamePrefixPtr, + const DiskErrorSimulator::Config* inSimulatorConfigPtr) + : QCDiskQueue(), + QCDiskQueue::DebugTracer(), + mFileNamePrefixes(inFileNamePrefixPtr ? inFileNamePrefixPtr : ""), + mDeviceId(inDeviceId), + mDeleteNullFilePtr(new DiskIo::File()), + mRenameNullFilePtr(new DiskIo::File()), + mGetFsSpaceAvailableNullFilePtr(new DiskIo::File()), + mCheckDirReadableNullFilePtr(new DiskIo::File()), + mSimulatorPtr(inSimulatorConfigPtr ? + new DiskErrorSimulator(*inSimulatorConfigPtr) : 0) + { + mFileNamePrefixes.append(1, (char)0); + DiskQueueList::Init(*this); + DiskQueueList::PushBack(inListPtr, *this); + mDeleteNullFilePtr->mQueuePtr = this; + mRenameNullFilePtr->mQueuePtr = this; + mGetFsSpaceAvailableNullFilePtr->mQueuePtr = this; + mCheckDirReadableNullFilePtr->mQueuePtr = this; + } + void Delete( + DiskQueue** inListPtr) + { + DiskQueueList::Remove(inListPtr, *this); + delete this; + } + int Start( + int inThreadCount, + int inMaxQueueDepth, + int inMaxBuffersPerRequestCount, + int inFileCount, + const char** inFileNamesPtr, + QCIoBufferPool& inBufferPool, + CpuAffinity inCpuAffinity, + bool inTraceFlag) + { + return QCDiskQueue::Start( + inThreadCount, + inMaxQueueDepth, + inMaxBuffersPerRequestCount, + inFileCount, + inFileNamesPtr, + inBufferPool, + mSimulatorPtr, + inCpuAffinity, + inTraceFlag ? this : 0 + ); + } + EnqueueStatus DeleteFile( + const char* inFileNamePtr, + IoCompletion* inIoCompletionPtr, + Time inTimeWaitNanoSec) + { + return QCDiskQueue::Delete( + inFileNamePtr, inIoCompletionPtr, inTimeWaitNanoSec); + } + bool IsFileNamePrefixMatches( + const char* inFileNamePtr) const + { + const char* const theFileNamePtr = inFileNamePtr ? inFileNamePtr : ""; + const char* thePtr = theFileNamePtr; + const char* thePrefPtr = mFileNamePrefixes.data(); + const char* const thePrefsEndPtr = thePrefPtr + + mFileNamePrefixes.length(); + while (thePrefPtr < thePrefsEndPtr) { + while (*thePtr && *thePrefPtr && *thePtr == *thePrefPtr) { + thePtr++; + thePrefPtr++; + } + if (*thePrefPtr == 0) { + return true; + } + while (*thePrefPtr++) + {} + thePtr = theFileNamePtr; + } + return false; + } + DeviceId GetDeviceId() const + { return mDeviceId; } + DeviceId SetDeviceId( + DeviceId inDeviceId) + { return mDeviceId = inDeviceId; } + void AddFileNamePrefix( + const char* inFileNamePtr) + { + mFileNamePrefixes.append(inFileNamePtr ? inFileNamePtr : ""); + mFileNamePrefixes.append(1, (char)0); + } + bool RemoveFileNamePrefix( + const char* inPrefixPtr) + { + if (! inPrefixPtr || ! *inPrefixPtr) { + return false; + } + const char* thePtr = inPrefixPtr; + const char* thePrefPtr = mFileNamePrefixes.data(); + const char* const thePrefsEndPtr = thePrefPtr + + mFileNamePrefixes.length(); + while (thePrefPtr < thePrefsEndPtr) { + while (*thePtr && *thePrefPtr && *thePtr == *thePrefPtr) { + thePtr++; + thePrefPtr++; + } + if (*thePrefPtr == 0 && *thePtr == 0) { + const size_t theLen = thePtr - inPrefixPtr; + mFileNamePrefixes.erase( + thePrefPtr - mFileNamePrefixes.data() - theLen, theLen + 1); + return true; + } + while (*thePrefPtr++) + {} + thePtr = inPrefixPtr; + } + return false; + } + bool IsInUse() const + { return (! mFileNamePrefixes.empty()); } + DiskIo::FilePtr GetDeleteNullFile() + { return mDeleteNullFilePtr; }; + DiskIo::FilePtr GetRenameNullFile() + { return mRenameNullFilePtr; }; + DiskIo::FilePtr GetGetFsSpaceAvailableNullFile() + { return mGetFsSpaceAvailableNullFilePtr; }; + DiskIo::FilePtr GetCheckDirReadableNullFile() + { return mCheckDirReadableNullFilePtr; }; + virtual void TraceMsg( + const char* inMsgPtr, + int inLength) + { + KFS_LOG_STREAM_START(MsgLogger::kLogLevelDEBUG, theLogStream); + ostream& theStream = theLogStream.GetStream(); + theStream << "QCDQ[" << setfill('0') << setw(2) << mDeviceId << "]"; + theStream.write(inMsgPtr, inLength); + KFS_LOG_STREAM_END; + } + void Stop() + { + if (mSimulatorPtr) { + // Make sure that io threads can proceed. + mSimulatorPtr->Shutdown(); + } + QCDiskQueue::Stop(); + } +private: + string mFileNamePrefixes; + DeviceId mDeviceId; + DiskIo::FilePtr mDeleteNullFilePtr; // Pseudo files. + DiskIo::FilePtr mRenameNullFilePtr; + DiskIo::FilePtr mGetFsSpaceAvailableNullFilePtr; + DiskIo::FilePtr mCheckDirReadableNullFilePtr; + DiskErrorSimulator* const mSimulatorPtr; + DiskQueue* mPrevPtr[1]; + DiskQueue* mNextPtr[1]; + + ~DiskQueue() + { + DiskQueue::Stop(); + mDeleteNullFilePtr->mQueuePtr = 0; + mRenameNullFilePtr->mQueuePtr = 0; + mGetFsSpaceAvailableNullFilePtr->mQueuePtr = 0; + mCheckDirReadableNullFilePtr->mQueuePtr = 0; + delete mSimulatorPtr; + } + friend class QCDLListOp; +private: + DiskQueue( + const DiskQueue& inQueue); + DiskQueue& operator=( + const DiskQueue& inQueue); +}; + +// Disk io globals, including io completion queue, accounting, and +// configuration. +class DiskIoQueues : private ITimeout +{ +private: + typedef DiskQueue::DiskQueueList DiskQueueList; + +public: + enum { kDiskQueueIdNone = -1 }; + + typedef QCDLList IoQueue; + typedef DiskIo::Counters Counters; + + DiskIoQueues( + const Properties& inConfig) + : ITimeout(), + mDiskQueueThreadCount(inConfig.getValue( + "chunkServer.diskQueue.threadCount", 2)), + mDiskQueueMaxQueueDepth(inConfig.getValue( + "chunkServer.diskQueue.maxDepth", 4 << 10)), + mDiskQueueMaxBuffersPerRequest(inConfig.getValue( + "chunkServer.diskQueue.maxBuffersPerRequest", 1 << 8)), + mDiskQueueMaxEnqueueWaitNanoSec(inConfig.getValue( + "chunkServer.diskQueue.maxEnqueueWaitTimeMilliSec", 0) * 1000000), + mBufferPoolPartitionCount(inConfig.getValue( + "chunkServer.ioBufferPool.partitionCount", 1)), + mBufferPoolPartitionBufferCount(inConfig.getValue( + "chunkServer.ioBufferPool.partitionBufferCount", + (sizeof(size_t) < 8 ? 64 : 192) << 10)), + mBufferPoolBufferSize(inConfig.getValue( + "chunkServer.ioBufferPool.bufferSize", 4 << 10)), + mBufferPoolLockMemoryFlag(inConfig.getValue( + "chunkServer.ioBufferPool.lockMemory", false)), + mDiskOverloadedPendingRequestCount(inConfig.getValue( + "chunkServer.diskIo.overloadedPendingRequestCount", + mDiskQueueMaxQueueDepth * 3 / 4)), + mDiskClearOverloadedPendingRequestCount(inConfig.getValue( + "chunkServer.diskIo.clearOverloadedPendingRequestCount", + mDiskOverloadedPendingRequestCount * 3 / 4)), + mDiskOverloadedMinFreeBufferCount(inConfig.getValue( + "chunkServer.diskIo.overloadedMinFreeBufferCount", + int(int64_t(mBufferPoolPartitionCount) * + mBufferPoolPartitionBufferCount / 16))), + mDiskClearOverloadedMinFreeBufferCount(inConfig.getValue( + "chunkServer.diskIo.overloadedClearMinFreeBufferCount", + mDiskOverloadedMinFreeBufferCount * 2 / 3)), + mDiskOverloadedPendingWriteByteCount(inConfig.getValue( + "chunkServer.diskIo.overloadedPendingWriteByteCount", + int64_t(mBufferPoolPartitionBufferCount) * + mBufferPoolBufferSize * mBufferPoolPartitionCount / 4)), + mDiskClearOverloadedPendingWriteByteCount(inConfig.getValue( + "chunkServer.diskIo.clearOverloadedPendingWriteByteCount", + mDiskOverloadedPendingWriteByteCount * 2 / 3)), + mCrashOnErrorFlag(inConfig.getValue( + "chunkServer.diskIo.crashOnError", false)), + mBufferManagerMaxRatio(inConfig.getValue( + "chunkServer.bufferManager.maxRatio", 0.4)), + mMaxClientQuota(inConfig.getValue( + "chunkServer.bufferManager.maxClientQuota", + int64_t(CHUNKSIZE + (4 << 20)))), + mMaxIoTime(inConfig.getValue( + "chunkServer.diskIo.maxIoTimeSec", 4 * 60 + 30)), + mOverloadedFlag(false), + mMaxRequestSize(0), + mNextIoTimeout(Now()), + mWriteCancelWaiterPtr(0), + mReadPendingBytes(0), + mWritePendingBytes(0), + mReadReqCount(0), + mWriteReqCount(0), + mMutex(), + mPutCond(), + mBufferAllocator(), + mBufferManager(inConfig.getValue( + "chunkServer.bufferManager.enabled", true)), + mNullCallback(), + mCounters(), + mDiskErrorSimulatorConfig(inConfig), + mCpuAffinity(inConfig.getValue( + "chunkServer.diskQueue.cpuAffinity", 0)), + mDiskQueueTraceFlag(inConfig.getValue( + "chunkServer.diskQueue.trace", 0) != 0) + { + mCounters.Clear(); + IoQueue::Init(mIoInFlightQueuePtr); + IoQueue::Init(mIoDoneQueuePtr); + DiskQueueList::Init(mDiskQueuesPtr); + // Call Timeout() every time NetManager goes trough its work loop. + ITimeout::SetTimeoutInterval(0); + } + ~DiskIoQueues() + { + DiskIoQueues::Shutdown(0, false); + globalNetManager().UnRegisterTimeoutHandler(this); + } + bool Start( + string* inErrMessagePtr) + { + int theSysError = GetBufferPool().Create( + mBufferPoolPartitionCount, + mBufferPoolPartitionBufferCount, + mBufferPoolBufferSize, + mBufferPoolLockMemoryFlag + ); + if (theSysError) { + if (inErrMessagePtr) { + *inErrMessagePtr = QCUtils::SysError(theSysError); + } + } else { + if (! SetIOBufferAllocator(&GetBufferAllocator())) { + DiskIoReportError("failed to set buffer allocator"); + if (inErrMessagePtr) { + *inErrMessagePtr = "failed to set buffer allocator"; + theSysError = -1; + } + } else { + // Make sure that allocator works, and it isn't possible to + // change it: + IOBufferData theAllocatorTest; + } + } + if (theSysError) { + GetBufferPool().Destroy(); + } else { + int64_t const theMaxReqSize = + int64_t(mBufferAllocator.GetBufferSize()) * + mDiskQueueMaxBuffersPerRequest * mDiskQueueMaxQueueDepth / 2; + if (theMaxReqSize > 0 && + int64_t(mMaxRequestSize = size_t(theMaxReqSize)) < + theMaxReqSize) { + mMaxRequestSize = numeric_limits::max(); + } + globalNetManager().RegisterTimeoutHandler(this); + mBufferManager.Init( + &GetBufferPool(), + int64_t(mBufferManagerMaxRatio * mBufferAllocator.GetBufferSize() * + mBufferPoolPartitionCount * mBufferPoolPartitionBufferCount), + mMaxClientQuota, + mDiskOverloadedPendingWriteByteCount / + mBufferAllocator.GetBufferSize() + ); + } + return (! theSysError); + } + bool Shutdown( + string* inErrMsgPtr, + bool inRunIoCompletionFlag) + { + DiskQueueList::Iterator theIt(mDiskQueuesPtr); + DiskQueue* thePtr; + while ((thePtr = theIt.Next())) { + thePtr->Stop(); + } + QCRTASSERT(IoQueue::IsEmpty(mIoInFlightQueuePtr)); + delete mWriteCancelWaiterPtr; + mWriteCancelWaiterPtr = 0; + mMaxRequestSize = 0; + if (inRunIoCompletionFlag && ! IoQueue::IsEmpty(mIoDoneQueuePtr)) { + RunCompletion(); + } + if (IoQueue::IsEmpty(mIoDoneQueuePtr)) { + while (! DiskQueueList::IsEmpty(mDiskQueuesPtr)) { + DiskQueueList::Front(mDiskQueuesPtr)->Delete(mDiskQueuesPtr); + } + globalNetManager().UnRegisterTimeoutHandler(this); + return true; + } + DiskIoReportError("io completion queue is not empty"); + if (inErrMsgPtr) { + *inErrMsgPtr = "io completion queue is not empty: " + "call RunIoCompletion()"; + } + return false; + } + bool RunCompletion() + { + bool theRet = false; + DiskIo* thePtr; + while ((thePtr = Get())) { + thePtr->RunCompletion(); + theRet = true; + } + return theRet; + } + void Put( + DiskIo& inIo, + QCDiskQueue::RequestId inRequestId, + QCDiskQueue::Error inCompletionCode) + { + { + QCStMutexLocker theLocker(mMutex); + IoQueue::Remove(mIoInFlightQueuePtr, inIo); + IoQueue::PushBack(mIoDoneQueuePtr, inIo); + inIo.mCompletionRequestId = inRequestId; + inIo.mCompletionCode = inCompletionCode; + mPutCond.Notify(); + } + globalNetManager().Wakeup(); + } + DiskIo* Get() + { + QCStMutexLocker theLocker(mMutex); + return IoQueue::PopFront(mIoDoneQueuePtr); + } + bool CancelOrExpire( + DiskIo& inIo, + bool inExpireFlag) + { + if (! QCDiskQueue::IsValidRequestId(inIo.mRequestId)) { + return false; + } + DiskQueue* const theQueuePtr = inIo.mFilePtr->GetDiskQueuePtr(); + QCRTASSERT(theQueuePtr); + QCDiskQueue::IoCompletion* theComplPtr = 0; + if (inIo.mReadLength <= 0 && ! inIo.mIoBuffers.empty()) { + // Hold on to the write buffers, while waiting for write to + // complete. + if (! inExpireFlag) { + WritePending(-int64_t(inIo.mIoBuffers.size() * + GetBufferAllocator().GetBufferSize())); + } + if (! mWriteCancelWaiterPtr) { + mWriteCancelWaiterPtr = new WriteCancelWaiter(); + } + mWriteCancelWaiterPtr->mIoBuffers = inIo.mIoBuffers; + theComplPtr = theQueuePtr->CancelOrSetCompletionIfInFlight( + inIo.mRequestId, mWriteCancelWaiterPtr); + if (theComplPtr == mWriteCancelWaiterPtr) { + mWriteCancelWaiterPtr = 0; + theComplPtr = 0; + } else { + mWriteCancelWaiterPtr->mIoBuffers.clear(); + } + } else { + if (inIo.mReadLength > 0 && ! inExpireFlag) { + ReadPending(-int64_t(inIo.mReadLength)); + } + // When read completes it can just discard buffers. + // Sync doesn't have any buffers attached. + theComplPtr = theQueuePtr->CancelOrSetCompletionIfInFlight( + inIo.mRequestId, 0); + } + QCStMutexLocker theLocker(mMutex); + if (theComplPtr == &inIo) { + while (inIo.mCompletionRequestId != inIo.mRequestId) { + mPutCond.Wait(mMutex); + } + } + if (inIo.mCompletionRequestId == inIo.mRequestId) { + QCASSERT(IoQueue::IsInList(mIoDoneQueuePtr, inIo)); + if (! inExpireFlag) { + IoQueue::Remove(mIoDoneQueuePtr, inIo); + } else if (inIo.mCompletionCode == QCDiskQueue::kErrorCancel) { + inIo.mIoRetCode = -ETIMEDOUT; + } + } else { + QCASSERT(IoQueue::IsInList(mIoInFlightQueuePtr, inIo)); + IoQueue::Remove(mIoInFlightQueuePtr, inIo); + if (inExpireFlag) { + inIo.mCompletionRequestId = inIo.mRequestId; + inIo.mCompletionCode = QCDiskQueue::kErrorCancel; + inIo.mIoRetCode = -ETIMEDOUT; + IoQueue::PushBack(mIoDoneQueuePtr, inIo); + } + } + if (! inExpireFlag) { + inIo.mRequestId = QCDiskQueue::kRequestIdNone; + } + return true; + } + bool Cancel( + DiskIo& inIo) + { return CancelOrExpire(inIo, false); } + bool Expire( + DiskIo& inIo) + { return CancelOrExpire(inIo, true); } + DiskQueue* FindDiskQueue( + const char* inFileNamePtr) + { + DiskQueueList::Iterator theItr(mDiskQueuesPtr); + DiskQueue* thePtr; + while ((thePtr = theItr.Next()) && + ! thePtr->IsFileNamePrefixMatches(inFileNamePtr)) + {} + return thePtr; + } + DiskQueue* FindDiskQueue( + DiskIo::DeviceId inDeviceId) + { + DiskQueueList::Iterator theItr(mDiskQueuesPtr); + DiskQueue* thePtr; + while ((thePtr = theItr.Next()) && thePtr->GetDeviceId() != inDeviceId) + {} + return thePtr; + } + DiskQueue* FindDiskQueueNotInUse() + { + DiskQueueList::Iterator theItr(mDiskQueuesPtr); + DiskQueue* thePtr; + while ((thePtr = theItr.Next()) && thePtr->IsInUse()) + {} + return thePtr; + } + bool AddDiskQueue( + const char* inDirNamePtr, + DiskIo::DeviceId inDeviceId, + int inMaxOpenFiles, + string* inErrMessagePtr) + { + DiskQueue* theQueuePtr = FindDiskQueue(inDirNamePtr); + if (theQueuePtr) { + return (inDeviceId == theQueuePtr->GetDeviceId()); + } + if ((theQueuePtr = FindDiskQueue(inDeviceId))) { + theQueuePtr->AddFileNamePrefix(inDirNamePtr); + return true; + } + theQueuePtr = FindDiskQueueNotInUse(); + if (theQueuePtr) { + theQueuePtr->AddFileNamePrefix(inDirNamePtr); + theQueuePtr->SetDeviceId(inDeviceId); + return true; + } + theQueuePtr = new DiskQueue( + mDiskQueuesPtr, + inDeviceId, + inDirNamePtr, + mDiskErrorSimulatorConfig.IsEnabled(inDirNamePtr) ? + &mDiskErrorSimulatorConfig : 0 + ); + const int theSysErr = theQueuePtr->Start( + mDiskQueueThreadCount, + mDiskQueueMaxQueueDepth, + mDiskQueueMaxBuffersPerRequest, + inMaxOpenFiles, + 0, // FileNamesPtr + GetBufferPool(), + mCpuAffinity, + mDiskQueueTraceFlag + ); + if (theSysErr) { + theQueuePtr->Delete(mDiskQueuesPtr); + const string theErrMsg = QCUtils::SysError(theSysErr); + DiskIoReportError("failed to start queue" + theErrMsg, theSysErr); + if (inErrMessagePtr) { + *inErrMessagePtr = theErrMsg; + } + return false; + } + return true; + } + DiskQueue::Time GetMaxEnqueueWaitTimeNanoSec() const + { return mDiskQueueMaxEnqueueWaitNanoSec; } + IOBufferAllocator& GetBufferAllocator() + { return mBufferAllocator; } + BufferManager& GetBufferManager() + { return mBufferManager; } + void ReportError( + const char* inMsgPtr, + int inErr) + { + if (mCrashOnErrorFlag) { + QCUtils::FatalError(inMsgPtr, inErr); + } + } + size_t GetMaxRequestSize() const + { return mMaxRequestSize; } + void ReadPending( + int64_t inReqBytes, + ssize_t inRetCode = 0) + { + if (inReqBytes == 0) { + return; + } + if (inReqBytes < 0) { + mCounters.mReadCount++; + if (inRetCode >= 0) { + mCounters.mReadByteCount += inRetCode; + } else { + mCounters.mReadErrorCount++; + } + } + mReadPendingBytes += inReqBytes; + mReadReqCount += inReqBytes > 0 ? 1 : -1; + QCASSERT(mReadPendingBytes >= 0 && mReadReqCount >= 0); + CheckIfOverloaded(); + } + void WritePending( + int64_t inReqBytes, + int64_t inRetCode = 0) + { + if (inReqBytes == 0) { + return; + } + if (inReqBytes < 0) { + mCounters.mWriteCount++; + if (inRetCode >= 0) { + mCounters.mWriteByteCount += inRetCode; + } else { + mCounters.mWriteErrorCount++; + } + } + mWritePendingBytes += inReqBytes; + mWriteReqCount += inReqBytes > 0 ? 1 : -1; + QCASSERT(mWritePendingBytes >= 0 && mWriteReqCount >= 0); + CheckIfOverloaded(); + } + void SyncDone( + int64_t inRetCode) + { + if (inRetCode >= 0) { + mCounters.mSyncCount++; + } else { + mCounters.mSyncErrorCount++; + } + } + void DeleteDone( + int64_t inRetCode) + { + if (inRetCode >= 0) { + mCounters.mDeleteCount++; + } else { + mCounters.mDeleteErrorCount++; + } + } + void RenameDone( + int64_t inRetCode) + { + if (inRetCode >= 0) { + mCounters.mRenameCount++; + } else { + mCounters.mRenameErrorCount++; + } + } + void GetGetFsSpaceAvailableDone( + int64_t inRetCode) + { + if (inRetCode >= 0) { + mCounters.mGetFsSpaceAvailableCount++; + } else { + mCounters.mGetFsSpaceAvailableErrorCount++; + } + } + void CheckDirReadableDone( + int64_t inRetCode) + { + if (inRetCode >= 0) { + mCounters.mCheckDirReadableCount++; + } else { + mCounters.mCheckDirReadableErrorCount++; + } + } + int GetFdCountPerFile() const + { return mDiskQueueThreadCount; } + void GetCounters( + Counters& outCounters) + { outCounters = mCounters; } + void SetInFlight( + DiskIo* inIoPtr) + { + if (! inIoPtr) { + return; + } + inIoPtr->mEnqueueTime = Now(); + QCStMutexLocker theLocker(mMutex); + IoQueue::PushBack(mIoInFlightQueuePtr, *inIoPtr); + } + void ResetInFlight( + DiskIo* inIoPtr) + { + if (! inIoPtr) { + return; + } + QCStMutexLocker theLocker(mMutex); + IoQueue::Remove(mIoInFlightQueuePtr, *inIoPtr); + } + KfsCallbackObj* GetNullCallbackPtr() + { return &mNullCallback; } + static time_t Now() + { return globalNetManager().Now(); } + void UpdateOpenFilesCount( + int inDelta) + { + mCounters.mOpenFilesCount += inDelta; + } + void SetParameters( + const Properties& inProperties) + { + mBufferManager.SetWaitingAvgInterval(inProperties.getValue( + "chunkServer.bufferManager.waitingAvgInterval", + mBufferManager.GetWaitingAvgInterval())); + mMaxIoTime = max(1, inProperties.getValue( + "chunkServer.diskIo.maxIoTimeSec", mMaxIoTime)); + } +private: + typedef DiskIo::IoBuffers IoBuffers; + class WriteCancelWaiter : public QCDiskQueue::IoCompletion + { + public: + WriteCancelWaiter() + : QCDiskQueue::IoCompletion(), + mIoBuffers() + {} + virtual bool Done( + QCDiskQueue::RequestId /* inRequestId */, + QCDiskQueue::FileIdx /* inFileIdx */, + QCDiskQueue::BlockIdx /* inStartBlockIdx */, + QCDiskQueue::InputIterator& /* inBufferItr */, + int /* inBufferCount */, + QCDiskQueue::Error /* inCompletionCode */, + int /* inSysErrorCode */, + int64_t /* inIoByteCount */) + { + delete this; // This might release buffers. + return true; // Tell the caller not to release buffers. + } + IoBuffers mIoBuffers; + }; + class BufferAllocator : public IOBufferAllocator + { + public: + BufferAllocator() + : mBufferPool() + {} + virtual size_t GetBufferSize() const + { return mBufferPool.GetBufferSize(); } + virtual char* Allocate() + { + char* const theBufPtr = mBufferPool.Get(); + if (! theBufPtr) { + QCUtils::FatalError("out of io buffers", 0); + } + return theBufPtr; + } + virtual void Deallocate( + char* inBufferPtr) + { mBufferPool.Put(inBufferPtr); } + QCIoBufferPool& GetBufferPool() + { return mBufferPool; } + private: + QCIoBufferPool mBufferPool; + + private: + BufferAllocator( + const BufferAllocator& inAllocator); + BufferAllocator& operator=( + const BufferAllocator& inAllocator); + }; + class NullCallback : public KfsCallbackObj + { + public: + NullCallback() + : KfsCallbackObj() + { SET_HANDLER(this, &NullCallback::Done); } + int Done(int /* inCode */, void* /* inDataPtr */) + { return 0; } + }; + + const int mDiskQueueThreadCount; + const int mDiskQueueMaxQueueDepth; + const int mDiskQueueMaxBuffersPerRequest; + const DiskQueue::Time mDiskQueueMaxEnqueueWaitNanoSec; + const int mBufferPoolPartitionCount; + const int mBufferPoolPartitionBufferCount; + const int mBufferPoolBufferSize; + const int mBufferPoolLockMemoryFlag; + const int mDiskOverloadedPendingRequestCount; + const int mDiskClearOverloadedPendingRequestCount; + const int mDiskOverloadedMinFreeBufferCount; + const int mDiskClearOverloadedMinFreeBufferCount; + const int64_t mDiskOverloadedPendingWriteByteCount; + const int64_t mDiskClearOverloadedPendingWriteByteCount; + const bool mCrashOnErrorFlag; + const double mBufferManagerMaxRatio; + const BufferManager::ByteCount mMaxClientQuota; + int mMaxIoTime; + bool mOverloadedFlag; + size_t mMaxRequestSize; + time_t mNextIoTimeout; + WriteCancelWaiter* mWriteCancelWaiterPtr; + int64_t mReadPendingBytes; + int64_t mWritePendingBytes; + int mReadReqCount; + int mWriteReqCount; + QCMutex mMutex; + QCCondVar mPutCond; + BufferAllocator mBufferAllocator; + BufferManager mBufferManager; + NullCallback mNullCallback; + DiskIo* mIoInFlightQueuePtr[1]; + DiskIo* mIoDoneQueuePtr[1]; + DiskQueue* mDiskQueuesPtr[1]; + Counters mCounters; + DiskErrorSimulator::Config mDiskErrorSimulatorConfig; + const QCDiskQueue::CpuAffinity mCpuAffinity; + const int mDiskQueueTraceFlag; + + QCIoBufferPool& GetBufferPool() + { return mBufferAllocator.GetBufferPool(); } + + DiskIo* GetTimedOut( + time_t inMinTime) + { + QCStMutexLocker theLocker(mMutex); + DiskIo* thePtr = IoQueue::Front(mIoInFlightQueuePtr); + if (thePtr && + thePtr->mCompletionRequestId == QCDiskQueue::kRequestIdNone && + inMinTime <= thePtr->mEnqueueTime) { + thePtr = 0; + } + return thePtr; + } + virtual void Timeout() // ITimeout + { + const time_t theNow = Now(); + const int kMaxTimerOverrun = 60; + if (theNow > mNextIoTimeout + kMaxTimerOverrun) { + mNextIoTimeout = theNow + kMaxTimerOverrun / 8; // Reschedule + } + if (theNow >= mNextIoTimeout) { + // Timeout io requests. + const time_t theMinTime = theNow - mMaxIoTime; + DiskIo* thePtr; + while ((thePtr = GetTimedOut(theMinTime)) && Expire(*thePtr)) { + KFS_LOG_STREAM_ERROR << + "io request " << thePtr->mRequestId << + " timed out; wait: " << (theNow - thePtr->mEnqueueTime) << + " sec" << + KFS_LOG_EOM; + mCounters.mTimedOutErrorCount++; + mCounters.mTimedOutErrorReadByteCount += + max(size_t(0), thePtr->mReadLength); + mCounters.mTimedOutErrorWriteByteCount += + thePtr->mIoBuffers.size() * GetBufferPool().GetBufferSize(); + } + mNextIoTimeout = theNow + 1 + mMaxIoTime / 8; + } + RunCompletion(); + } + void CheckIfOverloaded() + { + const int theReqCount = mReadReqCount + mWriteReqCount; + SetOverloaded(mOverloadedFlag ? // Hysteresis + mWritePendingBytes > mDiskClearOverloadedPendingWriteByteCount || + theReqCount > mDiskClearOverloadedPendingRequestCount || + (mWritePendingBytes > 0 && + mBufferAllocator.GetBufferPool().GetFreeBufferCount() < + mDiskClearOverloadedMinFreeBufferCount + ) + : + mWritePendingBytes > mDiskOverloadedPendingWriteByteCount || + theReqCount > mDiskOverloadedPendingRequestCount || + (mWritePendingBytes > 0 && + mBufferAllocator.GetBufferPool().GetFreeBufferCount() < + mDiskOverloadedMinFreeBufferCount + ) + ); + } + void SetOverloaded( + bool inFlag) + { + if (mOverloadedFlag == inFlag) { + return; + } + mOverloadedFlag = inFlag; + KFS_LOG_STREAM_INFO << + (mOverloadedFlag ? "Setting" : "Clearing") << + " disk overloaded state: pending" + " read: " << mReadReqCount << + " bytes: " << mReadPendingBytes << + " write: " << mWriteReqCount << + " bytes: " << mWritePendingBytes << + KFS_LOG_EOM; + mBufferManager.SetDiskOverloaded(inFlag); + } +}; + +static DiskIoQueues* sDiskIoQueuesPtr; + + /* static */ bool +DiskIo::Init( + const Properties& inProperties, + string* inErrMessagePtr /* = 0 */) +{ + if (sDiskIoQueuesPtr) { + *inErrMessagePtr = "already initialized"; + return false; + } + sDiskIoQueuesPtr = new DiskIoQueues(inProperties); + if (! sDiskIoQueuesPtr->Start(inErrMessagePtr)) { + delete sDiskIoQueuesPtr; + sDiskIoQueuesPtr = 0; + return false; + } + return (sDiskIoQueuesPtr != 0); +} + +static void DiskIoReportError( + const char* inMsgPtr, + int inErr) +{ + if (sDiskIoQueuesPtr) { + sDiskIoQueuesPtr->ReportError(inMsgPtr, inErr); + } +} + + /* static */ bool +DiskIo::StartIoQueue( + const char* inDirNamePtr, + DiskIo::DeviceId inDeviceId, + int inMaxOpenFiles, + string* inErrMessagePtr /* = 0 */) +{ + if (! sDiskIoQueuesPtr) { + if (inErrMessagePtr) { + *inErrMessagePtr = "not initialized"; + } + return false; + } + return sDiskIoQueuesPtr->AddDiskQueue( + inDirNamePtr, inDeviceId, inMaxOpenFiles, inErrMessagePtr); +} + + + /* static */ bool +DiskIo::StopIoQueue( + DiskQueue* inDiskQueuePtr, + const char* inDirNamePtr, + DiskIo::DeviceId inDeviceId, + string* inErrMessagePtr /* = 0 */) +{ + if (! inDiskQueuePtr) { + if (inErrMessagePtr) { + *inErrMessagePtr = "disk queue parameter is null"; + } + return false; + } + if (! sDiskIoQueuesPtr) { + if (inErrMessagePtr) { + *inErrMessagePtr = "not initialized"; + } + return false; + } + if (inDiskQueuePtr->GetDeviceId() != inDeviceId) { + if (inErrMessagePtr) { + *inErrMessagePtr = "device id mismatch"; + } + return false; + } + if (! inDiskQueuePtr->RemoveFileNamePrefix(inDirNamePtr)) { + if (inErrMessagePtr) { + *inErrMessagePtr = "no such prefix"; + } + return false; + } + return true; +} + + /* static */ bool +DiskIo::Shutdown( + string* inErrMessagePtr /* = 0 */) +{ + if (! sDiskIoQueuesPtr) { + return true; + } + const bool kRunIoCompletionFlag = true; + const bool theOkFlag = sDiskIoQueuesPtr->Shutdown( + inErrMessagePtr, kRunIoCompletionFlag); + delete sDiskIoQueuesPtr; + sDiskIoQueuesPtr = 0; + return theOkFlag; +} + + /* static */ int +DiskIo::GetFdCountPerFile() +{ + return (sDiskIoQueuesPtr ? sDiskIoQueuesPtr->GetFdCountPerFile() : -1); +} + + /* static */ bool +DiskIo::RunIoCompletion() +{ + return (sDiskIoQueuesPtr && sDiskIoQueuesPtr->RunCompletion()); +} + + /* static */ size_t +DiskIo::GetMaxRequestSize() +{ + return (sDiskIoQueuesPtr ? sDiskIoQueuesPtr->GetMaxRequestSize() : 0); +} + + /* static */ BufferManager& +DiskIo::GetBufferManager() +{ + QCRTASSERT(sDiskIoQueuesPtr); + return (sDiskIoQueuesPtr->GetBufferManager()); +} + + /* static */ void +DiskIo::GetCounters( + Counters& outCounters) +{ + if (! sDiskIoQueuesPtr) { + outCounters.Clear(); + return; + } + sDiskIoQueuesPtr->GetCounters(outCounters); +} + + /* static */ bool +DiskIo::Delete( + const char* inFileNamePtr, + KfsCallbackObj* inCallbackObjPtr /* = 0 */, + string* inErrMessagePtr /* = 0 */) +{ + return EnqueueMeta( + kMetaOpTypeDelete, + inFileNamePtr, + 0, + inCallbackObjPtr, + inErrMessagePtr + ); +} + + /* static */ bool +DiskIo::Rename( + const char* inSrcFileNamePtr, + const char* inDstFileNamePtr, + KfsCallbackObj* inCallbackObjPtr /* = 0 */, + string* inErrMessagePtr /* = 0 */) +{ + return EnqueueMeta( + kMetaOpTypeRename, + inSrcFileNamePtr, + inDstFileNamePtr, + inCallbackObjPtr, + inErrMessagePtr + ); +} + + /* static */ bool +DiskIo::GetFsSpaceAvailable( + const char* inPathNamePtr, + KfsCallbackObj* inCallbackObjPtr /* = 0 */, + string* inErrMessagePtr /* = 0 */) +{ + return EnqueueMeta( + kMetaOpTypeGetFsSpaceAvailable, + inPathNamePtr, + 0, + inCallbackObjPtr, + inErrMessagePtr + ); +} + + /* static */ bool +DiskIo::CheckDirReadable( + const char* inDirNamePtr, + KfsCallbackObj* inCallbackObjPtr /* = 0 */, + string* inErrMessagePtr /* = 0 */) +{ + return EnqueueMeta( + kMetaOpTypeCheckDirReadable, + inDirNamePtr, + 0, + inCallbackObjPtr, + inErrMessagePtr + ); +} + + /* static */ bool +DiskIo::GetDiskQueuePendingCount( + DiskQueue* inDiskQueuePtr, + int& outFreeRequestCount, + int& outRequestCount, + int64_t& outReadBlockCount, + int64_t& outWriteBlockCount, + int& outBlockSize) +{ + if (! inDiskQueuePtr) { + outFreeRequestCount = 0; + outRequestCount = 0; + outReadBlockCount = 0; + outWriteBlockCount = 0; + outBlockSize = 0; + return false; + } + inDiskQueuePtr->GetPendingCount( + outFreeRequestCount, + outRequestCount, + outReadBlockCount, + outWriteBlockCount); + outBlockSize = inDiskQueuePtr->GetBlockSize(); + return true; +} + + /* static */ DiskQueue* +DiskIo::FindDiskQueue( + const char* inDirNamePtr) +{ + return (sDiskIoQueuesPtr ? + sDiskIoQueuesPtr->FindDiskQueue(inDirNamePtr) : 0); +} + + /* static */ void +DiskIo::SetParameters( + const Properties& inProperties) +{ + if (sDiskIoQueuesPtr) { + sDiskIoQueuesPtr->SetParameters(inProperties); + } +} + + /* static */ bool +DiskIo::EnqueueMeta( + DiskIo::MetaOpType inOpType, + const char* inNamePtr, + const char* inNextNamePtr, + KfsCallbackObj* inCallbackObjPtr, + string* inErrMessagePtr) +{ + const char* theErrMsgPtr = 0; + if (! inNamePtr) { + theErrMsgPtr = "file or directory name is null"; + } else if (inOpType == kMetaOpTypeRename && ! inNextNamePtr) { + theErrMsgPtr = "destination file name is null"; + } else if (! sDiskIoQueuesPtr) { + theErrMsgPtr = "disk queues are not initialized"; + } else { + DiskQueue* const theQueuePtr = + sDiskIoQueuesPtr->FindDiskQueue(inNamePtr); + KfsCallbackObj* const theCallbackPtr = inCallbackObjPtr ? + inCallbackObjPtr : sDiskIoQueuesPtr->GetNullCallbackPtr(); + if (theQueuePtr) { + DiskIo* theDiskIoPtr = 0; + DiskQueue::EnqueueStatus theStatus; + switch (inOpType) { + case kMetaOpTypeRename: + theDiskIoPtr = new DiskIo( + theQueuePtr->GetRenameNullFile(), + theCallbackPtr + ); + sDiskIoQueuesPtr->SetInFlight(theDiskIoPtr); + theStatus = theQueuePtr->Rename( + inNamePtr, + inNextNamePtr, + theDiskIoPtr, + sDiskIoQueuesPtr->GetMaxEnqueueWaitTimeNanoSec() + ); + // Completion handler needed to count errors. + // For now assume that the request completed successfully if + // no completion handler specified. + if (theStatus.IsError()) { + sDiskIoQueuesPtr->RenameDone(-1); + } + break; + case kMetaOpTypeDelete: + theDiskIoPtr = new DiskIo( + theQueuePtr->GetDeleteNullFile(), + theCallbackPtr + ); + sDiskIoQueuesPtr->SetInFlight(theDiskIoPtr); + theStatus = theQueuePtr->DeleteFile( + inNamePtr, + theDiskIoPtr, + sDiskIoQueuesPtr->GetMaxEnqueueWaitTimeNanoSec() + ); + if (theStatus.IsError()) { + sDiskIoQueuesPtr->DeleteDone(-1); + } + break; + case kMetaOpTypeGetFsSpaceAvailable: + theDiskIoPtr = new DiskIo( + theQueuePtr->GetGetFsSpaceAvailableNullFile(), + theCallbackPtr + ); + sDiskIoQueuesPtr->SetInFlight(theDiskIoPtr); + theStatus = theQueuePtr->GetFsSpaceAvailable( + inNamePtr, + theDiskIoPtr, + sDiskIoQueuesPtr->GetMaxEnqueueWaitTimeNanoSec() + ); + if (theStatus.IsError()) { + sDiskIoQueuesPtr->GetGetFsSpaceAvailableDone(-1); + } + break; + case kMetaOpTypeCheckDirReadable: + theDiskIoPtr = new DiskIo( + theQueuePtr->GetCheckDirReadableNullFile(), + theCallbackPtr + ); + sDiskIoQueuesPtr->SetInFlight(theDiskIoPtr); + theStatus = theQueuePtr->CheckDirReadable( + inNamePtr, + theDiskIoPtr, + sDiskIoQueuesPtr->GetMaxEnqueueWaitTimeNanoSec() + ); + if (theStatus.IsError()) { + sDiskIoQueuesPtr->CheckDirReadableDone(-1); + } + break; + default: + QCRTASSERT(! "invalid op type"); + } + if (theStatus.IsGood()) { + if (theDiskIoPtr) { + theDiskIoPtr->mRequestId = theStatus.GetRequestId(); + QCRTASSERT( + theDiskIoPtr->mRequestId != QCDiskQueue::kRequestIdNone + ); + } + return true; + } + sDiskIoQueuesPtr->ResetInFlight(theDiskIoPtr); + delete theDiskIoPtr; + if (inErrMessagePtr) { + *inErrMessagePtr = QCDiskQueue::ToString(theStatus.GetError()); + } + DiskIoReportError(QCDiskQueue::ToString(theStatus.GetError()), 0); + return false; + } else { + theErrMsgPtr = "failed to find disk queue"; + } + } + if (theErrMsgPtr) { + if (inErrMessagePtr) { + *inErrMessagePtr = theErrMsgPtr; + } + DiskIoReportError(theErrMsgPtr, EINVAL); + } + return false; +} + + bool +DiskIo::File::Open( + const char* inFileNamePtr, + DiskIo::Offset inMaxFileSize /* = -1 */, + bool inReadOnlyFlag /* = false */, + bool inReserveFileSpaceFlag /* = false */, + bool inCreateFlag /* = false */, + string* inErrMessagePtr /* = 0 */, + bool* inRetryFlagPtr /* = 0 */, + bool inBufferedIoFlag /* = false */) +{ + const char* theErrMsgPtr = 0; + if (IsOpen()) { + theErrMsgPtr = "file is already open"; + } else if (! inFileNamePtr) { + theErrMsgPtr = "file name is null"; + } else if (! sDiskIoQueuesPtr) { + theErrMsgPtr = "disk queues are not initialized"; + } else { + Reset(); + if (! (mQueuePtr = sDiskIoQueuesPtr->FindDiskQueue(inFileNamePtr))) { + theErrMsgPtr = "failed to find disk queue"; + } + } + if (theErrMsgPtr) { + if (inErrMessagePtr) { + *inErrMessagePtr = theErrMsgPtr; + } + if (inRetryFlagPtr) { + *inRetryFlagPtr = false; + } + DiskIoReportError(theErrMsgPtr, EINVAL); + return false; + } + mReadOnlyFlag = inReadOnlyFlag; + mSpaceReservedFlag = ! mReadOnlyFlag && + inMaxFileSize > 0 && inReserveFileSpaceFlag; + QCDiskQueue::OpenFileStatus const theStatus = mQueuePtr->OpenFile( + inFileNamePtr, mReadOnlyFlag ? -1 : inMaxFileSize, mReadOnlyFlag, + mSpaceReservedFlag, inCreateFlag, inBufferedIoFlag); + if (theStatus.IsError()) { + if (inErrMessagePtr) { + *inErrMessagePtr = + QCUtils::SysError(theStatus.GetSysError()) + " " + + QCDiskQueue::ToString(theStatus.GetError()); + } + Reset(); + DiskIoReportError(QCUtils::SysError(theStatus.GetSysError()) + " " + + QCDiskQueue::ToString(theStatus.GetError()), 0); + if (inRetryFlagPtr) { + *inRetryFlagPtr = true; + } + return false; + } + mFileIdx = theStatus.GetFileIdx(); + sDiskIoQueuesPtr->UpdateOpenFilesCount(+1); + return true; +} + + bool +DiskIo::File::Close( + DiskIo::Offset inFileSize, /* = -1 */ + string* inErrMessagePtr /* = 0 */) +{ + if (mFileIdx < 0 || ! mQueuePtr) { + Reset(); + return true; + } + QCDiskQueue::CloseFileStatus theStatus = mQueuePtr->CloseFile( + mFileIdx, mReadOnlyFlag ? -1 : inFileSize); + if (theStatus.IsError()) { + if (inErrMessagePtr) { + *inErrMessagePtr = + QCUtils::SysError(theStatus.GetSysError()) + " " + + QCDiskQueue::ToString(theStatus.GetError()); + } + DiskIoReportError(QCUtils::SysError(theStatus.GetSysError()) + " " + + QCDiskQueue::ToString(theStatus.GetError()), + theStatus.GetSysError()); + } + Reset(); + if (sDiskIoQueuesPtr) { + sDiskIoQueuesPtr->UpdateOpenFilesCount(-1); + } + return (! theStatus.IsError()); +} + + void +DiskIo::File::GetDiskQueuePendingCount( + int& outFreeRequestCount, + int& outRequestCount, + int64_t& outReadBlockCount, + int64_t& outWriteBlockCount, + int& outBlockSize) +{ + if (mQueuePtr) { + mQueuePtr->GetPendingCount( + outFreeRequestCount, + outRequestCount, + outReadBlockCount, + outWriteBlockCount); + outBlockSize = mQueuePtr->GetBlockSize(); + } else { + outFreeRequestCount = 0; + outRequestCount = 0; + outReadBlockCount = 0; + outWriteBlockCount = 0; + outBlockSize = 0; + } +} + + bool +DiskIo::File::ReserveSpace( + string* inErrMessagePtr) +{ + if (mSpaceReservedFlag) { + return true; // Already done. + } + if (! IsOpen()) { + if (inErrMessagePtr) { + *inErrMessagePtr = "closed"; + } + return false; + } + if (IsReadOnly()) { + if (inErrMessagePtr) { + *inErrMessagePtr = "read only"; + } + return false; + } + if (! mQueuePtr) { + if (inErrMessagePtr) { + *inErrMessagePtr = "no queue"; + } + return false; + } + const DiskQueue::Status theStatus = mQueuePtr->AllocateFileSpace(mFileIdx); + if (theStatus.IsError() && inErrMessagePtr) { + if (theStatus.GetError() != QCDiskQueue::kErrorNone) { + *inErrMessagePtr = QCDiskQueue::ToString(theStatus.GetError()); + } else { + *inErrMessagePtr = QCUtils::SysError(theStatus.GetSysError()); + } + } + mSpaceReservedFlag = theStatus.IsGood(); + return mSpaceReservedFlag; +} + + void +DiskIo::File::Reset() +{ + mQueuePtr = 0; + mFileIdx = -1; + mReadOnlyFlag = false; + mSpaceReservedFlag = false; +} + +DiskIo::DiskIo( + DiskIo::FilePtr inFilePtr, + KfsCallbackObj* inCallBackObjPtr) + : mCallbackObjPtr(inCallBackObjPtr), + mFilePtr(inFilePtr), + mRequestId(QCDiskQueue::kRequestIdNone), + mIoBuffers(), + mReadBufOffset(0), + mReadLength(0), + mBlockIdx(0), + mIoRetCode(0), + mEnqueueTime(), + mCompletionRequestId(QCDiskQueue::kRequestIdNone), + mCompletionCode(QCDiskQueue::kErrorNone) +{ + QCRTASSERT(mCallbackObjPtr && mFilePtr.get()); + DiskIoQueues::IoQueue::Init(*this); +} + +DiskIo::~DiskIo() +{ + DiskIo::Close(); +} + + void +DiskIo::Close() +{ + if (sDiskIoQueuesPtr) { + sDiskIoQueuesPtr->Cancel(*this); + } +} + + ssize_t +DiskIo::Read( + DiskIo::Offset inOffset, + size_t inNumBytes) +{ + if (inOffset < 0 || + mRequestId != QCDiskQueue::kRequestIdNone || ! mFilePtr->IsOpen()) { + KFS_LOG_STREAM_ERROR << + "file: " << mFilePtr->GetFileIdx() << + " " << (mFilePtr->IsOpen() ? "open" : "closed") << + " read request: " << mRequestId << + " offset: " << inOffset << + KFS_LOG_EOM; + DiskIoReportError("DiskIo::Read: bad parameters", EINVAL); + return -EINVAL; + } + mIoBuffers.clear(); + DiskQueue* const theQueuePtr = mFilePtr->GetDiskQueuePtr(); + if (! theQueuePtr) { + KFS_LOG_STREAM_ERROR << "read: no queue" << KFS_LOG_EOM; + DiskIoReportError("DiskIo::Read: no queue", EINVAL); + return -EINVAL; + } + if (inNumBytes <= 0) { + return 0; // Io completion will not be called in this case. + } + const int theBlockSize = theQueuePtr->GetBlockSize(); + if (theBlockSize <= 0) { + KFS_LOG_STREAM_ERROR << + "bad block size " << theBlockSize << + KFS_LOG_EOM; + DiskIoReportError("DiskIo::Read: bad block size", EINVAL); + return -EINVAL; + } + mIoRetCode = 0; + mBlockIdx = -1; + mReadBufOffset = inOffset % theBlockSize; + mReadLength = inNumBytes; + const int theBufferCnt = + (mReadLength + mReadBufOffset + theBlockSize - 1) / theBlockSize; + mIoBuffers.reserve(theBufferCnt); + mCompletionRequestId = QCDiskQueue::kRequestIdNone; + mCompletionCode = QCDiskQueue::kErrorNone; + sDiskIoQueuesPtr->SetInFlight(this); + const DiskQueue::EnqueueStatus theStatus = theQueuePtr->Read( + mFilePtr->GetFileIdx(), + inOffset / theBlockSize, + 0, // inBufferIteratorPtr // allocate buffers just beofre read + theBufferCnt, + this, + sDiskIoQueuesPtr->GetMaxEnqueueWaitTimeNanoSec() + ); + if (theStatus.IsGood()) { + sDiskIoQueuesPtr->ReadPending(inNumBytes); + mRequestId = theStatus.GetRequestId(); + QCRTASSERT(mRequestId != QCDiskQueue::kRequestIdNone); + return inNumBytes; + } + sDiskIoQueuesPtr->ResetInFlight(this); + const string theErrMsg(QCDiskQueue::ToString(theStatus.GetError())); + KFS_LOG_STREAM_ERROR << + "read queuing error: " << theErrMsg << + KFS_LOG_EOM; + + const int theErr = DiskQueueToSysError(theStatus.GetError()); + DiskIoReportError("DiskIo::Read: " + theErrMsg, theErr); + return -theErr; +} + + ssize_t +DiskIo::Write( + DiskIo::Offset inOffset, + size_t inNumBytes, + IOBuffer* inBufferPtr) +{ + if (inOffset < 0 || ! inBufferPtr || + mRequestId != QCDiskQueue::kRequestIdNone || ! mFilePtr->IsOpen()) { + KFS_LOG_STREAM_ERROR << + "file: " << mFilePtr->GetFileIdx() << + " " << (mFilePtr->IsOpen() ? "open" : "closed") << + " write request: " << mRequestId << + " offset: " << inOffset << "," + " buffer: " << (const void*)inBufferPtr << + KFS_LOG_EOM; + DiskIoReportError("DiskIo::Write: bad parameters", EINVAL); + return -EINVAL; + } + mReadLength = 0; + mIoRetCode = 0; + mBlockIdx = -1; + mReadBufOffset = 0; + mIoBuffers.clear(); + if (mFilePtr->IsReadOnly()) { + KFS_LOG_STREAM_ERROR << "write: read only mode" << KFS_LOG_EOM; + DiskIoReportError("DiskIo::Write: read only mode", EINVAL); + return -EINVAL; + } + DiskQueue* const theQueuePtr = mFilePtr->GetDiskQueuePtr(); + if (! theQueuePtr) { + KFS_LOG_STREAM_ERROR << "write: no queue" << KFS_LOG_EOM; + DiskIoReportError("DiskIo::Write: no queue", EINVAL); + return -EINVAL; + } + const int theBlockSize = theQueuePtr->GetBlockSize(); + if (inOffset % theBlockSize != 0) { + KFS_LOG_STREAM_ERROR << + "file: " << mFilePtr->GetFileIdx() << + " write: invalid offset: " << inOffset << + KFS_LOG_EOM; + DiskIoReportError("DiskIo::Write: invalid offset", EINVAL); + return -EINVAL; + } + const size_t kBlockAlignMask = (4 << 10) - 1; + size_t theNWr = inNumBytes; + for (IOBuffer::iterator + theIt = inBufferPtr->begin(); + theIt != inBufferPtr->end() && theNWr > 0; + ++theIt) { + const IOBufferData& theBuf = *theIt; + if (theBuf.IsEmpty()) { + continue; + } + if (theNWr < (size_t)theBlockSize || + theBlockSize != theBuf.BytesConsumable() || + (theBuf.Consumer() - (char*)0) & kBlockAlignMask) { + KFS_LOG_STREAM_ERROR << + "file: " << mFilePtr->GetFileIdx() << + " invalid io buffer: " << (theBuf.Consumer() - (char*)0) << + " size: " << min((int)theNWr, (int)theBuf.BytesConsumable()) << + KFS_LOG_EOM; + mIoBuffers.clear(); + DiskIoReportError("DiskIo::Write: invalid buffer", EINVAL); + return -EINVAL; + } + theNWr -= theBlockSize; + mIoBuffers.push_back(*theIt); + } + if (mIoBuffers.empty()) { + return 0; + } + struct BufIterator : public QCDiskQueue::InputIterator + { + BufIterator( + IoBuffers& inBufs) + : mCur(inBufs.begin()), + mEnd(inBufs.end()) + {} + virtual char* Get() + { return (mEnd == mCur ? 0 : (mCur++)->Consumer()); } + IoBuffers::iterator mCur; + IoBuffers::iterator const mEnd; + }; + BufIterator theBufItr(mIoBuffers); + mCompletionRequestId = QCDiskQueue::kRequestIdNone; + mCompletionCode = QCDiskQueue::kErrorNone; + sDiskIoQueuesPtr->SetInFlight(this); + const DiskQueue::EnqueueStatus theStatus = theQueuePtr->Write( + mFilePtr->GetFileIdx(), + inOffset / theBlockSize, + &theBufItr, + mIoBuffers.size(), + this, + sDiskIoQueuesPtr->GetMaxEnqueueWaitTimeNanoSec() + ); + if (theStatus.IsGood()) { + sDiskIoQueuesPtr->WritePending(inNumBytes - theNWr); + mRequestId = theStatus.GetRequestId(); + QCRTASSERT(mRequestId != QCDiskQueue::kRequestIdNone); + return (inNumBytes - theNWr); + } + sDiskIoQueuesPtr->ResetInFlight(this); + const string theErrMsg = QCDiskQueue::ToString(theStatus.GetError()); + KFS_LOG_STREAM_ERROR << "write queuing error: " << theErrMsg << + KFS_LOG_EOM; + const int theErr = DiskQueueToSysError(theStatus.GetError()); + DiskIoReportError("DiskIo::Write: " + theErrMsg, theErr); + return -theErr; +} + + int +DiskIo::Sync( + bool inNotifyDoneFlag) +{ + if (mRequestId != QCDiskQueue::kRequestIdNone || ! mFilePtr->IsOpen()) { + KFS_LOG_STREAM_ERROR << + "file: " << mFilePtr->GetFileIdx() << + " " << (mFilePtr->IsOpen() ? "open" : "closed") << + " sync request: " << mRequestId << + " notify: " << (inNotifyDoneFlag ? "yes" : "no") << + KFS_LOG_EOM; + DiskIoReportError("DiskIo::Sync: bad parameters", EINVAL); + return -EINVAL; + } + mIoBuffers.clear(); + DiskQueue* const theQueuePtr = mFilePtr->GetDiskQueuePtr(); + if (! theQueuePtr) { + KFS_LOG_STREAM_ERROR << "sync: no queue" << KFS_LOG_EOM; + DiskIoReportError("DiskIo::Sync: no queue", EINVAL); + return -EINVAL; + } + mCompletionRequestId = QCDiskQueue::kRequestIdNone; + mCompletionCode = QCDiskQueue::kErrorNone; + sDiskIoQueuesPtr->SetInFlight(this); + const DiskQueue::EnqueueStatus theStatus = theQueuePtr->Sync( + mFilePtr->GetFileIdx(), + inNotifyDoneFlag ? this : 0, + sDiskIoQueuesPtr->GetMaxEnqueueWaitTimeNanoSec() + ); + if (theStatus.IsGood()) { + if (inNotifyDoneFlag) { + mRequestId = theStatus.GetRequestId(); + QCRTASSERT(mRequestId != QCDiskQueue::kRequestIdNone); + } + return 0; + } + sDiskIoQueuesPtr->ResetInFlight(this); + const string theErrMsg(QCDiskQueue::ToString(theStatus.GetError())); + KFS_LOG_STREAM_ERROR << "sync queuing error: " << theErrMsg << + KFS_LOG_EOM; + const int theErr = DiskQueueToSysError(theStatus.GetError()); + DiskIoReportError("DiskIo::Sync: " + theErrMsg, theErr); + return -theErr; +} + + /* virtual */ bool +DiskIo::Done( + QCDiskQueue::RequestId inRequestId, + QCDiskQueue::FileIdx inFileIdx, + QCDiskQueue::BlockIdx inBlockIdx, + QCDiskQueue::InputIterator& inBufferItr, + int inBufferCount, + QCDiskQueue::Error inCompletionCode, + int inSysErrorCode, + int64_t inIoByteCount) +{ + QCASSERT(sDiskIoQueuesPtr); + bool theOwnBuffersFlag = false; + mBlockIdx = inBlockIdx; + if (inCompletionCode != QCDiskQueue::kErrorNone) { + if (inSysErrorCode != 0) { + mIoRetCode = -inSysErrorCode; + } else { + if (inCompletionCode == QCDiskQueue::kErrorOutOfBuffers) { + mIoRetCode = -ENOMEM; + } else { + mIoRetCode = -EIO; + } + } + if (mIoRetCode > 0) { + mIoRetCode = -mIoRetCode; + } else if (mIoRetCode == 0) { + mIoRetCode = -1000; + } + // If this is read failure, then tell caller to free the buffers. + theOwnBuffersFlag = mReadLength <= 0; + } else { + mIoRetCode = inIoByteCount; + if (mReadLength <= 0) { + theOwnBuffersFlag = true; // Write sync or meta done. + } else if (inIoByteCount <= 0) { + theOwnBuffersFlag = false; // empty read, free buffers if any. + } else { + const int theBufSize = + sDiskIoQueuesPtr->GetBufferAllocator().GetBufferSize(); + QCRTASSERT(inBufferCount * theBufSize >= inIoByteCount); + int theCnt = inBufferCount; + char* thePtr; + while (theCnt-- > 0 && (thePtr = inBufferItr.Get())) { + mIoBuffers.push_back(IOBufferData( + thePtr, 0, theBufSize, + sDiskIoQueuesPtr->GetBufferAllocator())); + } + QCRTASSERT( + (inBufferCount - (theCnt + 1)) * theBufSize >= inIoByteCount); + theOwnBuffersFlag = true; + } + } + sDiskIoQueuesPtr->Put(*this, inRequestId, inCompletionCode); + return theOwnBuffersFlag; +} + + void +DiskIo::RunCompletion() +{ + QCASSERT(mCompletionRequestId == mRequestId && sDiskIoQueuesPtr); + mRequestId = QCDiskQueue::kRequestIdNone; + const char* theOpNamePtr = ""; + int theCode = 0; + int theMetaFlag = false; + int64_t theMetaRet = -1; + + DiskQueue* const theQueuePtr = mFilePtr->GetDiskQueuePtr(); + QCASSERT(theQueuePtr); + if (mFilePtr.get() == theQueuePtr->GetDeleteNullFile().get()) { + theOpNamePtr = "delete"; + theMetaFlag = true; + theCode = EVENT_DISK_DELETE_DONE; + sDiskIoQueuesPtr->DeleteDone(mIoRetCode); + } else if (mFilePtr.get() == theQueuePtr->GetRenameNullFile().get()) { + theOpNamePtr = "rename"; + theMetaFlag = true; + theCode = EVENT_DISK_RENAME_DONE; + sDiskIoQueuesPtr->RenameDone(mIoRetCode); + } else if (mFilePtr.get() == + theQueuePtr->GetGetFsSpaceAvailableNullFile().get()) { + theOpNamePtr = "fs space available"; + theMetaFlag = true; + if (mBlockIdx >= 0) { + theMetaRet = (int64_t)mBlockIdx * + sDiskIoQueuesPtr->GetBufferAllocator().GetBufferSize(); + } + theCode = EVENT_DISK_GET_FS_SPACE_AVAIL_DONE; + sDiskIoQueuesPtr->GetGetFsSpaceAvailableDone(mIoRetCode); + } else if (mFilePtr.get() == + theQueuePtr->GetCheckDirReadableNullFile().get()) { + theOpNamePtr = "check dir readable"; + theMetaFlag = true; + theCode = EVENT_DISK_CHECK_DIR_READABLE_DONE; + sDiskIoQueuesPtr->CheckDirReadableDone(mIoRetCode); + } else if (mReadLength > 0) { + sDiskIoQueuesPtr->ReadPending(-int64_t(mReadLength), mIoRetCode); + theOpNamePtr = "read"; + } else if (! mIoBuffers.empty()) { + sDiskIoQueuesPtr->WritePending(-int64_t(mIoBuffers.size() * + sDiskIoQueuesPtr->GetBufferAllocator().GetBufferSize()), + mIoRetCode); + theOpNamePtr = "write"; + } else { + theOpNamePtr = "sync"; + sDiskIoQueuesPtr->SyncDone(mIoRetCode); + } + int theNumRead(mIoRetCode); + if (mIoRetCode < 0) { + string theErrMsg(QCDiskQueue::ToString(mCompletionCode)); + theErrMsg += " "; + theErrMsg += QCUtils::SysError(-theNumRead); + KFS_LOG_STREAM(theMetaFlag ? + MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + theOpNamePtr << + " (" << mReadLength << " " << mIoBuffers.size() << ")" + " error: " << theNumRead << + " " << theErrMsg << + KFS_LOG_EOM; + } + if (theMetaFlag) { + KfsCallbackObj* const theCallbackObjPtr = mCallbackObjPtr; + int64_t theIoMetaResult[2]; + void* theDataPtr; + if (mIoRetCode < 0) { + theCode = EVENT_DISK_ERROR; + theDataPtr = &theNumRead; + } else { + theIoMetaResult[0] = mIoRetCode; + theIoMetaResult[1] = theMetaRet; + theDataPtr = theIoMetaResult; + } + delete this; + theCallbackObjPtr->HandleEvent(theCode, theDataPtr); + return; + } + QCRTASSERT(theNumRead == mIoRetCode); + if (mIoRetCode < 0 || mReadLength <= 0) { + const bool theSyncFlag = mReadLength == 0 && mIoBuffers.empty(); + mIoBuffers.clear(); + IoCompletion(0, theNumRead, theSyncFlag); + return; + } + // Read. Skip/trim first/last buffers if needed. + if (mIoBuffers.empty()) { + QCRTASSERT(theNumRead == 0); + theNumRead = 0; + } else { + const size_t theBufSize = mIoBuffers.front().BytesConsumable(); + QCRTASSERT((ssize_t)(mIoBuffers.size() * theBufSize) >= theNumRead); + const int theConsumed = mIoBuffers.front().Consume(mReadBufOffset); + QCRTASSERT(theConsumed == (int)mReadBufOffset); + theNumRead -= min(theNumRead, theConsumed); + if (theNumRead > (int)mReadLength) { + const int theToTrimTo(theBufSize - (theNumRead - mReadLength)); + const int theTrimmedSize = mIoBuffers.back().Trim(theToTrimTo); + QCRTASSERT(theToTrimTo == theTrimmedSize); + theNumRead = mReadLength; + } + } + IOBuffer theIoBuffer; + int theRem = theNumRead; + for (IoBuffers::iterator theItr = mIoBuffers.begin(); + theItr != mIoBuffers.end() && theRem > 0; + ++theItr) { + const int theSize = theItr->BytesConsumable(); + if (theSize <= 0) { + continue; + } + if (theSize > theRem) { + theItr->Trim(theRem); + } + theRem -= theSize; + theIoBuffer.Append(*theItr); + } + mIoBuffers.clear(); + QCASSERT(theIoBuffer.BytesConsumable() <= theNumRead); + IoCompletion(&theIoBuffer, theNumRead); +} + + void +DiskIo::IoCompletion( + IOBuffer* inBufferPtr, + int inRetCode, + bool inSyncFlag /* = false */) +{ + if (inRetCode < 0) { + mCallbackObjPtr->HandleEvent(EVENT_DISK_ERROR, &inRetCode); + } else if (inSyncFlag) { + mCallbackObjPtr->HandleEvent(EVENT_SYNC_DONE, 0); + } else if (inBufferPtr) { + globals().ctrDiskBytesRead.Update(int(mIoRetCode)); + mCallbackObjPtr->HandleEvent(EVENT_DISK_READ, inBufferPtr); + } else { + globals().ctrDiskBytesWritten.Update(int(mIoRetCode)); + mCallbackObjPtr->HandleEvent(EVENT_DISK_WROTE, &inRetCode); + } +} + +} /* namespace KFS */ diff --git a/src/cc/chunk/DiskIo.h b/src/cc/chunk/DiskIo.h new file mode 100644 index 000000000..d7588bcf8 --- /dev/null +++ b/src/cc/chunk/DiskIo.h @@ -0,0 +1,317 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/01/17 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _DISKIO_H +#define _DISKIO_H + +#include + +#include +#include + +#include "kfsio/IOBuffer.h" +#include "qcdio/QCDiskQueue.h" +#include "qcdio/QCDLList.h" + +namespace KFS +{ +using std::string; +using std::vector; + +class KfsCallbackObj; +class IOBuffer; +class DiskQueue; +class Properties; +class BufferManager; + +// Asynchronous disk io shim. +// Creates and destroys low level disk queues. Runs io completion queue in the +// main event loop. +class DiskIo : private QCDiskQueue::IoCompletion +{ +public: + struct Counters + { + typedef int64_t Counter; + + Counter mReadCount; + Counter mReadByteCount; + Counter mReadErrorCount; + Counter mWriteCount; + Counter mWriteByteCount; + Counter mWriteErrorCount; + Counter mSyncCount; + Counter mSyncErrorCount; + Counter mDeleteCount; + Counter mDeleteErrorCount; + Counter mRenameCount; + Counter mRenameErrorCount; + Counter mGetFsSpaceAvailableCount; + Counter mGetFsSpaceAvailableErrorCount; + Counter mCheckDirReadableCount; + Counter mCheckDirReadableErrorCount; + Counter mTimedOutErrorCount; + Counter mTimedOutErrorReadByteCount; + Counter mTimedOutErrorWriteByteCount; + Counter mOpenFilesCount; + void Clear() + { + mReadCount = 0; + mReadByteCount = 0; + mReadErrorCount = 0; + mWriteCount = 0; + mWriteByteCount = 0; + mWriteErrorCount = 0; + mSyncCount = 0; + mSyncErrorCount = 0; + mDeleteCount = 0; + mDeleteErrorCount = 0; + mRenameCount = 0; + mRenameErrorCount = 0; + mGetFsSpaceAvailableCount = 0; + mGetFsSpaceAvailableErrorCount = 0; + mCheckDirReadableCount = 0; + mCheckDirReadableErrorCount = 0; + mTimedOutErrorCount = 0; + mTimedOutErrorReadByteCount = 0; + mTimedOutErrorWriteByteCount = 0; + mOpenFilesCount = 0; + } + }; + typedef int64_t Offset; + typedef int64_t DeviceId; + + static bool Init( + const Properties& inProperties, + string* inErrMessagePtr = 0); + static bool StartIoQueue( + const char* inDirNamePtr, + DeviceId inDeviceId, + int inMaxOpenFiles, + string* inErrMessagePtr = 0); + static bool StopIoQueue( + DiskQueue* inDiskQueuePtr, + const char* inDirNamePtr, + DeviceId inDeviceId, + string* inErrMessagePtr = 0); + static bool Shutdown( + string* inErrMessagePtr = 0); + static bool RunIoCompletion(); + static size_t GetMaxRequestSize(); + static int GetFdCountPerFile(); + static BufferManager& GetBufferManager(); + static void GetCounters( + Counters& outCounters); + static bool Delete( + const char* inFileNamePtr, + KfsCallbackObj* inCallbackObjPtr = 0, + string* inErrMessagePtr = 0); + static bool Rename( + const char* inSrcFileNamePtr, + const char* inDstFileNamePtr, + KfsCallbackObj* inCallbackObjPtr = 0, + string* inErrMessagePtr = 0); + static bool GetFsSpaceAvailable( + const char* inPathNamePtr, + KfsCallbackObj* inCallbackObjPtr = 0, + string* inErrMessagePtr = 0); + static bool CheckDirReadable( + const char* inDirNamePtr, + KfsCallbackObj* inCallbackObjPtr = 0, + string* inErrMessagePtr = 0); + static bool GetDiskQueuePendingCount( + DiskQueue* inDiskQueuePtr, + int& outFreeRequestCount, + int& outRequestCount, + int64_t& outReadBlockCount, + int64_t& outWriteBlockCount, + int& outBlockSize); + static DiskQueue* FindDiskQueue( + const char* inDirNamePtr); + static void SetParameters( + const Properties& inProperties); + + class File + { + public: + File() + : mQueuePtr(0), + mFileIdx(-1), + mReadOnlyFlag(false), + mSpaceReservedFlag(false) + {} + ~File() + { + if (File::IsOpen()) { + File::Close(); + } + } + bool Open( + const char* inFileNamePtr, + Offset inMaxFileSize = -1, + bool inReadOnlyFlag = false, + bool inReserveFileSpaceFlag = false, + bool inCreateFlag = false, + string* inErrMessagePtr = 0, + bool* inRetryFlagPtr = 0, + bool inBufferedIoFlag = false); + bool IsOpen() const + { return (mFileIdx >= 0); } + bool Close( + Offset inFileSize = -1, + string* inErrMessagePtr = 0); + DiskQueue* GetDiskQueuePtr() const + { return mQueuePtr; } + int GetFileIdx() const + { return mFileIdx; } + bool IsReadOnly() const + { return mReadOnlyFlag; } + bool ReserveSpace( + string* inErrMessagePtr = 0); + void GetDiskQueuePendingCount( + int& outFreeRequestCount, + int& outRequestCount, + int64_t& outReadBlockCount, + int64_t& outWriteBlockCount, + int& outBlockSize); + private: + DiskQueue* mQueuePtr; + int mFileIdx; + bool mReadOnlyFlag:1; + bool mSpaceReservedFlag:1; + + void Reset(); + friend class DiskQueue; + private: + // No copies. + File(const File&); + File& operator=(const File&); + }; + typedef boost::shared_ptr FilePtr; + + DiskIo( + FilePtr inFilePtr, + KfsCallbackObj* inCallbackObjPtr); + + ~DiskIo(); + + /// Close disk queue. This will cause cancellation of all scheduled + /// requests. + void Close(); + + /// Schedule a read at the specified offset for numBytes. + /// @param[in] numBytes # of bytes that need to be read. + /// @param[in] offset offset in the file at which to start reading data from. + /// @retval # of bytes for which read was successfully scheduled; + /// -1 if there was an error. + ssize_t Read( + Offset inOffset, + size_t inNumBytes); + + /// Schedule a write. + /// @param[in] numBytes # of bytes that need to be written + /// @param[in] offset offset in the file at which to start writing data. + /// @param[in] buf IOBuffer which contains data that should be written + /// out to disk. + /// @retval # of bytes for which write was successfully scheduled; + /// -1 if there was an error. + ssize_t Write( + Offset inOffset, + size_t inNumBytes, + IOBuffer* inBufferPtr); + + /// Sync the previously written data to disk. + /// @param[in] inNotifyDoneFlag if set, notify upstream objects that the + /// sync operation has finished. + int Sync( + bool inNotifyDoneFlag); + + FilePtr GetFilePtr() const + { return mFilePtr; } +private: + typedef vector IoBuffers; + /// Owning KfsCallbackObj. + KfsCallbackObj* const mCallbackObjPtr; + FilePtr mFilePtr; + QCDiskQueue::RequestId mRequestId; + IoBuffers mIoBuffers; + size_t mReadBufOffset; + size_t mReadLength; + int64_t mBlockIdx; + int64_t mIoRetCode; + time_t mEnqueueTime; + QCDiskQueue::RequestId mCompletionRequestId; + QCDiskQueue::Error mCompletionCode; + DiskIo* mPrevPtr[1]; + DiskIo* mNextPtr[1]; + + void RunCompletion(); + void IoCompletion( + IOBuffer* inBufferPtr, + int inRetCode, + bool inSyncFlag = false); + virtual bool Done( + QCDiskQueue::RequestId inRequestId, + QCDiskQueue::FileIdx inFileIdx, + QCDiskQueue::BlockIdx inStartBlockIdx, + QCDiskQueue::InputIterator& inBufferItr, + int inBufferCount, + QCDiskQueue::Error inCompletionCode, + int inSysErrorCode, + int64_t inIoByteCount); + + enum MetaOpType + { + kMetaOpTypeNone = 0, + kMetaOpTypeDelete = 1, + kMetaOpTypeRename = 2, + kMetaOpTypeGetFsSpaceAvailable = 3, + kMetaOpTypeCheckDirReadable = 4, + kMetaOpTypeNumOps + }; + + static bool EnqueueMeta( + MetaOpType inOpType, + const char* inSrcFileNamePtr, + const char* inDstFileNamePtr, + KfsCallbackObj* inCallbackObjPtr, + string* inErrMessagePtr); + + friend class QCDLListOp; + friend class DiskIoQueues; + +private: + // No copies. + DiskIo( + const DiskIo& inDiskIo); + DiskIo& operator=( + const DiskIo& inDiskIo); +}; + +typedef boost::shared_ptr DiskIoPtr; + +} + +#endif /* _DISKIO_H */ diff --git a/src/cc/chunk/KfsOps.cc b/src/cc/chunk/KfsOps.cc new file mode 100644 index 000000000..1696a3a85 --- /dev/null +++ b/src/cc/chunk/KfsOps.cc @@ -0,0 +1,2871 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/05/26 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Code for parsing commands sent to the Chunkserver and generating +// responses that summarize the result of their execution. +// +// +//---------------------------------------------------------------------------- + +#include "KfsOps.h" +#include "common/Version.h" +#include "common/kfstypes.h" +#include "common/time.h" +#include "common/RequestParser.h" +#include "kfsio/Globals.h" +#include "kfsio/checksum.h" + +#include "ChunkManager.h" +#include "Logger.h" +#include "ChunkServer.h" +#include "LeaseClerk.h" +#include "Replicator.h" +#include "AtomicRecordAppender.h" +#include "utils.h" + +#include +#include +#include +#include + +#ifdef KFS_OS_NAME_SUNOS +#include +#endif + +namespace KFS { + +using std::map; +using std::string; +using std::ofstream; +using std::ifstream; +using std::istringstream; +using std::ostringstream; +using std::istream; +using std::ostream; +using std::for_each; +using std::vector; +using std::min; +using std::make_pair; +using std::ostream_iterator; +using std::copy; +using std::hex; +using std::max; +using namespace KFS::libkfsio; + +// Counters for the various ops +struct OpCounters : private map +{ + static void Update(KfsOp_t opName, int64_t startTime) + { + Counter* const c = GetCounter(opName); + if (! c) { + return; + } + c->Update(1); + c->UpdateTime(microseconds() - startTime); + } + static void WriteMaster() + { + if (! sInstance) { + return; + } + sInstance->mWriteMaster.Update(1); + } + static void WriteDuration(int64_t time) + { + if (! sInstance) { + return; + } + sInstance->mWriteDuration.Update(1); + sInstance->mWriteDuration.UpdateTime(time); + } +private: + Counter mWriteMaster; + Counter mWriteDuration; + static OpCounters* sInstance; + + OpCounters() + : map(), + mWriteMaster("Write Master"), + mWriteDuration("Write Duration") + {} + ~OpCounters() + { + for (iterator i = begin(); i != end(); ++i) { + if (sInstance == this) { + globals().counterManager.RemoveCounter(i->second); + } + delete i->second; + } + if (sInstance == this) { + globals().counterManager.RemoveCounter(&mWriteMaster); + globals().counterManager.RemoveCounter(&mWriteDuration); + sInstance = 0; + } + } + void AddCounter(const char *name, KfsOp_t opName) + { + Counter* const c = new Counter(name); + if (! insert(make_pair(opName, c)).second) { + delete c; + return; + } + globals().counterManager.AddCounter(c); + } + static Counter* GetCounter(KfsOp_t opName) + { + if (! sInstance) { + return 0; + } + OpCounters::iterator iter = sInstance->find(opName); + if (iter == sInstance->end()) { + return 0; + } + return iter->second; + } + static OpCounters* MakeInstance() + { + // ensure that globals constructed first + globals(); + static OpCounters instance; + instance.AddCounter("Open", CMD_OPEN); + instance.AddCounter("Read", CMD_READ); + instance.AddCounter("Write Prepare", CMD_WRITE_PREPARE); + instance.AddCounter("Write Sync", CMD_WRITE_SYNC); + instance.AddCounter("Write (AIO)", CMD_WRITE); + instance.AddCounter("Size", CMD_SIZE); + instance.AddCounter("Record append", CMD_RECORD_APPEND); + instance.AddCounter("Space reserve", CMD_SPC_RESERVE); + instance.AddCounter("Space release", CMD_SPC_RELEASE); + instance.AddCounter("Get Chunk Metadata", CMD_GET_CHUNK_METADATA); + instance.AddCounter("Alloc", CMD_ALLOC_CHUNK); + instance.AddCounter("Delete", CMD_DELETE_CHUNK); + instance.AddCounter("Truncate", CMD_TRUNCATE_CHUNK); + instance.AddCounter("Replicate", CMD_REPLICATE_CHUNK); + instance.AddCounter("Heartbeat", CMD_HEARTBEAT); + instance.AddCounter("Change Chunk Vers", CMD_CHANGE_CHUNK_VERS); + instance.AddCounter("Make Chunk Stable", CMD_MAKE_CHUNK_STABLE); + globals().counterManager.AddCounter(&instance.mWriteMaster); + globals().counterManager.AddCounter(&instance.mWriteDuration); + return &instance; + } +}* OpCounters::sInstance(OpCounters::MakeInstance()); + + +const char* const KFS_VERSION_STR = "KFS/1.0"; + +static bool +needToForwardToPeer(string &serverInfo, uint32_t numServers, int &myPos, + ServerLocation &peerLoc, + bool isWriteIdPresent, int64_t &writeId); + +static inline RemoteSyncSMPtr +FindPeer(KfsOp& op, const ServerLocation& loc) +{ + ClientSM* const csm = op.GetClientSM(); + return (csm ? csm->FindServer(loc) : RemoteSyncSMPtr()); +} + +void +SubmitOp(KfsOp *op) +{ + op->type = OP_REQUEST; + op->Execute(); +} + +void +SubmitOpResponse(KfsOp *op) +{ + op->type = OP_RESPONSE; + op->HandleEvent(EVENT_CMD_DONE, op); +} + +int64_t KfsOp::sOpsCount = 0; + +KfsOp::~KfsOp() +{ + OpCounters::Update(op, startTime); + assert(sOpsCount > 0); + sOpsCount--; +} + +/* static */ uint32_t +KfsOp::Checksum( + const char* name, + size_t nameLen, + const char* header, + size_t headerLen) +{ + return ComputeBlockChecksum( + ComputeBlockChecksum(name, nameLen), header, headerLen); +} + +typedef RequestHandler ChunkRequestHandler; +static const ChunkRequestHandler& +MakeRequestHandler() +{ + static ChunkRequestHandler sHandler; + return sHandler + .MakeParser("OPEN") + .MakeParser("CLOSE") + .MakeParser("READ") + .MakeParser("WRITE_ID_ALLOC") + .MakeParser("WRITE_PREPARE") + .MakeParser("WRITE_SYNC") + .MakeParser("SIZE") + .MakeParser("RECORD_APPEND") + .MakeParser("GET_RECORD_APPEND_OP_STATUS") + .MakeParser("CHUNK_SPACE_RESERVE") + .MakeParser("CHUNK_SPACE_RELEASE") + .MakeParser("GET_CHUNK_METADATA") + .MakeParser("ALLOCATE") + .MakeParser("DELETE") + .MakeParser("TRUNCATE") + .MakeParser("REPLICATE") + .MakeParser("HEARTBEAT") + .MakeParser("STALE_CHUNKS") + .MakeParser("CHUNK_VERS_CHANGE") + .MakeParser("BEGIN_MAKE_CHUNK_STABLE") + .MakeParser("MAKE_CHUNK_STABLE") + .MakeParser("RETIRE") + .MakeParser("PING") + .MakeParser("DUMP_CHUNKMAP") + .MakeParser("STATS") + .MakeParser("CMD_SET_PROPERTIES") + .MakeParser("RESTART_CHUNK_SERVER") + ; +} +static const ChunkRequestHandler& sRequestHandler = MakeRequestHandler(); + +/// +/// Given a command in a buffer, parse it out and build a "Command" +/// structure which can then be executed. For parsing, we take the +/// string representation of a command and build a Properties object +/// out of it; we can then pull the various headers in whatever order +/// we choose. +/// Commands are of the form: +/// \r\n +/// {header: value \r\n}+\r\n +/// +/// The general model in parsing the client command: +/// 1. Each command has its own parser +/// 2. Extract out the command name and find the parser for that +/// command +/// 3. Dump the header/value pairs into a properties object, so that we +/// can extract the header/value fields in any order. +/// 4. Finally, call the parser for the command sent by the client. +/// +/// @param[in] cmdBuf: buffer containing the request sent by the client +/// @param[in] cmdLen: length of cmdBuf +/// @param[out] res: A piece of memory allocated by calling new that +/// contains the data for the request. It is the caller's +/// responsibility to delete the memory returned in res. +/// @retval 0 on success; -1 if there is an error +/// +int +ParseCommand(const IOBuffer& ioBuf, int len, KfsOp** res) +{ + // Main thread's buffer + static char tempBuf[MAX_RPC_HEADER_LEN]; + + *res = 0; + if (len <= 0 || len > MAX_RPC_HEADER_LEN) { + return -1; + } + // Copy if request header spans two or more buffers. + // Requests on average are over a magnitude shorter than single + // io buffer (4K page), thus the copy should be infrequent, and + // small enough. With modern cpu the copy should be take less + // cpu cycles than buffer boundary handling logic (or one symbol + // per call processing), besides the request headers are small + // enough to fit into cpu cache. + int reqLen = len; + const char* const buf = ioBuf.CopyOutOrGetBufPtr(tempBuf, reqLen); + assert(reqLen == len); + *res = reqLen == len ? sRequestHandler.Handle(buf, reqLen) : 0; + return (*res ? 0 : -1); +} + +ClientSM* +KfsOp::GetClientSM() +{ + return (clientSMFlag ? static_cast(clnt) : 0); +} + +bool +WriteIdAllocOp::Validate() +{ + ValueParser::SetValue( + clientSeqStr.GetPtr(), + clientSeqStr.GetSize(), + seq, + clientSeq + ); + return true; +} + +bool +RecordAppendOp::Validate() +{ + ValueParser::SetValue( + clientSeqStr.GetPtr(), + clientSeqStr.GetSize(), + seq, + clientSeq + ); + return true; +} + +bool +WriteSyncOp::Validate() +{ + if (checksumsCnt <= 0) { + return true; + } + const char* ptr = checksumsStr.GetPtr(); + const char* const end = ptr + checksumsStr.GetSize(); + checksums.clear(); + checksums.reserve(checksumsCnt); + for (int i = 0; i < checksumsCnt; i++) { + uint32_t cksum = 0; + if (! ValueParser::ParseInt(ptr, end - ptr, cksum)) { + return false; + } + checksums.push_back(cksum); + while (ptr < end && (*ptr & 0xFF) > ' ') { + ++ptr; + } + } + return true; +} + +bool MakeChunkStableOp::Validate() +{ + hasChecksum = ! checksumStr.empty(); + if (hasChecksum) { + ValueParser::SetValue( + checksumStr.GetPtr(), + checksumStr.GetSize(), + uint32_t(0), + chunkChecksum + ); + } + return true; +} + +/// +/// Generic event handler for tracking completion of an event +/// execution. Push the op to the logger and the net thread will pick +/// it up and dispatch it. +/// +int +KfsOp::HandleDone(int code, void *data) +{ + gLogger.Submit(this); + return 0; +} + +/// +/// A read op finished. Set the status and the # of bytes read +/// alongwith the data and notify the client. +/// +int +ReadOp::HandleDone(int code, void *data) +{ + if (code == EVENT_DISK_ERROR) { + status = -1; + if (data) { + status = *reinterpret_cast(data); + KFS_LOG_STREAM_INFO << + "disk error: errno: " << status << " chunkid: " << chunkId << + KFS_LOG_EOM; + } + if (status != -ETIMEDOUT) { + gChunkManager.ChunkIOFailed(chunkId, status, diskIo.get()); + } + } else if (code == EVENT_DISK_READ) { + if (! dataBuf) { + dataBuf = new IOBuffer(); + } + IOBuffer* const b = reinterpret_cast(data); + // Order matters...when we append b, we take the data from b + // and put it into our buffer. + dataBuf->Append(b); + // verify checksum + if (! gChunkManager.ReadChunkDone(this)) { + return 0; // Retry. + } + numBytesIO = dataBuf->BytesConsumable(); + if (status == 0) { + // checksum verified + status = numBytesIO; + } + } + + if (status >= 0) { + assert(numBytesIO >= 0); + if (offset % CHECKSUM_BLOCKSIZE != 0 || + numBytesIO % CHECKSUM_BLOCKSIZE != 0) { + checksum = ComputeChecksums(dataBuf, numBytesIO); + } + assert(size_t((numBytesIO + CHECKSUM_BLOCKSIZE - 1) / CHECKSUM_BLOCKSIZE) == + checksum.size()); + // send the disk IO time back to client for telemetry reporting + diskIOTime = microseconds() - startTime; + } + + if (wop) { + // if the read was triggered by a write, then resume execution of write + wop->Execute(); + return 0; + } + + const ChunkInfo_t* ci = gChunkManager.GetChunkInfo(chunkId); + if (ci && ci->chunkSize > 0 && offset + numBytesIO >= ci->chunkSize && + ! gLeaseClerk.IsLeaseValid(chunkId)) { + // If we have read the full chunk, close out the fd. The + // observation is that reads are sequential and when we + // finished a chunk, the client will move to the next one. + // + // Release disk io first for CloseChunk to have effect: normally + // this method is invoked from io completion routine, and diskIo has a + // reference to file dataFH. + // DiskIo completion path doesn't expect diskIo pointer to remain valid + // upon return. + diskIo.reset(); + KFS_LOG_STREAM_INFO << "closing chunk: " << chunkId << KFS_LOG_EOM; + gChunkManager.CloseChunk(chunkId); + } + + gLogger.Submit(this); + return 0; +} + +int +ReadOp::HandleReplicatorDone(int code, void *data) +{ + if (status >= 0 && ! checksum.empty()) { + const vector datacksums = ComputeChecksums(dataBuf, numBytesIO); + if (datacksums.size() > checksum.size()) { + KFS_LOG_STREAM_INFO << + "Checksum number of entries mismatch in re-replication: " + " expect: " << datacksums.size() << + " got: " << checksum.size() << + KFS_LOG_EOM; + status = -EBADCKSUM; + } else { + for (uint32_t i = 0; i < datacksums.size(); i++) { + if (datacksums[i] != checksum[i]) { + KFS_LOG_STREAM_INFO << + "Checksum mismatch in re-replication: " + " expect: " << datacksums[i] << + " got: " << checksum[i] << + KFS_LOG_EOM; + status = -EBADCKSUM; + break; + } + } + } + } + // notify the replicator object that the read it had submitted to + // the peer has finished. + return clnt->HandleEvent(code, data); +} + +int +WriteOp::HandleRecordAppendDone(int code, void *data) +{ + gChunkManager.WriteDone(this); + if (code == EVENT_DISK_ERROR) { + // eat up everything that was sent + dataBuf->Consume(numBytes); + status = -1; + if (data) { + status = *(int *) data; + KFS_LOG_STREAM_INFO << + "Disk error: errno: " << status << " chunkid: " << chunkId << + KFS_LOG_EOM; + } + } else if (code == EVENT_DISK_WROTE) { + status = *(int *) data; + numBytesIO = status; + dataBuf->Consume(numBytesIO); + } else { + die("unexpected event code"); + } + return clnt->HandleEvent(EVENT_CMD_DONE, this); +} + +int +ReadOp::HandleScrubReadDone(int code, void *data) +{ + return scrubOp->HandleScrubReadDone(code, data); +} + +bool +ReadOp::IsChunkReadOp(int64_t& outNumBytes, kfsChunkId_t& outChunkId) +{ + outChunkId = chunkId; + if (numBytes > 0) { + outNumBytes = (int64_t)((numBytes + CHECKSUM_BLOCKSIZE - 1) / + CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE); + } else { + outNumBytes = numBytes; + } + return true; +} + +int +WriteOp::HandleWriteDone(int code, void *data) +{ + // DecrementCounter(CMD_WRITE); + + gChunkManager.WriteDone(this); + if (isFromReReplication) { + if (code == EVENT_DISK_WROTE) { + status = std::min(*(int *) data, int(numBytes)); + numBytesIO = status; + } + else { + status = -1; + } + return clnt->HandleEvent(code, this); + } + assert(wpop); + + if (code == EVENT_DISK_ERROR) { + // eat up everything that was sent + dataBuf->Consume(std::max(int(numBytesIO), int(numBytes))); + status = -1; + if (data) { + status = *(int *) data; + KFS_LOG_STREAM_INFO << + "Disk error: errno: " << status << " chunkid: " << chunkId << + KFS_LOG_EOM; + } + gChunkManager.ChunkIOFailed(chunkId, status, diskIo.get()); + + if (wpop->status >= 0) { + wpop->status = status; + } + wpop->HandleEvent(EVENT_CMD_DONE, this); + return 0; + } + else if (code == EVENT_DISK_WROTE) { + status = *(int *) data; + SET_HANDLER(this, &WriteOp::HandleSyncDone); + if (numBytesIO != status || status < (int)numBytes) { + // write didn't do everything that was asked; we need to retry + KFS_LOG_STREAM_INFO << + "Write on chunk did less: asked: " << numBytes << "/" << numBytesIO << + " did: " << status << "; asking clnt to retry" << + KFS_LOG_EOM; + status = -EAGAIN; + } else { + status = numBytes; // reply back the same # of bytes as in request. + } + if (numBytesIO > ssize_t(numBytes) && dataBuf) { + const int off(offset % IOBufferData::GetDefaultBufferSize()); + KFS_LOG_STREAM_DEBUG << + "chunk write: asked " << numBytes << "/" << numBytesIO << + " actual, buf offset: " << off << + KFS_LOG_EOM; + // restore original data in the buffer. + assert(ssize_t(numBytes) <= numBytesIO - off); + dataBuf->Consume(off); + dataBuf->Trim(int(numBytes)); + } + numBytesIO = numBytes; + // queue the sync op only if we are all done with writing to + // this chunk: + waitForSyncDone = false; + + if (!waitForSyncDone) { + // sync is queued; no need to wait for it to finish + return HandleSyncDone(EVENT_SYNC_DONE, 0); + } + } + return 0; +} + +/// +/// A write op finished. Set the status and the # of bytes written +/// and notify the owning write commit op. +/// +int +WriteOp::HandleSyncDone(int code, void *data) +{ + // eat up everything that was sent + dataBuf->Consume(numBytes); + + if (code != EVENT_SYNC_DONE) { + status = -1; + } + + if (status >= 0) { + SET_HANDLER(this, &WriteOp::HandleLoggingDone); + gLogger.Submit(this); + } + else { + wpop->HandleEvent(EVENT_CMD_DONE, this); + } + + return 0; +} + +int +WriteOp::HandleLoggingDone(int code, void *data) +{ + assert(wpop); + return wpop->HandleEvent(EVENT_CMD_DONE, this); +} + +/// +/// Handlers for executing the various ops. If the op execution is +/// "in-line", that is the op doesn't block, then when the execution +/// is finished, the op is handed off to the logger; the net thread +/// will drain the logger and then notify the client. Otherwise, the op is queued +/// for execution and the client gets notified whenever the op +/// finishes execution. +/// +void +OpenOp::Execute() +{ + status = gChunkManager.OpenChunk(chunkId, openFlags); + gLogger.Submit(this); +} + +void +CloseOp::Execute() +{ + KFS_LOG_STREAM_INFO << + "Closing chunk: " << chunkId << " and might give up lease" << + KFS_LOG_EOM; + + int myPos = -1; + int64_t writeId = -1; + ServerLocation peerLoc; + bool needToForward = needToForwardToPeer( + servers, numServers, myPos, peerLoc, hasWriteId, writeId); + if (! gAtomicRecordAppendManager.CloseChunk( + this, writeId, needToForward)) { + // forward the close only if it was accepted by the chunk + // manager. the chunk manager can reject a close if the + // chunk is being written to by multiple record appenders + needToForward = gChunkManager.CloseChunk(chunkId) == 0 && needToForward; + status = 0; + } + if (needToForward) { + ForwardToPeer(peerLoc); + } + gLogger.Submit(this); +} + +void +CloseOp::ForwardToPeer(const ServerLocation& loc) +{ + RemoteSyncSMPtr const peer = FindPeer(*this, loc); + if (! peer) { + KFS_LOG_STREAM_DEBUG << + "unable to forward to peer: " << loc.ToString() << + " cmd: " << Show() << + KFS_LOG_EOM; + return; + } + CloseOp* const fwdedOp = new CloseOp(0, this); + // don't need an ack back + fwdedOp->needAck = false; + // this op goes to the remote-sync SM and after it is sent, comes right back to be nuked + // when this op comes, just nuke it + fwdedOp->clnt = fwdedOp; + + SET_HANDLER(fwdedOp, &CloseOp::HandlePeerReply); + peer->Enqueue(fwdedOp); +} + +int +CloseOp::HandlePeerReply(int code, void *data) +{ + delete this; + return 0; +} + +void +AllocChunkOp::Execute() +{ + int myPos = -1; + int64_t writeId = -1; + ServerLocation peerLoc; + needToForwardToPeer(servers, numServers, myPos, peerLoc, false, writeId); + if (myPos < 0) { + statusMsg = "invalid or missing Servers: field"; + status = -EINVAL; + gLogger.Submit(this); + return; + } + + // Allocation implicitly invalidates all previously existed write leases. + gLeaseClerk.UnRegisterLease(chunkId); + mustExistFlag = chunkVersion > 1; + if (! mustExistFlag) { + const int ret = gChunkManager.DeleteChunk(chunkId); + if (ret != -EBADF) { + KFS_LOG_STREAM_WARN << + "allocate: delete existing" + " chunk: " << chunkId << + " status: " << ret << + KFS_LOG_EOM; + } + } + const bool failIfExistsFlag = ! mustExistFlag; + // Check if chunk exists, if it does then load chunk meta data. + SET_HANDLER(this, &AllocChunkOp::HandleChunkMetaReadDone); + int res = gChunkManager.ReadChunkMetadata(chunkId, this); + if (res == 0) { + if (failIfExistsFlag) { + die("chunk deletion failed"); + } + return; // The completion handler will be or already invoked. + } + if (! mustExistFlag && res == -EBADF) { + // Allocate new chunk. + res = 0; + HandleChunkAllocDone(EVENT_CMD_DONE, &res); + return; + } + KFS_LOG_STREAM_ERROR << + "allocate: read chunk metadata:" + " chunk: " << chunkId << + " error: " << res << + KFS_LOG_EOM; + status = res; + gLogger.Submit(this); +} + +int +AllocChunkOp::HandleChunkMetaReadDone(int code, void* data) +{ + if (status < 0) { + gLogger.Submit(this); + return 0; + } else if (data) { + status = *reinterpret_cast(data); + } + SET_HANDLER(this, &AllocChunkOp::HandleChunkAllocDone); + // When version change is done the chunk must exist. + // This is needed to detect chunk deletion while version version change is + // in progress. + // AllocChunk() does chunk version verification and other necessary checks + // in the case if chunk exists. + mustExistFlag = true; + const bool stableFlag = false; + const int ret = gChunkManager.ChangeChunkVers( + chunkId, chunkVersion, stableFlag, this); + if (ret < 0) { + statusMsg = "change version failure"; + status = ret; + gLogger.Submit(this); + } + return 0; +} + +int +AllocChunkOp::HandleChunkAllocDone(int code, void *data) +{ + if (status >= 0 && code == EVENT_DISK_ERROR) { + status = data ? *reinterpret_cast(data) : -1; + } + if (status >= 0) { + if (leaseId >= 0) { + OpCounters::WriteMaster(); + } + if (appendFlag) { + int myPos = -1; + int64_t writeId = -1; + ServerLocation peerLoc; + needToForwardToPeer(servers, numServers, myPos, peerLoc, false, writeId); + assert(myPos >= 0); + gChunkManager.AllocChunkForAppend(this, myPos, peerLoc); + } else { + bool beingReplicatedFlag = false; + status = gChunkManager.AllocChunk(fileId, chunkId, chunkVersion, + beingReplicatedFlag, 0, mustExistFlag); + } + if (status >= 0 && leaseId >= 0) { + gLeaseClerk.RegisterLease(chunkId, leaseId, appendFlag); + } + } + gLogger.Submit(this); + return 0; +} + +void +DeleteChunkOp::Execute() +{ + status = gChunkManager.DeleteChunk(chunkId); + gLogger.Submit(this); +} + +void +TruncateChunkOp::Execute() +{ + SET_HANDLER(this, &TruncateChunkOp::HandleChunkMetaReadDone); + if (gChunkManager.ReadChunkMetadata(chunkId, this) < 0) { + status = -EINVAL; + gLogger.Submit(this); + } +} + +int +TruncateChunkOp::HandleChunkMetaReadDone(int code, void *data) +{ + if (status >= 0 && data) { + status = *(int *) data; + } + if (status < 0) { + gLogger.Submit(this); + return 0; + } + + status = gChunkManager.TruncateChunk(chunkId, chunkSize); + if (status < 0) { + gLogger.Submit(this); + return 0; + } + SET_HANDLER(this, &TruncateChunkOp::HandleChunkMetaWriteDone); + const int ret = gChunkManager.WriteChunkMetadata(chunkId, this); + if (ret != 0) { + status = ret; + gLogger.Submit(this); + } + return 0; +} + +int +TruncateChunkOp::HandleChunkMetaWriteDone(int code, void* data) +{ + int res = data ? *reinterpret_cast(data) : -1; + if (res < 0) { + status = res; + } + gLogger.Submit(this); + return 0; +} + +void +ReplicateChunkOp::Execute() +{ + Replicator::Run(this); +} + +void +BeginMakeChunkStableOp::Execute() +{ + status = 0; + if (gAtomicRecordAppendManager.BeginMakeChunkStable(this)) { + return; + } + gLogger.Submit(this); +} + +void +MakeChunkStableOp::Execute() +{ + status = 0; + if (gChunkManager.IsChunkStable(this)) { + gLogger.Submit(this); + return; + } + SET_HANDLER(this, &MakeChunkStableOp::HandleChunkMetaReadDone); + const int ret = gChunkManager.ReadChunkMetadata(chunkId, this); + if (ret < 0) { + status = ret; + gLogger.Submit(this); + } +} + +int +MakeChunkStableOp::HandleChunkMetaReadDone(int code, void *data) +{ + if (status >= 0 && data) { + status = *reinterpret_cast(data); + } + if (status < 0) { + gLogger.Submit(this); + return 0; + } + SET_HANDLER(this, &MakeChunkStableOp::HandleMakeStableDone); + if (gAtomicRecordAppendManager.MakeChunkStable(this)) { + return 0; + } + HandleMakeStableDone(EVENT_CMD_DONE, this); + return 0; +} + +int +MakeChunkStableOp::HandleMakeStableDone(int code, void *data) +{ + if (code == EVENT_DISK_ERROR && status == 0) { + const int res = data ? *reinterpret_cast(data) : -1; + status = res < 0 ? res : -1; + } + if (status >= 0 && + ! gLeaseClerk.IsLeaseValid(chunkId) && + gChunkManager.CloseChunkIfReadable(chunkId)) { + KFS_LOG_STREAM_DEBUG << + Show() << " done, chunk closed" << + KFS_LOG_EOM; + } + gLogger.Submit(this); + return 0; +} + +void +ChangeChunkVersOp::Execute() +{ + SET_HANDLER(this, &ChangeChunkVersOp::HandleChunkMetaReadDone); + const int ret = gChunkManager.ReadChunkMetadata(chunkId, this); + if (ret < 0) { + status = -EINVAL; + gLogger.Submit(this); + } +} + +int +ChangeChunkVersOp::HandleChunkMetaReadDone(int code, void *data) +{ + if (status >= 0 && data) { + status = *(int *) data; + } + if (status < 0) { + gLogger.Submit(this); + return 0; + } + SET_HANDLER(this, &ChangeChunkVersOp::HandleChunkMetaWriteDone); + if (gChunkManager.ChangeChunkVers(this) < 0) { + gLogger.Submit(this); + } + return 0; +} + +int +ChangeChunkVersOp::HandleChunkMetaWriteDone(int code, void* data) +{ + const int res = data ? *reinterpret_cast(data) : -1; + if (res < 0) { + status = res; + } + gLogger.Submit(this); + return 0; +} + +template void +HeartbeatOp::Append(const char* key1, const char* key2, T val) +{ + if (key1 && *key1) { + response << key1 << ": " << val << "\r\n"; + } + if (key2 && *key2) { + cmdShow << " " << key2 << ": " << val; + } +} + +// This is the heartbeat sent by the meta server +void +HeartbeatOp::Execute() +{ + double loadavg[3] = {-1, -1, -1}; +#ifndef KFS_OS_NAME_CYGWIN + getloadavg(loadavg, 3); +#endif + gChunkManager.MetaHeartbeat(*this); + + const int64_t writeCount = gChunkManager.GetNumWritableChunks(); + const int64_t writeAppendCount = gAtomicRecordAppendManager.GetOpenAppendersCount(); + const int64_t replicationCount = Replicator::GetNumReplications(); + int64_t utime, stime; + + if (cputime(&utime, &stime) < 0) { + utime = stime = -1; + } + int64_t totalFsSpace = 0; + int chunkDirs = 0; + int evacuateInFlightCount = 0; + int writableDirs = 0; + int evacuateChunks = 0; + int64_t evacuateByteCount = 0; + int evacuateDoneChunkCount = 0; + int64_t evacuateDoneByteCount = 0; + cmdShow << " space:"; + Append("Total-space", "total", gChunkManager.GetTotalSpace( + totalFsSpace, chunkDirs, evacuateInFlightCount, writableDirs, + evacuateChunks, evacuateByteCount, + &evacuateDoneChunkCount, &evacuateDoneByteCount)); + Append("Total-fs-space", "tfs", totalFsSpace); + Append("Used-space", "used", gChunkManager.GetUsedSpace()); + Append("Num-drives", "drives", chunkDirs); + Append("Num-wr-drives", "wr-drv", writableDirs); + Append("Num-chunks", "chunks", gChunkManager.GetNumChunks()); + Append("Num-writable-chunks", "wrchunks", + writeCount + writeAppendCount + replicationCount + ); + Append("Evacuate", "evacuate", + max(evacuateChunks, evacuateInFlightCount)); + Append("Evacuate-bytes", "evac-b", evacuateByteCount); + Append("Evacuate-done", "evac-d", evacuateDoneChunkCount); + Append("Evacuate-done-bytes", "evac-d-b", evacuateDoneByteCount); + Append("Evacuate-in-flight", "evac-fl", evacuateInFlightCount); + Append("Num-random-writes", "rwr", writeCount); + Append("Num-appends", "awr", writeAppendCount); + Append("Num-re-replications", "rep", replicationCount); + Append("Num-appends-with-wids", "awid", + gAtomicRecordAppendManager.GetAppendersWithWidCount()); + Append("Uptime", "up", globalNetManager().UpTime()); + + Append("CPU-user", "ucpu", utime); + Append("CPU-sys", "scpu", stime); + Append("CPU-load-avg", "load", loadavg[0]); + + ChunkManager::Counters cm; + gChunkManager.GetCounters(cm); + cmdShow << " chunk: err:"; + Append("Chunk-corrupted", "cor", cm.mCorruptedChunksCount); + Append("Chunk-lost", "lost", cm.mLostChunksCount); + Append("Chunk-header-errors", "hdr", cm.mBadChunkHeaderErrorCount); + Append("Chunk-chksum-errors", "csum", cm.mReadChecksumErrorCount); + Append("Chunk-read-errors", "rd", cm.mReadErrorCount); + Append("Chunk-write-errors", "wr", cm.mWriteErrorCount); + Append("Chunk-open-errors", "open", cm.mOpenErrorCount); + Append("Dir-chunk-lost", "dce", cm.mDirLostChunkCount); + Append("Chunk-dir-lost", "cdl", cm.mChunkDirLostCount); + + MetaServerSM::Counters mc; + gMetaServerSM.GetCounters(mc); + cmdShow << " meta:"; + Append("Meta-connect", "conn", mc.mConnectCount); + cmdShow << " hello:"; + Append("Meta-hello-count", "cnt", mc.mHelloCount); + Append("Meta-hello-errors", "err", mc.mHelloErrorCount); + cmdShow << " alloc:"; + Append("Meta-alloc-count", "cnt", mc.mAllocCount); + Append("Meta-alloc-errors", "err", mc.mAllocErrorCount); + + ClientManager::Counters cli; + gClientManager.GetCounters(cli); + cmdShow << " cli:"; + Append("Client-accept", "accept", cli.mAcceptCount); + Append("Client-active", "cur", cli.mClientCount); + cmdShow << " req: err:"; + Append("Client-req-invalid", "inval", cli.mBadRequestCount); + Append("Client-req-invalid-header", "hdr", cli.mBadRequestHeaderCount); + Append("Client-req-invalid-length", "len", + cli.mRequestLengthExceededCount); + cmdShow << " read:"; + Append("Client-read-count", "cnt", cli.mReadRequestCount); + Append("Client-read-bytes", "bytes", cli.mReadRequestBytes); + Append("Client-read-micro-sec", "tm", cli.mReadRequestTimeMicroSecs); + Append("Client-read-errors", "err", cli.mReadRequestErrors); + cmdShow << " write:"; + Append("Client-write-count", "cnt", cli.mWriteRequestCount); + Append("Client-write-bytes", "bytes", cli.mWriteRequestBytes); + Append("Client-write-micro-sec", "tm", cli.mWriteRequestTimeMicroSecs); + Append("Client-write-errors", "err", cli.mWriteRequestErrors); + cmdShow << " append:"; + Append("Client-append-count", "cnt", cli.mAppendRequestCount); + Append("Client-append-bytes", "bytes", cli.mAppendRequestBytes); + Append("Client-append-micro-sec", "tm", cli.mAppendRequestTimeMicroSecs); + Append("Client-append-errors", "err", cli.mAppendRequestErrors); + cmdShow << " other:"; + Append("Client-other-count", "cnt", cli.mOtherRequestCount); + Append("Client-other-micro-sec", "tm", cli.mOtherRequestTimeMicroSecs); + Append("Client-other-errors", "err", cli.mOtherRequestErrors); + + cmdShow << " timer: ovr:"; + Append("Timer-overrun-count", "cnt", + globalNetManager().GetTimerOverrunCount()); + Append("Timer-overrun-sec", "sec", + globalNetManager().GetTimerOverrunSec()); + + cmdShow << " wappend:"; + Append("Write-appenders", "cur", + gAtomicRecordAppendManager.GetAppendersCount()); + AtomicRecordAppendManager::Counters wa; + gAtomicRecordAppendManager.GetCounters(wa); + Append("WAppend-count", "cnt", wa.mAppendCount); + Append("WAppend-bytes", "bytes", wa.mAppendByteCount); + Append("WAppend-errors","err", wa.mAppendErrorCount); + cmdShow << " repl:"; + Append("WAppend-replication-errors", "err", wa.mReplicationErrorCount); + Append("WAppend-replication-tiemouts", "tmo", wa.mReplicationTimeoutCount); + cmdShow << " alloc:"; + Append("WAppend-alloc-count", "cnt", wa.mAppenderAllocCount); + Append("WAppend-alloc-master-count", "mas", wa.mAppenderAllocMasterCount); + Append("WAppend-alloc-errors", "err", wa.mAppenderAllocErrorCount); + cmdShow << " wid:"; + Append("WAppend-wid-alloc-count", "cnt", wa.mWriteIdAllocCount); + Append("WAppend-wid-alloc-errors", "err", wa.mWriteIdAllocErrorCount); + Append("WAppend-wid-alloc-no-appender","nae", wa.mWriteIdAllocNoAppenderCount); + cmdShow << " srsrv:"; + Append("WAppend-sreserve-count", "cnt", wa.mSpaceReserveCount); + Append("WAppend-sreserve-bytes", "bytes", wa.mSpaceReserveByteCount); + Append("WAppend-sreserve-errors", "err", wa.mSpaceReserveErrorCount); + Append("WAppend-sreserve-denied", "den", wa.mSpaceReserveDeniedCount); + cmdShow << " bmcs:"; + Append("WAppend-bmcs-count", "cnt", wa.mBeginMakeStableCount); + Append("WAppend-bmcs-errors", "err", wa.mBeginMakeStableErrorCount); + cmdShow << " mcs:"; + Append("WAppend-mcs-count", "cnt", wa.mMakeStableCount); + Append("WAppend-mcs-errors", "err", wa.mMakeStableErrorCount); + Append("WAppend-mcs-length-errors", "eln", wa.mMakeStableLengthErrorCount); + Append("WAppend-mcs-chksum-errors", "ecs", wa.mMakeStableChecksumErrorCount); + cmdShow << " gos:"; + Append("WAppend-get-op-status-count", "cnt", wa.mGetOpStatusCount); + Append("WAppend-get-op-status-errors","err", wa.mGetOpStatusErrorCount); + Append("WAppend-get-op-status-known", "knw", wa.mGetOpStatusKnownCount); + cmdShow << " err:"; + Append("WAppend-chksum-erros", "csum", wa.mChecksumErrorCount); + Append("WAppend-read-erros", "rd", wa.mReadErrorCount); + Append("WAppend-write-errors", "wr", wa.mWriteErrorCount); + Append("WAppend-lease-ex-errors", "lease", wa.mLeaseExpiredCount); + cmdShow << " lost:"; + Append("WAppend-lost-timeouts", "tm", wa.mTimeoutLostCount); + Append("WAppend-lost-chunks", "csum", wa.mLostChunkCount); + + const BufferManager& bufMgr = DiskIo::GetBufferManager(); + cmdShow << " buffers: bytes:"; + Append("Buffer-bytes-total", "total", bufMgr.GetTotalByteCount()); + Append("Buffer-bytes-wait", "wait", bufMgr.GetWaitingByteCount()); + Append("Buffer-bytes-wait-avg", "wavg", bufMgr.GetWaitingAvgBytes()); + Append("Buffer-usec-wait-avg", "uavg", bufMgr.GetWaitingAvgUsecs()); + Append("Buffer-clients-wait-avg", "cavg", bufMgr.GetWaitingAvgCount()); + cmdShow << " cnt:"; + Append("Buffer-total-count", "total", bufMgr.GetTotalBufferCount()); + Append("Buffer-min-count", "min", bufMgr.GetMinBufferCount()); + Append("Buffer-free-count", "free", bufMgr.GetFreeBufferCount()); + cmdShow << " req:"; + Append("Buffer-clients", "cbuf", bufMgr.GetClientsWihtBuffersCount()); + Append("Buffer-clients-wait", "cwait", bufMgr.GetWaitingCount()); + BufferManager::Counters bmCnts; + bufMgr.GetCounters(bmCnts); + Append("Buffer-req-total", "cnt", bmCnts.mRequestCount); + Append("Buffer-req-bytes", "bytes", bmCnts.mRequestByteCount); + Append("Buffer-req-denied-total", "den", bmCnts.mRequestDeniedCount); + Append("Buffer-req-denied-bytes", "denb", bmCnts.mRequestDeniedByteCount); + Append("Buffer-req-granted-total", "grn", bmCnts.mRequestGrantedCount); + Append("Buffer-req-granted-bytes", "grnb", bmCnts.mRequestGrantedByteCount); + Append("Buffer-req-wait-usec", "rwu", bmCnts.mRequestWaitUsecs); + + DiskIo::Counters dio; + DiskIo::GetCounters(dio); + cmdShow << " disk: read:"; + Append("Disk-read-count", "cnt", dio.mReadCount); + Append("Disk-read-bytes", "bytes", dio.mReadByteCount); + Append("Disk-read-errors","err", dio.mReadErrorCount); + cmdShow << " write:"; + Append("Disk-write-count", "cnt", dio.mWriteCount); + Append("Disk-write-bytes", "bytes", dio.mWriteByteCount); + Append("Disk-write-errors","err", dio.mWriteErrorCount); + cmdShow << " sync:"; + Append("Disk-sync-count", "cnt", dio.mSyncCount); + Append("Disk-sync-errors","err", dio.mSyncErrorCount); + cmdShow << " del:"; + Append("Disk-delete-count", "cnt", dio.mDeleteCount); + Append("Disk-delete-errors","err", dio.mDeleteErrorCount); + cmdShow << " rnm:"; + Append("Disk-rename-count", "cnt", dio.mRenameCount); + Append("Disk-rename-errors","err", dio.mRenameErrorCount); + cmdShow << " fsavl:"; + Append("Disk-fs-get-free-count", "cnt", dio.mGetFsSpaceAvailableCount); + Append("Disk-fs-get-free-errors","err", dio.mGetFsSpaceAvailableErrorCount); + cmdShow << " dirchk:"; + Append("Disk-dir-readable-count", "cnt", dio.mCheckDirReadableCount); + Append("Disk-dir-readable-errors","err", dio.mCheckDirReadableErrorCount); + cmdShow << " timedout:"; + Append("Disk-timedout-count", "cnt", dio.mTimedOutErrorCount); + Append("Disk-timedout-read-bytes", "rbytes", dio.mTimedOutErrorReadByteCount); + Append("Disk-timedout-write-bytes","wbytes", dio.mTimedOutErrorWriteByteCount); + Append("Disk-open-files", "fopen", dio.mOpenFilesCount); + + cmdShow << " msglog:"; + MsgLogger::Counters msgLogCntrs; + MsgLogger::GetLogger()->GetCounters(msgLogCntrs); + Append("Msg-log-level", "level", MsgLogger::GetLogger()->GetLogLevel()); + Append("Msg-log-count", "cnt", msgLogCntrs.mAppendCount); + Append("Msg-log-drop", "drop", msgLogCntrs.mDroppedCount); + Append("Msg-log-write-errors", "werr", msgLogCntrs.mWriteErrorCount); + Append("Msg-log-wait", "wait", msgLogCntrs.mAppendWaitCount); + Append("Msg-log-waited-micro-sec", "waittm", msgLogCntrs.mAppendWaitMicroSecs); + + cmdShow << " repl:"; + Replicator::Counters replCntrs; + Replicator::GetCounters(replCntrs); + Append("Replication-count", "cnt", replCntrs.mReplicationCount); + Append("Replication-errors", "err", replCntrs.mReplicationErrorCount); + Append("Replication-cancel", "cancel", replCntrs.mReplicationCanceledCount); + Append("Replicator-count", "obj", replCntrs.mReplicatorCount); + cmdShow << " recov:"; + Append("Recovery-count", "cnt", replCntrs.mRecoveryCount); + Append("Recovery-errors", "err", replCntrs.mRecoveryErrorCount); + Append("Recovery-cancel", "cancel", replCntrs.mRecoveryCanceledCount); + + Append("Ops-in-flight-count", "opsf", gChunkServer.GetNumOps()); + cmdShow << " gcntrs:"; + Append("Socket-count", "socks", globals().ctrOpenNetFds.GetValue()); + Append("Disk-fd-count", "dfds", globals().ctrOpenDiskFds.GetValue()); + Append("Net-bytes-read", "nrd", globals().ctrNetBytesRead.GetValue()); + Append("Net-bytes-write", "nwr", globals().ctrNetBytesWritten.GetValue()); + Append("Disk-bytes-read", "drd", globals().ctrDiskBytesRead.GetValue()); + Append("Disk-bytes-write", "dwr", globals().ctrDiskBytesWritten.GetValue()); + Append("Total-ops-count", "ops", KfsOp::GetOpsCount()); + + status = 0; + gLogger.Submit(this); +} + +void +RetireOp::Execute() +{ + // we are told to retire...so, bow out + KFS_LOG_STREAM_INFO << "we have been asked to retire, bye" << KFS_LOG_EOM; + globalNetManager().Shutdown(); +} + +bool +StaleChunksOp::ParseContent(istream& is) +{ + if (status != 0) { + return false; + } + kfsChunkId_t c; + staleChunkIds.reserve(numStaleChunks); + const istream::fmtflags isFlags = is.flags(); + if (hexFormatFlag) { + is >> hex; + } + for(int i = 0; i < numStaleChunks; ++i) { + if (! (is >> c)) { + ostringstream os; + os << + "failed to parse stale chunks request:" + " expected: " << numStaleChunks << + " got: " << i << + " last chunk: " << c + ; + statusMsg = os.str(); + status = -EINVAL; + break; + } + staleChunkIds.push_back(c); + } + is.flags(isFlags); + return (status == 0); +} + +void +StaleChunksOp::Execute() +{ + status = 0; + const bool forceDeleteFlag = true; + for (StaleChunkIds::const_iterator it = staleChunkIds.begin(); + it != staleChunkIds.end(); + ++it) { + gChunkManager.StaleChunk(*it, forceDeleteFlag, evacuatedFlag); + } + KFS_LOG_STREAM_INFO << "stale chunks: " << + (staleChunkIds.empty() ? kfsChunkId_t(-1) : staleChunkIds.front()) << + " count: " << staleChunkIds.size() << + KFS_LOG_EOM; + gLogger.Submit(this); +} + +void +ReadOp::Execute() +{ + if (numBytes > CHUNKSIZE) { + KFS_LOG_STREAM_DEBUG << + "read request size exceeds chunk size: " << numBytes << + KFS_LOG_EOM; + status = -EINVAL; + gLogger.Submit(this); + return; + } + + gChunkManager.GetDriveName(this); + + SET_HANDLER(this, &ReadOp::HandleChunkMetaReadDone); + const int res = gChunkManager.ReadChunkMetadata(chunkId, this); + if (res < 0) { + KFS_LOG_STREAM_ERROR << + "failed read chunk meta data, status: " << res << + KFS_LOG_EOM; + status = res; + gLogger.Submit(this); + } +} + +int +ReadOp::HandleChunkMetaReadDone(int code, void *data) +{ + if (status >= 0 && data) { + status = *(int *) data; + } + if (status < 0) { + gLogger.Submit(this); + return 0; + } + + SET_HANDLER(this, &ReadOp::HandleDone); + status = gChunkManager.ReadChunk(this); + + if (status < 0) { + // clnt->HandleEvent(EVENT_CMD_DONE, this); + if (! wop) { + // we are done with this op; this needs draining + gLogger.Submit(this); + } else { + // resume execution of write + wop->Execute(); + } + } + return 0; +} + +// +// Handling of writes is done in multiple steps: +// 1. The client allocates a chunk from the metaserver; the metaserver +// picks a set of hosting chunkservers and nominates one of the +// server's as the "master" for the transaction. +// 2. The client pushes data for a write via a WritePrepareOp to each +// of the hosting chunkservers (in any order). +// 3. The chunkserver in turn enqueues the write with the ChunkManager +// object. The ChunkManager assigns an id to the write. NOTE: +// nothing is written out to disk at this point. +// 4. After the client has pushed out data to replica chunk-servers +// and gotten write-id's, the client does a WriteSync to the master. +// 5. The master retrieves the write corresponding to the write-id and +// commits the write to disk. +// 6. The master then sends out a WriteCommit to each of the replica +// chunkservers asking them to commit the write; this commit message +// is sent concurrently to all the replicas. +// 7. After the replicas reply, the master replies to the client with +// status from individual servers and how much got written on each. +// + +static bool +needToForwardToPeer(string &serverInfo, uint32_t numServers, int &myPos, + ServerLocation &peerLoc, + bool isWriteIdPresent, int64_t &writeId) +{ + istringstream ist(serverInfo); + ServerLocation loc; + bool foundLocal = false; + int64_t id; + bool needToForward = false; + + // the list of servers is ordered: we forward to the next one + // in the list. + for (uint32_t i = 0; i < numServers; i++) { + ist >> loc.hostname; + ist >> loc.port; + if (isWriteIdPresent) + ist >> id; + + if (gChunkServer.IsLocalServer(loc)) { + // return the position of where this server is present in the list + myPos = i; + foundLocal = true; + if (isWriteIdPresent) + writeId = id; + continue; + } + // forward if we are not the last in the list + if (foundLocal) { + needToForward = true; + break; + } + } + peerLoc = loc; + return needToForward; +} + +void +WriteIdAllocOp::Execute() +{ + // check if we need to forward anywhere + writeId = -1; + int64_t dummyWriteId = -1; + int myPos = -1; + ServerLocation peerLoc; + const bool needToForward = needToForwardToPeer( + servers, numServers, myPos, peerLoc, false, dummyWriteId); + if (myPos < 0) { + statusMsg = "invalid or missing Servers: field"; + status = -EINVAL; + gLogger.Submit(this); + return; + } + const bool writeMaster = myPos == 0; + if (writeMaster && ! gLeaseClerk.IsLeaseValid(chunkId)) { + status = -ELEASEEXPIRED; + statusMsg = "no valid write lease exists"; + Done(EVENT_CMD_DONE, &status); + return; + } + const int res = gChunkManager.AllocateWriteId(this, myPos, peerLoc); + if (res != 0 && status == 0) { + status = res < 0 ? res : -res; + } + if (status != 0) { + Done(EVENT_CMD_DONE, &status); + return; + } + if (writeMaster) { + // Notify the lease clerk that we are doing write. This is to + // signal the lease clerk to renew the lease for the chunk when appropriate. + gLeaseClerk.DoingWrite(chunkId); + } + ostringstream os; + os << gChunkServer.GetMyLocation() << " " << writeId; + writeIdStr = os.str(); + if (needToForward) { + ForwardToPeer(peerLoc); + } else { + ReadChunkMetadata(); + } +} + +int +WriteIdAllocOp::ForwardToPeer(const ServerLocation& loc) +{ + assert(! fwdedOp && status == 0 && (clnt || isForRecordAppend)); + + RemoteSyncSMPtr const peer = isForRecordAppend ? + appendPeer : FindPeer(*this, loc); + if (! peer) { + status = -EHOSTUNREACH; + statusMsg = "unable to find peer " + loc.ToString(); + return Done(EVENT_CMD_DONE, &status); + } + fwdedOp = new WriteIdAllocOp(0, *this); + fwdedOp->writePrepareReplyFlag = false; // set by the next one in the chain. + // When forwarded op completes, call this op HandlePeerReply. + fwdedOp->clnt = this; + SET_HANDLER(this, &WriteIdAllocOp::HandlePeerReply); + + peer->Enqueue(fwdedOp); + return 0; +} + +int +WriteIdAllocOp::HandlePeerReply(int code, void *data) +{ + assert(code == EVENT_CMD_DONE && data == fwdedOp); + + if (status == 0 && fwdedOp->status < 0) { + status = fwdedOp->status; + statusMsg = fwdedOp->statusMsg.empty() ? + string("forwarding failed") : fwdedOp->statusMsg; + } + if (status != 0) { + return Done(EVENT_CMD_DONE, &status); + } + writeIdStr += " " + fwdedOp->writeIdStr; + writePrepareReplyFlag = + writePrepareReplyFlag && fwdedOp->writePrepareReplyFlag; + ReadChunkMetadata(); + return 0; +} + +void +WriteIdAllocOp::ReadChunkMetadata() +{ + assert(status == 0); + // Now, we are all done pending metadata read + // page in the chunk meta-data if needed + // if the read was successful, the call to read will callback handle-done + SET_HANDLER(this, &WriteIdAllocOp::Done); + int res = gChunkManager.ReadChunkMetadata(chunkId, this); + if (res < 0) { + Done(EVENT_CMD_DONE, &res); + } +} + +int +WriteIdAllocOp::Done(int code, void *data) +{ + if (status == 0) { + status = (code == EVENT_CMD_DONE && data) ? + *reinterpret_cast(data) : -1; + if (status != 0) { + statusMsg = "chunk meta data read failed"; + } + } + if (status != 0) { + if (isForRecordAppend) { + if (! writeIdStr.empty()) { + gAtomicRecordAppendManager.InvalidateWriteIdDeclareFailure( + chunkId, writeId); + } + } else { + gChunkManager.SetWriteStatus(writeId, status); + // The write id alloc has failed; we don't want to renew the lease. + // Now, when the client forces a re-allocation, the + // metaserver will do a version bump; when the node that + // was dead comes back, we can detect it has missed a write + gLeaseClerk.InvalidateLease(chunkId); + } + } + KFS_LOG_STREAM( + status == 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + (status == 0 ? "done: " : "failed: ") << Show() << + KFS_LOG_EOM; + gLogger.Submit(this); + return 0; +} + +void +WritePrepareOp::Execute() +{ + ServerLocation peerLoc; + int myPos = -1; + + SET_HANDLER(this, &WritePrepareOp::Done); + + // check if we need to forward anywhere + bool needToForward = false, writeMaster; + + needToForward = needToForwardToPeer(servers, numServers, myPos, peerLoc, true, writeId); + if (myPos < 0) { + statusMsg = "invalid or missing Servers: field"; + status = -EINVAL; + gLogger.Submit(this); + return; + } + writeMaster = (myPos == 0); + + if (! gChunkManager.IsValidWriteId(writeId)) { + statusMsg = "invalid write id"; + status = -EINVAL; + gLogger.Submit(this); + return; + } + + if (!gChunkManager.IsChunkMetadataLoaded(chunkId)) { + statusMsg = "checksums are not loaded"; + status = -ELEASEEXPIRED; + Done(EVENT_CMD_DONE, this); + return; + } + + if (writeMaster) { + // if we are the master, check the lease... + if (! gLeaseClerk.IsLeaseValid(chunkId)) { + KFS_LOG_STREAM_ERROR << + "Write prepare failed, lease expired for " << chunkId << + KFS_LOG_EOM; + statusMsg = "no valid write lease exists"; + gLeaseClerk.InvalidateLease(chunkId); + status = -ELEASEEXPIRED; + Done(EVENT_CMD_DONE, this); + return; + } + // Notify the lease clerk that we are doing write. This is to + // signal the lease clerk to renew the lease for the chunk when appropriate. + gLeaseClerk.DoingWrite(chunkId); + } + + uint32_t val = 0; + vector checksums = ComputeChecksums(dataBuf, numBytes, &val); + if (val != checksum) { + statusMsg = "checksum mismatch"; + KFS_LOG_STREAM_ERROR << + "checksum mismatch: sent: " << checksum << + ", computed: " << val << " for " << Show() << + KFS_LOG_EOM; + status = -EBADCKSUM; + Done(EVENT_CMD_DONE, this); + return; + } + + // will clone only when the op is good + writeOp = gChunkManager.CloneWriteOp(writeId); + + if (! writeOp) { + // the write has previously failed; so fail this op and move on + status = gChunkManager.GetWriteStatus(writeId); + if (status >= 0) { + status = -EINVAL; + } + Done(EVENT_CMD_DONE, this); + return; + } + + if (needToForward) { + status = ForwardToPeer(peerLoc); + if (status < 0) { + // can't forward to peer...so fail the write + Done(EVENT_CMD_DONE, this); + return; + } + } + + writeOp->offset = offset; + writeOp->numBytes = numBytes; + writeOp->dataBuf = dataBuf; + writeOp->wpop = this; + writeOp->checksums.swap(checksums); + dataBuf = 0; + + writeOp->enqueueTime = globalNetManager().Now(); + + KFS_LOG_STREAM_DEBUG << + "Writing to chunk: " << chunkId << + " @offset: " << offset << + " nbytes: " << numBytes << + " checksum: " << checksum << + KFS_LOG_EOM; + + status = gChunkManager.WriteChunk(writeOp); + if (status < 0) { + Done(EVENT_CMD_DONE, this); + } +} + +int +WritePrepareOp::ForwardToPeer(const ServerLocation& loc) +{ + assert(clnt); + RemoteSyncSMPtr const peer = FindPeer(*this, loc); + if (!peer) { + statusMsg = "no such peer " + loc.ToString(); + return -EHOSTUNREACH; + } + writeFwdOp = new WritePrepareFwdOp(*this); + writeFwdOp->clnt = this; + peer->Enqueue(writeFwdOp); + return 0; +} + +int +WritePrepareOp::Done(int code, void *data) +{ + if (status >= 0 && writeFwdOp && writeFwdOp->status < 0) { + status = writeFwdOp->status; + statusMsg = writeFwdOp->statusMsg; + } + if (status < 0) { + // so that the error goes out on a sync + gChunkManager.SetWriteStatus(writeId, status); + // The write has failed; we don't want to renew the lease. + // Now, when the client forces a re-allocation, the + // metaserver will do a version bump; when the node that + // was dead comes back, we can detect it has missed a write + gLeaseClerk.InvalidateLease(chunkId); + } + numDone++; + if (writeFwdOp && numDone < 2) { + return 0; + } + KFS_LOG_STREAM( + status >= 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + (status >= 0 ? "done: " : "failed: ") << Show() << + " status: " << status << + (statusMsg.empty() ? "" : " msg: ") << statusMsg << + KFS_LOG_EOM; + gLogger.Submit(this); + return 0; +} + +void +WriteSyncOp::Execute() +{ + ServerLocation peerLoc; + int myPos = -1; + + KFS_LOG_STREAM_DEBUG << "executing: " << Show() << KFS_LOG_EOM; + // check if we need to forward anywhere + const bool needToForward = needToForwardToPeer(servers, numServers, myPos, peerLoc, true, writeId); + if (myPos < 0) { + statusMsg = "invalid or missing Servers: field"; + status = -EINVAL; + gLogger.Submit(this); + return; + } + writeMaster = myPos == 0; + + writeOp = gChunkManager.CloneWriteOp(writeId); + if (! writeOp) { + status = -EINVAL; + statusMsg = "no such write id"; + KFS_LOG_STREAM_ERROR << + "failed: " << statusMsg << " " << Show() << + KFS_LOG_EOM; + gLogger.Submit(this); + return; + } + + writeOp->enqueueTime = globalNetManager().Now(); + + if (writeOp->status < 0) { + // due to failures with data forwarding/checksum errors and such + status = writeOp->status; + statusMsg = "write error"; + gLogger.Submit(this); + return; + } + + if (! gChunkManager.IsChunkMetadataLoaded(chunkId)) { + // This should not normally happen, as valid write id would keep chunk + // loaded / writable. + status = -ELEASEEXPIRED; + statusMsg = "meta data unloaded"; + KFS_LOG_STREAM_ERROR << + "failed: " << statusMsg << " " << Show() << + KFS_LOG_EOM; + gChunkManager.SetWriteStatus(writeId, status); + gLogger.Submit(this); + return; + } + + if (writeMaster) { + // if we are the master, check the lease... + if (! gLeaseClerk.IsLeaseValid(chunkId)) { + statusMsg = "no valid write lease exists"; + status = -ELEASEEXPIRED; + KFS_LOG_STREAM_ERROR << + "failed: " << statusMsg << " " << Show() << + KFS_LOG_EOM; + gChunkManager.SetWriteStatus(writeId, status); + gLogger.Submit(this); + return; + } + // Notify the lease clerk that we are doing write. This is to + // signal the lease clerk to renew the lease for the chunk when appropriate. + gLeaseClerk.DoingWrite(chunkId); + } + + SET_HANDLER(this, &WriteSyncOp::Done); + + if (needToForward) { + status = ForwardToPeer(peerLoc); + if (status < 0) { + // can't forward to peer...so fail the write + Done(EVENT_CMD_DONE, this); + return; + } + } + + // when things aren't aligned, we can't validate the checksums + // handed by the client. In such cases, make sure that the + // chunkservers agree on the checksum + bool validateChecksums = true; + bool mismatch = false; + if (writeMaster && + (((offset % CHECKSUM_BLOCKSIZE) != 0) || ((numBytes % CHECKSUM_BLOCKSIZE) != 0))) { + validateChecksums = false; + } + // in the non-writemaster case, our checksums should match what + // the write master sent us. + + vector myChecksums = gChunkManager.GetChecksums(chunkId, offset, numBytes); + if ((!validateChecksums) || (checksums.size() == 0)) { + // Either we can't validate checksums due to alignment OR the + // client didn't give us checksums. In either case: + // The sync covers a certain region for which the client + // sent data. The value for that region should be non-zero + for (uint32_t i = 0; (i < myChecksums.size()) && !mismatch; i++) { + if (myChecksums[i] == 0) { + KFS_LOG_STREAM_ERROR << + "Sync failed due to checksum mismatch: we have 0 in the range " << + offset << "->" << offset+numBytes << " ; but should be non-zero" << KFS_LOG_EOM; + mismatch = true; + } + } + if (!mismatch) + KFS_LOG_STREAM_DEBUG << "Validated checksums are non-zero for chunk = " << chunkId + << " offset = " << offset << " numbytes = " << numBytes << KFS_LOG_EOM; + } else { + if (myChecksums.size() != checksums.size()) { + KFS_LOG_STREAM_ERROR << + "Checksum mismatch: # of entries we have: " << myChecksums.size() << + " # of entries client sent: " << checksums.size() << KFS_LOG_EOM; + mismatch = true; + } + for (uint32_t i = 0; (i < myChecksums.size()) && !mismatch; i++) { + if (myChecksums[i] != checksums[i]) { + KFS_LOG_STREAM_ERROR << + "Sync failed due to checksum mismatch: we have = " << + myChecksums[i] << " but the value should be: " << checksums[i] << + KFS_LOG_EOM; + mismatch = true; + break; + } + // KFS_LOG_STREAM_DEBUG << "Got = " << checksums[i] << " and ours: " << myChecksums[i] << KFS_LOG_EOM; + } + // bit of testing code + // if ((rand() % 20) == 0) { + // if ((offset == 33554432) && (chunkVersion == 1)) { + // if ((2097152 <= offset) && (offset <= 4194304) && (chunkVersion == 1)) { + // KFS_LOG_STREAM_DEBUG << "Intentionally failing verify for chunk = " << chunkId << " offset = " << offset + // << KFS_LOG_EOM; + // mismatch = true; + //} + + if (!mismatch) + KFS_LOG_STREAM_DEBUG << "Checksum verified for chunk = " << chunkId << " offset = " << offset + << ": " << myChecksums.size() << " and got: " << checksums.size() << KFS_LOG_EOM; + } + if (mismatch) { + status = -EAGAIN; + statusMsg = "checksum mismatch"; + Done(EVENT_CMD_DONE, this); + return; + } + assert(status >= 0); + Done(EVENT_CMD_DONE, this); +} + +int +WriteSyncOp::ForwardToPeer(const ServerLocation& loc) +{ + assert(clnt); + RemoteSyncSMPtr const peer = FindPeer(*this, loc); + if (! peer) { + statusMsg = "no such peer " + loc.ToString(); + return -EHOSTUNREACH; + } + fwdedOp = new WriteSyncOp(0, chunkId, chunkVersion, offset, numBytes); + fwdedOp->numServers = numServers; + fwdedOp->servers = servers; + fwdedOp->clnt = this; + SET_HANDLER(fwdedOp, &KfsOp::HandleDone); + + if (writeMaster) { + fwdedOp->checksums = gChunkManager.GetChecksums(chunkId, offset, numBytes); + } else { + fwdedOp->checksums = this->checksums; + } + peer->Enqueue(fwdedOp); + return 0; +} + +int +WriteSyncOp::Done(int code, void *data) +{ + if (status >= 0 && fwdedOp && fwdedOp->status < 0) { + status = fwdedOp->status; + statusMsg = fwdedOp->statusMsg; + KFS_LOG_STREAM_ERROR << + "Peer: " << fwdedOp->Show() << " returned: " << fwdedOp->status << + KFS_LOG_EOM; + } + if (status < 0) { + gChunkManager.SetWriteStatus(writeId, status); + // The write has failed; we don't want to renew the lease. + // Now, when the client forces a re-allocation, the + // metaserver will do a version bump; when the node that + // was dead comes back, we can detect it has missed a write + gLeaseClerk.InvalidateLease(chunkId); + } + numDone++; + if (fwdedOp && numDone < 2) { + return 0; + } + KFS_LOG_STREAM( + status >= 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + (status >= 0 ? "done: " : "failed: ") << Show() << + " status: " << status << + (statusMsg.empty() ? "" : " msg: ") << statusMsg << + KFS_LOG_EOM; + gLogger.Submit(this); + return 0; +} + +void +WriteOp::Execute() +{ + status = gChunkManager.WriteChunk(this); + + if (status < 0) { + if (isFromRecordAppend) { + HandleEvent(EVENT_CMD_DONE, this); + return; + } else { + assert(wpop); + wpop->HandleEvent(EVENT_CMD_DONE, this); + } + } +} + +void +RecordAppendOp::Execute() +{ + ServerLocation peerLoc; + int myPos = -1; + + needToForwardToPeer(servers, numServers, myPos, peerLoc, true, writeId); + gAtomicRecordAppendManager.AppendBegin(this, myPos, peerLoc); +} + +void +GetRecordAppendOpStatus::Execute() +{ + gAtomicRecordAppendManager.GetOpStatus(this); + gLogger.Submit(this); +} + +void +SizeOp::Execute() +{ + gChunkManager.ChunkSize(this); + gLogger.Submit(this); +} + +void +ChunkSpaceReserveOp::Execute() +{ + ServerLocation peerLoc; + int myPos = -1; + + needToForwardToPeer(servers, numServers, myPos, peerLoc, true, writeId); + if (myPos == 0) { + status = gAtomicRecordAppendManager.ChunkSpaceReserve( + chunkId, writeId, nbytes, &statusMsg); + } else { + status = -EINVAL; + statusMsg = "invalid or missing Servers: field"; + } + if (status == 0) { + // Only master keeps track of space reservations. + assert(myPos == 0); + ClientSM* const client = GetClientSM(); + assert((client != 0) == (clnt != 0)); + if (client) { + client->ChunkSpaceReserve(chunkId, writeId, nbytes); + } + } + KFS_LOG_STREAM((status >= 0 || status == -ENOSPC) ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "space reserve: " + " chunk: " << chunkId << + " writeId: " << writeId << + " bytes: " << nbytes << + " status: " << status << + KFS_LOG_EOM; + gLogger.Submit(this); +} + +void +ChunkSpaceReleaseOp::Execute() +{ + ServerLocation peerLoc; + int myPos = -1; + + needToForwardToPeer(servers, numServers, myPos, peerLoc, true, writeId); + size_t rsvd = 0; + if (myPos == 0) { + ClientSM* const client = GetClientSM(); + assert((client != 0) == (clnt != 0)); + rsvd = client ? + std::min(client->GetReservedSpace(chunkId, writeId), nbytes) : nbytes; + status = gAtomicRecordAppendManager.ChunkSpaceRelease( + chunkId, writeId, rsvd, &statusMsg); + if (status == 0 && client) { + client->UseReservedSpace(chunkId, writeId, rsvd); + } + } else { + status = -EINVAL; + statusMsg = "invalid or missing Servers: field"; + } + KFS_LOG_STREAM(status >= 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + "space release: " + " chunk: " << chunkId << + " writeId: " << writeId << + " requested: " << nbytes << + " reserved: " << rsvd << + " status: " << status << + KFS_LOG_EOM; + gLogger.Submit(this); +} + +void +GetChunkMetadataOp::Execute() +{ + SET_HANDLER(this, &GetChunkMetadataOp::HandleChunkMetaReadDone); + if (gChunkManager.ReadChunkMetadata(chunkId, this) < 0) { + status = -EINVAL; + gLogger.Submit(this); + } +} + +int +GetChunkMetadataOp::HandleChunkMetaReadDone(int code, void *data) +{ + if (status >= 0 && data) { + status = *(int *) data; + } + if (status < 0) { + gLogger.Submit(this); + return 0; + } + const ChunkInfo_t * const info = gChunkManager.GetChunkInfo(chunkId); + if (info) { + if (info->chunkBlockChecksum || info->chunkSize == 0) { + chunkVersion = info->chunkVersion; + chunkSize = info->chunkSize; + if (info->chunkBlockChecksum) { + dataBuf = new IOBuffer(); + dataBuf->CopyIn((const char *)info->chunkBlockChecksum, + MAX_CHUNK_CHECKSUM_BLOCKS * sizeof(uint32_t)); + numBytesIO = dataBuf->BytesConsumable(); + } + } else { + assert(! "no checksums"); + status = -EIO; + } + } else { + status = -EBADF; + } + + if (status < 0 || ! readVerifyFlag) { + gLogger.Submit(this); + return 0; + } + + numBytesScrubbed = 0; + readOp.chunkId = chunkId; + readOp.chunkVersion = chunkVersion; + readOp.offset = 0; + readOp.numBytes = std::min((int64_t) 1 << 20, chunkSize); + + readOp.SetScrubOp(this); + SET_HANDLER(this, &GetChunkMetadataOp::HandleScrubReadDone); + status = gChunkManager.ReadChunk(&readOp); + if (status < 0) { + gLogger.Submit(this); + return 0; + } + return 0; +} + +int +GetChunkMetadataOp::HandleScrubReadDone(int code, void *data) +{ + if (code == EVENT_DISK_ERROR) { + status = -1; + if (data) { + status = *(int *) data; + KFS_LOG_STREAM_ERROR << "disk error:" + " chunkid: " << chunkId << + " status: " << status << + KFS_LOG_EOM; + } + gChunkManager.ChunkIOFailed(chunkId, status, readOp.diskIo.get()); + gLogger.Submit(this); + return 0; + } else if (code == EVENT_DISK_READ) { + if (! readOp.dataBuf) { + readOp.dataBuf = new IOBuffer(); + } + IOBuffer *b = (IOBuffer *) data; + // Order matters...when we append b, we take the data from b + // and put it into our buffer. + readOp.dataBuf->Append(b); + if (((size_t) (readOp.offset + readOp.dataBuf->BytesConsumable()) > (size_t) chunkSize) && + ((size_t) readOp.dataBuf->BytesConsumable() > (size_t) readOp.numBytes)) { + // trim the extra stuff off the end. + readOp.dataBuf->Trim(readOp.numBytes); + } + // verify checksum + gChunkManager.ReadChunkDone(&readOp); + status = readOp.status; + if (status == 0) { + KFS_LOG_STREAM_DEBUG << "scrub read succeeded" + " chunk: " << chunkId << + " offset: " << readOp.offset << + KFS_LOG_EOM; + // checksum verified; setup the next read + numBytesScrubbed += readOp.dataBuf->BytesConsumable(); + readOp.offset += readOp.dataBuf->BytesConsumable(); + readOp.numBytes = std::min((int64_t)kChunkReadSize, chunkSize - numBytesScrubbed); + // throw away the data + readOp.dataBuf->Consume(readOp.dataBuf->BytesConsumable()); + if (numBytesScrubbed >= chunkSize) { + KFS_LOG_STREAM_DEBUG << "scrub succeeded" + " chunk: " << chunkId << + " bytes read: " << numBytesScrubbed << + KFS_LOG_EOM; + gLogger.Submit(this); + return 0; + } + status = gChunkManager.ReadChunk(&readOp); + } + } + if (status < 0) { + KFS_LOG_STREAM_INFO << "scrub read failed: " + " chunk: " << chunkId << + " status: " << status << + KFS_LOG_EOM; + gLogger.Submit(this); + return 0; + } + return 0; + +} + +void +PingOp::Execute() +{ + int chunkDirs = 0; + int writableDirs = 0; + int evacuateChunks = 0; + int64_t evacuateByteCount = 0; + totalFsSpace = 0; + totalSpace = gChunkManager.GetTotalSpace(totalFsSpace, chunkDirs, + evacuateInFlightCount, writableDirs, evacuateChunks, evacuateByteCount); + usedSpace = gChunkManager.GetUsedSpace(); + if (usedSpace < 0) + usedSpace = 0; + status = 0; + // clnt->HandleEvent(EVENT_CMD_DONE, this); + gLogger.Submit(this); +} + +void +DumpChunkMapOp::Execute() +{ + // Dump chunk map + gChunkManager.DumpChunkMap(); + status = 0; + gLogger.Submit(this); +} + +void +StatsOp::Execute() +{ + ostringstream os; + + os << "Num aios: " << 0 << "\r\n"; + os << "Num ops: " << gChunkServer.GetNumOps() << "\r\n"; + globals().counterManager.Show(os); + stats = os.str(); + status = 0; + // clnt->HandleEvent(EVENT_CMD_DONE, this); + gLogger.Submit(this); +} + +inline static bool +OkHeader(const KfsOp* op, ostream &os, bool checkStatus = true) +{ + os << "OK\r\n"; + os << "Cseq: " << op->seq << "\r\n"; + os << "Status: " << op->status << "\r\n"; + if (! op->statusMsg.empty()) { + const size_t p = op->statusMsg.find('\r'); + assert(string::npos == p && op->statusMsg.find('\n') == string::npos); + os << "Status-message: " << + (p == string::npos ? op->statusMsg : op->statusMsg.substr(0, p)) << + "\r\n"; + } + if (checkStatus && op->status < 0) { + os << "\r\n"; + } + return (op->status >= 0); +} + +inline static ostream& +PutHeader(const KfsOp* op, ostream &os) +{ + OkHeader(op, os, false); + return os; +} + +/// +/// Generate response for an op based on the KFS protocol. +/// +void +KfsOp::Response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +SizeOp::Response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + os << "Size: " << size << "\r\n\r\n"; +} + +void +GetChunkMetadataOp::Response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + os << "Chunk-handle: " << chunkId << "\r\n"; + os << "Chunk-version: " << chunkVersion << "\r\n"; + os << "Size: " << chunkSize << "\r\n"; + os << "Content-length: " << numBytesIO << "\r\n\r\n"; +} + +void +ReadOp::Response(ostream &os) +{ + PutHeader(this, os); + os << "Drivename: " << driveName << "\r\n"; + if (status < 0) { + os << "\r\n"; + return; + } + + os << "DiskIOtime: " << (diskIOTime * 1e-6) << "\r\n"; + os << "Checksum-entries: " << checksum.size() << "\r\n"; + if (checksum.size() == 0) { + os << "Checksums: " << 0 << "\r\n"; + } else { + os << "Checksums: "; + for (uint32_t i = 0; i < checksum.size(); i++) + os << checksum[i] << ' '; + os << "\r\n"; + } + os << "Content-length: " << numBytesIO << "\r\n\r\n"; +} + +void +WriteIdAllocOp::Response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + if (writePrepareReplyFlag) { + os << "Write-prepare-reply: 1\r\n"; + } + os << "Write-id: " << writeIdStr << "\r\n" + "\r\n"; +} + +void +WritePrepareOp::Response(ostream &os) +{ + if (! replyRequestedFlag) { + // no reply for a prepare...the reply is covered by sync + return; + } + PutHeader(this, os) << "\r\n"; +} + +void +RecordAppendOp::Response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + os << "File-offset: " << fileOffset << "\r\n\r\n"; +} + +void +RecordAppendOp::Request(ostream &os) +{ + os << + "RECORD_APPEND \r\n" + "Cseq: " << seq << "\r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Offset: " << offset << "\r\n" + "File-offset: " << fileOffset << "\r\n" + "Num-bytes: " << numBytes << "\r\n" + "Checksum: " << checksum << "\r\n" + "Num-servers: " << numServers << "\r\n" + "Client-cseq: " << clientSeq << "\r\n" + "Servers: " << servers << "\r\n" + "Master-committed: " << masterCommittedOffset << "\r\n" + "\r\n"; +} + +void +GetRecordAppendOpStatus::Request(std::ostream &os) +{ + os << + "GET_RECORD_APPEND_OP_STATUS \r\n" + "Cseq: " << seq << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "Write-id: " << writeId << "\r\n" + "\r\n"; +} + +void +GetRecordAppendOpStatus::Response(std::ostream &os) +{ + PutHeader(this, os); + os << + "Chunk-version: " << chunkVersion << "\r\n" + "Op-seq: " << opSeq << "\r\n" + "Op-status: " << opStatus << "\r\n" + "Op-offset: " << opOffset << "\r\n" + "Op-length: " << opLength << "\r\n" + "Wid-append-count: " << widAppendCount << "\r\n" + "Wid-bytes-reserved: " << widBytesReserved << "\r\n" + "Chunk-bytes-reserved: " << chunkBytesReserved << "\r\n" + "Remaining-lease-time: " << remainingLeaseTime << "\r\n" + "Master-commit-offset: " << masterCommitOffset << "\r\n" + "Next-commit-offset: " << nextCommitOffset << "\r\n" + "Wid-read-only: " << (widReadOnlyFlag ? 1 : 0) << "\r\n" + "Wid-was-read-only: " << (widWasReadOnlyFlag ? 1 : 0) << "\r\n" + "Chunk-master: " << (masterFlag ? 1 : 0) << "\r\n" + "Stable-flag: " << (stableFlag ? 1 : 0) << "\r\n" + "Open-for-append-flag: " << (openForAppendFlag ? 1 : 0) << "\r\n" + "Appender-state: " << appenderState << "\r\n" + "Appender-state-string: " << appenderStateStr << "\r\n" + "\r\n"; +} + +void +CloseOp::Request(ostream &os) +{ + os << + "CLOSE \r\n" + "Cseq: " << seq << "\r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Need-ack: " << (needAck ? 1 : 0) << "\r\n" + ; + if (numServers > 0) { + os << + "Num-servers: " << numServers << "\r\n" + "Servers: " << servers << "\r\n" + ; + } + os << "Chunk-handle: " << chunkId << "\r\n"; + if (hasWriteId) { + os << "Has-write-id: " << 1 << "\r\n"; + } + if (masterCommitted >= 0) { + os << "Master-committed: " << masterCommitted << "\r\n"; + } + os << "\r\n"; +} + +void +SizeOp::Request(ostream &os) +{ + os << "SIZE \r\n"; + os << "Cseq: " << seq << "\r\n"; + os << "Version: " << KFS_VERSION_STR << "\r\n"; + os << "Chunk-handle: " << chunkId << "\r\n"; + os << "Chunk-version: " << chunkVersion << "\r\n\r\n"; +} + +void +GetChunkMetadataOp::Request(ostream &os) +{ + os << "GET_CHUNK_METADATA \r\n" + "Cseq: " << seq << "\r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "Read-verify: " << (readVerifyFlag ? 1 : 0) << "\r\n" + "\r\n"; +} + +void +ReadOp::Request(ostream &os) +{ + os << "READ \r\n"; + os << "Cseq: " << seq << "\r\n"; + os << "Version: " << KFS_VERSION_STR << "\r\n"; + os << "Chunk-handle: " << chunkId << "\r\n"; + os << "Chunk-version: " << chunkVersion << "\r\n"; + os << "Offset: " << offset << "\r\n"; + os << "Num-bytes: " << numBytes << "\r\n\r\n"; +} + +void +WriteIdAllocOp::Request(ostream &os) +{ + os << + "WRITE_ID_ALLOC\r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Cseq: " << seq << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Offset: " << offset << "\r\n" + "Num-bytes: " << numBytes << "\r\n" + "For-record-append: " << (isForRecordAppend ? 1 : 0) << "\r\n" + "Client-cseq: " << clientSeq << "\r\n" + "Num-servers: " << numServers << "\r\n" + "Servers: " << servers << "\r\n" + "\r\n"; +} + +void +WritePrepareFwdOp::Request(ostream &os) +{ + os << + "WRITE_PREPARE\r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Cseq: " << seq << "\r\n" + "Chunk-handle: " << owner.chunkId << "\r\n" + "Chunk-version: " << owner.chunkVersion << "\r\n" + "Offset: " << owner.offset << "\r\n" + "Num-bytes: " << owner.numBytes << "\r\n" + "Checksum: " << owner.checksum << "\r\n" + "Num-servers: " << owner.numServers << "\r\n" + "Reply: " << (owner.replyRequestedFlag ? 1 : 0) << "\r\n" + "Servers: " << owner.servers << "\r\n" + "\r\n"; +} + +void +WriteSyncOp::Request(ostream &os) +{ + os << "WRITE_SYNC\r\n"; + os << "Version: " << KFS_VERSION_STR << "\r\n"; + os << "Cseq: " << seq << "\r\n"; + os << "Chunk-handle: " << chunkId << "\r\n"; + os << "Chunk-version: " << chunkVersion << "\r\n"; + os << "Offset: " << offset << "\r\n"; + os << "Num-bytes: " << numBytes << "\r\n"; + os << "Checksum-entries: " << checksums.size() << "\r\n"; + if (checksums.size() == 0) { + os << "Checksums: " << 0 << "\r\n"; + } else { + os << "Checksums: "; + for (uint32_t i = 0; i < checksums.size(); i++) + os << checksums[i] << ' '; + os << "\r\n"; + } + os << "Num-servers: " << numServers << "\r\n"; + os << "Servers: " << servers << "\r\n\r\n"; +} + +void +HeartbeatOp::Response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + os << response.str() << "\r\n"; +} + +void +ReplicateChunkOp::Response(ostream &os) +{ + PutHeader(this, os) << + "File-handle: " << fid << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + ; + if (! invalidStripeIdx.empty()) { + os << "Invalid-stripes: " << invalidStripeIdx << "\r\n"; + } + os << "\r\n"; +} + +void +PingOp::Response(ostream &os) +{ + ServerLocation loc = gMetaServerSM.GetLocation(); + + PutHeader(this, os); + os << + "Meta-server-host: " << loc.hostname << "\r\n" + "Meta-server-port: " << loc.port << "\r\n" + "Total-space: " << totalSpace << "\r\n" + "Total-fs-space: " << totalFsSpace << "\r\n" + "Used-space: " << usedSpace << "\r\n" + "Num-evacuate: " << evacuateInFlightCount << "\r\n" + "\r\n"; +} + +void +BeginMakeChunkStableOp::Response(std::ostream& os) +{ + if (! OkHeader(this, os)) { + return; + } + os << + "Chunk-size: " << chunkSize << "\r\n" + "Chunk-checksum: " << chunkChecksum << "\r\n" + "\r\n"; +} + +void +DumpChunkMapOp::Response(ostream &os) +{ + ostringstream v; + gChunkManager.DumpChunkMap(v); + PutHeader(this, os) << + "Content-length: " << v.str().length() << "\r\n\r\n"; + if (v.str().length() > 0) { + os << v.str(); + } +} + +void +StatsOp::Response(ostream &os) +{ + PutHeader(this, os) << stats << "\r\n"; +} + +//////////////////////////////////////////////// +// Now the handle done's.... +//////////////////////////////////////////////// + +int +SizeOp::HandleDone(int code, void *data) +{ + // notify the owning object that the op finished + clnt->HandleEvent(EVENT_CMD_DONE, this); + return 0; +} + +int +GetChunkMetadataOp::HandleDone(int code, void *data) +{ + // notify the owning object that the op finished + clnt->HandleEvent(EVENT_CMD_DONE, this); + return 0; +} + +class ReadChunkMetaNotifier { + const int res; +public: + ReadChunkMetaNotifier(int r) : res(r) { } + void operator()(KfsOp *op) { + int r = res; + op->HandleEvent(EVENT_CMD_DONE, &r); + } +}; + +int +ReadChunkMetaOp::HandleDone(int code, void *data) +{ + IOBuffer* dataBuf = 0; + if (code == EVENT_DISK_ERROR) { + status = data ? *reinterpret_cast(data) : -EIO; + KFS_LOG_STREAM_ERROR << + "chunk: " << chunkId << + " read meta disk error: " << status << + KFS_LOG_EOM; + } else if (code == EVENT_DISK_READ) { + dataBuf = reinterpret_cast(data); + } else { + status = -EINVAL; + ostringstream os; + os << "read chunk meta data unexpected event: " + " code: " << code << " data: " << data; + die(os.str()); + } + gChunkManager.ReadChunkMetadataDone(this, dataBuf); + int res = status; + if (clnt) { + clnt->HandleEvent(EVENT_CMD_DONE, &res); + } + for_each(waiters.begin(), waiters.end(), ReadChunkMetaNotifier(res)); + + delete this; + return 0; +} + +WriteOp::~WriteOp() +{ + if (isWriteIdHolder) { + // track how long it took for the write to finish up: + // enqueueTime tracks when the last write was done to this + // writeid + const int64_t kMicroSecs = 1000 * 1000; + const int64_t timeSpent = int64_t(enqueueTime) * kMicroSecs - startTime; + if (timeSpent > 5 * kMicroSecs) { + gChunkServer.SendTelemetryReport(CMD_WRITE, timeSpent); + } + // we don't want write id's to pollute stats + startTime = microseconds(); + OpCounters::WriteDuration(timeSpent); + } + + delete dataBuf; + if (rop) { + rop->wop = 0; + // rop->dataBuf can be non null when read completes but WriteChunk + // fails, and returns before using this buff. + // Read op destructor deletes dataBuf. + delete rop; + } +} + +WriteIdAllocOp::~WriteIdAllocOp() +{ + delete fwdedOp; +} + +WritePrepareOp::~WritePrepareOp() +{ + // on a successful prepare, dataBuf should be moved to a write op. + assert(status != 0 || ! dataBuf); + + delete dataBuf; + delete writeFwdOp; + delete writeOp; +} + +WriteSyncOp::~WriteSyncOp() +{ + delete fwdedOp; + delete writeOp; +} + +void +LeaseRenewOp::Request(ostream &os) +{ + os << "LEASE_RENEW\r\n"; + os << "Version: " << KFS_VERSION_STR << "\r\n"; + os << "Cseq: " << seq << "\r\n"; + os << "Chunk-handle: " << chunkId << "\r\n"; + os << "Lease-id: " << leaseId << "\r\n"; + os << "Lease-type: " << leaseType << "\r\n\r\n"; +} + +int +LeaseRenewOp::HandleDone(int code, void *data) +{ + assert(data == this && clnt); + return clnt->HandleEvent(EVENT_CMD_DONE, data); +} + +void +LeaseRelinquishOp::Request(ostream &os) +{ + os << "LEASE_RELINQUISH\r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Cseq: " << seq << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "Lease-id: " << leaseId << "\r\n" + "Lease-type: " << leaseType << "\r\n" + ; + if (chunkSize >= 0) { + os << "Chunk-size: " << chunkSize << "\r\n"; + } + if (hasChecksum) { + os << "Chunk-checksum: " << chunkChecksum << "\r\n"; + } + os << "\r\n"; +} + +int +LeaseRelinquishOp::HandleDone(int code, void *data) +{ + assert(code == EVENT_CMD_DONE && data == this); + delete this; + return 0; +} + +void +CorruptChunkOp::Request(ostream &os) +{ + os << + "CORRUPT_CHUNK\r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Cseq: " << seq << "\r\n" + "File-handle: " << fid << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "Is-chunk-lost: " << (isChunkLost ? 1 : 0) << "\r\n" + ; + if (noReply) { + os << "No-reply: 1\r\n"; + } + if (! chunkDir.empty()) { + os << + "Chunk-dir: " << chunkDir << "\r\n" + "Dir-ok: " << (dirOkFlag ? 1 : 0) << "\r\n" + ; + } + os << "\r\n"; +} + +int +CorruptChunkOp::HandleDone(int code, void *data) +{ + UnRef(); + return 0; +} + +void +EvacuateChunksOp::Request(ostream &os) +{ + assert(numChunks <= kMaxChunkIds); + + os << + "EVACUATE_CHUNK\r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Cseq: " << seq << "\r\n" + ; + if (totalSpace >= 0) { + os << "Total-space: " << totalSpace << "\r\n"; + } + if (usedSpace >= 0) { + os << "Used-space: " << usedSpace << "\r\n"; + } + if (chunkDirs >= 0) { + os << "Num-drives: " << chunkDirs << "\r\n"; + } + if (writableChunkDirs >= 0) { + os << "Num-wr-drives: " << writableChunkDirs << "\r\n"; + } + if (evacuateInFlightCount >= 0) { + os << "Num-evacuate: " << evacuateInFlightCount << "\r\n"; + } + os << "Chunk-ids:"; + for (int i = 0; i < numChunks; i++) { + os << " " << chunkIds[i]; + } + os << "\r\n\r\n"; +} + +class PrintChunkInfo { + ostringstream &os; +public: + PrintChunkInfo(ostringstream &o) : os(o) { } + void operator() (ChunkInfo_t &c) { + os << c.fileId << ' '; + os << c.chunkId << ' '; + os << c.chunkVersion << ' '; + } +}; + +void +HelloMetaOp::Request(ostream &os) +{ + os << + "HELLO \r\n" + "Version: " << KFS_VERSION_STR << "\r\n" + "Cseq: " << seq << "\r\n" + "Chunk-server-name: " << myLocation.hostname << "\r\n" + "Chunk-server-port: " << myLocation.port << "\r\n" + "Cluster-key: " << clusterKey << "\r\n" + "MD5Sum: " << md5sum << "\r\n" + "Rack-id: " << rackId << "\r\n" + "Total-space: " << totalSpace << "\r\n" + "Total-fs-space: " << totalFsSpace << "\r\n" + "Used-space: " << usedSpace << "\r\n" + "Uptime: " << globalNetManager().UpTime() << "\r\n" + "Num-chunks: " << chunks.size() << "\r\n" + "Num-not-stable-append-chunks: " << notStableAppendChunks.size() << "\r\n" + "Num-not-stable-chunks: " << notStableChunks.size() << "\r\n" + "Num-appends-with-wids: " << + gAtomicRecordAppendManager.GetAppendersWithWidCount() << "\r\n" + "Num-re-replications: " << Replicator::GetNumReplications() << "\r\n" + "Stale-chunks-hex-format: 1\r\n" + "Content-int-base: 16\r\n" + ; + ostringstream chunkInfo; + chunkInfo << hex; + // figure out the content-length first... + for_each(chunks.begin(), chunks.end(), PrintChunkInfo(chunkInfo)); + for_each(notStableAppendChunks.begin(), notStableAppendChunks.end(), PrintChunkInfo(chunkInfo)); + for_each(notStableChunks.begin(), notStableChunks.end(), PrintChunkInfo(chunkInfo)); + + os << "Content-length: " << chunkInfo.str().length() << "\r\n\r\n"; + os << chunkInfo.str(); +} + +void +SetProperties::Request(std::ostream &os) +{ + string content; + properties.getList(content, ""); + contentLength = content.length(); + os << "CMD_SET_PROPERTIES \r\n"; + os << "Version: " << KFS_VERSION_STR << "\r\n"; + os << "Cseq: " << seq << "\r\n"; + os << "Content-length: " << contentLength << "\r\n\r\n"; + os << content; +} + +bool +SetProperties::ParseContent(istream& is) +{ + properties.clear(); + status = min(0, properties.loadProperties(is, '=', false)); + if (status != 0) { + statusMsg = "failed to parse properties"; + } + return (status == 0); +} + +void +SetProperties::Execute() +{ + if (status == 0) { + if (! MsgLogger::GetLogger()) { + status = -ENOENT; + statusMsg = "no logger"; + } else { + MsgLogger::GetLogger()->SetParameters( + properties, "chunkServer.msgLogWriter."); + gMetaServerSM.SetParameters(properties); + gChunkManager.SetParameters(properties); + } + } + gLogger.Submit(this); +} + +string RestartChunkServer(); + +void +RestartChunkServerOp::Execute() +{ + statusMsg = RestartChunkServer(); + status = statusMsg.empty() ? 0 : -1; + gLogger.Submit(this); +} + +void +KillRemoteSyncOp::Execute() +{ + RemoteSyncSM* remoteSyncSM = static_cast(clnt); + assert(remoteSyncSM); + + remoteSyncSM->Finish(); +} + +void +HelloMetaOp::Execute() +{ + int chunkDirs = 0; + int numEvacuateInFlight = 0; + int numWritableChunkDirs = 0; + int evacuateChunks = 0; + int64_t evacuateByteCount = 0; + totalFsSpace = 0; + totalSpace = gChunkManager.GetTotalSpace( + totalFsSpace, chunkDirs, numEvacuateInFlight, numWritableChunkDirs, + evacuateChunks, evacuateByteCount, 0, 0, &lostChunkDirs); + usedSpace = gChunkManager.GetUsedSpace(); + gChunkManager.GetHostedChunks( + chunks, notStableChunks, notStableAppendChunks); + status = 0; + gLogger.Submit(this); +} + +} diff --git a/src/cc/chunk/KfsOps.h b/src/cc/chunk/KfsOps.h new file mode 100644 index 000000000..bcb84e694 --- /dev/null +++ b/src/cc/chunk/KfsOps.h @@ -0,0 +1,1849 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/05/26 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Declarations for the various Chunkserver ops and RPCs. +// +// +//---------------------------------------------------------------------------- + +#ifndef _CHUNKSERVER_KFSOPS_H +#define _CHUNKSERVER_KFSOPS_H + +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/IOBuffer.h" +#include "kfsio/event.h" + +#include "common/Properties.h" +#include "common/kfsdecls.h" +#include "common/time.h" +#include "common/StBuffer.h" +#include "Chunk.h" +#include "DiskIo.h" +#include "RemoteSyncSM.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace KFS +{ + +using std::string; +using std::vector; +using std::set; +using std::list; +using std::ostream; +using std::istream; +using std::ostringstream; + +enum KfsOp_t { + CMD_UNKNOWN, + // Meta server->Chunk server ops + CMD_ALLOC_CHUNK, + CMD_DELETE_CHUNK, + CMD_TRUNCATE_CHUNK, + CMD_REPLICATE_CHUNK, + CMD_CHANGE_CHUNK_VERS, + CMD_BEGIN_MAKE_CHUNK_STABLE, + CMD_MAKE_CHUNK_STABLE, + CMD_COALESCE_BLOCK, + CMD_HEARTBEAT, + CMD_STALE_CHUNKS, + CMD_RETIRE, + // Chunk server->Meta server ops + CMD_META_HELLO, + CMD_CORRUPT_CHUNK, + CMD_LEASE_RENEW, + CMD_LEASE_RELINQUISH, + + // Client -> Chunkserver ops + CMD_SYNC, + CMD_OPEN, + CMD_CLOSE, + CMD_READ, + CMD_WRITE_ID_ALLOC, + CMD_WRITE_PREPARE, + CMD_WRITE_PREPARE_FWD, + CMD_WRITE_SYNC, + CMD_SIZE, + // RPCs support for record append: client reserves space and sends + // us records; the client can also free reserved space + CMD_RECORD_APPEND, + CMD_SPC_RESERVE, + CMD_SPC_RELEASE, + CMD_GET_RECORD_APPEND_STATUS, + // when data is loaded KFS, we need a way to verify that what was + // copied in matches the source. analogous to md5 model, client + // can issue this RPC and get the checksums stored for a chunk; + // the client can comptue checksum on input data and verify that + // they both match + CMD_GET_CHUNK_METADATA, + // Monitoring ops + CMD_PING, + CMD_STATS, + CMD_DUMP_CHUNKMAP, + // Internally generated ops + CMD_CHECKPOINT, + CMD_WRITE, + CMD_WRITE_CHUNKMETA, // write out the chunk meta-data + CMD_READ_CHUNKMETA, // read out the chunk meta-data + // op sent by the network thread to event thread to kill a + // "RemoteSyncSM". + CMD_KILL_REMOTE_SYNC, + // this op is to periodically "kick" the event processor thread + CMD_TIMEOUT, + // op to signal the disk manager that some disk I/O has finished + CMD_DISKIO_COMPLETION, + CMD_SET_PROPERTIES, + CMD_RESTART_CHUNK_SERVER, + CMD_EVACUATE_CHUNKS, + CMD_NCMDS +}; + +enum OpType_t { + OP_REQUEST, + OP_RESPONSE +}; + +class ClientSM; + +struct KfsOp : public KfsCallbackObj { + const KfsOp_t op; + OpType_t type; + kfsSeq_t seq; + int32_t status; + bool cancelled:1; + bool done:1; + bool noReply:1; + bool noRetry:1; + bool clientSMFlag:1; + string statusMsg; // output, optional, mostly for debugging + KfsCallbackObj* clnt; + // keep statistics + int64_t startTime; + + KfsOp(KfsOp_t o, kfsSeq_t s, KfsCallbackObj *c = 0) + : op(o), + type(OP_REQUEST), + seq(s), + status(0), + cancelled(false), + done(false), + noReply(false), + noRetry(false), + clientSMFlag(false), + statusMsg(), + clnt(c), + startTime(microseconds()) + { + SET_HANDLER(this, &KfsOp::HandleDone); + sOpsCount++; + } + void Cancel() { + cancelled = true; + } + // to allow dynamic-type-casting, make the destructor virtual + virtual ~KfsOp(); + virtual void Request(ostream& /* os */) { + // fill this method if the op requires a message to be sent to a server. + }; + // After an op finishes execution, this method generates the + // response that should be sent back to the client. The response + // string that is generated is based on the KFS protocol. + virtual void Response(ostream& os); + virtual void ResponseContent(IOBuffer*& buf, int& size) { + buf = 0; + size = 0; + } + virtual void Execute() = 0; + // Return info. about op for debugging + virtual string Show() const = 0; + // If the execution of an op suspends and then resumes and + // finishes, this method should be invoked to signify completion. + virtual int HandleDone(int code, void *data); + virtual int GetContentLength() const { return 0; } + virtual bool ParseContent(istream& is) { return true; } + virtual bool IsChunkReadOp( + int64_t& /* numBytes */, kfsChunkId_t& /* chunkId */) { return false; } + static int64_t GetOpsCount() { return sOpsCount; } + bool ValidateRequestHeader( + const char* name, + size_t nameLen, + const char* header, + size_t headerLen, + bool hasChecksum, + uint32_t checksum) + { + return (! hasChecksum || + Checksum(name, nameLen, header, headerLen) == checksum); + } + bool Validate() { return true; } + ClientSM* GetClientSM(); + static uint32_t Checksum( + const char* name, + size_t nameLen, + const char* header, + size_t headerLen); + template static T& ParserDef(T& parser) + { + return parser + .Def("Cseq", &KfsOp::seq, kfsSeq_t(-1)) + ; + } +private: + static int64_t sOpsCount; +}; + +// +// Model used in all the c'tor's of the ops: we do minimal +// initialization and primarily init the fields that are used for +// output. The fields that are "input" are set when they are parsed +// from the input stream. +// +struct AllocChunkOp : public KfsOp { + kfsFileId_t fileId; // input + kfsChunkId_t chunkId; // input + int64_t chunkVersion; // input + int64_t leaseId; // input + bool appendFlag; // input + string servers; // input + uint32_t numServers; + bool mustExistFlag; + AllocChunkOp(kfsSeq_t s = 0) + : KfsOp(CMD_ALLOC_CHUNK, s), + fileId(-1), + chunkId(-1), + chunkVersion(-1), + leaseId(-1), + appendFlag(false), + servers(), + numServers(0), + mustExistFlag(false) + { + // All inputs will be parsed in + } + void Execute(); + // handlers for reading/writing out the chunk meta-data + int HandleChunkMetaReadDone(int code, void *data); + int HandleChunkAllocDone(int code, void *data); + string Show() const { + ostringstream os; + os << + "alloc-chunk:" + " seq: " << seq << + " fileid: " << fileId << + " chunkid: " << chunkId << + " chunkvers: " << chunkVersion << + " leaseid: " << leaseId << + " append: " << (appendFlag ? 1 : 0) + ; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("File-handle", &AllocChunkOp::fileId, kfsFileId_t(-1)) + .Def("Chunk-handle", &AllocChunkOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &AllocChunkOp::chunkVersion, int64_t(-1)) + .Def("Lease-id", &AllocChunkOp::leaseId, int64_t(-1)) + .Def("Chunk-append", &AllocChunkOp::appendFlag, false) + .Def("Num-servers", &AllocChunkOp::numServers) + .Def("Servers", &AllocChunkOp::servers) + ; + } +}; + +struct BeginMakeChunkStableOp : public KfsOp { + kfsFileId_t fileId; // input + kfsChunkId_t chunkId; // input + int64_t chunkVersion; // input + int64_t chunkSize; // output + uint32_t chunkChecksum; // output + BeginMakeChunkStableOp* next; + BeginMakeChunkStableOp(kfsSeq_t s = 0) + : KfsOp(CMD_BEGIN_MAKE_CHUNK_STABLE, s), + fileId(-1), + chunkId(-1), + chunkVersion(-1), + chunkSize(-1), + chunkChecksum(0), + next(0) + {} + void Execute(); + void Response(ostream &os); + string Show() const { + ostringstream os; + os << "begin-make-chunk-stable:" + " seq: " << seq << + " fileid: " << fileId << + " chunkid: " << chunkId << + " chunkvers: " << chunkVersion << + " size: " << chunkSize << + " checksum: " << chunkChecksum + ; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("File-handle", &BeginMakeChunkStableOp::fileId, kfsFileId_t(-1)) + .Def("Chunk-handle", &BeginMakeChunkStableOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &BeginMakeChunkStableOp::chunkVersion, int64_t(-1)) + ; + } +}; + +struct MakeChunkStableOp : public KfsOp { + kfsFileId_t fileId; // input + kfsChunkId_t chunkId; // input + int64_t chunkVersion; // input + int64_t chunkSize; // input + uint32_t chunkChecksum; // input + bool hasChecksum; + StringBufT<32> checksumStr; + MakeChunkStableOp* next; + MakeChunkStableOp(kfsSeq_t s = 0) + : KfsOp(CMD_MAKE_CHUNK_STABLE, s), + fileId(-1), + chunkId(-1), + chunkVersion(-1), + chunkSize(-1), + chunkChecksum(0), + hasChecksum(false), + checksumStr(), + next(0) + {} + void Execute(); + int HandleChunkMetaReadDone(int code, void *data); + int HandleMakeStableDone(int code, void *data); + string Show() const { + ostringstream os; + + os << "make-chunk-stable:" + " seq: " << seq << + " fileid: " << fileId << + " chunkid: " << chunkId << + " chunkvers: " << chunkVersion << + " chunksize: " << chunkSize << + " checksum: " << chunkChecksum << + " has-checksum: " << (hasChecksum ? "yes" : "no") + ; + return os.str(); + } + // generic response from KfsOp works.. + bool Validate(); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("File-handle", &MakeChunkStableOp::fileId, kfsFileId_t(-1)) + .Def("Chunk-handle", &MakeChunkStableOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &MakeChunkStableOp::chunkVersion, int64_t(-1)) + .Def("Chunk-size", &MakeChunkStableOp::chunkSize, int64_t(-1)) + .Def("Chunk-checksum", &MakeChunkStableOp::checksumStr) + ; + } +}; + +struct ChangeChunkVersOp : public KfsOp { + kfsFileId_t fileId; // input + kfsChunkId_t chunkId; // input + int64_t chunkVersion; // input + int64_t fromChunkVersion; // input + bool makeStableFlag; + ChangeChunkVersOp(kfsSeq_t s = 0) + : KfsOp(CMD_CHANGE_CHUNK_VERS, s), + fileId(-1), + chunkId(-1), + chunkVersion(-1), + fromChunkVersion(-1), + makeStableFlag(false) + {} + void Execute(); + // handler for reading in the chunk meta-data + int HandleChunkMetaReadDone(int code, void *data); + int HandleChunkMetaWriteDone(int code, void *data); + string Show() const { + ostringstream os; + + os << "change-chunk-vers:" + " fileid: " << fileId << + " chunkid: " << chunkId << + " chunkvers: " << chunkVersion << + " make stable: " << makeStableFlag + ; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("File-handle", &ChangeChunkVersOp::fileId, kfsFileId_t(-1)) + .Def("Chunk-handle", &ChangeChunkVersOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &ChangeChunkVersOp::chunkVersion, int64_t(-1)) + .Def("From-chunk-version", &ChangeChunkVersOp::fromChunkVersion, int64_t(-1)) + .Def("Make-stable", &ChangeChunkVersOp::makeStableFlag, false) + ; + } +}; + +struct DeleteChunkOp : public KfsOp { + kfsChunkId_t chunkId; // input + DeleteChunkOp(kfsSeq_t s = 0) + : KfsOp(CMD_DELETE_CHUNK, s), + chunkId(-1) + {} + void Execute(); + string Show() const { + ostringstream os; + + os << "delete-chunk: chunkid: " << chunkId; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &DeleteChunkOp::chunkId, kfsChunkId_t(-1)) + ; + } +}; + +struct TruncateChunkOp : public KfsOp { + kfsChunkId_t chunkId; // input + size_t chunkSize; // size to which file should be truncated to + TruncateChunkOp(kfsSeq_t s = 0) + : KfsOp(CMD_TRUNCATE_CHUNK, s), + chunkId(-1), + chunkSize(0) + {} + void Execute(); + // handler for reading in the chunk meta-data + int HandleChunkMetaReadDone(int code, void *data); + int HandleChunkMetaWriteDone(int code, void *data); + string Show() const { + ostringstream os; + + os << "truncate-chunk:" + " chunkid: " << chunkId << + " chunksize: " << chunkSize + ; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &TruncateChunkOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-size", &TruncateChunkOp::chunkSize, size_t(0)) + ; + } +}; + +// Op for replicating the chunk. The metaserver is asking this +// chunkserver to create a copy of a chunk. We replicate the chunk +// and then notify the server upon completion. +// +struct ReplicateChunkOp : public KfsOp { + kfsFileId_t fid; // input + kfsChunkId_t chunkId; // input + ServerLocation location; // input: where to get the chunk from + int64_t chunkVersion; // io: we tell the metaserver what we replicated + int64_t fileSize; + int64_t chunkOffset; + int16_t striperType; + int16_t numStripes; + int16_t numRecoveryStripes; + int32_t stripeSize; + string pathName; + string invalidStripeIdx; + int metaPort; + StringBufT<64> locationStr; + ReplicateChunkOp(kfsSeq_t s = 0) : + KfsOp(CMD_REPLICATE_CHUNK, s), + fid(-1), + chunkId(-1), + location(), + chunkVersion(-1), + fileSize(-1), + chunkOffset(-1), + striperType(KFS_STRIPED_FILE_TYPE_NONE), + numStripes(0), + numRecoveryStripes(0), + stripeSize(0), + pathName(), + invalidStripeIdx(), + metaPort(-1), + locationStr() + {} + void Execute(); + void Response(ostream &os); + string Show() const { + ostringstream os; + + os << "replicate-chunk:" << + " fid: " << fid << + " chunkid: " << chunkId << + " version: " << chunkVersion << + " offset: " << chunkOffset << + " stiper: " << striperType << + " dstripes: " << numStripes << + " rstripes: " << numRecoveryStripes << + " ssize: " << stripeSize << + " fsize: " << fileSize << + " fname: " << pathName << + " invals: " << invalidStripeIdx + ; + return os.str(); + } + bool Validate() + { + if (locationStr.empty()) { + location.port = metaPort; + } else { + location.FromString(locationStr.GetStr()); + } + return true; + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("File-handle", &ReplicateChunkOp::fid, kfsFileId_t(-1)) + .Def("Chunk-handle", &ReplicateChunkOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &ReplicateChunkOp::chunkVersion, int64_t(-1)) + .Def("Chunk-location", &ReplicateChunkOp::locationStr) + .Def("Meta-port", &ReplicateChunkOp::metaPort, int(-1)) + .Def("Chunk-offset", &ReplicateChunkOp::chunkOffset, int64_t(-1)) + .Def("Striper-type", &ReplicateChunkOp::striperType, int16_t(KFS_STRIPED_FILE_TYPE_NONE)) + .Def("Num-stripes", &ReplicateChunkOp::numStripes) + .Def("Num-recovery-stripes", &ReplicateChunkOp::numRecoveryStripes) + .Def("Stripe-size", &ReplicateChunkOp::stripeSize) + .Def("Pathname", &ReplicateChunkOp::pathName) + .Def("File-size", &ReplicateChunkOp::fileSize, int64_t(-1)) + ; + } +}; + +struct HeartbeatOp : public KfsOp { + int64_t metaEvacuateCount; // input + ostringstream response; + ostringstream cmdShow; + HeartbeatOp(kfsSeq_t s = 0) + : KfsOp(CMD_HEARTBEAT, s), + metaEvacuateCount(-1), + response(), + cmdShow() + { cmdShow << "meta-heartbeat:"; } + void Execute(); + void Response(ostream &os); + string Show() const { + return cmdShow.str(); + } + template void Append(const char* key1, const char* key2, T val); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Num-evacuate", &HeartbeatOp::metaEvacuateCount, int64_t(-1)) + ; + } +}; + +struct StaleChunksOp : public KfsOp { + typedef vector StaleChunkIds; + int contentLength; /* length of data that identifies the stale chunks */ + int numStaleChunks; /* what the server tells us */ + bool evacuatedFlag; + bool hexFormatFlag; + StaleChunkIds staleChunkIds; /* data we parse out */ + StaleChunksOp(kfsSeq_t s = 0) + : KfsOp(CMD_STALE_CHUNKS, s), + contentLength(0), + numStaleChunks(0), + evacuatedFlag(false), + hexFormatFlag(false), + staleChunkIds() + {} + void Execute(); + string Show() const { + ostringstream os; + + os << "stale chunks: count: " << numStaleChunks << + " evacuated: " << evacuatedFlag; + return os.str(); + } + virtual int GetContentLength() const { return contentLength; } + virtual bool ParseContent(istream& is); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Content-length", &StaleChunksOp::contentLength) + .Def("Num-chunks", &StaleChunksOp::numStaleChunks) + .Def("Evacuated", &StaleChunksOp::evacuatedFlag, false) + .Def("HexFormat", &StaleChunksOp::hexFormatFlag, false) + ; + } +}; + +struct RetireOp : public KfsOp { + RetireOp(kfsSeq_t s = 0) + : KfsOp(CMD_RETIRE, s) + {} + void Execute(); + string Show() const { + return "meta-server is telling us to retire"; + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + ; + } +}; + +struct OpenOp : public KfsOp { + kfsChunkId_t chunkId; // input + int openFlags; // either O_RDONLY, O_WRONLY + StringBufT<64> intentStr; + OpenOp(kfsSeq_t s = 0) + : KfsOp(CMD_OPEN, s), + chunkId(-1), + openFlags(0), + intentStr() + {} + void Execute(); + string Show() const { + ostringstream os; + + os << "open: chunkId: " << chunkId; + return os.str(); + } + bool Valudate() + { + openFlags = O_RDWR; + return true; + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &OpenOp::chunkId, kfsChunkId_t(-1)) + .Def("Intent", &OpenOp::intentStr) + ; + } +}; + +struct CloseOp : public KfsOp { + kfsChunkId_t chunkId; // input + uint32_t numServers; // input + bool needAck; // input: when set, this RPC is ack'ed + bool hasWriteId; // input + int64_t masterCommitted; // input + string servers; // input: set of servers on which to chunk is to be closed + CloseOp(kfsSeq_t s = 0, const CloseOp* op = 0) + : KfsOp(CMD_CLOSE, s), + chunkId (op ? op->chunkId : (kfsChunkId_t)-1), + numServers (op ? op->numServers : 0u), + needAck (op ? op->needAck : true), + hasWriteId (op ? op->hasWriteId : false), + masterCommitted(op ? op->masterCommitted : (int64_t)-1), + servers (op ? op->servers : string()) + {} + void Execute(); + string Show() const { + ostringstream os; + os << + "close:" + " chunkId: " << chunkId << + " num-servers: " << numServers << + " servers: " << servers << + " need-ack: " << needAck << + " has-write-id: " << hasWriteId << + " mater-committed: " << masterCommitted + ; + return os.str(); + } + // if there was a daisy chain for this chunk, forward the close down the chain + void Request(ostream &os); + void Response(ostream &os) { + if (needAck) { + KfsOp::Response(os); + } + } + void ForwardToPeer(const ServerLocation &loc); + int HandlePeerReply(int code, void *data); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &CloseOp::chunkId, kfsChunkId_t(-1)) + .Def("Num-servers", &CloseOp::numServers) + .Def("Servers", &CloseOp::servers) + .Def("Need-ack", &CloseOp::needAck, true) + .Def("Has-write-id", &CloseOp::hasWriteId, false) + .Def("Master-committed", &CloseOp::masterCommitted, int64_t(-1)) + ; + } +}; + +struct ReadOp; +struct WriteOp; +struct WriteSyncOp; +struct WritePrepareFwdOp; + +// support for record appends +struct RecordAppendOp : public KfsOp { + kfsSeq_t clientSeq; /* input */ + kfsChunkId_t chunkId; /* input */ + int64_t chunkVersion; /* input */ + size_t numBytes; /* input */ + int64_t writeId; /* value for the local parsed out of servers string */ + int64_t offset; /* input: offset as far as the transaction is concerned */ + int64_t fileOffset; /* value set by the head of the daisy chain */ + uint32_t numServers; /* input */ + uint32_t checksum; /* input: as computed by the sender; 0 means sender didn't send */ + string servers; /* input: set of servers on which to write */ + int64_t masterCommittedOffset; /* input piggy back master's ack to slave */ + StringBufT<32> clientSeqStr; + IOBuffer dataBuf; /* buffer with the data to be written */ + /* + * when a record append is to be fwd'ed along a daisy chain, + * this field stores the original op client. + */ + KfsCallbackObj* origClnt; + kfsSeq_t origSeq; + time_t replicationStartTime; + RecordAppendOp* mPrevPtr[1]; + RecordAppendOp* mNextPtr[1]; + + RecordAppendOp(kfsSeq_t s = 0); + virtual ~RecordAppendOp(); + + void Request(ostream &os); + void Response(ostream &os); + void Execute(); + string Show() const; + bool Validate(); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &RecordAppendOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &RecordAppendOp::chunkVersion, int64_t(-1)) + .Def("Offset", &RecordAppendOp::offset, int64_t(-1)) + .Def("File-offset", &RecordAppendOp::fileOffset, int64_t(-1)) + .Def("Num-bytes", &RecordAppendOp::numBytes) + .Def("Num-servers", &RecordAppendOp::numServers) + .Def("Servers", &RecordAppendOp::servers) + .Def("Checksum", &RecordAppendOp::checksum) + .Def("Client-cseq", &RecordAppendOp::clientSeqStr) + .Def("Master-committed", &RecordAppendOp::masterCommittedOffset, int64_t(-1)) + ; + } +}; + +struct GetRecordAppendOpStatus : public KfsOp +{ + kfsChunkId_t chunkId; // input + int64_t writeId; // input + kfsSeq_t opSeq; // output + int64_t chunkVersion; + int64_t opOffset; + size_t opLength; + int opStatus; + size_t widAppendCount; + size_t widBytesReserved; + size_t chunkBytesReserved; + int64_t remainingLeaseTime; + int64_t masterCommitOffset; + int64_t nextCommitOffset; + int appenderState; + const char* appenderStateStr; + bool masterFlag; + bool stableFlag; + bool openForAppendFlag; + bool widWasReadOnlyFlag; + bool widReadOnlyFlag; + + GetRecordAppendOpStatus(kfsSeq_t s = 0) + : KfsOp(CMD_GET_RECORD_APPEND_STATUS, s), + chunkId(-1), + writeId(-1), + opSeq(-1), + chunkVersion(-1), + opOffset(-1), + opLength(0), + opStatus(-1), + widAppendCount(0), + widBytesReserved(0), + chunkBytesReserved(0), + remainingLeaseTime(0), + masterCommitOffset(-1), + nextCommitOffset(-1), + appenderState(0), + appenderStateStr(""), + masterFlag(false), + stableFlag(false), + openForAppendFlag(false), + widWasReadOnlyFlag(false), + widReadOnlyFlag(false) + {} + void Request(ostream &os); + void Response(ostream &os); + void Execute(); + string Show() const + { + ostringstream os; + os << "get-record-append-op-status:" + " seq: " << seq << + " chunkId: " << chunkId << + " writeId: " << writeId << + " status: " << status << + " op-seq: " << opSeq << + " op-status: " << opStatus << + " wid: " << (widReadOnlyFlag ? "ro" : "w") + ; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &GetRecordAppendOpStatus::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &GetRecordAppendOpStatus::chunkVersion, int64_t(-1)) + .Def("Write-id", &GetRecordAppendOpStatus::writeId, int64_t(-1)) + ; + } +}; + +struct WriteIdAllocOp : public KfsOp { + kfsSeq_t clientSeq; /* input */ + kfsChunkId_t chunkId; + int64_t chunkVersion; + int64_t offset; /* input */ + size_t numBytes; /* input */ + int64_t writeId; /* output */ + string writeIdStr; /* output */ + uint32_t numServers; /* input */ + string servers; /* input: set of servers on which to write */ + WriteIdAllocOp* fwdedOp; /* if we did any fwd'ing, this is the op that tracks it */ + bool isForRecordAppend; /* set if the write-id-alloc is for a record append that will follow */ + bool writePrepareReplyFlag; /* write prepare reply supported */ + StringBufT<32> clientSeqStr; + RemoteSyncSMPtr appendPeer; + + WriteIdAllocOp(kfsSeq_t s = 0) + : KfsOp(CMD_WRITE_ID_ALLOC, s), + clientSeq(-1), + chunkId(-1), + chunkVersion(-1), + offset(0), + numBytes(0), + writeId(-1), + writeIdStr(), + numServers(0), + servers(), + fwdedOp(0), + isForRecordAppend(false), + writePrepareReplyFlag(true), + clientSeqStr(), + appendPeer() + { + SET_HANDLER(this, &WriteIdAllocOp::Done); + } + WriteIdAllocOp(kfsSeq_t s, const WriteIdAllocOp& other) + : KfsOp(CMD_WRITE_ID_ALLOC, s), + clientSeq(other.clientSeq), + chunkId(other.chunkId), + chunkVersion(other.chunkVersion), + offset(other.offset), + numBytes(other.numBytes), + writeId(-1), + numServers(other.numServers), + servers(other.servers), + fwdedOp(0), + isForRecordAppend(other.isForRecordAppend), + writePrepareReplyFlag(other.writePrepareReplyFlag), + clientSeqStr(), + appendPeer() + {} + ~WriteIdAllocOp(); + + void Request(ostream &os); + void Response(ostream &os); + void Execute(); + // should the chunk metadata get paged out, then we use the + // write-id alloc op as a hint to page the data back in---writes + // are coming. + void ReadChunkMetadata(); + + int ForwardToPeer(const ServerLocation &peer); + int HandlePeerReply(int code, void *data); + int Done(int code, void *data); + + string Show() const { + ostringstream os; + + os << "write-id-alloc:" + " seq: " << seq << + " client-seq: " << clientSeq << + " chunkId: " << chunkId << + " chunkversion: " << chunkVersion << + " servers: " << servers << + " status: " << status << + " msg: " << statusMsg + ; + return os.str(); + } + bool Validate(); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &WriteIdAllocOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &WriteIdAllocOp::chunkVersion, int64_t(-1)) + .Def("Offset", &WriteIdAllocOp::offset) + .Def("Num-bytes", &WriteIdAllocOp::numBytes) + .Def("Num-servers", &WriteIdAllocOp::numServers) + .Def("Servers", &WriteIdAllocOp::servers) + .Def("For-record-append", &WriteIdAllocOp::isForRecordAppend, false) + .Def("Client-cseq", &WriteIdAllocOp::clientSeqStr) + .Def("Write-prepare-reply", &WriteIdAllocOp::writePrepareReplyFlag) + ; + } +}; + +struct WritePrepareOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; + int64_t offset; /* input */ + size_t numBytes; /* input */ + int64_t writeId; /* value for the local server */ + uint32_t numServers; /* input */ + uint32_t checksum; /* input: as computed by the sender; 0 means sender didn't send */ + string servers; /* input: set of servers on which to write */ + bool replyRequestedFlag; + IOBuffer* dataBuf; /* buffer with the data to be written */ + WritePrepareFwdOp* writeFwdOp; /* op that tracks the data we fwd'ed to a peer */ + WriteOp* writeOp; /* the underlying write that is queued up locally */ + uint32_t numDone; // if we did forwarding, we wait for + // local/remote to be done; otherwise, we only + // wait for local to be done + WritePrepareOp(kfsSeq_t s = 0) + : KfsOp(CMD_WRITE_PREPARE, s), + chunkId(-1), + chunkVersion(-1), + offset(0), + numBytes(0), + writeId(-1), + numServers(0), + checksum(0), + servers(), + replyRequestedFlag(false), + dataBuf(0), + writeFwdOp(0), + writeOp(0), + numDone(0) + { SET_HANDLER(this, &WritePrepareOp::Done); } + ~WritePrepareOp(); + + void Response(ostream &os); + void Execute(); + + int ForwardToPeer(const ServerLocation& peer); + int Done(int code, void *data); + + string Show() const { + ostringstream os; + + os << "write-prepare:" + " seq: " << seq << + " chunkId: " << chunkId << + " chunkversion: " << chunkVersion << + " offset: " << offset << + " numBytes: " << numBytes + ; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &WritePrepareOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &WritePrepareOp::chunkVersion, int64_t(-1)) + .Def("Offset", &WritePrepareOp::offset) + .Def("Num-bytes", &WritePrepareOp::numBytes) + .Def("Num-servers", &WritePrepareOp::numServers) + .Def("Servers", &WritePrepareOp::servers) + .Def("Checksum", &WritePrepareOp::checksum) + .Def("Reply", &WritePrepareOp::replyRequestedFlag) + ; + } +}; + +struct WritePrepareFwdOp : public KfsOp { + const WritePrepareOp& owner; + WritePrepareFwdOp(WritePrepareOp& o) + : KfsOp(CMD_WRITE_PREPARE_FWD, 0), + owner(o) + {} + void Request(ostream &os); + // nothing to do...we send the data to peer and wait. have a + // decl. to keep compiler happy + void Execute() {} + + string Show() const { + return ("write-prepare-fwd: " + owner.Show()); + } +}; + +struct WriteOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; + int64_t offset; /* input */ + size_t numBytes; /* input */ + ssize_t numBytesIO; /* output: # of bytes actually written */ + DiskIoPtr diskIo; /* disk connection used for writing data */ + IOBuffer* dataBuf; /* buffer with the data to be written */ + vector checksums; /* store the checksum for logging purposes */ + /* + * for writes that are smaller than a checksum block, we need to + * read the whole block in, compute the new checksum and then write + * out data. This buffer holds the data read in from disk. + */ + ReadOp *rop; + /* + * The owning write prepare op + */ + WritePrepareOp *wpop; + /* + * Should we wait for aio_sync() to finish before replying to + * upstream clients? By default, we don't + */ + bool waitForSyncDone; + /* Set if the write was triggered due to re-replication */ + bool isFromReReplication; + // Set if the write is from a record append + bool isFromRecordAppend; + // for statistics purposes, have a "holder" op that tracks how long it took a write to finish. + bool isWriteIdHolder; + int64_t writeId; + // time at which the write was enqueued at the ChunkManager + time_t enqueueTime; + + WriteOp(kfsChunkId_t c, int64_t v) : + KfsOp(CMD_WRITE, 0), chunkId(c), chunkVersion(v), + offset(0), numBytes(0), numBytesIO(0), + dataBuf(NULL), rop(NULL), wpop(NULL), waitForSyncDone(false), + isFromReReplication(false), isFromRecordAppend(false), + isWriteIdHolder(false) + { + SET_HANDLER(this, &WriteOp::HandleWriteDone); + } + + WriteOp(kfsSeq_t s, kfsChunkId_t c, int64_t v, int64_t o, size_t n, + IOBuffer *b, int64_t id) : + KfsOp(CMD_WRITE, s), chunkId(c), chunkVersion(v), + offset(o), numBytes(n), numBytesIO(0), + dataBuf(b), rop(NULL), wpop(NULL), + waitForSyncDone(false), isFromReReplication(false), + isFromRecordAppend(false), + isWriteIdHolder(false), writeId(id) + { + SET_HANDLER(this, &WriteOp::HandleWriteDone); + } + ~WriteOp(); + + void InitForRecordAppend() { + SET_HANDLER(this, &WriteOp::HandleRecordAppendDone); + dataBuf = new IOBuffer(); + isFromRecordAppend = true; + } + + void Reset() { + status = numBytesIO = 0; + SET_HANDLER(this, &WriteOp::HandleWriteDone); + } + void Response(ostream &os) { }; + void Execute(); + + // for record appends, this handler will be called back; on the + // callback, notify the atomic record appender of + // completion status + int HandleRecordAppendDone(int code, void *data); + + int HandleWriteDone(int code, void *data); + int HandleSyncDone(int code, void *data); + int HandleLoggingDone(int code, void *data); + + string Show() const { + ostringstream os; + + os << "write:" + " chunkId: " << chunkId << + " chunkversion: " << chunkVersion << + " offset: " << offset << + " numBytes: " << numBytes + ; + return os.str(); + } +}; + +// sent by the client to force data to disk +struct WriteSyncOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; + // what is the range of data we are sync'ing + int64_t offset; /* input */ + size_t numBytes; /* input */ + // sent by the chunkmaster to downstream replicas; if there is a + // mismatch, the sync will fail and the client will retry the write + vector checksums; + int64_t writeId; /* corresponds to the local write */ + uint32_t numServers; + string servers; + WriteSyncOp* fwdedOp; + WriteOp* writeOp; // the underlying write that needs to be pushed to disk + uint32_t numDone; // if we did forwarding, we wait for + // local/remote to be done; otherwise, we only + // wait for local to be done + bool writeMaster; // infer from the server list if we are the "master" for doing the writes + int checksumsCnt; + StringBufT<256> checksumsStr; + + WriteSyncOp(kfsSeq_t s = 0, kfsChunkId_t c = -1, + int64_t v = -1, int64_t o = 0, size_t n = 0) + : KfsOp(CMD_WRITE_SYNC, s), + chunkId(c), + chunkVersion(v), + offset(o), + numBytes(n), + writeId(-1), + numServers(0), + fwdedOp(NULL), + writeOp(NULL), + numDone(0), + writeMaster(false), + checksumsCnt(0), + checksumsStr() + { SET_HANDLER(this, &WriteSyncOp::Done); } + ~WriteSyncOp(); + + void Request(ostream &os); + void Execute(); + + int ForwardToPeer(const ServerLocation &peer); + int Done(int code, void *data); + + string Show() const { + ostringstream os; + + os << "write-sync:" + " seq: " << seq << + " chunkId: " << chunkId << + " chunkversion: " << chunkVersion << + " offset: " << offset << + " numBytes: " << numBytes << + " write-ids: " << servers; + return os.str(); + } + bool Validate(); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &WriteSyncOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &WriteSyncOp::chunkVersion, int64_t(-1)) + .Def("Offset", &WriteSyncOp::offset) + .Def("Num-bytes", &WriteSyncOp::numBytes) + .Def("Num-servers", &WriteSyncOp::numServers) + .Def("Servers", &WriteSyncOp::servers) + .Def("Checksum-entries", &WriteSyncOp::checksumsCnt) + .Def("Checksums", &WriteSyncOp::checksumsStr) + ; + } +}; + +struct ReadChunkMetaOp : public KfsOp { + kfsChunkId_t chunkId; + DiskIoPtr diskIo; /* disk connection used for reading data */ + + // others ops that are also waiting for this particular meta-data + // read to finish; they'll get notified when the read is done + list waiters; + ReadChunkMetaOp(kfsChunkId_t c, KfsCallbackObj *o) + : KfsOp(CMD_READ_CHUNKMETA, 0, o), + chunkId(c), + diskIo(), + waiters() + { + SET_HANDLER(this, &ReadChunkMetaOp::HandleDone); + } + + void Execute() { } + string Show() const { + ostringstream os; + + os << "read-chunk-meta: chunkid: " << chunkId; + return os.str(); + } + + void AddWaiter(KfsOp *op) { + waiters.push_back(op); + } + // Update internal data structures and then notify the waiting op + // that read of meta-data is done. + int HandleDone(int code, void *data); +}; + +struct GetChunkMetadataOp; + +struct ReadOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; + int64_t offset; /* input */ + size_t numBytes; /* input */ + ssize_t numBytesIO; /* output: # of bytes actually read */ + DiskIoPtr diskIo; /* disk connection used for reading data */ + IOBuffer* dataBuf; /* buffer with the data read */ + vector checksum; /* checksum over the data that is sent back to client */ + int64_t diskIOTime; /* how long did the AIOs take */ + string driveName; /* for telemetry, provide the drive info to the client */ + int retryCnt; + /* + * for writes that require the associated checksum block to be + * read in, store the pointer to the associated write op. + */ + WriteOp* wop; + // for getting chunk metadata, we do a data scrub. + GetChunkMetadataOp* scrubOp; + ReadOp(kfsSeq_t s = 0) + : KfsOp(CMD_READ, s), + chunkId(-1), + chunkVersion(-1), + offset(0), + numBytes(0), + numBytesIO(0), + diskIo(), + dataBuf(0), + checksum(), + diskIOTime(0), + driveName(), + retryCnt(0), + wop(0), + scrubOp(0) + { SET_HANDLER(this, &ReadOp::HandleDone); } + ReadOp(WriteOp* w, int64_t o, size_t n) + : KfsOp(CMD_READ, w->seq), + chunkId(w->chunkId), + chunkVersion(w->chunkVersion), + offset(o), + numBytes(n), + numBytesIO(0), + diskIo(), + dataBuf(0), + checksum(), + diskIOTime(0), + driveName(), + retryCnt(0), + wop(w), + scrubOp(0) + { + clnt = w; + SET_HANDLER(this, &ReadOp::HandleDone); + } + ~ReadOp() { + assert(wop == NULL); + delete dataBuf; + } + + void SetScrubOp(GetChunkMetadataOp *sop) { + scrubOp = sop; + SET_HANDLER(this, &ReadOp::HandleScrubReadDone); + } + void Request(ostream &os); + void Response(ostream &os); + void ResponseContent(IOBuffer*& buf, int& size) { + buf = status >= 0 ? dataBuf : 0; + size = buf ? numBytesIO : 0; + } + void Execute(); + int HandleDone(int code, void *data); + // handler for reading in the chunk meta-data + int HandleChunkMetaReadDone(int code, void *data); + // handler for dealing with re-replication events + int HandleReplicatorDone(int code, void *data); + int HandleScrubReadDone(int code, void *data); + string Show() const { + ostringstream os; + + os << "read:" + " chunkId: " << chunkId << + " chunkversion: " << chunkVersion << + " offset: " << offset << + " numBytes: " << numBytes + ; + return os.str(); + } + virtual bool IsChunkReadOp(int64_t& outNumBytes, kfsChunkId_t& outChunkId); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &ReadOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &ReadOp::chunkVersion, int64_t(-1)) + .Def("Offset", &ReadOp::offset) + .Def("Num-bytes", &ReadOp::numBytes) + ; + } +}; + +// used for retrieving a chunk's size +struct SizeOp : public KfsOp { + kfsFileId_t fileId; // optional + kfsChunkId_t chunkId; + int64_t chunkVersion; + int64_t size; /* result */ + SizeOp( + kfsSeq_t s = 0, + kfsFileId_t fid = -1, + kfsChunkId_t c = -1, + int64_t v = -1) + : KfsOp(CMD_SIZE, s), + fileId(fid), + chunkId(c), + chunkVersion(v), + size(-1) + {} + + void Request(ostream &os); + void Response(ostream &os); + void Execute(); + string Show() const { + ostringstream os; + + os << "size:" + " chunkId: " << chunkId << + " chunkversion: " << chunkVersion << + " size: " << size; + return os.str(); + } + int HandleDone(int code, void *data); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("File-handle", &SizeOp::fileId, kfsFileId_t(-1)) + .Def("Chunk-handle", &SizeOp::chunkId, kfsChunkId_t(-1)) + .Def("Chunk-version", &SizeOp::chunkVersion, int64_t(-1)) + ; + } +}; + +// used for reserving space in a chunk +struct ChunkSpaceReserveOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t writeId; /* value for the local server */ + string servers; /* input: set of servers on which to write */ + uint32_t numServers; /* input */ + // client to provide transaction id (in the daisy chain, the + // upstream node is a proxy for the client; since the data fwding + // for a record append is all serialized over a single TCP + // connection, we need to pass the transaction id so that the + // receivers in the daisy chain can update state + // + size_t nbytes; + ChunkSpaceReserveOp(kfsSeq_t s = 0) + : KfsOp(CMD_SPC_RESERVE, s), + chunkId(-1), + writeId(-1), + servers(), + numServers(0), + nbytes(0) + {} + ChunkSpaceReserveOp(kfsSeq_t s, kfsChunkId_t c, size_t n) + : KfsOp(CMD_SPC_RESERVE, s), + chunkId(c), + writeId(-1), + servers(), + numServers(0), + nbytes(n) + {} + + void Execute(); + string Show() const { + ostringstream os; + + os << "space reserve: chunkId: " << chunkId << " nbytes: " << nbytes; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &ChunkSpaceReserveOp::chunkId, kfsChunkId_t(-1)) + .Def("Num-bytes", &ChunkSpaceReserveOp::nbytes) + .Def("Num-servers", &ChunkSpaceReserveOp::numServers) + .Def("Servers", &ChunkSpaceReserveOp::servers) + ; + } +}; + +// used for releasing previously reserved chunk space reservation +struct ChunkSpaceReleaseOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t writeId; /* value for the local server */ + string servers; /* input: set of servers on which to write */ + uint32_t numServers; /* input */ + size_t nbytes; + ChunkSpaceReleaseOp(kfsSeq_t s = 0) + : KfsOp(CMD_SPC_RELEASE, s), + chunkId(-1), + writeId(-1), + servers(), + numServers(0), + nbytes(0) + {} + ChunkSpaceReleaseOp(kfsSeq_t s, kfsChunkId_t c, int n) + : KfsOp(CMD_SPC_RELEASE, s), + chunkId(c), + writeId(-1), + servers(), + numServers(0), + nbytes(n) + {} + + void Execute(); + string Show() const { + ostringstream os; + + os << "space release: chunkId: " << chunkId << " nbytes: " << nbytes; + return os.str(); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &ChunkSpaceReleaseOp::chunkId, kfsChunkId_t(-1)) + .Def("Num-bytes", &ChunkSpaceReleaseOp::nbytes) + .Def("Num-servers", &ChunkSpaceReleaseOp::numServers) + .Def("Servers", &ChunkSpaceReleaseOp::servers) + ; + } +}; + +struct GetChunkMetadataOp : public KfsOp { + kfsChunkId_t chunkId; // input + int64_t chunkVersion; // output + bool readVerifyFlag; + int64_t chunkSize; // output + IOBuffer* dataBuf; // buffer with the checksum info + size_t numBytesIO; + ReadOp readOp; // internally generated + int64_t numBytesScrubbed; + enum { kChunkReadSize = 1 << 20, kChunkMetaReadSize = 16 << 10 }; + + GetChunkMetadataOp(kfsSeq_t s = 0) + : KfsOp(CMD_GET_CHUNK_METADATA, s), + chunkId(-1), + chunkVersion(0), + readVerifyFlag(false), + chunkSize(0), + dataBuf(0), + numBytesIO(0), + readOp(0), + numBytesScrubbed(0) + {} + ~GetChunkMetadataOp() + { delete dataBuf; } + void Execute(); + // handler for reading in the chunk meta-data + int HandleChunkMetaReadDone(int code, void *data); + + // We scrub the chunk 1MB at a time and validate checksums; once + // the chunk is fully scrubbed and checksums are good, we return + // the values to the client + int HandleScrubReadDone(int code, void *data); + + void Request(ostream &os); + void Response(ostream &os); + void ResponseContent(IOBuffer*& buf, int& size) { + buf = status >= 0 ? dataBuf : 0; + size = buf ? numBytesIO : 0; + } + string Show() const { + ostringstream os; + + os << "get-chunk-metadata:" + " chunkid: " << chunkId << + " chunkversion: " << chunkVersion + ; + return os.str(); + } + int HandleDone(int code, void *data); + virtual bool IsChunkReadOp(int64_t& outNumBytes, kfsChunkId_t& outChunkId) { + outChunkId = chunkId; + outNumBytes = readVerifyFlag ? kChunkReadSize : kChunkMetaReadSize; + return true; + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Chunk-handle", &GetChunkMetadataOp::chunkId, kfsChunkId_t(-1)) + .Def("Read-verify", &GetChunkMetadataOp::readVerifyFlag) + ; + } +}; + +// used for pinging the server and checking liveness +struct PingOp : public KfsOp { + int64_t totalSpace; + int64_t usedSpace; + int64_t totalFsSpace; + int evacuateInFlightCount; + PingOp(kfsSeq_t s = 0) + : KfsOp(CMD_PING, s), + totalSpace(-1), + usedSpace(-1), + totalFsSpace(-1), + evacuateInFlightCount(-1) + {} + void Response(ostream &os); + void Execute(); + string Show() const { + return "monitoring ping"; + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + ; + } +}; + +// used to dump chunk map +struct DumpChunkMapOp : public KfsOp { + DumpChunkMapOp(kfsSeq_t s = 0) + : KfsOp(CMD_DUMP_CHUNKMAP, s) + {} + void Response(ostream &os); + void Execute(); + string Show() const { + return "dumping chunk map"; + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + ; + } +}; + +// used to extract out all the counters we have +struct StatsOp : public KfsOp { + string stats; // result + StatsOp(kfsSeq_t s = 0) + : KfsOp(CMD_STATS, s), + stats() + {} + void Response(ostream &os); + void Execute(); + string Show() const { + return "monitoring stats"; + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + ; + } +}; + +struct LeaseRenewOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t leaseId; + string leaseType; + LeaseRenewOp(kfsSeq_t s, kfsChunkId_t c, int64_t l, string t) + : KfsOp(CMD_LEASE_RENEW, s), + chunkId(c), + leaseId(l), + leaseType(t) + { + SET_HANDLER(this, &LeaseRenewOp::HandleDone); + } + void Request(ostream &os); + // To be called whenever we get a reply from the server + int HandleDone(int code, void *data); + void Execute() { }; + string Show() const { + ostringstream os; + + os << "lease-renew:" + " chunkid: " << chunkId << + " leaseId: " << leaseId << + " type: " << leaseType + ; + return os.str(); + } +}; + +// Whenever we want to give up a lease early, we notify the metaserver +// using this op. +struct LeaseRelinquishOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t leaseId; + string leaseType; + int64_t chunkSize; + uint32_t chunkChecksum; + bool hasChecksum; + LeaseRelinquishOp(kfsSeq_t s, kfsChunkId_t c, int64_t l, string t) + : KfsOp(CMD_LEASE_RELINQUISH, s), + chunkId(c), + leaseId(l), + leaseType(t), + chunkSize(-1), + chunkChecksum(0), + hasChecksum(false) + { + SET_HANDLER(this, &LeaseRelinquishOp::HandleDone); + } + void Request(ostream &os); + // To be called whenever we get a reply from the server + int HandleDone(int code, void *data); + void Execute() { }; + string Show() const { + ostringstream os; + + os << "lease-relinquish:" + " chunkid: " << chunkId << + " leaseId: " << leaseId << + " type: " << leaseType << + " size: " << chunkSize << + " checksum: " << chunkChecksum + ; + return os.str(); + } +}; + +// This is just a helper op for building a hello request to the metaserver. +struct HelloMetaOp : public KfsOp { + typedef vector LostChunkDirs; + + ServerLocation myLocation; + string clusterKey; + string md5sum; + int rackId; + int64_t totalSpace; + int64_t totalFsSpace; + int64_t usedSpace; + vector chunks; + vector notStableChunks; + vector notStableAppendChunks; + LostChunkDirs lostChunkDirs; + HelloMetaOp(kfsSeq_t s, const ServerLocation& l, + const string& k, const string& m, int r) + : KfsOp(CMD_META_HELLO, s), + myLocation(l), + clusterKey(k), + md5sum(m), + rackId(r), + totalSpace(0), + totalFsSpace(0), + usedSpace(0), + chunks(), + notStableChunks(), + notStableAppendChunks(), + lostChunkDirs() + {} + void Execute(); + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "meta-hello:" + " seq: " << seq << + " mylocation: " << myLocation.ToString() << + " cluster-key: " << clusterKey << + " md5sum: " << md5sum << + " rackId: " << rackId << + " space: " << totalSpace << + " used: " << usedSpace << + " chunks: " << chunks.size() << + " not-stable: " << notStableChunks.size() << + " append: " << notStableAppendChunks.size() + ; + return os.str(); + } +}; + +struct CorruptChunkOp : public KfsOp { + kfsFileId_t fid; // input: fid whose chunk is bad + kfsChunkId_t chunkId; // input: chunkid of the corrupted chunk + // input: set if chunk was lost---happens when we disconnect from metaserver and miss messages + bool isChunkLost; + bool dirOkFlag; + string chunkDir; + + CorruptChunkOp(kfsSeq_t s, kfsFileId_t f, kfsChunkId_t c, + const string* cDir = 0, bool dOkFlag = false) + : KfsOp(CMD_CORRUPT_CHUNK, s), + fid(f), + chunkId(c), + isChunkLost(false), + dirOkFlag(dOkFlag), + chunkDir(cDir ? *cDir : string()), + refCount(1) + { + noReply = true; + noRetry = true; + SET_HANDLER(this, &CorruptChunkOp::HandleDone); + } + int Ref() { return refCount++; } + void UnRef() { + if (--refCount <= 0) { + delete this; + } + } + int GetRef() const { return refCount; } + void Request(ostream &os); + // To be called whenever we get a reply from the server + int HandleDone(int code, void *data); + void Execute() { }; + string Show() const { + ostringstream os; + os << "corrupt chunk:" + " fileid: " << fid << + " chunkid: " << chunkId + ; + return os.str(); + } +private: + int refCount; +}; + +struct EvacuateChunksOp : public KfsOp { + enum { kMaxChunkIds = 32 }; + kfsChunkId_t chunkIds[kMaxChunkIds]; // input + int numChunks; + int chunkDirs; + int writableChunkDirs; + int evacuateInFlightCount; + int evacuateChunks; + int64_t totalSpace; + int64_t totalFsSpace; + int64_t usedSpace; + int64_t evacuateByteCount; + + EvacuateChunksOp(kfsSeq_t s = 0, KfsCallbackObj* c = 0) + : KfsOp(CMD_EVACUATE_CHUNKS, s, c), + numChunks(0), + chunkDirs(-1), + writableChunkDirs(-1), + evacuateInFlightCount(-1), + evacuateChunks(0), + totalSpace(-1), + totalFsSpace(-1), + usedSpace(-1), + evacuateByteCount(-1) + { + SET_HANDLER(this, &EvacuateChunksOp::HandleDone); + } + void Request(ostream &os); + // To be called whenever we get a reply from the server + int HandleDone(int code, void *data) { + if (clnt) { + return KfsOp::HandleDone(code, data); + } + delete this; + return 0; + } + void Execute() {}; + string Show() const { + ostringstream os; + os << "evacuate chunks:"; + for (int i = 0; i < numChunks; i++) { + os << " " << chunkIds[i]; + } + return os.str(); + } +}; + +struct SetProperties : public KfsOp { + int contentLength; + Properties properties; // input + SetProperties(kfsSeq_t seq = 0) + : KfsOp(CMD_SET_PROPERTIES, seq), + contentLength(0), + properties() + {} + virtual void Request(ostream &os); + virtual void Execute(); + virtual string Show() const { + string ret("set-properties: " ); + properties.getList(ret, "", ";"); + return ret; + } + virtual int GetContentLength() const { return contentLength; } + virtual bool ParseContent(istream& is); + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + .Def("Content-length", &SetProperties::contentLength) + ; + } +}; + +struct RestartChunkServerOp : public KfsOp { + RestartChunkServerOp(kfsSeq_t seq = 0) + : KfsOp(CMD_RESTART_CHUNK_SERVER, seq) + {} + virtual void Execute(); + virtual string Show() const { + return string("restart"); + } + template static T& ParserDef(T& parser) + { + return KfsOp::ParserDef(parser) + ; + } +}; + +struct KillRemoteSyncOp : public KfsOp { + + // pass in the remote sync SM that needs to be nuked + KillRemoteSyncOp(kfsSeq_t s, KfsCallbackObj *owner) : + KfsOp(CMD_KILL_REMOTE_SYNC, s, owner) + { + + } + void Request(ostream &os) { } + void Execute(); + string Show() const { return "kill remote sync"; } +}; + +// Helper functor that matches ops based on seq # + +class OpMatcher { + kfsSeq_t seqNum; +public: + OpMatcher(kfsSeq_t s) : seqNum(s) { }; + bool operator() (KfsOp *op) { + return op->seq == seqNum; + } +}; + +extern int ParseCommand(const IOBuffer& ioBuf, int len, KfsOp** res); + +extern void SubmitOp(KfsOp *op); +extern void SubmitOpResponse(KfsOp *op); + +} + +#endif // CHUNKSERVER_KFSOPS_H diff --git a/src/cc/chunk/LeaseClerk.cc b/src/cc/chunk/LeaseClerk.cc new file mode 100644 index 000000000..6f3c0aa0e --- /dev/null +++ b/src/cc/chunk/LeaseClerk.cc @@ -0,0 +1,286 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/10/09 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Code for dealing with chunk write leases. +// +//---------------------------------------------------------------------------- + +#include "LeaseClerk.h" +#include "kfsio/Globals.h" + +#include "ChunkManager.h" +#include "MetaServerSM.h" +#include "AtomicRecordAppender.h" + +namespace KFS +{ +using KFS::libkfsio::globalNetManager; + +LeaseClerk gLeaseClerk; + +inline time_t +LeaseClerk::Now() +{ + return globalNetManager().Now(); +} + +LeaseClerk::LeaseClerk() +{ + mLastLeaseCheckTime = 0; + SET_HANDLER(this, &LeaseClerk::HandleEvent); +} + +void +LeaseClerk::RegisterLease(kfsChunkId_t chunkId, int64_t leaseId, bool appendFlag) +{ + // Get replace the old lease if there is one + LeaseInfo_t& lease = mLeases[chunkId]; + lease.leaseId = leaseId; + lease.lastWriteTime = Now(); + lease.expires = lease.lastWriteTime + LEASE_INTERVAL_SECS; + lease.leaseRenewSent = false; + lease.invalidFlag = false; + lease.appendFlag = appendFlag; + KFS_LOG_STREAM_DEBUG << + "registered lease:" + " chunk: " << chunkId << + " lease: " << leaseId << + KFS_LOG_EOM; +} + +void +LeaseClerk::UnRegisterLease(kfsChunkId_t chunkId) +{ + if (mLeases.erase(chunkId) <= 0) { + return; + } + KFS_LOG_STREAM_DEBUG << + "Lease for chunk: " << chunkId << " unregistered" << + KFS_LOG_EOM; +} + +void +LeaseClerk::InvalidateLease(kfsChunkId_t chunkId) +{ + LeaseMap::iterator const iter = mLeases.find(chunkId); + if (iter == mLeases.end() || + iter->second.invalidFlag || + iter->second.appendFlag || + iter->second.expires < Now()) { + return; + } + // Keep meta server's lease valid to allow the client to re-allocate the + // chunk. + // Re-allocation will replace the lease. + iter->second.lastWriteTime = Now(); + iter->second.invalidFlag = true; + KFS_LOG_STREAM_DEBUG << + "Lease for chunk: " << chunkId << " invalidated" << + KFS_LOG_EOM; +} + +void +LeaseClerk::UnregisterAllLeases() +{ + KFS_LOG_STREAM_DEBUG << + "Unregistered all " << mLeases.size() << " leases" << + KFS_LOG_EOM; + mLeases.clear(); +} + +void +LeaseClerk::DoingWrite(kfsChunkId_t chunkId) +{ + LeaseMap::iterator const iter = mLeases.find(chunkId); + if (iter == mLeases.end()) { + return; + } + iter->second.lastWriteTime = Now(); +} + +bool +LeaseClerk::IsLeaseValid(kfsChunkId_t chunkId) const +{ + // now <= lease.expires ==> lease hasn't expired and is therefore + // valid. + LeaseMap::const_iterator const iter = mLeases.find(chunkId); + return (iter != mLeases.end() && ! iter->second.invalidFlag && + Now() <= iter->second.expires); +} + +time_t +LeaseClerk::GetLeaseExpireTime(kfsChunkId_t chunkId) const +{ + LeaseMap::const_iterator const iter = mLeases.find(chunkId); + return ((iter == mLeases.end() || iter->second.invalidFlag) ? + Now() - 1 : iter->second.expires); +} + +void +LeaseClerk::LeaseRenewed(kfsChunkId_t chunkId) +{ + LeaseMap::iterator const iter = mLeases.find(chunkId); + if (iter == mLeases.end()) { + return; // Ignore stale renew reply. + } + LeaseInfo_t& lease = iter->second; + lease.expires = Now() + LEASE_INTERVAL_SECS; + lease.leaseRenewSent = false; + KFS_LOG_STREAM_INFO << + "lease renewed for:" + " chunk: " << chunkId << + " lease: " << lease.leaseId << + KFS_LOG_EOM; +} + +int +LeaseClerk::HandleEvent(int code, void *data) +{ + switch(code) { + case EVENT_CMD_DONE: { + // we got a reply for a lease renewal + const KfsOp* const op = reinterpret_cast(data); + if (! op) { + break; + } + if (op->op == CMD_LEASE_RENEW) { + const LeaseRenewOp* const renewOp = + static_cast(op); + if (renewOp->status == 0) { + LeaseRenewed(renewOp->chunkId); + } else { + UnRegisterLease(renewOp->chunkId); + } + } else if (op->op != CMD_LEASE_RELINQUISH) { + // Relinquish op will get here with its default handler, but + // no other op should, + KFS_LOG_STREAM_DEBUG << "unexpected op: " << op->op << + KFS_LOG_EOM; + } + delete op; + } + break; + + default: + assert(!"Unknown event"); + break; + } + return 0; +} + +void +LeaseClerk::Timeout() +{ + const time_t now = Now(); + if (mLastLeaseCheckTime + 1 >= now) { + return; + } + mLastLeaseCheckTime = now; + // once per second, check the state of the leases + for (LeaseMap::iterator it = mLeases.begin(); it != mLeases.end(); ) { + // messages could be in-flight...so wait for a full + // lease-interval before discarding dead leases + if (it->second.expires + LEASE_INTERVAL_SECS < now) { + KFS_LOG_STREAM_INFO << + "cleanup lease: " << it->second.leaseId << + " chunk: " << it->first << + KFS_LOG_EOM; + mLeases.erase(it++); + continue; + } + const kfsChunkId_t chunkId = it->first; + LeaseInfo_t& lease = it->second; + ++it; + + /// Before the lease expires at the server, we submit we a renew + /// request, so that the lease remains valid. So, back-off a few + /// secs before the leases and submit the renew + if (lease.leaseRenewSent || + now + LEASE_INTERVAL_SECS - 60 < lease.expires) { + // If the lease is valid for a while or a lease renew is in flight, + // move on + continue; + } + // Renew the lease if a write is pending or a write + // occured when we had a valid lease or if we are doing record + // appends to the chunk and some client has space reserved or + // there is some data buffered in the appender. + if (lease.lastWriteTime + LEASE_INTERVAL_SECS < now && + ! (lease.appendFlag ? + gAtomicRecordAppendManager.WantsToKeepLease(chunkId) : + gChunkManager.IsWritePending(chunkId) + )) { + continue; + } + // The metaserverSM will fill seq#. + LeaseRenewOp* const op = new LeaseRenewOp( + -1, chunkId, lease.leaseId, "WRITE_LEASE"); + KFS_LOG_STREAM_INFO << + "sending lease renew for:" + " chunk: " << chunkId << + " lease: " << lease.leaseId << + " expires in: " << (lease.expires - now) << " sec" << + KFS_LOG_EOM; + op->noRetry = true; + op->clnt = this; + lease.leaseRenewSent = true; + gMetaServerSM.EnqueueOp(op); + } +} + +void +LeaseClerk::RelinquishLease(kfsChunkId_t chunkId, int64_t size, + bool hasChecksum, uint32_t checksum) +{ + LeaseMap::iterator const it = mLeases.find(chunkId); + if (it == mLeases.end()) { + KFS_LOG_STREAM_DEBUG << + "lease relinquish: no lease exists for:" + " chunk: " << chunkId << + " size: " << size << + " checksum: " << (hasChecksum ? int64_t(checksum) : int64_t(-1)) << + KFS_LOG_EOM; + return; + } + // Notify metaserver if the lease exists, even if lease expired or renew is + // in flight, then delete the lease. + const LeaseInfo_t& lease = it->second; + LeaseRelinquishOp *op = new LeaseRelinquishOp( + -1, chunkId, lease.leaseId, "WRITE_LEASE"); + KFS_LOG_STREAM_INFO << + "sending lease relinquish for:" + " chunk: " << chunkId << + " lease: " << lease.leaseId << + " expires in: " << (lease.expires - Now()) << " sec" << + " size: " << size << + " checksum: " << (hasChecksum ? int64_t(checksum) : int64_t(-1)) << + KFS_LOG_EOM; + op->noRetry = true; // On disconnect meta server expires write leases. + op->hasChecksum = hasChecksum; + op->chunkChecksum = checksum; + op->chunkSize = size; + op->clnt = this; + mLeases.erase(it); + gMetaServerSM.EnqueueOp(op); +} +} // namespace KFS diff --git a/src/cc/chunk/LeaseClerk.h b/src/cc/chunk/LeaseClerk.h new file mode 100644 index 000000000..8c567a2de --- /dev/null +++ b/src/cc/chunk/LeaseClerk.h @@ -0,0 +1,110 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/10/09 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief A lease clerk interacts with the metaserver for renewing +// leases. There are two assumptions here: +// 1. The lease is for writes and only those need to be renewed +// 2. Prior to renewing a lease, the lease clerk checks with the +// ChunkManager to see if writes are outstanding on the chunk +// associated with the lease; only if writes are pending, is the lease +// renewed. +//---------------------------------------------------------------------------- + +#ifndef CHUNKSERVER_LEASECLERK_H +#define CHUNKSERVER_LEASECLERK_H + +#include + +#include "kfsio/event.h" +#include "common/kfstypes.h" +#include "Chunk.h" + +namespace KFS +{ + +// mapping from a chunk id to its lease + +class LeaseClerk : public KfsCallbackObj +{ +public: + LeaseClerk(); + ~LeaseClerk() + {} + /// Register a lease with the clerk. The clerk will renew the + /// lease with the server. + /// @param[in] chunkId The chunk associated with the lease. + /// @param[in] leaseId The lease id to be registered with the clerk + /// @param[in] appendFlag True if chunk created in write append mode + void RegisterLease(kfsChunkId_t chunkId, int64_t leaseId, bool appendFlag); + void UnRegisterLease(kfsChunkId_t chunkId); + void InvalidateLease(kfsChunkId_t chunkId); + + /// Used for voluntarily giving up a write lease. + /// + void RelinquishLease(kfsChunkId_t chunkId, int64_t size = -1, + bool hasChecksum = false, uint32_t checksum = 0); + /// Record the occurence of a write. This notifies the clerk to + /// renew the lease prior to the end of the lease period. + void DoingWrite(kfsChunkId_t chunkId); + + /// Check if lease is still valid. + /// @param[in] chunkId The chunk whose lease we are checking for validity. + bool IsLeaseValid(kfsChunkId_t chunkId) const; + + // Lease renew op completion handler. + int HandleEvent(int code, void *data); + + time_t GetLeaseExpireTime(kfsChunkId_t chunkId) const; + void UnregisterAllLeases(); + + void Timeout(); + +private: + struct LeaseInfo_t { + int64_t leaseId; + time_t expires; + time_t lastWriteTime; + bool leaseRenewSent:1; + bool appendFlag:1; + bool invalidFlag:1; + }; + typedef std::tr1::unordered_map LeaseMap; + + /// All the leases registered with the clerk + LeaseMap mLeases; + time_t mLastLeaseCheckTime; + + void LeaseRenewed(kfsChunkId_t chunkId); + void LeaseExpired(kfsChunkId_t chunkId); + + inline static time_t Now(); +private: + LeaseClerk(const LeaseClerk&); + LeaseClerk& operator=(const LeaseClerk&); +}; + +extern LeaseClerk gLeaseClerk; + +} + +#endif // CHUNKSERVER_LEASECLERK_H diff --git a/src/cc/chunk/Logger.cc b/src/cc/chunk/Logger.cc new file mode 100644 index 000000000..f41bc27c4 --- /dev/null +++ b/src/cc/chunk/Logger.cc @@ -0,0 +1,76 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/20 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include +#include + +#include "Logger.h" +#include "KfsOps.h" + +namespace KFS +{ +using std::ifstream; +using std::string; + +Logger gLogger; + +// Remains of the chunk server transaction log. Presently transaction log is not +// used. +void +Logger::Submit(KfsOp *op) +{ + if (op->op == CMD_CHECKPOINT) { + delete op; + return; + } + if (op->op == CMD_WRITE) { + KFS::SubmitOpResponse(op); + } else { + assert(op->clnt != NULL); + op->clnt->HandleEvent(EVENT_CMD_DONE, op); + } +} + +int +Logger::GetVersionFromCkpt() +{ + const string lastCP(mLogDir + "/ckpt_latest"); + ifstream ifs(lastCP.c_str(), ifstream::in); + if (!ifs) { + return KFS_LOG_VERSION; + } + + // Read the header + // Line 1 is the version + string versStr; + int vers = 0; + if (! (ifs >> versStr >> vers) || versStr != "version:") { + return 0; + } + return vers; +} + +} diff --git a/src/cc/chunk/Logger.h b/src/cc/chunk/Logger.h new file mode 100644 index 000000000..89f189536 --- /dev/null +++ b/src/cc/chunk/Logger.h @@ -0,0 +1,82 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/20 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file Logger.h +// \brief Code for handling logging between checkpoints +// +//---------------------------------------------------------------------------- + +#ifndef CHUNKSERVER_LOGGER_H +#define CHUNKSERVER_LOGGER_H + +#include + +namespace KFS +{ +using std::string; + +struct KfsOp; + +// Remains of the chunk server transaction log. Presently transaction log is not +// used, Submit just dispatches the op. +class Logger { +public: + Logger() + : mLogDir() + {} + ~Logger() + {} + + void Init(const string &logDir) { + mLogDir = logDir; + } + void Start() {} + + /// Submit a request for logging. This is called by the main + /// thread and the request is sent down to the logger thread. + /// @param[in] op The op that needs to be logged + void Submit(KfsOp *op); + + int GetVersionFromCkpt(); + + int GetLoggerVersionNum() const { + return KFS_LOG_VERSION; + } + +private: + /// Version # to be written out in the ckpt file + static const int KFS_LOG_VERSION = 2; + static const int KFS_LOG_VERSION_V1 = 1; + + /// The path to the directory for writing out logs + string mLogDir; +private: + Logger(const Logger&); + Logger& operator=(const Logger&); +}; + +extern Logger gLogger; + +} + +#endif // CHUNKSERVER_LOGGER_H diff --git a/src/cc/chunk/MetaServerSM.cc b/src/cc/chunk/MetaServerSM.cc new file mode 100644 index 000000000..13c7a5895 --- /dev/null +++ b/src/cc/chunk/MetaServerSM.cc @@ -0,0 +1,703 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/07 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file MetaServerSM.cc +// \brief Handle interactions with the meta server. +// +//---------------------------------------------------------------------------- + +#include "common/MsgLogger.h" +#include "MetaServerSM.h" +#include "ChunkManager.h" +#include "ChunkServer.h" +#include "utils.h" +#include "LeaseClerk.h" +#include "Replicator.h" + +#include "kfsio/NetManager.h" +#include "kfsio/Globals.h" +#include "qcdio/QCUtils.h" + +#include +#include +#include +#include + +#include +#include +#include + +namespace KFS +{ +using std::ostringstream; +using std::istringstream; +using std::make_pair; +using std::string; +using KFS::libkfsio::globalNetManager; + +MetaServerSM gMetaServerSM; + +MetaServerSM::MetaServerSM() + : KfsCallbackObj(), + ITimeout(), + mCmdSeq(GetRandomSeq()), + mRackId(-1), + mSentHello(false), + mHelloOp(0), + mNetConnection(), + mInactivityTimeout(65), + mMaxReadAhead(4 << 10), + mLastRecvCmdTime(0), + mLastConnectTime(0), + mConnectedTime(0), + mReconnectFlag(false), + mCounters(), + mWOStream() +{ + // Force net manager construction here, to insure that net manager + // destructor is called after gMetaServerSM destructor. + globalNetManager(); + SET_HANDLER(this, &MetaServerSM::HandleRequest); + mCounters.Clear(); +} + +MetaServerSM::~MetaServerSM() +{ + globalNetManager().UnRegisterTimeoutHandler(this); + FailOps(true); + delete mHelloOp; +} + +void +MetaServerSM::SetMetaInfo(const ServerLocation& metaLoc, const string& clusterKey, + int rackId, const string& md5sum, const Properties& prop) +{ + mLocation = metaLoc; + mClusterKey = clusterKey; + mRackId = rackId; + mMD5Sum = md5sum; + SetParameters(prop); +} + +void +MetaServerSM::SetParameters(const Properties& prop) +{ + mInactivityTimeout = prop.getValue( + "chunkServer.meta.inactivityTimeout", mInactivityTimeout); + mMaxReadAhead = prop.getValue( + "chunkServer.meta.maxReadAhead", mMaxReadAhead); +} + +void +MetaServerSM::Init() +{ + globalNetManager().RegisterTimeoutHandler(this); +} + +void +MetaServerSM::Timeout() +{ + if (mReconnectFlag) { + mReconnectFlag = false; + KFS_LOG_STREAM_WARN << + "meta server reconnect requested" << + KFS_LOG_EOM; + HandleRequest(EVENT_INACTIVITY_TIMEOUT, 0); + } + const time_t now = globalNetManager().Now(); + if (IsConnected() && + IsHandshakeDone() && + mLastRecvCmdTime + mInactivityTimeout < now) { + KFS_LOG_STREAM_ERROR << + "meta server inactivity timeout, last request received: " << + (now - mLastRecvCmdTime) << " secs ago" << + KFS_LOG_EOM; + HandleRequest(EVENT_INACTIVITY_TIMEOUT, 0); + } + if (! IsConnected()) { + if (mHelloOp) { + if (! mSentHello) { + return; // Wait for hello to come back. + } + delete mHelloOp; + mHelloOp = 0; + mSentHello = false; + } + if (mLastConnectTime + 1 < now) { + mLastConnectTime = now; + Connect(); + } + return; + } + if (! IsHandshakeDone()) { + return; + } + DispatchOps(); + mNetConnection->StartFlush(); +} + +time_t +MetaServerSM::ConnectionUptime() const +{ + return (IsUp() ? (globalNetManager().Now() - mLastConnectTime) : 0); +} + +int +MetaServerSM::Connect() +{ + if (mHelloOp) { + return 0; + } + mCounters.mConnectCount++; + mSentHello = false; + TcpSocket * const sock = new TcpSocket(); + const bool nonBlocking = true; + const int ret = sock->Connect(mLocation, nonBlocking); + if (ret < 0 && ret != -EINPROGRESS) { + KFS_LOG_STREAM_ERROR << + "connection to meter server failed:" + " error: " << QCUtils::SysError(-ret) << + KFS_LOG_EOM; + delete sock; + return -1; + } + KFS_LOG_STREAM_INFO << + (ret < 0 ? "connecting" : "connected") << + " to metaserver " << mLocation.ToString() << + KFS_LOG_EOM; + mNetConnection.reset(new NetConnection(sock, this)); + if (ret != 0) { + mNetConnection->SetDoingNonblockingConnect(); + } + // when the system is overloaded, we still want to add this + // connection to the poll vector for reads; this ensures that we + // get the heartbeats and other RPCs from the metaserver + mNetConnection->EnableReadIfOverloaded(); + mNetConnection->SetInactivityTimeout(mInactivityTimeout); + mNetConnection->SetMaxReadAhead(mMaxReadAhead); + // Add this to the poll vector + globalNetManager().AddConnection(mNetConnection); + if (ret == 0) { + SendHello(); + } + return 0; +} + +inline void +ChunkServer::SetLocation(const ServerLocation& loc) +{ + mLocation = loc; +} + +static inline int +IsIpHostedAndNotLoopBack(const char* ip) +{ + if (! ip) { + return -EINVAL; + } + struct sockaddr_in addr; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = 0; // any port + if (! inet_aton(ip, &addr.sin_addr)) { + return -EINVAL; + } + if (addr.sin_addr.s_addr == htonl(INADDR_LOOPBACK)) { + return -EACCES; + } + const int fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (fd < 0) { + return -errno; + } + const int ret = bind(fd, (struct sockaddr*)&addr, sizeof(addr)) == 0 ? + 0 : -errno; + close(fd); + return ret; +} + +int +MetaServerSM::SendHello() +{ + if (mHelloOp) { + return 0; + } + if (! IsConnected()) { + KFS_LOG_STREAM_DEBUG << + "unable to connect to meta server" << + KFS_LOG_EOM; + return -1; + } + if (gChunkServer.CanUpdateServerIp()) { + // Advertise the same ip address to the clients, as used + // for the meta connection. + ServerLocation loc(gChunkServer.GetLocation()); + loc.hostname = mNetConnection->GetSockName(); + const size_t colonPos = loc.hostname.find(char(':')); + if (colonPos == string::npos) { + KFS_LOG_STREAM_ERROR << + "invalid socket name: " << loc.hostname << + " resetting meta server connection" << + KFS_LOG_EOM; + mNetConnection->Close(); + return -1; + } + loc.hostname.erase(colonPos); + // Paperover for cygwin / win 7 with no nics configured: + // check if getsockname returns INADDR_ANY, and retry if it does. + // Moving this logic into TcpSocket isn't appropriate: INADDR_ANY is + // valid for unconnected socket bound to INADDR_ANY. + const char* const kAddrAny = "0.0.0.0"; + if (! loc.IsValid() || loc.hostname == kAddrAny) { + KFS_LOG_STREAM_ERROR << + "invalid chunk server location: " << loc << + " resetting meta server connection" << + KFS_LOG_EOM; + mNetConnection->Close(); + return -1; + } + const string prevIp = gChunkServer.GetLocation().hostname; + if (loc.hostname != prevIp) { + if (prevIp.empty()) { + KFS_LOG_STREAM_INFO << + "setting chunk server ip to: " << loc.hostname << + KFS_LOG_EOM; + gChunkServer.SetLocation(loc); + } else { + const int err = IsIpHostedAndNotLoopBack(prevIp.c_str()); + KFS_LOG_STREAM_WARN << + "meta server connection local address: " << loc.hostname << + " current chunk server ip: " << prevIp << + (err == 0 ? string() : + " is no longer valid: " + QCUtils::SysError(-err)) << + KFS_LOG_EOM; + if (err) { + gChunkServer.SetLocation(loc); + } + } + } + } + mHelloOp = new HelloMetaOp( + nextSeq(), gChunkServer.GetLocation(), mClusterKey, mMD5Sum, mRackId); + mHelloOp->clnt = this; + // Send the op and wait for the reply. + SubmitOp(mHelloOp); + return 0; +} + +void +MetaServerSM::DispatchHello() +{ + if (! IsConnected()) { + // don't have a connection...so, need to start the process again... + delete mHelloOp; + mHelloOp = 0; + mSentHello = false; + return; + } + mSentHello = true; + mHelloOp->Request(mWOStream.Set(mNetConnection->GetOutBuffer())); + mWOStream.Reset(); + KFS_LOG_STREAM_INFO << + "Sending hello to meta server: " << mHelloOp->Show() << + KFS_LOG_EOM; + mNetConnection->StartFlush(); +} + +/// +/// Generic event handler. Decode the event that occurred and +/// appropriately extract out the data and deal with the event. +/// @param[in] code: The type of event that occurred +/// @param[in] data: Data being passed in relative to the event that +/// occurred. +/// @retval 0 to indicate successful event handling; -1 otherwise. +/// +int +MetaServerSM::HandleRequest(int code, void* data) +{ + switch (code) { + case EVENT_NET_READ: { + // We read something from the network. Run the RPC that + // came in. + IOBuffer * const iobuf = (IOBuffer *) data; + bool hasMsg; + int cmdLen = 0; + while ((hasMsg = IsMsgAvail(iobuf, &cmdLen))) { + // if we don't have all the data for the command, bail + if (! HandleMsg(iobuf, cmdLen)) { + break; + } + } + int hdrsz; + if (! hasMsg && + (hdrsz = iobuf->BytesConsumable()) > MAX_RPC_HEADER_LEN) { + KFS_LOG_STREAM_ERROR << + "exceeded max request header size: " << hdrsz << + ">" << MAX_RPC_HEADER_LEN << + " closing connection: " << (IsConnected() ? + mNetConnection->GetPeerName() : + string("not connected")) << + KFS_LOG_EOM; + iobuf->Clear(); + return HandleRequest(EVENT_NET_ERROR, 0); + } + } + break; + + case EVENT_NET_WROTE: + if (! mSentHello && ! mHelloOp) { + SendHello(); + } + // Something went out on the network. For now, we don't + // track it. Later, we may use it for tracking throttling + // and such. + break; + + case EVENT_CMD_DONE: { + // An op finished execution. Send a response back + KfsOp* const op = reinterpret_cast(data); + if (op->op == CMD_META_HELLO) { + DispatchHello(); + } else { + SendResponse(op); + delete op; + } + } + break; + + case EVENT_INACTIVITY_TIMEOUT: + case EVENT_NET_ERROR: + if (mNetConnection) { + KFS_LOG_STREAM_ERROR << + mLocation.ToString() << + " closing meta server connection due to " << + (code == EVENT_INACTIVITY_TIMEOUT ? + "inactivity timeout" : "network error") << + KFS_LOG_EOM; + mNetConnection->Close(); + // Drop all leases. + gLeaseClerk.UnregisterAllLeases(); + // Meta server will fail all replication requests on + // disconnect anyway. + Replicator::CancelAll(); + gChunkManager.MetaServerConnectionLost(); + } + FailOps(! globalNetManager().IsRunning()); + break; + + default: + assert(!"Unknown event"); + break; + } + return 0; +} + +void +MetaServerSM::FailOps(bool shutdownFlag) +{ + // Fail all no retry ops, if any, or all ops on shutdown. + OpsQueue doneOps; + for (DispatchedOps::iterator it = mDispatchedOps.begin(); + it != mDispatchedOps.end(); + ) { + KfsOp* const op = it->second; + if (op->noRetry || shutdownFlag) { + mDispatchedOps.erase(it++); + doneOps.push_back(op); + } else { + ++it; + } + } + for (; ;) { + for (OpsQueue::const_iterator it = doneOps.begin(); + it != doneOps.end(); + ++it) { + KfsOp* const op = *it; + op->status = -EHOSTUNREACH; + SubmitOpResponse(op); + } + if (! shutdownFlag || mPendingOps.empty()) { + break; + } + doneOps.clear(); + mPendingOps.swap(doneOps); + } +} + +bool +MetaServerSM::HandleMsg(IOBuffer *iobuf, int msgLen) +{ + char buf[3]; + if (iobuf->CopyOut(buf, 3) == 3 && + buf[0] == 'O' && buf[1] == 'K' && (buf[2] & 0xFF) <= ' ') { + // This is a response to some op we sent earlier + return HandleReply(iobuf, msgLen); + } else { + // is an RPC from the server + return HandleCmd(iobuf, msgLen); + } +} + +bool +MetaServerSM::HandleReply(IOBuffer *iobuf, int msgLen) +{ + Properties prop; + { + IOBuffer::IStream is(*iobuf, msgLen); + const char separator = ':'; + prop.loadProperties(is, separator, false); + } + iobuf->Consume(msgLen); + + const kfsSeq_t seq = prop.getValue("Cseq", (kfsSeq_t)-1); + const int status = prop.getValue("Status", -1); + if (mHelloOp) { + if (status == -EBADCLUSTERKEY) { + KFS_LOG_STREAM_FATAL << + "exiting due to cluster key mismatch; our key: " << mClusterKey << + KFS_LOG_EOM; + globalNetManager().Shutdown(); + return false; + } + mCounters.mHelloCount++; + const bool err = seq != mHelloOp->seq || status != 0; + if (err) { + KFS_LOG_STREAM_ERROR << + " bad hello response:" + " seq: " << seq << "/" << mHelloOp->seq << + " status: " << status << + KFS_LOG_EOM; + mCounters.mHelloErrorCount++; + } + HelloMetaOp::LostChunkDirs lostDirs; + lostDirs.swap(mHelloOp->lostChunkDirs); + delete mHelloOp; + mHelloOp = 0; + if (err) { + HandleRequest(EVENT_NET_ERROR, 0); + return false; + } + mConnectedTime = globalNetManager().Now(); + ResubmitOps(); + for (HelloMetaOp::LostChunkDirs::const_iterator it = lostDirs.begin(); + it != lostDirs.end(); + ++it) { + EnqueueOp(new CorruptChunkOp(0, -1, -1, &(*it), false)); + } + return true; + } + DispatchedOps::iterator const iter = mDispatchedOps.find(seq); + if (iter == mDispatchedOps.end()) { + string reply; + prop.getList(reply, string(), string(" ")); + KFS_LOG_STREAM_DEBUG << "meta reply:" + " no op found for: " << reply << + KFS_LOG_EOM; + return true; + } + KfsOp* const op = iter->second; + mDispatchedOps.erase(iter); + op->status = status; + KFS_LOG_STREAM_DEBUG << + "recv meta reply:" + " seq: " << seq << + " status: " << status << + " " << op->Show() << + KFS_LOG_EOM; + // The op will be gotten rid of by this call. + SubmitOpResponse(op); + return true; +} + +/// +/// We have a command in a buffer. It is possible that we don't have +/// everything we need to execute it (for example, for a stale chunks +/// RPC, we may not have received all the chunkids). So, parse +/// out the command and if we have everything execute it. +/// + +bool +MetaServerSM::HandleCmd(IOBuffer* iobuf, int cmdLen) +{ + KfsOp* op = 0; + if (ParseCommand(*iobuf, cmdLen, &op) != 0) { + IOBuffer::IStream is(*iobuf, cmdLen); + const string peer = IsConnected() ? + mNetConnection->GetPeerName() : string("not connected"); + string line; + int numLines = 32; + while (--numLines >= 0 && getline(is, line)) { + KFS_LOG_STREAM_ERROR << peer << + " invalid meta request: " << line << + KFS_LOG_EOM; + } + iobuf->Clear(); + HandleRequest(EVENT_NET_ERROR, 0); + // got a bogus command + return false; + } + + const int contentLength = op->GetContentLength(); + const int remLen = cmdLen + contentLength - iobuf->BytesConsumable(); + if (remLen > 0) { + // if we don't have all the data wait... + if (remLen > mMaxReadAhead && mNetConnection) { + mNetConnection->SetMaxReadAhead(remLen); + } + delete op; + return false; + } + if (mNetConnection) { + mNetConnection->SetMaxReadAhead(mMaxReadAhead); + } + iobuf->Consume(cmdLen); + if (contentLength > 0) { + IOBuffer::IStream is(*iobuf, contentLength); + if (! op->ParseContent(is)) { + KFS_LOG_STREAM_ERROR << + (IsConnected() ? mNetConnection->GetPeerName() : "") << + " invalid content: " << op->statusMsg << + " cmd: " << op->Show() << + KFS_LOG_EOM; + delete op; + HandleRequest(EVENT_NET_ERROR, 0); + return false; + } + iobuf->Consume(contentLength); + } + mLastRecvCmdTime = globalNetManager().Now(); + op->clnt = this; + KFS_LOG_STREAM_DEBUG << + "recv meta cmd:" + " seq: " << op->seq << + " " << op->Show() << + KFS_LOG_EOM; + SubmitOp(op); + return true; +} + +void +MetaServerSM::EnqueueOp(KfsOp* op) +{ + op->seq = nextSeq(); + if (mPendingOps.empty() && IsUp()) { + if (! op->noReply && + ! mDispatchedOps.insert(make_pair(op->seq, op)).second) { + die("duplicate seq. number"); + } + op->Request(mWOStream.Set(mNetConnection->GetOutBuffer())); + mWOStream.Reset(); + op->status = 0; + if (op->noReply) { + SubmitOpResponse(op); + } + } else { + if (globalNetManager().IsRunning()) { + mPendingOps.push_back(op); + } else { + op->status = -EHOSTUNREACH; + SubmitOpResponse(op); + } + } + globalNetManager().Wakeup(); +} + +/// +/// Queue the response to the meta server request. The response is +/// generated by MetaRequest as per the protocol. +/// @param[in] op The request for which we finished execution. +/// + +void +MetaServerSM::SendResponse(KfsOp* op) +{ + if (! mSentHello || ! IsConnected()) { + // Hello does full chunk inventory synchronization. + // Meta server assumes undefined state for all requests that were in + // in flight at the time of disconnect, and will discard the responses + // anyway, as it will purge its pending response queue at the time of + // disconnect. + return; + } + // fire'n'forget. + KFS_LOG_STREAM_DEBUG << + "send meta reply:" + " seq: " << op->seq << + (op->statusMsg.empty() ? "" : " msg: ") << op->statusMsg << + " status: " << op->status << + " " << op->Show() << + KFS_LOG_EOM; + if (op->op == CMD_ALLOC_CHUNK) { + mCounters.mAllocCount++; + if (op->status < 0) { + mCounters.mAllocErrorCount++; + } + } + op->Response(mWOStream.Set(mNetConnection->GetOutBuffer())); + mWOStream.Reset(); + globalNetManager().Wakeup(); +} + +void +MetaServerSM::DispatchOps() +{ + while (! mPendingOps.empty() && IsHandshakeDone()) { + if (! IsConnected()) { + KFS_LOG_STREAM_INFO << + "meta handshake is not done, will dispatch later" << + KFS_LOG_EOM; + return; + } + KfsOp* const op = mPendingOps.front(); + mPendingOps.pop_front(); + assert(op->op != CMD_META_HELLO); + if (! op->noReply && + ! mDispatchedOps.insert(make_pair(op->seq, op)).second) { + die("duplicate seq. number"); + } + KFS_LOG_STREAM_DEBUG << + "send meta cmd:" + " seq: " << op->seq << + " " << op->Show() << + KFS_LOG_EOM; + op->Request(mWOStream.Set(mNetConnection->GetOutBuffer())); + mWOStream.Reset(); + } +} + +// After re-establishing connection to the server, resubmit the ops. +void +MetaServerSM::ResubmitOps() +{ + if (mDispatchedOps.empty()) { + return; + } + ostream& os = mWOStream.Set(mNetConnection->GetOutBuffer()); + for (DispatchedOps::const_iterator it = mDispatchedOps.begin(); + it != mDispatchedOps.end(); + ++it) { + it->second->Request(os); + } + mWOStream.Reset(); +} + +} // namespace KFS diff --git a/src/cc/chunk/MetaServerSM.h b/src/cc/chunk/MetaServerSM.h new file mode 100644 index 000000000..f6fb8baa2 --- /dev/null +++ b/src/cc/chunk/MetaServerSM.h @@ -0,0 +1,242 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/07 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file MetaServerSM.h +// \brief State machine that interfaces with the meta server and +// handles the RPCs sent by the meta server. +// +//---------------------------------------------------------------------------- + +#ifndef CHUNKSERVER_METASERVERSM_H +#define CHUNKSERVER_METASERVERSM_H + +#include "KfsOps.h" + +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/ITimeout.h" +#include "kfsio/NetConnection.h" +#include "kfsio/IOBuffer.h" +#include "common/StdAllocator.h" + +#include +#include +#include +#include + +namespace KFS +{ +using std::string; +using std::deque; +using std::map; + +class MetaServerSMTimeoutImpl; +class Properties; + +class MetaServerSM : public KfsCallbackObj, private ITimeout { +public: + struct Counters + { + typedef int64_t Counter; + + Counter mConnectCount; + Counter mHelloCount; + Counter mHelloErrorCount; + Counter mAllocCount; + Counter mAllocErrorCount; + + void Clear() + { + mConnectCount = 0; + mHelloCount = 0; + mHelloErrorCount = 0; + mAllocCount = 0; + mAllocErrorCount = 0; + } + }; + + MetaServerSM(); + ~MetaServerSM(); + + /// In each hello to the metaserver, we send an MD5 sum of the + /// binaries. This should be "acceptable" to the metaserver and + /// only then is the chunkserver allowed in. This provides a + /// simple mechanism to identify nodes that didn't receive a + /// binary update or are running versions that the metaserver + /// doesn't know about and shouldn't be inlcuded in the system. + void SetMetaInfo(const ServerLocation &metaLoc, const string &clusterKey, int rackId, + const string &md5sum, const Properties& prop); + + void Init(); + + /// Send HELLO message. This sends an op down to the event + /// processor to get all the info. + int SendHello(); + + /// Generic event handler to handle RPC requests sent by the meta server. + int HandleRequest(int code, void *data); + + bool HandleMsg(IOBuffer *iobuf, int msgLen); + + void EnqueueOp(KfsOp *op); + + /// If the connection to the server breaks, periodically, retry to + /// connect; also dispatch ops. + virtual void Timeout(); + + /// Return the meta server name/port information + ServerLocation GetLocation() const { + return mLocation; + } + + kfsSeq_t nextSeq() { + return mCmdSeq++; + } + + time_t GetLastRecvCmdTime() const { + return mLastRecvCmdTime; + } + + bool IsConnected() const { + return (mNetConnection && mNetConnection->IsGood()); + } + + bool IsHandshakeDone() const { + return (mSentHello && ! mHelloOp); + } + + bool IsUp() const { + return (IsConnected() && IsHandshakeDone()); + } + + time_t ConnectionUptime() const; + + void GetCounters(Counters& counters) { + counters = mCounters; + } + + void Reconnect() { + mReconnectFlag = true; + } + + void SetParameters(const Properties& prop); +private: + typedef deque OpsQueue; + typedef std::map< + kfsSeq_t, + KfsOp*, + std::less, + StdFastAllocator< + std::pair + > + > DispatchedOps; + + kfsSeq_t mCmdSeq; + /// where is the meta server located? + ServerLocation mLocation; + + /// An id that specifies the rack on which the server is located; + /// this is used to do rack-aware replication + int mRackId; + + /// "shared secret" key for this cluster. This is used to prevent + /// config mishaps: When we connect to a metaserver, we have to + /// agree on the cluster key. + string mClusterKey; + + /// An MD5 sum computed over the binaries that we send to the metaserver. + string mMD5Sum; + + /// the port that the metaserver tells the clients to connect to us at. + int mChunkServerPort; + + /// the hostname to use for discovering our IP address + /// (instead of using gethostname) + string mChunkServerHostname; + + /// Track if we have sent a "HELLO" to metaserver + bool mSentHello; + + /// a handle to the hello op. The model: the network processor + /// queues the hello op to the event processor; the event + /// processor pulls the result and enqueues the op back to us; the + /// network processor dispatches the op and gets rid of it. + HelloMetaOp *mHelloOp; + + /// list of ops that need to be dispatched: this is the queue that + /// is shared between the event processor and the network + /// dispatcher. When the network dispatcher runs, it pulls ops + /// from this queue and stashes them away in the dispatched list. + OpsQueue mPendingOps; + + /// ops that we have sent to metaserver and are waiting for reply. + DispatchedOps mDispatchedOps; + + /// Our connection to the meta server. + NetConnectionPtr mNetConnection; + + /// A timer to periodically check that the connection to the + /// server is good; if the connection broke, reconnect and do the + /// handshake again. Also, we use the timeout to dispatch pending + /// messages to the server. + int mInactivityTimeout; + int mMaxReadAhead; + time_t mLastRecvCmdTime; + time_t mLastConnectTime; + time_t mConnectedTime; + bool mReconnectFlag; + Counters mCounters; + IOBuffer::WOStream mWOStream; + + /// Connect to the meta server + /// @retval 0 if connect was successful; -1 otherwise + int Connect(); + + /// Given a (possibly) complete op in a buffer, run it. + bool HandleCmd(IOBuffer *iobuf, int cmdLen); + /// Handle a reply to an RPC we previously sent. + bool HandleReply(IOBuffer *iobuf, int msgLen); + + /// Op has finished execution. Send a response to the meta + /// server. + void SendResponse(KfsOp *op); + + /// This is special: we dispatch mHelloOp and get rid of it. + void DispatchHello(); + + /// Submit all the enqueued ops + void DispatchOps(); + + /// We reconnected to the metaserver; so, resend all the pending ops. + void ResubmitOps(); + void FailOps(bool shutdownFlag); +private: + // No copy. + MetaServerSM(const MetaServerSM&); + MetaServerSM& operator=(const MetaServerSM&); +}; + +extern MetaServerSM gMetaServerSM; + +} + +#endif // CHUNKSERVER_METASERVERSM_H diff --git a/src/cc/chunk/RemoteSyncSM.cc b/src/cc/chunk/RemoteSyncSM.cc new file mode 100644 index 000000000..0a5110fb1 --- /dev/null +++ b/src/cc/chunk/RemoteSyncSM.cc @@ -0,0 +1,529 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/09/27 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "RemoteSyncSM.h" +#include "utils.h" +#include "ChunkServer.h" +#include "kfsio/NetManager.h" +#include "kfsio/Globals.h" + +#include "common/MsgLogger.h" +#include "common/Properties.h" + +#include +#include +#include +#include +#include +#include + +namespace KFS +{ + +using std::for_each; +using std::list; +using std::istringstream; +using std::string; +using std::make_pair; + +using namespace KFS::libkfsio; + +const int kMaxCmdHeaderLength = 2 << 10; +bool RemoteSyncSM::sTraceRequestResponse = false; +int RemoteSyncSM::sOpResponseTimeoutSec = 5 * 60; // 5 min op response timeout + +static kfsSeq_t +NextSeq() +{ + static kfsSeq_t sSeqno = GetRandomSeq(); + return sSeqno++; +} + +inline void +RemoteSyncSM::UpdateRecvTimeout() +{ + if (sOpResponseTimeoutSec < 0 || ! mNetConnection) { + return; + } + const time_t now = globalNetManager().Now(); + const time_t end = mLastRecvTime + sOpResponseTimeoutSec; + mNetConnection->SetInactivityTimeout(end > now ? end - now : 0); +} + +// State machine for communication with other chunk servers. +RemoteSyncSM::RemoteSyncSM(const ServerLocation &location) + : KfsCallbackObj(), + mNetConnection(), + mLocation(location), + mSeqnum(NextSeq()), + mDispatchedOps(), + mReplySeqNum(-1), + mReplyNumBytes(0), + mRecursionCount(0), + mLastRecvTime(0), + mWOStream() +{ +} + +kfsSeq_t +RemoteSyncSM::NextSeqnum() +{ + mSeqnum = NextSeq(); + return mSeqnum; +} + +RemoteSyncSM::~RemoteSyncSM() +{ + if (mNetConnection) + mNetConnection->Close(); + assert(mDispatchedOps.size() == 0); +} + +bool +RemoteSyncSM::Connect() +{ + assert(! mNetConnection); + + KFS_LOG_STREAM_DEBUG << + "Trying to connect to: " << mLocation.ToString() << + KFS_LOG_EOM; + + TcpSocket* const sock = new TcpSocket(); + // do a non-blocking connect + const int res = sock->Connect(mLocation, true); + if ((res < 0) && (res != -EINPROGRESS)) { + KFS_LOG_STREAM_INFO << + "Connect to remote server (" << mLocation << + ") failed: code = " << res << + KFS_LOG_EOM; + delete sock; + return false; + } + + KFS_LOG_STREAM_INFO << + "Connect to remote server " << mLocation.ToString() << + " succeeded (res = " << res << ")" << + KFS_LOG_EOM; + + SET_HANDLER(this, &RemoteSyncSM::HandleEvent); + + mNetConnection.reset(new NetConnection(sock, this)); + mNetConnection->SetDoingNonblockingConnect(); + mNetConnection->SetMaxReadAhead(kMaxCmdHeaderLength); + mLastRecvTime = globalNetManager().Now(); + + // If there is no activity on this socket, we want + // to be notified, so that we can close connection. + mNetConnection->SetInactivityTimeout(sOpResponseTimeoutSec); + // Add this to the poll vector + globalNetManager().AddConnection(mNetConnection); + + return true; +} + +void +RemoteSyncSM::Enqueue(KfsOp* op) +{ + if (mNetConnection && ! mNetConnection->IsGood()) { + KFS_LOG_STREAM_INFO << + "Lost connection to peer " << mLocation.ToString() << + " failed; failing ops" << + KFS_LOG_EOM; + mNetConnection->Close(); + mNetConnection.reset(); + } + op->seq = NextSeqnum(); + if (! mNetConnection && ! Connect()) { + KFS_LOG_STREAM_INFO << + "Connect to peer " << mLocation.ToString() << + " failed; failing ops" << + KFS_LOG_EOM; + if (! mDispatchedOps.insert(make_pair(op->seq, op)).second) { + die("duplicate seq. number"); + } + FailAllOps(); + return; + } + if (mDispatchedOps.empty()) { + mLastRecvTime = globalNetManager().Now(); + } + KFS_LOG_STREAM_DEBUG << + "forwarding to " << mLocation.ToString() << + " " << op->Show() << + KFS_LOG_EOM; + IOBuffer& buf = mNetConnection->GetOutBuffer(); + const int start = buf.BytesConsumable(); + op->Request(mWOStream.Set(buf)); + mWOStream.Reset(); + if (sTraceRequestResponse) { + IOBuffer::IStream is(buf, buf.BytesConsumable()); + is.ignore(start); + char buf[128]; + KFS_LOG_STREAM_DEBUG << reinterpret_cast(this) << + " send to: " << mLocation.ToString() << + KFS_LOG_EOM; + while (is.getline(buf, sizeof(buf))) { + KFS_LOG_STREAM_DEBUG << reinterpret_cast(this) << + " request: " << buf << + KFS_LOG_EOM; + } + } + if (op->op == CMD_CLOSE) { + // fire'n'forget + op->status = 0; + SubmitOpResponse(op); + } else if (op->op == CMD_WRITE_PREPARE_FWD) { + // send the data as well + WritePrepareFwdOp* const wpfo = static_cast(op); + op->status = 0; + mNetConnection->WriteCopy(wpfo->owner.dataBuf, + wpfo->owner.dataBuf->BytesConsumable()); + if (wpfo->owner.replyRequestedFlag) { + if (! mDispatchedOps.insert(make_pair(op->seq, op)).second) { + die("duplicate seq. number"); + } + } else { + // fire'n'forget + SubmitOpResponse(op); + } + } else { + if (op->op == CMD_RECORD_APPEND) { + // send the append over; we'll get an ack back + RecordAppendOp* const ra = static_cast(op); + mNetConnection->Write(&ra->dataBuf, ra->numBytes); + } + if (! mDispatchedOps.insert(make_pair(op->seq, op)).second) { + die("duplicate seq. number"); + } + } + UpdateRecvTimeout(); + if (mRecursionCount <= 0 && mNetConnection) { + mNetConnection->StartFlush(); + } +} + +int +RemoteSyncSM::HandleEvent(int code, void *data) +{ + IOBuffer *iobuf; + int msgLen = 0; + // take a ref to prevent the object from being deleted + // while we are still in this function. + RemoteSyncSMPtr self = shared_from_this(); + const char *reason = "error"; + + mRecursionCount++; + assert(mRecursionCount > 0); + switch (code) { + case EVENT_NET_READ: + mLastRecvTime = globalNetManager().Now(); + // We read something from the network. Run the RPC that + // came in if we got all the data for the RPC + iobuf = (IOBuffer *) data; + while ((mReplyNumBytes > 0 || IsMsgAvail(iobuf, &msgLen)) && + HandleResponse(iobuf, msgLen) >= 0) + {} + UpdateRecvTimeout(); + break; + + case EVENT_NET_WROTE: + // Something went out on the network. For now, we don't + // track it. Later, we may use it for tracking throttling + // and such. + UpdateRecvTimeout(); + break; + + + case EVENT_INACTIVITY_TIMEOUT: + reason = "inactivity timeout"; + case EVENT_NET_ERROR: + // If there is an error or there is no activity on the socket + // for N mins, we close the connection. + KFS_LOG_STREAM_INFO << "Closing connection to peer: " << + mLocation.ToString() << " due to " << reason << + KFS_LOG_EOM; + if (mNetConnection) { + mNetConnection->Close(); + mNetConnection.reset(); + } + break; + + default: + assert(!"Unknown event"); + break; + } + assert(mRecursionCount > 0); + if (mRecursionCount <= 1) { + const bool connectedFlag = mNetConnection && mNetConnection->IsGood(); + if (connectedFlag) { + mNetConnection->StartFlush(); + } + if (! connectedFlag || ! mNetConnection || ! mNetConnection->IsGood()) { + // we are done... + Finish(); + } + } + mRecursionCount--; + return 0; + +} + +int +RemoteSyncSM::HandleResponse(IOBuffer *iobuf, int msgLen) +{ + DispatchedOps::iterator i = mDispatchedOps.end(); + int nAvail = iobuf->BytesConsumable(); + + if (mReplyNumBytes <= 0) { + assert(msgLen >= 0 && msgLen <= nAvail); + if (sTraceRequestResponse) { + IOBuffer::IStream is(*iobuf, msgLen); + const string loc(mLocation.ToString()); + string line; + while (getline(is, line)) { + KFS_LOG_STREAM_DEBUG << reinterpret_cast(this) << + loc << " response: " << line << + KFS_LOG_EOM; + } + } + Properties prop; + { + const char separator(':'); + IOBuffer::IStream is(*iobuf, msgLen); + prop.loadProperties(is, separator, false); + } + iobuf->Consume(msgLen); + mReplySeqNum = prop.getValue("Cseq", (kfsSeq_t) -1); + if (mReplySeqNum < 0) { + KFS_LOG_STREAM_ERROR << + "invalid or missing Cseq header: " << mReplySeqNum << + ", resetting connection" << + KFS_LOG_EOM; + HandleEvent(EVENT_NET_ERROR, 0); + } + mReplyNumBytes = prop.getValue("Content-length", (long long) 0); + nAvail -= msgLen; + i = mDispatchedOps.find(mReplySeqNum); + KfsOp* const op = i != mDispatchedOps.end() ? i->second : 0; + if (op) { + op->status = prop.getValue("Status", -1); + if (op->op == CMD_WRITE_ID_ALLOC) { + WriteIdAllocOp *wiao = static_cast(op); + wiao->writeIdStr = prop.getValue("Write-id", ""); + wiao->writePrepareReplyFlag = + prop.getValue("Write-prepare-reply", 0) != 0; + } else if (op->op == CMD_READ) { + ReadOp *rop = static_cast (op); + const int checksumEntries = prop.getValue("Checksum-entries", 0); + if (checksumEntries > 0) { + istringstream is(prop.getValue("Checksums", "")); + uint32_t cks; + for (int i = 0; i < checksumEntries; i++) { + is >> cks; + rop->checksum.push_back(cks); + } + } + const int off(rop->offset % IOBufferData::GetDefaultBufferSize()); + if (off > 0) { + IOBuffer buf; + buf.ReplaceKeepBuffersFull(iobuf, off, nAvail); + iobuf->Move(&buf); + iobuf->Consume(off); + } else { + iobuf->MakeBuffersFull(); + } + } else if (op->op == CMD_SIZE) { + SizeOp *sop = static_cast(op); + sop->size = prop.getValue("Size", 0); + } else if (op->op == CMD_GET_CHUNK_METADATA) { + GetChunkMetadataOp *gcm = static_cast(op); + gcm->chunkVersion = prop.getValue("Chunk-version", 0); + gcm->chunkSize = prop.getValue("Size", 0); + } + } + } + + // if we don't have all the data for the write, hold on... + if (nAvail < mReplyNumBytes) { + // the data isn't here yet...wait... + if (mNetConnection) { + mNetConnection->SetMaxReadAhead(mReplyNumBytes - nAvail); + } + return -1; + } + + // now, we got everything... + if (mNetConnection) { + mNetConnection->SetMaxReadAhead(kMaxCmdHeaderLength); + } + + // find the matching op + if (i == mDispatchedOps.end()) { + i = mDispatchedOps.find(mReplySeqNum); + } + if (i != mDispatchedOps.end()) { + KfsOp *const op = i->second; + mDispatchedOps.erase(i); + if (op->op == CMD_READ) { + ReadOp *rop = static_cast (op); + if (rop->dataBuf == NULL) + rop->dataBuf = new IOBuffer(); + rop->dataBuf->Move(iobuf, mReplyNumBytes); + rop->numBytesIO = mReplyNumBytes; + } else if (op->op == CMD_GET_CHUNK_METADATA) { + GetChunkMetadataOp *gcm = static_cast(op); + if (gcm->dataBuf == NULL) + gcm->dataBuf = new IOBuffer(); + gcm->dataBuf->Move(iobuf, mReplyNumBytes); + } + mReplyNumBytes = 0; + // op->HandleEvent(EVENT_DONE, op); + SubmitOpResponse(op); + } else { + KFS_LOG_STREAM_DEBUG << + "Discarding a reply for unknown seq #: " << mReplySeqNum << + KFS_LOG_EOM; + mReplyNumBytes = 0; + } + return 0; +} + +// Helper functor that fails an op with an error code. +class OpFailer +{ + const int errCode; +public: + OpFailer(int c) + : errCode(c) + {} + template + void operator()(const T& val) + { + KfsOp* const op = val.second; + op->status = errCode; + SubmitOpResponse(op); + } +}; + + +void +RemoteSyncSM::FailAllOps() +{ + if (mDispatchedOps.empty()) { + return; + } + // There is a potential recursive call: if a client owns this + // object and the client got a network error, the client will call + // here to fail the outstandnig ops; when an op the client is + // notified and the client calls to close out this object. We'll + // be right back here trying to fail an op and will core. To + // avoid, swap out the ops and try. + DispatchedOps opsToFail; + + mDispatchedOps.swap(opsToFail); + for_each(opsToFail.begin(), opsToFail.end(), + OpFailer(-EHOSTUNREACH)); + opsToFail.clear(); +} + +void +RemoteSyncSM::Finish() +{ + FailAllOps(); + if (mNetConnection) { + mNetConnection->Close(); + mNetConnection.reset(); + } + // if the object was owned by the chunkserver, have it release the reference + gChunkServer.RemoveServer(this); +} + +// +// Utility functions to operate on a list of remotesync servers +// + +class RemoteSyncSMMatcher { + const ServerLocation myLoc; +public: + RemoteSyncSMMatcher(const ServerLocation &loc) + : myLoc(loc) + {} + bool operator() (const RemoteSyncSMPtr& other) + { + return other->GetLocation() == myLoc; + } +}; + +RemoteSyncSMPtr +FindServer(list &remoteSyncers, const ServerLocation &location, + bool connect) +{ + RemoteSyncSMPtr peer; + + list::iterator const i = find_if( + remoteSyncers.begin(), remoteSyncers.end(), + RemoteSyncSMMatcher(location)); + if (i != remoteSyncers.end()) { + peer = *i; + return peer; + } + if (!connect) { + return peer; + } + peer.reset(new RemoteSyncSM(location)); + if (peer->Connect()) { + remoteSyncers.push_back(peer); + } else { + // we couldn't connect...so, force destruction + peer.reset(); + } + return peer; +} + +void +RemoveServer(list& remoteSyncers, RemoteSyncSM* target) +{ + if (! target) { + return; + } + list::iterator const i = find( + remoteSyncers.begin(), remoteSyncers.end(), target->shared_from_this()); + if (i != remoteSyncers.end()) { + remoteSyncers.erase(i); + } +} + +void +ReleaseAllServers(list& remoteSyncers) +{ + while (! remoteSyncers.empty()) { + RemoteSyncSMPtr const r = remoteSyncers.front(); + remoteSyncers.pop_front(); + r->Finish(); + } +} + +} diff --git a/src/cc/chunk/RemoteSyncSM.h b/src/cc/chunk/RemoteSyncSM.h new file mode 100644 index 000000000..7d3f4d78f --- /dev/null +++ b/src/cc/chunk/RemoteSyncSM.h @@ -0,0 +1,132 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/09/27 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef CHUNKSERVER_REMOTESYNCSM_H +#define CHUNKSERVER_REMOTESYNCSM_H + +#include "common/kfsdecls.h" +#include "common/StdAllocator.h" +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/NetConnection.h" + +#include + +#include +#include +#include +#include + +namespace KFS +{ +using std::map; +using std::list; +using std::less; + +class RemoteSyncSMTimeoutImpl; +struct KfsOp; + +// State machine for communication with other chunk servers: daisy chain rpc +// forwarding, and re-replication data and meta-data chunk read. +class RemoteSyncSM : public KfsCallbackObj, + public boost::enable_shared_from_this +{ +public: + + RemoteSyncSM(const ServerLocation &location); + + ~RemoteSyncSM(); + + bool Connect(); + + void Enqueue(KfsOp *op); + + void Finish(); + + int HandleEvent(int code, void *data); + + ServerLocation GetLocation() const { + return mLocation; + } + static void SetTraceRequestResponse(bool flag) { + sTraceRequestResponse = flag; + } + static void SetResponseTimeoutSec(int timeoutSec) { + sOpResponseTimeoutSec = timeoutSec; + } + static int GetResponseTimeoutSec() { + return sOpResponseTimeoutSec; + } + +private: + typedef map< + kfsSeq_t, + KfsOp*, + less, + StdFastAllocator< + std::pair + > + > DispatchedOps; + + NetConnectionPtr mNetConnection; + ServerLocation mLocation; + /// Assign a sequence # for each op we send to the remote server + kfsSeq_t mSeqnum; + /// Queue of outstanding ops sent to remote server. + DispatchedOps mDispatchedOps; + kfsSeq_t mReplySeqNum; + int mReplyNumBytes; + int mRecursionCount; + time_t mLastRecvTime; + IOBuffer::WOStream mWOStream; + + kfsSeq_t NextSeqnum(); + + /// We (may) have got a response from the peer. If we are doing + /// re-replication, then we need to wait until we got all the data + /// for the op; in such cases, we need to know if we got the full + /// response. + /// @retval 0 if we got the response; -1 if we need to wait + int HandleResponse(IOBuffer *iobuf, int cmdLen); + void FailAllOps(); + inline void UpdateRecvTimeout(); + static bool sTraceRequestResponse; + static int sOpResponseTimeoutSec; +private: + // No copy. + RemoteSyncSM(const RemoteSyncSM&); + RemoteSyncSM& operator=(const RemoteSyncSM&); +}; + +typedef boost::shared_ptr RemoteSyncSMPtr; + +RemoteSyncSMPtr FindServer(list& remoteSyncers, + const ServerLocation &location, bool connect); +void RemoveServer(list& remoteSyncers, RemoteSyncSM* target); +void ReleaseAllServers(list &remoteSyncers); + +} + +#endif // CHUNKSERVER_REMOTESYNCSM_H diff --git a/src/cc/chunk/Replicator.cc b/src/cc/chunk/Replicator.cc new file mode 100644 index 000000000..1afa74e39 --- /dev/null +++ b/src/cc/chunk/Replicator.cc @@ -0,0 +1,1144 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2007/01/17 +// Author: Sriram Rao +// Mike Ovsiannikov -- rework re-replication to protect against +// duplicate requests. Implement chunk recovery. +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2007-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Code for dealing with chunk re-replication and recovery. +// The meta server instructs chunk server to obtain a copy of a chunk from a +// source chunk server, or recover chunk by reading other available chunks in +// the RS block and recomputing the missing chunk data. The chunk server reads +// the chunk data from the other chunk server(s) writes chunk replica to disk. +// At the end replication, the destination chunk server notifies the meta +// server. +// +//---------------------------------------------------------------------------- + +#include "Replicator.h" +#include "ChunkServer.h" +#include "utils.h" +#include "RemoteSyncSM.h" +#include "KfsOps.h" +#include "Logger.h" +#include "BufferManager.h" +#include "DiskIo.h" + +#include "common/MsgLogger.h" +#include "common/StdAllocator.h" +#include "qcdio/qcstutils.h" +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/NetConnection.h" +#include "kfsio/Globals.h" +#include "kfsio/checksum.h" +#include "libclient/KfsNetClient.h" +#include "libclient/Reader.h" +#include "libclient/KfsOps.h" + +#include +#include + +namespace KFS +{ + +using std::string; +using std::string; +using std::ostringstream; +using std::istringstream; +using std::pair; +using std::make_pair; +using std::max; +using std::min; +using KFS::libkfsio::globalNetManager; +using KFS::client::Reader; +using KFS::client::KfsNetClient; + +class ReplicatorImpl : + public KfsCallbackObj, + public QCRefCountedObj, + public BufferManager::Client +{ +public: + // Model for doing a chunk replication involves 3 steps: + // - First, figure out the size of the chunk. + // - Second in a loop: + // - read N bytes from the source + // - write N bytes to disk + // - Third, notify the metaserver of the status (0 to mean + // success, -1 on failure). + // + // During replication, the chunk isn't part of the chunkTable data + // structure that is maintained locally. This is done for + // simplifying failure handling: if we die in the midst of + // replication, upon restart, we will find an incomplete chunk, i.e. + // chunk with with 0 version in the the dirty directory. Such chunks + // will be deleted upon restart. + // + typedef Replicator::Counters Counters; + static int GetNumReplications(); + static void CancelAll(); + static void SetParameters(const Properties& props) + { + sUseConnectionPoolFlag = props.getValue( + "chunkServer.rsReader.meta.idleTimeoutSec", + sUseConnectionPoolFlag ? 1 : 0 + ) != 0; + } + static void GetCounters(Replicator::Counters& counters); + + ReplicatorImpl(ReplicateChunkOp *op, const RemoteSyncSMPtr &peer); + void Run(); + // Handle the callback for a size request + int HandleStartDone(int code, void *data); + // Handle the callback for a remote read request + int HandleReadDone(int code, void *data); + // Handle the callback for a write + int HandleWriteDone(int code, void *data); + // When replication done, we write out chunk meta-data; this is + // the handler that gets called when this event is done. + int HandleReplicationDone(int code, void *data); + virtual void Granted(ByteCount byteCount); + static Counters& Ctrs() + { return sCounters; }; + static bool GetUseConnectionPoolFlag() + { return sUseConnectionPoolFlag; } + +protected: + // Inputs from the metaserver + kfsFileId_t mFileId; + kfsChunkId_t mChunkId; + kfsSeq_t mChunkVersion; + // What we obtain from the src from where we download the chunk. + int64_t mChunkSize; + // The op that triggered this replication operation. + ReplicateChunkOp* mOwner; + // What is the offset we are currently reading at + int64_t mOffset; + // Handle to the peer from where we have to get data + RemoteSyncSMPtr mPeer; + + GetChunkMetadataOp mChunkMetadataOp; + ReadOp mReadOp; + WriteOp mWriteOp; + // Are we done yet? + bool mDone; + bool mCancelFlag; + + virtual ~ReplicatorImpl(); + // Cleanup... + void Terminate(); + string GetPeerName() const; + // Start by sending out a size request + virtual void Start(); + // Send out a read request to the peer + virtual void Read(); + virtual void Cancel() + { + mCancelFlag = true; + if (IsWaiting()) { + // Cancel buffers wait, and fail the op. + CancelRequest(); + Terminate(); + } + } + virtual ByteCount GetBufferBytesRequired() const; + +private: + typedef std::map< + kfsChunkId_t, ReplicatorImpl*, + std::less, + StdFastAllocator< + std::pair + > + > InFlightReplications; + + static InFlightReplications sInFlightReplications; + static Counters sCounters; + static int sReplicationCount; + static bool sUseConnectionPoolFlag; +private: + // No copy. + ReplicatorImpl(const ReplicatorImpl&); + ReplicatorImpl& operator=(const ReplicatorImpl&); +}; + +const int kDefaultReplicationReadSize = (int)( + ((1 << 20) + CHECKSUM_BLOCKSIZE - 1) / + CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE); +ReplicatorImpl::InFlightReplications ReplicatorImpl::sInFlightReplications; +ReplicatorImpl::Counters ReplicatorImpl::sCounters; +int ReplicatorImpl::sReplicationCount = 0; +bool ReplicatorImpl::sUseConnectionPoolFlag = false; + +int +ReplicatorImpl::GetNumReplications() +{ + if (sInFlightReplications.empty()) { + sReplicationCount = 0; + } + return sReplicationCount; +} + +void +ReplicatorImpl::CancelAll() +{ + for (InFlightReplications::iterator it = sInFlightReplications.begin(); + it != sInFlightReplications.end(); + ++it) { + it->second->Cancel(); + } + sReplicationCount = 0; +} + +void ReplicatorImpl::GetCounters(ReplicatorImpl::Counters& counters) +{ + counters = sCounters; +} + +ReplicatorImpl::ReplicatorImpl(ReplicateChunkOp *op, const RemoteSyncSMPtr &peer) : + KfsCallbackObj(), + QCRefCountedObj(), + BufferManager::Client(), + mFileId(op->fid), + mChunkId(op->chunkId), + mChunkVersion(op->chunkVersion), + mOwner(op), + mOffset(0), + mPeer(peer), + mChunkMetadataOp(0), + mReadOp(0), + mWriteOp(op->chunkId, op->chunkVersion), + mDone(false), + mCancelFlag(false) +{ + mReadOp.chunkId = op->chunkId; + mReadOp.chunkVersion = op->chunkVersion; + mReadOp.clnt = this; + mWriteOp.clnt = this; + mChunkMetadataOp.clnt = this; + mWriteOp.Reset(); + mWriteOp.isFromReReplication = true; + SET_HANDLER(&mReadOp, &ReadOp::HandleReplicatorDone); + Ctrs().mReplicatorCount++; +} + +ReplicatorImpl::~ReplicatorImpl() +{ + InFlightReplications::iterator const it = + sInFlightReplications.find(mChunkId); + if (it != sInFlightReplications.end() && it->second == this) { + if (! mCancelFlag && sReplicationCount > 0) { + sReplicationCount--; + } + sInFlightReplications.erase(it); + } + assert(! mOwner && Ctrs().mReplicatorCount > 0); + Ctrs().mReplicatorCount--; +} + +void +ReplicatorImpl::Run() +{ + pair const ret = + sInFlightReplications.insert(make_pair(mChunkId, this)); + if (ret.second) { + sReplicationCount++; + } else { + assert(ret.first->second && ret.first->second != this); + ReplicatorImpl& other = *ret.first->second; + KFS_LOG_STREAM_INFO << "replication:" + " chunk: " << ret.first->first << + " peer: " << other.GetPeerName() << + " offset: " << other.mOffset << + "canceling:" << + (other.mCancelFlag ? " already canceled?" : "") << + " restarting from" + " peer: " << GetPeerName() << + KFS_LOG_EOM; + other.Cancel(); + // Cancel can delete the "other" replicator if it was waiting for + // buffers for example, and make the iterator invalid. + pair const res = + sInFlightReplications.insert(make_pair(mChunkId, this)); + if (! res.second) { + assert(ret == res); + res.first->second = this; + } + if (mCancelFlag) { + // Non debug version -- an attempt to restart? &other == this + // Delete chunk and declare error. + mCancelFlag = false; + Terminate(); + return; + } + } + + const ByteCount kChunkHeaderSize = 16 << 10; + const ByteCount bufBytes = max(kChunkHeaderSize, GetBufferBytesRequired()); + BufferManager& bufMgr = DiskIo::GetBufferManager(); + if (bufMgr.IsOverQuota(*this, bufBytes)) { + KFS_LOG_STREAM_ERROR << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + " bytes: " << bufBytes << + " total: " << GetByteCount() << + " over quota: " << bufMgr.GetMaxClientQuota() << + KFS_LOG_EOM; + Terminate(); + return; + } + if (bufMgr.GetForDiskIo(*this, bufBytes)) { + Start(); + return; + } + KFS_LOG_STREAM_INFO << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + " denined: " << bufBytes << + " waiting for buffers" << + KFS_LOG_EOM; +} + +ReplicatorImpl::ByteCount +ReplicatorImpl::GetBufferBytesRequired() const +{ + return kDefaultReplicationReadSize; +} + +void +ReplicatorImpl::Granted(ByteCount byteCount) +{ + KFS_LOG_STREAM_INFO << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + " granted: " << byteCount << + KFS_LOG_EOM; + Start(); +} + +void +ReplicatorImpl::Start() +{ + assert(mPeer); + + mChunkMetadataOp.chunkId = mChunkId; + mChunkMetadataOp.readVerifyFlag = false; + SET_HANDLER(this, &ReplicatorImpl::HandleStartDone); + mPeer->Enqueue(&mChunkMetadataOp); +} + +int +ReplicatorImpl::HandleStartDone(int code, void *data) +{ + if (mCancelFlag || mChunkMetadataOp.status < 0) { + Terminate(); + return 0; + } + mChunkSize = mChunkMetadataOp.chunkSize; + mChunkVersion = mChunkMetadataOp.chunkVersion; + if (mChunkSize < 0 || mChunkSize > (int64_t)CHUNKSIZE) { + KFS_LOG_STREAM_INFO << "replication:" + " invalid chunk size: " << mChunkSize << + KFS_LOG_EOM; + Terminate(); + return 0; + } + + mReadOp.chunkVersion = mChunkVersion; + // Delete stale copy if it exists, before replication. + // Replication request implicitly makes previous copy stale. + const bool kDeleteOkFlag = true; + gChunkManager.StaleChunk(mChunkId, kDeleteOkFlag); + // set the version to a value that will never be used; if + // replication is successful, we then bump up the counter. + mWriteOp.chunkVersion = 0; + if (gChunkManager.AllocChunk(mFileId, mChunkId, 0, true) < 0) { + Terminate(); + return -1; + } + KFS_LOG_STREAM_INFO << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + " starting:" + " size: " << mChunkSize << + KFS_LOG_EOM; + Read(); + return 0; +} + +void +ReplicatorImpl::Read() +{ + assert(! mCancelFlag && mOwner); + StRef ref(*this); + + if (mOffset >= mChunkSize) { + mDone = mOffset == mChunkSize; + KFS_LOG_STREAM(mDone ? + MsgLogger::kLogLevelNOTICE : + MsgLogger::kLogLevelERROR) << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + (mDone ? " done" : " failed") << + " position: " << mOffset << + " size: " << mChunkSize << + " " << mOwner->Show() << + KFS_LOG_EOM; + Terminate(); + return; + } + + assert(mPeer); + SET_HANDLER(this, &ReplicatorImpl::HandleReadDone); + mReadOp.checksum.clear(); + mReadOp.status = 0; + mReadOp.offset = mOffset; + mReadOp.numBytesIO = 0; + mReadOp.numBytes = (int)min( + mChunkSize - mOffset, int64_t(kDefaultReplicationReadSize)); + mPeer->Enqueue(&mReadOp); +} + +int +ReplicatorImpl::HandleReadDone(int code, void *data) +{ + assert(code == EVENT_CMD_DONE && data == &mReadOp); + + const int numRd = mReadOp.dataBuf ? mReadOp.dataBuf->BytesConsumable() : 0; + if (mReadOp.status < 0) { + KFS_LOG_STREAM_INFO << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + " read failed:" + " error: " << mReadOp.status << + KFS_LOG_EOM; + } else if (! mCancelFlag && + numRd < (int)mReadOp.numBytes && + mOffset + numRd < mChunkSize) { + KFS_LOG_STREAM_ERROR << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + " short read:" + " got: " << numRd << + " expected: " << mReadOp.numBytes << + KFS_LOG_EOM; + mReadOp.status = -EINVAL; + } + if (mCancelFlag || mReadOp.status < 0 || mOffset == mChunkSize) { + mDone = mOffset == mChunkSize && mReadOp.status >= 0 && ! mCancelFlag; + Terminate(); + return 0; + } + + const int kChecksumBlockSize = (int)CHECKSUM_BLOCKSIZE; + assert(mOffset % kChecksumBlockSize == 0); + // Swap read and write buffer pointers. + IOBuffer* const dataBuf = mWriteOp.dataBuf; + if (dataBuf) { + dataBuf->Clear(); + } + mWriteOp.Reset(); + mWriteOp.dataBuf = mReadOp.dataBuf; + mWriteOp.numBytes = numRd; + mWriteOp.offset = mOffset; + mWriteOp.isFromReReplication = true; + mReadOp.dataBuf = dataBuf; + + // align the writes to checksum boundaries + if (numRd > kChecksumBlockSize) { + // Chunk manager only handles checksum block aligned writes. + const int numBytes = numRd % kChecksumBlockSize; + const int64_t endPos = mOffset + numRd; + assert(numBytes == 0 || endPos == mChunkSize); + mWriteOp.numBytes = numRd - numBytes; + if (numBytes > 0 && endPos == mChunkSize) { + // Swap buffers back, and move the tail back into the read buffer. + IOBuffer* const dataBuf = + mReadOp.dataBuf ? mReadOp.dataBuf : new IOBuffer(); + mReadOp.dataBuf = mWriteOp.dataBuf; + mWriteOp.dataBuf = dataBuf; + mWriteOp.dataBuf->Move(mReadOp.dataBuf, mWriteOp.numBytes); + mReadOp.dataBuf->MakeBuffersFull(); + mReadOp.offset = mOffset + mWriteOp.numBytes; + mReadOp.numBytesIO = numBytes; + mReadOp.numBytes = numBytes; + } + } + + SET_HANDLER(this, &ReplicatorImpl::HandleWriteDone); + if (gChunkManager.WriteChunk(&mWriteOp) < 0) { + // abort everything + Terminate(); + } + return 0; +} + +int +ReplicatorImpl::HandleWriteDone(int code, void *data) +{ + assert( + (code == EVENT_DISK_ERROR) || + (code == EVENT_DISK_WROTE) || + (code == EVENT_CMD_DONE && data == &mWriteOp) + ); + StRef ref(*this); + mWriteOp.diskIo.reset(); + if (mWriteOp.status < 0) { + KFS_LOG_STREAM_ERROR << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + " write failed:" + " error: " << mWriteOp.status << + KFS_LOG_EOM; + } + if (mCancelFlag || mWriteOp.status < 0) { + Terminate(); + return 0; + } + mOffset += mWriteOp.numBytesIO; + if (mReadOp.offset == mOffset && + mReadOp.dataBuf && ! mReadOp.dataBuf->IsEmpty()) { + assert(mReadOp.dataBuf->BytesConsumable() < (int)CHECKSUM_BLOCKSIZE); + // Write the remaining tail. + HandleReadDone(EVENT_CMD_DONE, &mReadOp); + return 0; + } + Read(); + return 0; +} + +void +ReplicatorImpl::Terminate() +{ + int res = -1; + if (mDone && ! mCancelFlag) { + KFS_LOG_STREAM_INFO << "replication:" + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + " finished" << + KFS_LOG_EOM; + // now that replication is all done, set the version appropriately, and write + // meta data + SET_HANDLER(this, &ReplicatorImpl::HandleReplicationDone); + const bool stableFlag = true; + res = gChunkManager.ChangeChunkVers( + mChunkId, mChunkVersion, stableFlag, this); + if (res == 0) { + return; + } + } + HandleReplicationDone(EVENT_DISK_ERROR, &res); +} + +int +ReplicatorImpl::HandleReplicationDone(int code, void *data) +{ + assert(mOwner); + + const int status = data ? *reinterpret_cast(data) : 0; + mOwner->status = status >= 0 ? 0 : -1; + if (status < 0) { + KFS_LOG_STREAM_ERROR << "replication:" << + " chunk: " << mChunkId << + " peer: " << GetPeerName() << + (mCancelFlag ? " cancelled" : " failed") << + " status: " << status << + " " << mOwner->Show() << + KFS_LOG_EOM; + } else { + const ChunkInfo_t* const ci = gChunkManager.GetChunkInfo(mChunkId); + KFS_LOG_STREAM_NOTICE << mOwner->Show() << + " chunk size: " << (ci ? ci->chunkSize : -1) << + KFS_LOG_EOM; + } + bool notifyFlag = ! mCancelFlag; + if (mCancelFlag) { + InFlightReplications::iterator const it = + sInFlightReplications.find(mChunkId); + notifyFlag = it != sInFlightReplications.end() && it->second == this; + } + if (notifyFlag) { + gChunkManager.ReplicationDone(mChunkId, status); + } + // Notify the owner of completion + mOwner->chunkVersion = (! mCancelFlag && status >= 0) ? mChunkVersion : -1; + if (mOwner->status < 0 || mCancelFlag) { + if (mOwner->location.IsValid()) { + if (mCancelFlag) { + Ctrs().mReplicationCanceledCount++; + } else { + Ctrs().mReplicationErrorCount++; + } + } else { + if (mCancelFlag) { + Ctrs().mRecoveryCanceledCount++; + } else { + Ctrs().mRecoveryErrorCount++; + } + } + } + ReplicateChunkOp* const op = mOwner; + mOwner = 0; + UnRef(); + SubmitOpResponse(op); + return 0; +} + +string +ReplicatorImpl::GetPeerName() const +{ + return (mPeer ? mPeer->GetLocation().ToString() : "none"); +} + +class RSReplicatorImpl : + public ReplicatorImpl, + public Reader::Completion +{ +public: + static void SetParameters(const Properties& props) + { + const int kChecksumBlockSize = (int)CHECKSUM_BLOCKSIZE; + sRSReaderMaxRetryCount = props.getValue( + "chunkServer.rsReader.maxRetryCount", + sRSReaderMaxRetryCount + ); + sRSReaderTimeSecBetweenRetries = props.getValue( + "chunkServer.rsReader.timeSecBetweenRetries", + sRSReaderTimeSecBetweenRetries + ); + sRSReaderOpTimeoutSec = props.getValue( + "chunkServer.rsReader.opTimeoutSec", + sRSReaderOpTimeoutSec + ); + sRSReaderIdleTimeoutSec = props.getValue( + "chunkServer.rsReader.idleTimeoutSec", + sRSReaderIdleTimeoutSec + ); + sRSReaderMaxReadSize = (max(1, props.getValue( + "chunkServer.rsReader.maxReadSize", + sRSReaderMaxReadSize + )) + kChecksumBlockSize - 1) / kChecksumBlockSize * kChecksumBlockSize; + sRSReaderMaxChunkReadSize = props.getValue( + "chunkServer.rsReader.maxChunkReadSize", + max(sRSReaderMaxReadSize, sRSReaderMaxChunkReadSize) + ); + sRSReaderLeaseRetryTimeout = props.getValue( + "chunkServer.rsReader.leaseRetryTimeout", + sRSReaderLeaseRetryTimeout + ); + sRSReaderLeaseWaitTimeout = props.getValue( + "chunkServer.rsReader.leaseWaitTimeout", + sRSReaderLeaseWaitTimeout + ); + sRSReaderMetaMaxRetryCount = props.getValue( + "chunkServer.rsReader.meta.maxRetryCount", + sRSReaderMetaMaxRetryCount + ); + sRSReaderMetaTimeSecBetweenRetries = props.getValue( + "chunkServer.rsReader.meta.timeSecBetweenRetries", + sRSReaderMetaTimeSecBetweenRetries + ); + sRSReaderMetaOpTimeoutSec = props.getValue( + "chunkServer.rsReader.meta.opTimeoutSec", + sRSReaderMetaOpTimeoutSec + ); + sRSReaderMetaIdleTimeoutSec = props.getValue( + "chunkServer.rsReader.meta.idleTimeoutSec", + sRSReaderMetaIdleTimeoutSec + ); + sRSReaderMetaResetConnectionOnOpTimeoutFlag = props.getValue( + "chunkServer.rsReader.meta.idleTimeoutSec", + sRSReaderMetaResetConnectionOnOpTimeoutFlag ? 1 : 0 + ) != 0; + } + RSReplicatorImpl(ReplicateChunkOp* op) + : ReplicatorImpl(op, RemoteSyncSMPtr()), + Reader::Completion(), + mReader( + GetMetaserver(op->location.port), + this, + sRSReaderMaxRetryCount, + sRSReaderTimeSecBetweenRetries, + sRSReaderOpTimeoutSec, + sRSReaderIdleTimeoutSec, + sRSReaderMaxChunkReadSize, + sRSReaderLeaseRetryTimeout, + sRSReaderLeaseWaitTimeout, + MakeLogPrefix(mChunkId), + GetSeqNum() + ), + mReadTail(), + mReadSize(GetReadSize(*op)), + mReadInFlightFlag(false), + mPendingCloseFlag(false) + { + assert(mReadSize % IOBufferData::GetDefaultBufferSize() == 0); + mReadOp.clnt = 0; // Should not queue read op. + } + virtual void Start() + { + assert(mOwner); + mChunkMetadataOp.chunkSize = CHUNKSIZE; + mChunkMetadataOp.chunkVersion = mOwner->chunkVersion; + mReadOp.status = 0; + mReadOp.numBytes = 0; + const bool kSkipHolesFlag = true; + const bool kUseDefaultBufferAllocatorFlag = true; + mChunkMetadataOp.status = mReader.Open( + mFileId, + mOwner->pathName.c_str(), + mOwner->fileSize, + mOwner->striperType, + mOwner->stripeSize, + mOwner->numStripes, + mOwner->numRecoveryStripes, + kSkipHolesFlag, + kUseDefaultBufferAllocatorFlag, + mOwner->chunkOffset + ); + HandleStartDone(EVENT_CMD_DONE, &mChunkMetadataOp); + } + virtual void Done( + Reader& inReader, + int inStatusCode, + Reader::Offset inOffset, + Reader::Offset inSize, + IOBuffer* inBufferPtr, + Reader::RequestId inRequestId) + { + StRef ref(*this); + + if (&inReader != &mReader || (inBufferPtr && + (inRequestId.mPtr != this || + inOffset < 0 || + (mOwner && mOwner->chunkOffset + mOffset != inOffset) || + inSize > (Reader::Offset)mReadOp.numBytes || + ! mReadInFlightFlag))) { + die("recovery: invalid read completion"); + mReadOp.status = -EINVAL; + } + if (mPendingCloseFlag) { + if (! mReader.IsActive()) { + KFS_LOG_STREAM_DEBUG << "recovery:" + " chunk: " << mChunkId << + " chunk reader closed" << + KFS_LOG_EOM; + mPendingCloseFlag = false; + UnRef(); + } + return; + } + if (! mReadInFlightFlag) { + if (mReadOp.status >= 0 && inStatusCode < 0) { + mReadOp.status = inStatusCode; + } + return; + } + mReadInFlightFlag = false; + if (! mOwner) { + return; + } + if (mReadOp.status != 0 || (! inBufferPtr && inStatusCode == 0)) { + return; + } + assert(mReadOp.dataBuf); + mReadOp.status = inStatusCode; + if (mReadOp.status == 0 && inBufferPtr) { + const bool endOfChunk = + mReadSize > inBufferPtr->BytesConsumable() || + mOffset + mReadSize >= mChunkSize; + IOBuffer& buf = *mReadOp.dataBuf; + buf.Clear(); + if (endOfChunk) { + buf.Move(&mReadTail); + buf.Move(inBufferPtr); + mReadOp.numBytes = buf.BytesConsumable(); + mReadOp.numBytesIO = mReadOp.numBytes; + mChunkSize = mOffset + mReadOp.numBytesIO; + mReader.Close(); + if (mReader.IsActive()) { + mPendingCloseFlag = true; + Ref(); + } + } else { + const int kChecksumBlockSize = (int)CHECKSUM_BLOCKSIZE; + int nmv = (mReadTail.BytesConsumable() + + inBufferPtr->BytesConsumable()) / + kChecksumBlockSize * kChecksumBlockSize; + if (nmv <= 0) { + mReadTail.Move(inBufferPtr); + Read(); + return; + } + nmv -= buf.Move(&mReadTail, nmv); + buf.Move(inBufferPtr, nmv); + mReadTail.Move(inBufferPtr); + mReadOp.numBytes = buf.BytesConsumable(); + mReadOp.numBytesIO = mReadOp.numBytes; + } + } else if (inStatusCode < 0 && inBufferPtr && + ! inBufferPtr->IsEmpty()) { + // Report invalid stripes. + const int ns = mOwner->numStripes + mOwner->numRecoveryStripes; + int n = 0; + ostringstream os; + while (! inBufferPtr->IsEmpty()) { + if (n >= ns) { + die("recovery: completion: invalid number of bad stripes"); + n = 0; + break; + } + int idx = -1; + kfsChunkId_t chunkId = -1; + int64_t chunkVersion = -1; + ReadVal(*inBufferPtr, idx); + ReadVal(*inBufferPtr, chunkId); + ReadVal(*inBufferPtr, chunkVersion); + if (idx < 0 || idx >= ns) { + die("recovery: completion: invalid bad stripe index"); + n = 0; + break; + } + os << (n > 0 ? " " : "") << idx << + " " << chunkId << " " << chunkVersion; + n++; + } + if (n > 0) { + mOwner->invalidStripeIdx = os.str(); + KFS_LOG_STREAM_ERROR << "recovery: " + " status: " << inStatusCode << + " invalid stripes: " << mOwner->invalidStripeIdx << + KFS_LOG_EOM; + } + } + HandleReadDone(EVENT_CMD_DONE, &mReadOp); + } + static void CancelAll() + { GetMetaserver(-1); } + +private: + Reader mReader; + IOBuffer mReadTail; + const int mReadSize; + bool mReadInFlightFlag; + bool mPendingCloseFlag; + + virtual ~RSReplicatorImpl() + { + KFS_LOG_STREAM_DEBUG << "~RSReplicatorImpl" + " chunk: " << mChunkId << + KFS_LOG_EOM; + mReader.Register(0); + mReader.Shutdown(); + } + virtual void Cancel() + { + StRef ref(*this); + + const int prevRef = GetRefCount(); + mReader.Unregister(this); + mReader.Shutdown(); + ReplicatorImpl::Cancel(); + if (mReadInFlightFlag && prevRef <= GetRefCount()) { + assert(mOwner); + mReadInFlightFlag = false; + mReadOp.status = -ETIMEDOUT; + HandleReadDone(EVENT_CMD_DONE, &mReadOp); + } + } + virtual void Read() + { + assert(! mCancelFlag && mOwner && ! mReadInFlightFlag); + if (mOffset >= mChunkSize || mReadOp.status < 0) { + ReplicatorImpl::Read(); + return; + } + + StRef ref(*this); + + if (! mReadOp.dataBuf) { + mReadOp.dataBuf = new IOBuffer(); + } + mReadOp.status = 0; + mReadOp.numBytes = mReadSize; + mReadOp.numBytesIO = 0; + mReadOp.offset = mOffset; + mReadOp.dataBuf->Clear(); + Reader::RequestId reqId = Reader::RequestId(); + reqId.mPtr = this; + mReadInFlightFlag = true; + IOBuffer buf; + const int status = mReader.Read( + buf, + mReadSize, + mOffset + mReadTail.BytesConsumable(), + reqId + ); + if (status != 0 && mReadInFlightFlag) { + mReadInFlightFlag = false; + mReadOp.status = status; + HandleReadDone(EVENT_CMD_DONE, &mReadOp); + } + } + virtual ByteCount GetBufferBytesRequired() const + { + return (mReadSize * (mOwner ? mOwner->numStripes + 1 : 0)); + } + template static void ReadVal(IOBuffer& buf, T& val) + { + const int len = (int)sizeof(val); + if (buf.Consume(buf.CopyOut( + reinterpret_cast(&val), len)) != len) { + die("invalid buffer size"); + } + } + struct AddExtraClientHeaders + { + AddExtraClientHeaders(const char* hdrs) + { + client::KfsOp::AddExtraRequestHeaders(hdrs); + client::KfsOp::AddDefaultRequestHeaders( + kKfsUserRoot, kKfsGroupRoot); + } + }; + static KfsNetClient& GetMetaserver(int port) + { + static AddExtraClientHeaders sAddHdrs("From-chunk-server: 1\r\n"); + static KfsNetClient sMetaServerClient( + globalNetManager(), + string(), // inHost + 0, // inPort + sRSReaderMetaMaxRetryCount, + sRSReaderMetaTimeSecBetweenRetries, + sRSReaderMetaOpTimeoutSec, + sRSReaderMetaIdleTimeoutSec, + GetRandomSeq(), + "RSR", + sRSReaderMetaResetConnectionOnOpTimeoutFlag + ); + static int sMetaPort = -1; + if (port <= 0) { + sMetaPort = -1; + sMetaServerClient.Stop(); + } else if (sMetaPort != port) { + if (sMetaPort > 0) { + KFS_LOG_STREAM_INFO << "recovery:" + " meta server client port has changed" + " from: " << sMetaPort << + " to: " << port << + KFS_LOG_EOM; + } + sMetaPort = port; + sMetaServerClient.SetServer(ServerLocation( + gMetaServerSM.GetLocation().hostname, sMetaPort)); + } + return sMetaServerClient; + } + static const char* MakeLogPrefix(kfsChunkId_t chunkId) + { + static ostringstream os; + static string pref; + os.str(string()); + os << "CR: " << chunkId; + pref = os.str(); + return pref.c_str(); + } + static kfsSeq_t GetSeqNum() + { + static kfsSeq_t sInitialSeqNum = GetRandomSeq(); + static uint32_t sNextRand = (uint32_t)sInitialSeqNum; + sNextRand = sNextRand * 1103515245 + 12345; + sInitialSeqNum += 100000 + ((uint32_t)(sNextRand / 65536) % 32768); + return sInitialSeqNum; + } + static int GetReadSize(const ReplicateChunkOp& op) + { + // Align read on checksum block boundary, and align on stripe size, + // if possible. + const int kChecksumBlockSize = (int)CHECKSUM_BLOCKSIZE; + const int kIoBufferSize = IOBufferData::GetDefaultBufferSize(); + assert( + sRSReaderMaxReadSize >= kChecksumBlockSize && + op.stripeSize > 0 && + sRSReaderMaxReadSize % kChecksumBlockSize == 0 && + kChecksumBlockSize % kIoBufferSize == 0 + ); + const int size = max(kChecksumBlockSize, (int)min( + int64_t(sRSReaderMaxReadSize), + (DiskIo::GetBufferManager().GetMaxClientQuota() / + max(1, op.numStripes + 1)) / + kChecksumBlockSize * kChecksumBlockSize) + ); + if (size <= op.stripeSize) { + KFS_LOG_STREAM_DEBUG << "recovery:" + " large stripe: " << op.stripeSize << + " read size: " << size << + KFS_LOG_EOM; + return size; + } + int lcm = GetLcm(kChecksumBlockSize, op.stripeSize); + if (lcm > size) { + lcm = GetLcm(kIoBufferSize, op.stripeSize); + if (lcm > size) { + KFS_LOG_STREAM_WARN << "recovery:" + "invalid read parameters:" + " max read size: " << sRSReaderMaxReadSize << + " io buffer size: " << kIoBufferSize << + " stripe size: " << op.stripeSize << + " set read size: " << lcm << + KFS_LOG_EOM; + return lcm; + } + } + return (size / lcm * lcm); + } + static int GetGcd(int nl, int nr) + { + int a = nl; + int b = nr; + while (b != 0) { + const int t = b; + b = a % b; + a = t; + } + return a; + } + static int GetLcm(int nl, int nr) + { return ((nl == 0 || nr == 0) ? 0 : nl / GetGcd(nl, nr) * nr); } + + static int sRSReaderMaxRetryCount; + static int sRSReaderTimeSecBetweenRetries; + static int sRSReaderOpTimeoutSec; + static int sRSReaderIdleTimeoutSec; + static int sRSReaderMaxChunkReadSize; + static int sRSReaderMaxReadSize; + static int sRSReaderLeaseRetryTimeout; + static int sRSReaderLeaseWaitTimeout; + static int sRSReaderMetaMaxRetryCount; + static int sRSReaderMetaTimeSecBetweenRetries; + static int sRSReaderMetaOpTimeoutSec; + static int sRSReaderMetaIdleTimeoutSec; + static bool sRSReaderMetaResetConnectionOnOpTimeoutFlag; +private: + // No copy. + RSReplicatorImpl(const RSReplicatorImpl&); + RSReplicatorImpl& operator=(const RSReplicatorImpl&); +}; +int RSReplicatorImpl::sRSReaderMaxRetryCount = 3; +int RSReplicatorImpl::sRSReaderTimeSecBetweenRetries = 10; +int RSReplicatorImpl::sRSReaderOpTimeoutSec = 30; +int RSReplicatorImpl::sRSReaderIdleTimeoutSec = 5 * 30; +int RSReplicatorImpl::sRSReaderMaxReadSize = + kDefaultReplicationReadSize; +int RSReplicatorImpl::sRSReaderMaxChunkReadSize = + max(kDefaultReplicationReadSize, 1 << 20); +int RSReplicatorImpl::sRSReaderLeaseRetryTimeout = 3; +int RSReplicatorImpl::sRSReaderLeaseWaitTimeout = 30; +int RSReplicatorImpl::sRSReaderMetaMaxRetryCount = 2; +int RSReplicatorImpl::sRSReaderMetaTimeSecBetweenRetries = 10; +int RSReplicatorImpl::sRSReaderMetaOpTimeoutSec = 4 * 60; +int RSReplicatorImpl::sRSReaderMetaIdleTimeoutSec = 5 * 60; +bool RSReplicatorImpl::sRSReaderMetaResetConnectionOnOpTimeoutFlag = true; + +int +Replicator::GetNumReplications() +{ + return ReplicatorImpl::GetNumReplications(); +} + +void +Replicator::CancelAll() +{ + ReplicatorImpl::CancelAll(); + RSReplicatorImpl::CancelAll(); +} + +void +Replicator::SetParameters(const Properties& props) +{ + ReplicatorImpl::SetParameters(props); + RSReplicatorImpl::SetParameters(props); +} + +void +Replicator::GetCounters(Replicator::Counters& counters) +{ + ReplicatorImpl::GetCounters(counters); +} + +void +Replicator::Run(ReplicateChunkOp *op) +{ + assert(op); + KFS_LOG_STREAM_DEBUG << op->Show() << KFS_LOG_EOM; + + ReplicatorImpl* impl = 0; + if (op->location.IsValid()) { + ReplicatorImpl::Ctrs().mReplicationCount++; + RemoteSyncSMPtr peer; + if (ReplicatorImpl::GetUseConnectionPoolFlag()) { + peer = gChunkServer.FindServer(op->location); + } else { + peer.reset(new RemoteSyncSM(op->location)); + if (! peer->Connect()) { + peer.reset(); + } + } + if (! peer) { + KFS_LOG_STREAM_ERROR << "replication:" + "unable to find peer: " << op->location.ToString() << + " " << op->Show() << + KFS_LOG_EOM; + op->status = -1; + ReplicatorImpl::Ctrs().mReplicationErrorCount++; + } else { + impl = new ReplicatorImpl(op, peer); + } + } else { + ReplicatorImpl::Ctrs().mRecoveryCount++; + if (op->chunkOffset < 0 || + op->chunkOffset % int64_t(CHUNKSIZE) != 0 || + op->striperType != KFS_STRIPED_FILE_TYPE_RS || + op->numStripes <= 0 || + op->numRecoveryStripes <= 0 || + op->stripeSize < KFS_MIN_STRIPE_SIZE || + op->stripeSize > KFS_MAX_STRIPE_SIZE || + CHUNKSIZE % op->stripeSize != 0 || + op->stripeSize % KFS_STRIPE_ALIGNMENT != 0 || + op->location.port <= 0) { + op->status = -EINVAL; + KFS_LOG_STREAM_ERROR << "replication:" + "invalid request: " << op->Show() << + KFS_LOG_EOM; + ReplicatorImpl::Ctrs().mRecoveryErrorCount++; + } else { + impl = new RSReplicatorImpl(op); + } + } + if (impl) { + impl->Ref(); + impl->Run(); + } else { + SubmitOpResponse(op); + } +} + +} // namespace KFS diff --git a/src/cc/chunk/Replicator.h b/src/cc/chunk/Replicator.h new file mode 100644 index 000000000..9b6e6b2cf --- /dev/null +++ b/src/cc/chunk/Replicator.h @@ -0,0 +1,73 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2007/01/17 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2007-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Code to deal with chunk re-replication and recovery. +//---------------------------------------------------------------------------- + +#ifndef CHUNKSERVER_REPLICATOR_H +#define CHUNKSERVER_REPLICATOR_H + +#include + +namespace KFS +{ + +struct ReplicateChunkOp; +class Properties; + +class Replicator +{ +public: + struct Counters + { + typedef int64_t Counter; + + Counter mReplicationCount; + Counter mReplicationErrorCount; + Counter mReplicationCanceledCount; + Counter mRecoveryCount; + Counter mRecoveryErrorCount; + Counter mRecoveryCanceledCount; + Counter mReplicatorCount; + Counters() + : mReplicationCount(0), + mReplicationErrorCount(0), + mReplicationCanceledCount(0), + mRecoveryCount(0), + mRecoveryErrorCount(0), + mRecoveryCanceledCount(0), + mReplicatorCount(0) + {} + void Reset() + { *this = Counters(); } + }; + static void Run(ReplicateChunkOp* op); + static int GetNumReplications(); + static void CancelAll(); + static void SetParameters(const Properties& props); + static void GetCounters(Counters& counters); +}; + +} + +#endif // CHUNKSERVER_REPLICATOR_H diff --git a/src/cc/chunk/chunkscrubber_main.cc b/src/cc/chunk/chunkscrubber_main.cc new file mode 100644 index 000000000..ba66053fe --- /dev/null +++ b/src/cc/chunk/chunkscrubber_main.cc @@ -0,0 +1,332 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/06/11 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief A tool that scrubs the chunks in a directory and validates checksums. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kfsio/checksum.h" +#include "common/MsgLogger.h" +#include "qcdio/QCUtils.h" +#include "Chunk.h" +#include "ChunkManager.h" + +namespace KFS +{ +#ifndef O_DIRECT +const int O_DIRECT = 0; +#endif + +using std::cout; + +const int kIoBlkSize = 4 << 10; + +static int +Deserialize(ChunkInfo_t& chunkInfo, int fd, char* buf, bool hdrChksumRequiredFlag) +{ + const DiskChunkInfo_t& dci = + *reinterpret_cast(buf); + const uint64_t& checksum = + *reinterpret_cast(&dci + 1); + const size_t readsz = (sizeof(dci) + sizeof(checksum) + + kIoBlkSize - 1) / kIoBlkSize * kIoBlkSize; + + const ssize_t res = pread(fd, buf, readsz, 0); + if (res != (ssize_t)readsz) { + return (res < 0 ? -errno : -EINVAL); + } + uint32_t headerChecksum = 0; + if ((checksum != 0 || hdrChksumRequiredFlag) && + (headerChecksum = ComputeBlockChecksum(buf, sizeof(dci))) != + checksum) { + KFS_LOG_STREAM_ERROR << + "chunk header checksum mismatch:" + " computed: " << headerChecksum << + " expected: " << checksum << + KFS_LOG_EOM; + return -EBADCKSUM; + } + KFS_LOG_STREAM_DEBUG << + " chunk header checksum: " << checksum << + KFS_LOG_EOM; + return chunkInfo.Deserialize(dci, true); +} + +static bool +scrubFile(const string& fn, bool hdrChksumRequiredFlag, + char* buf, chunkOff_t infilesz) +{ + const int kNumComponents = 3; + long long components[kNumComponents]; + const char* ptr = fn.c_str(); + char* end = 0; + int64_t filesz = infilesz; + int i; + const char* p; + + if ((p = strrchr(ptr, '/'))) { + ptr = p + 1; + } + for (i = 0; i < kNumComponents; i++) { + components[i] = strtoll(ptr, &end, 10); + if (components[i] < 0) { + break; + } + if ((*end & 0xFF) != '.') { + if (*end == 0) { + i++; + } + break; + } + ptr = end + 1; + } + if (i != kNumComponents || *end) { + KFS_LOG_STREAM_ERROR << + fn << ": malformed chunk file name" << + KFS_LOG_EOM; + return false; + } + // Allow files bigger than chunk size. If file wasn't properly closed, + // but was in the stable directory, its header needs to be read, + // validated and proper size must be set. + // The file might be bigger by one io buffer size, and io buffer size is + // guaranteed to be less or equal to the KFS_CHUNK_HEADER_SIZE. + const int64_t kMaxChunkFileSize = (int64_t)(KFS_CHUNK_HEADER_SIZE + CHUNKSIZE); + if (filesz < (int64_t)KFS_CHUNK_HEADER_SIZE || + filesz > (int64_t)(kMaxChunkFileSize + KFS_CHUNK_HEADER_SIZE)) { + KFS_LOG_STREAM_ERROR << + fn << ": invalid file size: " << filesz << + KFS_LOG_EOM; + return false; + } + const chunkId_t chunkId = components[1]; + const kfsSeq_t chunkVers = components[2]; + + const int fd = open(fn.c_str(), O_RDONLY | O_DIRECT); + if (fd < 0) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + fn << ":" << QCUtils::SysError(err) << + KFS_LOG_EOM; + return false; + } + + ChunkInfo_t chunkInfo; + const int err = Deserialize(chunkInfo, fd, buf, hdrChksumRequiredFlag); + if (err != 0) { + KFS_LOG_STREAM_ERROR << + fn << ":" << QCUtils::SysError(-err) << + KFS_LOG_EOM; + close(fd); + return false; + } + if (chunkInfo.chunkId != chunkId) { + KFS_LOG_STREAM_ERROR << + fn << ":" << "chunk id mismatch: " << chunkInfo.chunkId << + KFS_LOG_EOM; + close(fd); + return false; + } + if (chunkInfo.chunkVersion != chunkVers) { + KFS_LOG_STREAM_ERROR << + fn << ":" << "chunk id version: " << chunkInfo.chunkId << + KFS_LOG_EOM; + close(fd); + return false; + } + KFS_LOG_STREAM_DEBUG << + "fid: " << chunkInfo.fileId << + " chunkId: " << chunkInfo.chunkId << + " size: " << chunkInfo.chunkSize << + " version: " << chunkInfo.chunkVersion << + KFS_LOG_EOM; + if (chunkInfo.chunkSize < 0 || + chunkInfo.chunkSize > (chunkOff_t)CHUNKSIZE) { + KFS_LOG_STREAM_ERROR << + fn << ":" << " invalid chunk size: " << chunkInfo.chunkSize << + KFS_LOG_EOM; + close(fd); + return false; + } + const ssize_t res = pread(fd, buf, CHUNKSIZE, KFS_CHUNK_HEADER_SIZE); + if (res < 0) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + fn << ":" << QCUtils::SysError(err) << + KFS_LOG_EOM; + close(fd); + return false; + } + if (res != chunkInfo.chunkSize) { + KFS_LOG_STREAM_ERROR << + "chunk size mismatch:" + " chunk header: " << chunkInfo.chunkSize << + " read returned: " << res << + KFS_LOG_EOM; + if (res < chunkInfo.chunkSize) { + close(fd); + return false; + } + } + const size_t off = chunkInfo.chunkSize % CHECKSUM_BLOCKSIZE; + if (off > 0) { + memset(buf + chunkInfo.chunkSize, 0, CHECKSUM_BLOCKSIZE - off); + } + // go thru block by block and verify checksum + bool ok = true; + for (int i = 0, b = 0; + i < chunkInfo.chunkSize; + i += CHECKSUM_BLOCKSIZE, b++) { + const uint32_t cksum = ComputeBlockChecksum(buf + i, CHECKSUM_BLOCKSIZE); + if (cksum != chunkInfo.chunkBlockChecksum[b]) { + KFS_LOG_STREAM_ERROR << + fn << ": checksum mismatch" + " block: " << b << + " pos: " << i << + " computed: " << cksum << + " expected: " << chunkInfo.chunkBlockChecksum[b] << + KFS_LOG_EOM; + ok = false; + } + } + close(fd); + return ok; +} + +static int +ChunkScrubberMain(int argc, char **argv) +{ + int optchar; + bool help = false; + bool verbose = false; + bool hdrChksumRequiredFlag = false; + bool throttleFlag = false; + double sampling = -1; + + while ((optchar = getopt(argc, argv, "hvtcs:")) != -1) { + switch (optchar) { + case 'v': + verbose = true; + break; + case 's': + sampling = atof(optarg); + break; + case 't': + throttleFlag = true; + break; + case 'h': + default: + help = true; + break; + } + } + + if (help || optind >= argc) { + cout << + "Usage: " << argv[0] << "{-v} {-s 0.1} {-t} ...\n" + " -s -- sampling: scrub only about 10% of the files\n" + " -v -- verbose\n" + " -t -- throttle\n" + ; + return 1; + } + + MsgLogger::Init(0, verbose ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelINFO); + + char* const allocBuf = + new char[CHUNKSIZE + KFS_CHUNK_HEADER_SIZE + kIoBlkSize]; + char* const buf = + allocBuf + (kIoBlkSize - (allocBuf - (char*)0) % kIoBlkSize); + srand48(time(0)); + + int ret = 0; + while (optind < argc) { + const char* const chunkDir = argv[optind++]; + DIR* const dirStream = opendir(chunkDir); + if (! dirStream) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + chunkDir << ": " << QCUtils::SysError(err) << + KFS_LOG_EOM; + ret = 1; + continue; + } + double randval; + struct stat statBuf = {0}; + struct dirent const* dent; + int i = 0; + while ((dent = readdir(dirStream))) { + ++i; + string fn(chunkDir); + fn = fn + "/" + dent->d_name; + if (stat(fn.c_str(), &statBuf)) { + const int err = errno; + KFS_LOG_STREAM_ERROR << + fn << ":" << QCUtils::SysError(err) << + KFS_LOG_EOM; + ret = 1; + continue; + } + if (! S_ISREG(statBuf.st_mode)) { + continue; + } + if (sampling > 0) { + randval = drand48(); + if (randval > 0.1) { + continue; + } + } + if (! scrubFile(fn, hdrChksumRequiredFlag, buf, statBuf.st_size)) { + ret = 1; + } + // scrubs will keep the disk very busy; slow it down so that + // the system isn't overwhelmed + if (throttleFlag && (i % 10) == 0) { + sleep(1); + } + } + closedir(dirStream); + } + delete [] allocBuf; + + MsgLogger::Stop(); + return ret; +} + +} + +int +main(int argc, char** argv) +{ + return KFS::ChunkScrubberMain(argc, argv); +} diff --git a/src/cc/chunk/chunkserver_main.cc b/src/cc/chunk/chunkserver_main.cc new file mode 100644 index 000000000..89932fd6b --- /dev/null +++ b/src/cc/chunk/chunkserver_main.cc @@ -0,0 +1,545 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/22 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "ChunkServer.h" +#include "ClientManager.h" +#include "ChunkManager.h" +#include "Logger.h" +#include "AtomicRecordAppender.h" +#include "RemoteSyncSM.h" + +#include "common/Properties.h" +#include "common/MdStream.h" +#include "common/MemLock.h" +#include "kfsio/NetManager.h" +#include "kfsio/Globals.h" +#include "kfsio/NetErrorSimulator.h" +#include "qcdio/QCUtils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +extern char **environ; + +namespace KFS { + +using std::string; +using std::vector; +using std::cout; +using std::cerr; +using std::fstream; +using std::ostream; +using std::istringstream; +using std::min; +using KFS::libkfsio::globalNetManager; +using KFS::libkfsio::InitGlobals; + +// Restart the chunk chunk server process by issuing exec when / if requested. +// Fork is more reliable, but might confuse existing scripts. Using debugger +// with fork is a little bit more involved. +// The intention here is to do graceful restart, this is not intended as an +// external "nanny" / monitoring / watchdog. +class Restarter +{ +public: + Restarter() + : mCwd(0), + mArgs(0), + mEnv(0), + mMaxGracefulRestartSeconds(60 * 6), + mExitOnRestartFlag(false) + {} + ~Restarter() + { Cleanup(); } + bool Init(int argc, char **argv) + { + ::alarm(0); + if (::signal(SIGALRM, &Restarter::SigAlrmHandler) == SIG_ERR) { + QCUtils::FatalError("signal(SIGALRM)", errno); + } + Cleanup(); + if (argc < 1 || ! argv) { + return false; + } + for (int len = PATH_MAX; len < PATH_MAX * 1000; len += PATH_MAX) { + mCwd = (char*)::malloc(len); + if (! mCwd || ::getcwd(mCwd, len)) { + break; + } + const int err = errno; + ::free(mCwd); + mCwd = 0; + if (err != ERANGE) { + break; + } + } + if (! mCwd) { + return false; + } + mArgs = new char*[argc + 1]; + int i; + for (i = 0; i < argc; i++) { + if (! (mArgs[i] = ::strdup(argv[i]))) { + Cleanup(); + return false; + } + } + mArgs[i] = 0; + char** ptr = environ; + for (i = 0; *ptr; i++, ptr++) + {} + mEnv = new char*[i + 1]; + for (i = 0, ptr = environ; *ptr; ) { + if (! (mEnv[i++] = ::strdup(*ptr++))) { + Cleanup(); + return false; + } + } + mEnv[i] = 0; + return true; + } + void SetParameters(const Properties& props, string prefix) + { + mMaxGracefulRestartSeconds = props.getValue( + prefix + "maxGracefulRestartSeconds", + mMaxGracefulRestartSeconds + ); + mExitOnRestartFlag = props.getValue( + prefix + "exitOnRestartFlag", + mExitOnRestartFlag + ); + } + string Restart() + { + if (! mCwd || ! mArgs || ! mEnv || ! mArgs[0] || ! mArgs[0][0]) { + return string("not initialized"); + } + if (! mExitOnRestartFlag) { + struct stat res = {0}; + if (::stat(mCwd, &res) != 0) { + return QCUtils::SysError(errno, mCwd); + } + if (! S_ISDIR(res.st_mode)) { + return (mCwd + string(": not a directory")); + } + string execpath(mArgs[0][0] == '/' ? mArgs[0] : mCwd); + if (mArgs[0][0] != '/') { + if (! execpath.empty() && + execpath.at(execpath.length() - 1) != '/') { + execpath += "/"; + } + execpath += mArgs[0]; + } + if (::stat(execpath.c_str(), &res) != 0) { + return QCUtils::SysError(errno, execpath.c_str()); + } + if (! S_ISREG(res.st_mode)) { + return (execpath + string(": not a file")); + } + } + if (::signal(SIGALRM, &Restarter::SigAlrmHandler) == SIG_ERR) { + QCUtils::FatalError("signal(SIGALRM)", errno); + } + if (mMaxGracefulRestartSeconds > 0) { + if (sInstance) { + return string("restart in progress"); + } + sInstance = this; + if (::atexit(&Restarter::RestartSelf)) { + sInstance = 0; + return QCUtils::SysError(errno, "atexit"); + } + ::alarm((unsigned int)mMaxGracefulRestartSeconds); + globalNetManager().Shutdown(); + } else { + ::alarm((unsigned int)-mMaxGracefulRestartSeconds); + Exec(); + } + return string(); + } +private: + char* mCwd; + char** mArgs; + char** mEnv; + int mMaxGracefulRestartSeconds; + bool mExitOnRestartFlag; + + static Restarter* sInstance; + + static void FreeArgs(char** args) + { + if (! args) { + return; + } + char** ptr = args; + while (*ptr) { + ::free(*ptr++); + } + delete [] args; + } + void Cleanup() + { + free(mCwd); + mCwd = 0; + FreeArgs(mArgs); + mArgs = 0; + FreeArgs(mEnv); + mEnv = 0; + } + void Exec() + { + if (mExitOnRestartFlag) { + _exit(0); + } +#ifdef KFS_OS_NAME_LINUX + ::clearenv(); +#else + environ = 0; +#endif + if (mEnv) { + for (char** ptr = mEnv; *ptr; ptr++) { + if (::putenv(*ptr)) { + QCUtils::FatalError("putenv", errno); + } + } + } + if (::chdir(mCwd) != 0) { + QCUtils::FatalError(mCwd, errno); + } + execvp(mArgs[0], mArgs); + QCUtils::FatalError(mArgs[0], errno); + } + static void RestartSelf() + { + if (! sInstance) { + ::abort(); + } + sInstance->Exec(); + } + static void SigAlrmHandler(int /* sig */) + { + write(2, "SIGALRM\n", 8); + ::abort(); + } +}; +Restarter* Restarter::sInstance = 0; +static Restarter sRestarter; + +string RestartChunkServer() +{ + return sRestarter.Restart(); +} + +class StdErrAndOutRedirector +{ +public: + StdErrAndOutRedirector(const char* outName, const char* errName) + : mCout(outName), + mCerr(errName), + mPrevCout(*outName ? cout.tie(&mCout) : 0), + mPrevCerr(*errName ? cerr.tie(&mCerr) : 0) + { + if (*outName) { + freopen(outName, "a", stdout); + } + if (*errName) { + freopen(errName, "a", stderr); + } + } + ~StdErrAndOutRedirector() + { + if (mPrevCerr) { + cerr.tie(mPrevCerr); + } + if (mPrevCout) { + cout.tie(mPrevCout); + } + } +private: + fstream mCout; + fstream mCerr; + ostream* const mPrevCout; + ostream* const mPrevCerr; +}; + +class ChunkServerMain +{ +public: + ChunkServerMain() + : mProp(), + mLogDir(), + mChunkDirs(), + mMD5Sum(), + mMetaServerLoc(), + mChunkServerClientPort(-1), + mChunkServerHostname(), + mClusterKey(), + mChunkServerRackId(-1), + mMaxLockedMemorySize(0) + {} + int Run(int argc, char **argv); + +private: + Properties mProp; + string mLogDir; + vector mChunkDirs; + string mMD5Sum; + ServerLocation mMetaServerLoc; + int mChunkServerClientPort; // Port at which kfs clients connect to us + string mChunkServerHostname; // Our hostname to use (instead of using gethostname() ) + string mClusterKey; + int mChunkServerRackId; + int64_t mMaxLockedMemorySize; + + void ComputeMD5(const char *pathname); + bool LoadParams(const char *fileName); +}; + +void +ChunkServerMain::ComputeMD5(const char* pathname) +{ + const size_t kBufSize = size_t(1) << 10; + char* const buf = new char[kBufSize]; + fstream is(pathname, fstream::in | fstream::binary); + MdStream mds(0, false, string(), 0); + + while (is && mds) { + is.read(buf, kBufSize); + mds.write(buf, is.gcount()); + } + delete [] buf; + + if (! is.eof() || ! mds) { + KFS_LOG_STREAM_ERROR << + "md5sum " << QCUtils::SysError(errno, pathname) << + KFS_LOG_EOM; + } else { + mMD5Sum = mds.GetMd(); + KFS_LOG_STREAM_INFO << + "md5sum " << pathname << ": " << mMD5Sum << + KFS_LOG_EOM; + } + is.close(); +} + +/// +/// Read and validate the configuration settings for the chunk +/// server. The configuration file is assumed to contain lines of the +/// form: xxx.yyy.zzz = +/// @result 0 on success; -1 on failure +/// @param[in] fileName File that contains configuration information +/// for the chunk server. +bool +ChunkServerMain::LoadParams(const char* fileName) +{ + static StdErrAndOutRedirector redirector( + mProp.getValue("chunkServer.stdout", ""), + mProp.getValue("chunkServer.stderr", "") + ); + + if (mProp.loadProperties(fileName, '=', false) != 0) { + KFS_LOG_STREAM_FATAL << + "Invalid properties file: " << fileName << + KFS_LOG_EOM; + return false; + } + + MsgLogger::GetLogger()->SetLogLevel( + mProp.getValue("chunkServer.loglevel", + MsgLogger::GetLogLevelNamePtr(MsgLogger::GetLogger()->GetLogLevel()))); + MsgLogger::GetLogger()->SetMaxLogWaitTime(0); + MsgLogger::GetLogger()->SetParameters(mProp, "chunkServer.msgLogWriter."); + sRestarter.SetParameters(mProp, "chunkServer."); + + string displayProps(fileName); + displayProps += ":\n"; + mProp.getList(displayProps, string()); + KFS_LOG_STREAM_INFO << displayProps << KFS_LOG_EOM; + + mMetaServerLoc.hostname = mProp.getValue("chunkServer.metaServer.hostname", ""); + mMetaServerLoc.port = mProp.getValue("chunkServer.metaServer.port", -1); + if (! mMetaServerLoc.IsValid()) { + KFS_LOG_STREAM_FATAL << "invalid meta-server host or port: " << + mMetaServerLoc.hostname << ':' << mMetaServerLoc.port << + KFS_LOG_EOM; + return false; + } + + mChunkServerClientPort = mProp.getValue( + "chunkServer.clientPort", mChunkServerClientPort); + if (mChunkServerClientPort < 0) { + KFS_LOG_STREAM_FATAL << "invalid client port: " << mChunkServerClientPort << + KFS_LOG_EOM; + return false; + } + KFS_LOG_STREAM_INFO << "chunk server client port: " << + mChunkServerClientPort << + KFS_LOG_EOM; + + mChunkServerHostname = mProp.getValue("chunkServer.hostname", mChunkServerHostname); + if (! mChunkServerHostname.empty()) { + KFS_LOG_STREAM_INFO << "chunk server hostname: " << + mChunkServerHostname << + KFS_LOG_EOM; + } + + // Paths are space separated directories for storing chunks + istringstream is(mProp.getValue("chunkServer.chunkDir", "chunks")); + string dir; + while ((is >> dir)) { + KFS_LOG_STREAM_INFO << "chunk dir: " << dir << KFS_LOG_EOM; + mChunkDirs.push_back(dir); + } + + mLogDir = mProp.getValue("chunkServer.logDir", "logs"); + KFS_LOG_STREAM_INFO << "log dir: " << mLogDir << KFS_LOG_EOM; + + mChunkServerRackId = mProp.getValue("chunkServer.rackId", mChunkServerRackId); + KFS_LOG_STREAM_INFO << "rack: " << mChunkServerRackId << + KFS_LOG_EOM; + + mClusterKey = mProp.getValue("chunkServer.clusterKey", mClusterKey); + KFS_LOG_STREAM_INFO << "cluster key: " << mClusterKey << KFS_LOG_EOM; + + mMD5Sum = mProp.getValue("chunkServer.md5sum", mMD5Sum); + NetErrorSimulatorConfigure( + globalNetManager(), + mProp.getValue("chunkServer.netErrorSimulator", "") + ); + + mMaxLockedMemorySize = (int64_t)mProp.getValue( + "chunkServer.maxLockedMemory", (double)mMaxLockedMemorySize); + string errMsg; + int err = LockProcessMemory( + mMaxLockedMemorySize, + min(int64_t(16) << 20, mMaxLockedMemorySize / 3), + min(int64_t(16) << 20, mMaxLockedMemorySize / 4), + &errMsg + ); + if (err != 0) { + KFS_LOG_STREAM_FATAL << + errMsg << + (errMsg.empty() ? QCUtils::SysError( + err, "lock process memory") : string()) << + KFS_LOG_EOM; + return false; + } + return true; +} + +static void SigQuitHandler(int /* sig */) +{ + write(1, "SIGQUIT\n", 8); + globalNetManager().Shutdown(); +} + +static void SigHupHandler(int /* sig */) +{ + gMetaServerSM.Reconnect(); +} + +int +ChunkServerMain::Run(int argc, char **argv) +{ + if (argc < 2) { + cout << "Usage: " << argv[0] << + " {}" + "\n"; + return 0; + } + + sRestarter.Init(argc, argv); + MsgLogger::Init(argc > 2 ? argv[2] : 0); + srand((int)microseconds()); + InitGlobals(); + MdStream::Init(); + + // set the coredump size to unlimited + struct rlimit rlim; + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_CORE, &rlim)) { + KFS_LOG_STREAM_INFO << "Unable to increase coredump file size: " << + QCUtils::SysError(errno, "RLIMIT_CORE") << + KFS_LOG_EOM; + } + + // compute the MD5 of the binary +#ifdef KFS_OS_NAME_LINUX + ComputeMD5("/proc/self/exe"); +#endif + if (mMD5Sum.empty()) { + ComputeMD5(argv[0]); + } + if (! LoadParams(argv[1])) { + return 1; + } + + KFS_LOG_STREAM_INFO << "Starting chunkserver..." << KFS_LOG_EOM; + KFS_LOG_STREAM_INFO << + "md5sum to send to metaserver: " << mMD5Sum << + KFS_LOG_EOM; + + gChunkServer.Init(); + int ret = 1; + if (gChunkManager.Init(mChunkDirs, mProp)) { + gLogger.Init(mLogDir); + gMetaServerSM.SetMetaInfo( + mMetaServerLoc, mClusterKey, mChunkServerRackId, mMD5Sum, mProp); + signal(SIGPIPE, SIG_IGN); + signal(SIGQUIT, &SigQuitHandler); + signal(SIGHUP, &SigHupHandler); + + ret = gChunkServer.MainLoop( + mChunkServerClientPort, mChunkServerHostname) ? 0 : 1; + gChunkManager.Shutdown(); + } + NetErrorSimulatorConfigure(globalNetManager()); + MdStream::Cleanup(); + + return ret; +} +static ChunkServerMain sChunkServerMain; + +} // namespace KFS + +int +main(int argc, char **argv) +{ + return KFS::sChunkServerMain.Run(argc, argv); +} diff --git a/src/cc/chunk/chunktrimmer_main.cc b/src/cc/chunk/chunktrimmer_main.cc new file mode 100644 index 000000000..bf6805098 --- /dev/null +++ b/src/cc/chunk/chunktrimmer_main.cc @@ -0,0 +1,228 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/06/11 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief This works only on Linux: Whenever chunks are moved to +// lost+found, it is likely because we had trouble reading data off +// disk. We'd like to leave the "bad" sectors pinned while freeing up +// the other readable sectors. This takes out the bad sectors from +// allocation while improving disk health. On Linux, we use XFS +// ioctl's to punch holes for all the readable data from a file, while +// leaving the regions that return EIO pinned on disk. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef KFS_OS_NAME_LINUX +#include +#endif + +using std::cout; +using std::endl; +using std::string; +using boost::scoped_array; + +static void trimFile(string &fn, int64_t filesz, bool verbose); + +int main(int argc, char **argv) +{ + char optchar; + bool help = false; + string pathname = ""; + int res, count; + int ndaysOld = 5; + struct dirent **entries; + struct stat statBuf; + bool verbose = false; + + while ((optchar = getopt(argc, argv, "hvd:n:")) != -1) { + switch (optchar) { + case 'd': + pathname = optarg; + break; + case 'v': + verbose = true; + break; + case 'n': + ndaysOld = atoi(optarg); + break; + case 'h': + default: + help = true; + break; + } + } + + if ((help) || (pathname == "")) { + cout << "Usage: " << argv[0] << " -d {-n } {-v} " << endl; + cout << "where, files older than n days are pruned" << endl; + exit(-1); + } + + time_t now = time(0); + + res = stat(pathname.c_str(), &statBuf); + if (res < 0) { + cout << "Unable to open: " << pathname << endl; + exit(-1); + } + if (S_ISREG(statBuf.st_mode)) { + if ((now - statBuf.st_mtime) < (ndaysOld * 24 * 60 * 60)) { + cout << "Ignoring file: " << pathname << " since it is not old enough for pruning" << endl; + exit(0); + } + trimFile(pathname, statBuf.st_size, verbose); + exit(0); + } + + count = scandir(pathname.c_str(), &entries, 0, alphasort); + if (count < 0) { + cout << "Unable to open: " << pathname << endl; + exit(-1); + } + + + for (int i = 0; i < count; i++) { + string fn = pathname; + fn = fn + "/" + entries[i]->d_name; + res = stat(fn.c_str(), &statBuf); + free(entries[i]); + if ((res < 0) || (!S_ISREG(statBuf.st_mode)) || + (fn.rfind(".corrupt") != string::npos)) { + // either it is not a file or it has been trimmed earlier + // and renamed to .corrupt + continue; + } + if ((now - statBuf.st_mtime) < (ndaysOld * 24 * 60 * 60)) { + cout << "Ignoring file: " << fn << " since it is not old enough for pruning" << endl; + continue; + } + trimFile(fn, statBuf.st_size, verbose); + } + free(entries); + exit(0); +} + +#ifdef KFS_OS_NAME_LINUX + +static const int oneMB = 1 << 20; +static const int fourK = 4096; + +static int trimPart(int fd, int64_t startP, int64_t endP, int blkSz, int &numFreed, int &numKept); + +static void trimFile(string &fn, int64_t filesz, bool verbose) +{ + int fd, res; + int numFreed = 0, numKept = 0; + + fd = open(fn.c_str(), O_RDWR | O_DIRECT); + // fd = open(fn.c_str(), O_RDONLY | O_DIRECT); + if (fd < 0) { + cout << "Unable to open: " << fn << endl; + return; + } + bool shouldPreserveFile = false; + int64_t filePos = 0; + int dummy; + while (1) { + // remember the filePos, since on a successful read it will be updated + filePos = lseek(fd, 0, SEEK_CUR); + // read in big chunks; if any fail, we will switch down to + // reading in 4k and pinning only the unreadable 4k blocks + res = trimPart(fd, filePos, filePos + oneMB, oneMB, numFreed, dummy); + if ((res == 0) || (filePos > filesz)) { + // hit EOF + break; + } + if (res < 0) { + filePos = lseek(fd, filePos, SEEK_SET); + shouldPreserveFile = true; + trimPart(fd, filePos, filePos + oneMB, fourK, numFreed, numKept); + } + } + close(fd); + if (shouldPreserveFile) { + string corruptFn = fn + ".corrupt"; + rename(fn.c_str(), corruptFn.c_str()); + cout << "Renamed: " << fn << " -> " << corruptFn << endl; + cout << "Filesize: " << filePos << endl; + cout << "Num 4k-blocks kept: " << numKept << endl; + cout << "Num 4k-blocks freed: " << numFreed << endl; + } else { + cout << "Non-corrupt file: " << fn << " ; so nuking" << endl; + unlink(fn.c_str()); + } +} + +static int +trimPart(int fd, int64_t startP, int64_t endP, int blkSz, int &numFreed, int &numKept) +{ + static char *data = NULL; + xfs_flock64_t theFree = {0}; + int64_t currP = startP; + int retval = 1, res; + + if (data == NULL) { + data = (char *) memalign(fourK, oneMB * sizeof(char)); + } + while (currP < endP) { + if (currP >= endP) + break; + res = read(fd, data, blkSz); + if (res == 0) + return (retval == -1 ? -1 : 0); + if (res < 0) { + // we couldn't read a block; so, reset the position and skip over the block + // we failed to read + currP = lseek(fd, currP + blkSz, SEEK_SET); + cout << "Couldn't read block at: " << currP - blkSz << " of len = " << blkSz << endl; + numKept += (blkSz / 4096); + // file needs to be preserved + retval = -1; + continue; + } + // got data, so free the block and punch a hole in the file + theFree.l_whence = 0; + theFree.l_start = currP; + theFree.l_len = res; + xfsctl(0, fd, XFS_IOC_UNRESVSP64, &theFree); + numFreed += (res / 4096); + currP += res; + } + return retval; +} + +#else +static void trimFile(string &fn, int64_t filesz, bool verbose) +{ + +} +#endif diff --git a/src/cc/chunk/chunkupgrade_main.cc b/src/cc/chunk/chunkupgrade_main.cc new file mode 100644 index 000000000..46d8396f5 --- /dev/null +++ b/src/cc/chunk/chunkupgrade_main.cc @@ -0,0 +1,190 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/06/11 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Convert v1 chunks to the current version. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kfsio/checksum.h" +#include "kfsio/Globals.h" +#include "common/MsgLogger.h" +#include "kfsio/FileHandle.h" +#include "Chunk.h" +#include "ChunkManager.h" + +using std::cout; +using std::endl; +using std::string; +using std::ostringstream; +using boost::scoped_array; + +string KFS::RestartChunkServer(); +string KFS::RestartChunkServer() +{ + return string("not supported"); +} + +using namespace KFS; + +// This structure is on-disk +struct DiskChunkInfoV1_t { + DiskChunkInfoV1_t() : metaMagic (CHUNK_META_MAGIC), metaVersion(CHUNK_META_VERSION) { } + DiskChunkInfoV1_t(kfsFileId_t f, kfsChunkId_t c, int64_t s, kfsSeq_t v) : + metaMagic (CHUNK_META_MAGIC), metaVersion(CHUNK_META_VERSION), + fileId(f), chunkId(c), chunkSize(s), chunkVersion(v) { } + void SetChecksums(const uint32_t *checksums) { + memcpy(chunkBlockChecksum, checksums, MAX_CHUNK_CHECKSUM_BLOCKS * sizeof(uint32_t)); + } + int metaMagic; + int metaVersion; + + kfsFileId_t fileId; + kfsChunkId_t chunkId; + int64_t chunkSize; + uint32_t chunkBlockChecksum[MAX_CHUNK_CHECKSUM_BLOCKS]; + // some statistics about the chunk: + // -- version # has an estimate of the # of writes + // -- track the # of reads + // ... + uint32_t chunkVersion; + uint32_t numReads; + char filename[MAX_FILENAME_LEN]; +}; + +static void upgradeChunkFile(string chunkDir, string fn, bool verbose); +static string makeChunkFilename(const string &chunkDir, const DiskChunkInfo_t &chunkInfo); + +int main(int argc, char **argv) +{ + char optchar; + bool help = false; + const char *chunkDir = NULL; + int res, count; + struct dirent **entries; + struct stat statBuf; + bool verbose = false; + + KFS::MsgLogger::Init(0, KFS::MsgLogger::kLogLevelINFO); + + while ((optchar = getopt(argc, argv, "hvd:")) != -1) { + switch (optchar) { + case 'd': + chunkDir = optarg; + break; + case 'v': + verbose = true; + break; + case 'h': + default: + help = true; + break; + } + } + + if ((help) || (chunkDir == NULL)) { + cout << "Usage: " << argv[0] << " -d {-v}" << endl; + exit(-1); + } + + res = scandir(chunkDir, &entries, 0, alphasort); + if (res < 0) { + cout << "Unable to open: " << chunkDir << endl; + exit(-1); + } + + count = res; + for (int i = 0; i < count; i++) { + string fn = chunkDir; + fn = fn + "/" + entries[i]->d_name; + res = stat(fn.c_str(), &statBuf); + if ((res < 0) || (!S_ISREG(statBuf.st_mode))) + continue; + upgradeChunkFile(chunkDir, entries[i]->d_name, verbose); + free(entries[i]); + } + free(entries); + exit(0); +} + +static void upgradeChunkFile(string chunkDir, string chunkfn, bool verbose) +{ + DiskChunkInfoV1_t chunkInfoV1; + int fd, res; + FileHandlePtr f; + scoped_array data; + string fn, newfn; + + fn = chunkDir + "/" + chunkfn; + + fd = open(fn.c_str(), O_RDWR); + if (fd < 0) { + cout << "Unable to open: " << fn << endl; + return; + } + f.reset(new FileHandle_t(fd)); + res = pread(fd, &chunkInfoV1, sizeof(DiskChunkInfoV1_t), 0); + if (res < 0) { + cout << "Unable to read chunkinfo for: " << fn << endl; + return; + } + + DiskChunkInfo_t chunkInfo(chunkInfoV1.fileId, chunkInfoV1.chunkId, + chunkInfoV1.chunkSize, chunkInfoV1.chunkVersion); + chunkInfo.SetChecksums(chunkInfoV1.chunkBlockChecksum); + res = pwrite(fd, &chunkInfo, sizeof(DiskChunkInfo_t), 0); + if (res < 0) { + cout << "Unable to write chunkinfo for: " << fn << endl; + return; + } + + if (verbose) { + cout << "fid: "<< chunkInfo.fileId << endl; + cout << "chunkId: "<< chunkInfo.chunkId << endl; + cout << "size: "<< chunkInfo.chunkSize << endl; + cout << "version: "<< chunkInfo.chunkVersion << endl; + } + // upgrade the meta-data + + newfn = makeChunkFilename(chunkDir, chunkInfo); + res = rename(fn.c_str(), newfn.c_str()); + if (res < 0) + perror("rename"); +} + +string makeChunkFilename(const string &chunkDir, const DiskChunkInfo_t &chunkInfo) +{ + ostringstream os; + + os << chunkDir << '/' << chunkInfo.fileId << '.' << chunkInfo.chunkId << '.' << chunkInfo.chunkVersion; + return os.str(); +} diff --git a/src/cc/chunk/utils.cc b/src/cc/chunk/utils.cc new file mode 100644 index 000000000..fabc8b84a --- /dev/null +++ b/src/cc/chunk/utils.cc @@ -0,0 +1,73 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/09/27 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include +#include +#include + +#include "utils.h" +#include "common/MsgLogger.h" +#include "kfsio/IOBuffer.h" + +namespace KFS +{ +using std::string; + +/// +/// Return true if there is a sequence of "\r\n\r\n". +/// @param[in] iobuf: Buffer with data sent by the client +/// @param[out] msgLen: string length of the command in the buffer +/// @retval true if a command is present; false otherwise. +/// +bool IsMsgAvail(IOBuffer *iobuf, int *msgLen) +{ + const int idx = iobuf->IndexOf(0, "\r\n\r\n"); + if (idx < 0) { + return false; + } + *msgLen = idx + 4; // including terminating seq. length. + return true; +} + +void die(const string &msg) +{ + string lm = "panic: " + msg; + KFS_LOG_STREAM_FATAL << lm << KFS_LOG_EOM; + lm += "\n"; + write(2, msg.data(), msg.size()); + MsgLogger::Stop(); + abort(); +} + +kfsSeq_t GetRandomSeq() +{ + kfsSeq_t id = 0; + RAND_pseudo_bytes( + reinterpret_cast(&id), int(sizeof(id))); + return ((id < 0 ? -id : id) >> 1); +} + +} diff --git a/src/cc/chunk/utils.h b/src/cc/chunk/utils.h new file mode 100644 index 000000000..3e04881e8 --- /dev/null +++ b/src/cc/chunk/utils.h @@ -0,0 +1,61 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/09/27 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef CHUNKSERVER_UTILS_H +#define CHUNKSERVER_UTILS_H + +#include "common/kfstypes.h" + +#include + +namespace KFS +{ +using std::string; + +class IOBuffer; +/// +/// Given some data in a buffer, determine if we have a received a +/// valid op---one that ends with "\r\n\r\n". +/// @param[in] iobuf : buffer containing data +/// @param[out] msgLen : if we do have a valid command, return the length of +/// the command +/// @retval True if we have a valid command; False otherwise. +/// +bool IsMsgAvail(IOBuffer* iobuf, int* msgLen); + +/// +/// \brief bomb out on "impossible" error +/// \param[in] msg panic text +/// +void die(const string &msg); + +/// +/// \brief random initial seq. number +/// +kfsSeq_t GetRandomSeq(); +} + +#endif // CHUNKSERVER_UTILS_H diff --git a/src/cc/common/.gitignore b/src/cc/common/.gitignore new file mode 100644 index 000000000..b7e928a62 --- /dev/null +++ b/src/cc/common/.gitignore @@ -0,0 +1,2 @@ +Version.cc +Version.cc.* diff --git a/src/cc/common/BufferedLogWriter.cc b/src/cc/common/BufferedLogWriter.cc new file mode 100644 index 000000000..dc04aa965 --- /dev/null +++ b/src/cc/common/BufferedLogWriter.cc @@ -0,0 +1,1466 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/03/02 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Double buffered message log writer implementation. +// +//---------------------------------------------------------------------------- + +#include "BufferedLogWriter.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "qcdio/QCMutex.h" +#include "qcdio/QCUtils.h" +#include "qcdio/qcstutils.h" +#include "qcdio/qcdebug.h" +#include "qcdio/QCThread.h" +#include "Properties.h" + +namespace KFS +{ + +using std::string; +using std::max; +using std::min; +using std::ostringstream; +using std::ostream; +using std::streambuf; +using std::setw; +using std::setfill; +using std::streamsize; +using std::vector; + +const char* const kBufferedLogWriter_LogLevels[] = { + "FATAL", + "ALERT", + "CRIT", + "ERROR", + "WARN", + "NOTICE", + "INFO", + "DEBUG", + "NOTSET" +}; + +const int64_t kLogWriterMinLogFileSize = 16 << 10; +const int64_t kLogWriterMinOpenRetryIntervalMicroSec = 10000; +const int kLogWriterMinLogBufferSize = 16 << 10; +const int64_t kLogWirterDefaultTimeToKeepSecs = 60 * 60 * 24 * 30; +const int kLogWriterDefaulOpenFlags = + O_CREAT | O_APPEND | O_WRONLY /* | O_SYNC */; + +class BufferedLogWriter::Impl : public QCRunnable +{ +public: + Impl( + int inFd, + const char* inFileNamePtr, + int inBufSize, + const char* inTrucatedSuffixPtr, + int64_t inOpenRetryIntervalMicroSec, + int64_t inFlushIntervalMicroSec, + int64_t inMaxLogFileSize, + int inMaxLogsFiles, + int64_t inMaxLogWaitTimeMicroSec, + const char* inTimeStampFormatPtr, + bool inUseGMTFlag) + : mMutex(), + mWriteCond(), + mWriteDoneCond(), + mThread(0, "LogWriter"), + mLogFileNamePrefixes(), + mFileName(inFileNamePtr ? inFileNamePtr : ""), + mTruncatedSuffix(inTrucatedSuffixPtr ? inTrucatedSuffixPtr : "..."), + mTimeStampFormat(inTimeStampFormatPtr ? + inTimeStampFormatPtr : "%m-%d-%Y %H:%M:%S"), + mNewLogSuffix(), + mDroppedCount(0), + mTotalDroppedCount(0), + mTruncatedCount(0), + mBufWaitersCount(0), + mWriteErrCount(0), + mWriteBytesDiscardedCount(0), + mCurWritten(0), + mBufWaitedCount(0), + mMaxLogFileSize(inMaxLogFileSize <= 0 ? inMaxLogFileSize : + max(kLogWriterMinLogFileSize, inMaxLogFileSize)), + mMaxLogFiles(inMaxLogFileSize > 0 ? inMaxLogsFiles : -1), + mNextOpenRetryTime(Now() - Seconds(10)), + mOpenRetryInterval(max(kLogWriterMinOpenRetryIntervalMicroSec, + inOpenRetryIntervalMicroSec)), + mFlushInterval(max((int64_t)0, inFlushIntervalMicroSec)), + mNextFlushTime(-1), + mTotalLogWaitedTime(0), + mCurLogWatedTime(0), + mMaxLogWaitTime(inMaxLogWaitTimeMicroSec), + mNextDeleteOldLogsTimeSec(0), + mMinModTimeToKeepSec(kLogWirterDefaultTimeToKeepSecs), + mOpenFlags(kLogWriterDefaulOpenFlags), + mOpenMode(0644), + mBufSize(max(inBufSize > 0 ? inBufSize : (1 << 20), + (int)mTruncatedSuffix.size() + kLogWriterMinLogBufferSize)), + mLastError(0), + mFd(inFd >= 0 ? dup(inFd) : inFd), + mMaxAppendLength(mBufSize - mTruncatedSuffix.size() + 1), + mRunFlag(false), + mDeleteOldLogsFlag(false), + mBuf0Ptr(new char [mBufSize * 2]), + mBuf1Ptr(mBuf0Ptr + mBufSize), + mBufPtr(mBuf0Ptr), + mCurPtr(mBufPtr), + mEndPtr(mBuf0Ptr + mBufSize), + mWritePtr(0), + mWriteEndPtr(0), + mCloseFlag(false), + mUseGMTFlag(inUseGMTFlag), + mTimeSec(0), + mLogTimeStampSec(0), + mMsgAppendCount(0), + mMaxMsgStreamCount(256), + mMsgStreamCount(0), + mMsgStreamHeadPtr(0) + { + if (! mFileName.empty()) { + mLogFileNamePrefixes.push_back(mFileName); + } + mLogTimeStampPrefixStr[0] = 0; + int64_t theSec = 0; + int64_t theMicroSec = 0; + Now(theSec, theMicroSec); + mTimeSec = theSec - 1; + mLogTimeStampSec = mTimeSec; + UpdateTimeTm(theSec); + mLastLogTm = mTimeTm; + GetLogTimeStampPrefixPtr(theSec); + mNextFlushTime = Seconds(theSec) + theMicroSec + mFlushInterval; + } + static const char* GetLogLevelNamePtr( + LogLevel inLogLevel) + { + const int theLogLevel = inLogLevel / 100; + return ( + (theLogLevel < 0 || theLogLevel >= + int(sizeof(kBufferedLogWriter_LogLevels) / + sizeof(kBufferedLogWriter_LogLevels[0]))) ? + "INVALID" : kBufferedLogWriter_LogLevels[theLogLevel] + ); + } + virtual ~Impl() + { + Impl::Stop(); + delete [] mBuf0Ptr; + while (mMsgStreamHeadPtr) { + QCASSERT(mMsgStreamCount > 0); + MsgStream* const thePtr = mMsgStreamHeadPtr; + mMsgStreamHeadPtr = thePtr->Next(); + delete thePtr; + mMsgStreamCount--; + } + } + void SetParameters( + string inPropsPrefix, + const Properties& inProps) + { + QCStMutexLocker theLocker(mMutex); + mTruncatedSuffix = inProps.getValue( + "truncatedSuffix", mTruncatedSuffix.c_str()); + if (mTruncatedSuffix.empty()) { + mTruncatedSuffix = "..."; + } + mBufSize = inProps.getValue( + inPropsPrefix + "bufferSize", mBufSize); + mBufSize = max( + mBufSize <= 0 ? (1 << 20) : mBufSize, + (int)mTruncatedSuffix.size() + (16 << 10) + ); + mOpenRetryInterval = max((Time)kLogWriterMinOpenRetryIntervalMicroSec, + (Time)inProps.getValue( + inPropsPrefix + "openRetryIntervalMicroSec", + (double)mOpenRetryInterval)); + mMaxLogFileSize = max(kLogWriterMinLogFileSize, + (int64_t)inProps.getValue( + inPropsPrefix + "maxLogFileSize", (double)mMaxLogFileSize)); + mMaxLogFiles = mMaxLogFileSize <= 0 ? -1 : (int)inProps.getValue( + inPropsPrefix + "maxLogFiles", (double)mMaxLogFiles); + mUseGMTFlag = inProps.getValue( + inPropsPrefix + "useGMT", mUseGMTFlag ? 1 : 0) != 0; + mTimeStampFormat = inProps.getValue( + inPropsPrefix + "timeStampFormat", mTimeStampFormat); + mMinModTimeToKeepSec = (int64_t)inProps.getValue( + inPropsPrefix + "minOldLogModTimeSec", + (double)mMinModTimeToKeepSec); + mMaxMsgStreamCount = inProps.getValue( + inPropsPrefix + "maxMsgStreamCount", + mMaxMsgStreamCount); + string theLogFilePrefixes; + for (LogFileNames::const_iterator theIt = + mLogFileNamePrefixes.begin(); + theIt != mLogFileNamePrefixes.end(); + ++theIt) { + if (theIt != mLogFileNamePrefixes.begin()) { + theLogFilePrefixes += ":"; + } + theLogFilePrefixes += *theIt; + } + theLogFilePrefixes = inProps.getValue( + inPropsPrefix + "logFilePrefixes", theLogFilePrefixes); + ClearLogFileNamePrefixes(); + size_t theStart = 0; + while (theStart < theLogFilePrefixes.length()) { + const size_t theEnd = theLogFilePrefixes.find(':', theStart); + if (theStart != theEnd) { + AddLogFileNamePrefix(theLogFilePrefixes.substr(theStart, + theEnd == string::npos ? string::npos : theEnd - theStart)); + } + if (theEnd == string::npos) { + break; + } + theStart = theEnd + 1; + } + if (! mLogFileNamePrefixes.empty()) { + if (mFileName.empty()) { + if (mFd >= 0) { + ::close(mFd); + } + mFd = -1; + mFileName = mLogFileNamePrefixes.front(); + } else { + LogFileNames::const_iterator theIt; + for (theIt = mLogFileNamePrefixes.begin(); + theIt != mLogFileNamePrefixes.end() && + *theIt != mFileName; + ++theIt) + {} + if (theIt == mLogFileNamePrefixes.end()) { + mCloseFlag = true; + mFileName = mLogFileNamePrefixes.front(); + } + } + } + SetFlushInterval((int64_t)inProps.getValue( + inPropsPrefix + "flushIntervalMicroSec", (double)mFlushInterval) + ); + SetMaxLogWaitTime((int64_t)inProps.getValue( + inPropsPrefix + "waitMicroSec", (double)mMaxLogWaitTime) + ); + while (mMsgStreamHeadPtr && mMaxMsgStreamCount < mMsgStreamCount) { + QCASSERT(mMsgStreamCount > 0); + MsgStream* const thePtr = mMsgStreamHeadPtr; + mMsgStreamHeadPtr = thePtr->Next(); + delete thePtr; + mMsgStreamCount--; + } + + } + void Stop() + { + QCStMutexLocker theLocker(mMutex); + if (! mRunFlag) { + return; + } + FlushSelf(); + mRunFlag = false; + mWriteCond.Notify(); + theLocker.Unlock(); + mThread.Join(); + } + void Start() + { + QCStMutexLocker theLocker(mMutex); + if (mRunFlag) { + return; + } + mRunFlag = true; + const int kStackSize = 32 << 10; + mThread.Start(this, kStackSize, "LogWriter"); + } + bool Reopen() + { + QCStMutexLocker theLocker(mMutex); + if (mFileName.empty() || mFd < 0) { + return false; + } + mCloseFlag = true; + return true; + } + int Open( + const char* inFileNamePtr, + int inFlags, + int inMode, + bool inOpenHereFlag) + { + if (! inFileNamePtr || ! *inFileNamePtr) { + return -EINVAL; + } + int theFd = -1; + int theErr = -1; + while (inOpenHereFlag && + (theFd = ::open(inFileNamePtr, inFlags, inMode)) < 0) { + theErr = errno; + if (theErr != EINTR && theErr != EAGAIN) { + break; + } + } + if (inOpenHereFlag) { + if (theFd < 0) { + return theErr; + } + ::fcntl(theFd, FD_CLOEXEC, 1); + } + QCStMutexLocker theLocker(mMutex); + if (theFd >= 0) { + int64_t theSec = 0; + int64_t theMicroSec = 0; + Now(theSec, theMicroSec); + UpdateTimeTm(theSec); + mLastLogTm = mTimeTm; + } + mNextOpenRetryTime = Now() - Seconds(10); + mFileName = inFileNamePtr; + mOpenFlags = inFlags; + mOpenMode = inMode; + mFd = theFd; + AddLogFileNamePrefix(mFileName); + return 0; + } + bool AddLogFileNamePrefix( + string inName) + { + if (inName.empty()) { + return false; + } + QCStMutexLocker theLocker(mMutex); + for (LogFileNames::const_iterator theIt = + mLogFileNamePrefixes.begin(); + theIt != mLogFileNamePrefixes.end(); + ++theIt) { + if (*theIt == inName) { + return false; + } + } + mLogFileNamePrefixes.push_back(inName); + return true; + } + void ClearLogFileNamePrefixes() + { + QCStMutexLocker theLocker(mMutex); + mLogFileNamePrefixes.clear(); + if (! mFileName.empty()) { + mLogFileNamePrefixes.push_back(mFileName); + } + } + void Close() + { + QCStMutexLocker theLocker(mMutex); + if (mFd >= 0) { + ::close(mFd); + } + mFd = -1; + mFileName.clear(); + } + void Flush() + { + QCStMutexLocker theLocker(mMutex); + FlushSelf(); + } + void SetMaxLogWaitTime( + int64_t inMaxLogWaitTimeMicroSec) + { + QCStMutexLocker theLocker(mMutex); + const Time theMaxLogWaitTime = max((Time)0, inMaxLogWaitTimeMicroSec); + if (mMaxLogWaitTime != theMaxLogWaitTime) { + const bool theWakeupFlag = + mMaxLogWaitTime > theMaxLogWaitTime; + mMaxLogWaitTime = theMaxLogWaitTime; + if (theWakeupFlag) { + mWriteDoneCond.NotifyAll(); + } + } + } + void SetFlushInterval( + int64_t inMicroSecs) + { + QCStMutexLocker theLocker(mMutex); + const Time theInterval = max(int64_t(0), inMicroSecs); + if (mFlushInterval == theInterval) { + return; + } + mNextFlushTime -= mFlushInterval; + mFlushInterval = theInterval; + mNextFlushTime += mFlushInterval; + } + void GetCounters( + Counters& outCounters) + { + QCStMutexLocker theLocker(mMutex); + outCounters.mAppendCount = mMsgAppendCount; + outCounters.mDroppedCount = mTotalDroppedCount; + outCounters.mWriteErrorCount = mWriteErrCount; + outCounters.mAppendWaitCount = mBufWaitedCount; + outCounters.mAppendWaitMicroSecs = mTotalLogWaitedTime; + } + void PrepareToFork() + { mMutex.Lock(); } + void ForkDone() + { mMutex.Unlock(); } + void ChildAtFork() + { + mRunFlag = false; + if (mFd >= 0) { + close(mFd); + mFd = 0; + } + } + void Append( + LogLevel inLogLevel, + Writer& inWriter) + { + if (! mRunFlag) { + return; + } + QCStMutexLocker theLocker(mMutex); + static va_list theArgs; // dummy + AppendSelf(inLogLevel, &inWriter, 0, "", theArgs); + } + void Append( + LogLevel inLogLevel, + const char* inFmtStrPtr, + va_list inArgs) + { + if (! mRunFlag) { + return; + } + QCStMutexLocker theLocker(mMutex); + AppendSelf(inLogLevel, 0, -1, inFmtStrPtr, inArgs); + } + void AppendSelf( + LogLevel inLogLevel, + const char* inStrPtr, + int inStrLen) + { + static va_list theArgs; // dummy + AppendSelf(inLogLevel, 0, max(0, inStrLen), inStrPtr, theArgs); + } + void AppendSelf( + LogLevel inLogLevel, + Writer* inWriterPtr, + int inStrLen, + const char* inFmtStrPtr, + va_list inArgs) + { + mMsgAppendCount++; + + // The time includes mutex wait, but guarantees that the time stamps in + // the log are monotonically increasing. In the case of log wait this + // holds only in the case when thread library services the condition + // queue in FIFO order. + int64_t theSec = 0; + int64_t theMicroSec = 0; + bool theGetTimeFlag = true; + const bool theWasFlushingFlag = IsFlushing(); + const size_t kAvgPrefSize = 64; + const int kAvgMsgSize = 256; + const size_t theBufSize = min(GetMaxRecordSize(), + kAvgPrefSize + (inWriterPtr ? inWriterPtr->GetMsgLength() : 0) + + (inStrLen >= 0 ? inStrLen : kAvgMsgSize)); + Time theTimeWaited = 0; + for (int i = 0; ; i++) { + if ((i > 0 || mCurPtr + theBufSize >= mEndPtr) && ! FlushSelf()) { + if (theTimeWaited >= mMaxLogWaitTime || ! mRunFlag || i >= 4) { + mDroppedCount++; + mTotalDroppedCount++; + break; + } + if (theGetTimeFlag) { + Now(theSec, theMicroSec); + theGetTimeFlag = false; + } + mBufWaitersCount++; + const QCMutex::Time theTimeoutNanoSecs = + NanoSec(mMaxLogWaitTime - theTimeWaited); + mWriteDoneCond.Wait(mMutex, theTimeoutNanoSecs); + mBufWaitersCount--; + mBufWaitedCount++; + mCurLogWatedTime -= theTimeWaited; + mTotalLogWaitedTime -= theTimeWaited; + theTimeWaited = Now() - (Seconds(theSec) + theMicroSec); + mTotalLogWaitedTime += theTimeWaited; + mCurLogWatedTime += theTimeWaited; + } + if (theGetTimeFlag) { + Now(theSec, theMicroSec); + theGetTimeFlag = false; + } + const size_t theLen = MsgPrefix(theSec, theMicroSec, inLogLevel); + if (theLen <= 0) { + continue; + } + const size_t theMaxLen = + min(GetMaxRecordSize(), (size_t)(mEndPtr - mCurPtr)); + if (theMaxLen <= theLen + 1) { + continue; + } + size_t theMaxMsgLen = theMaxLen - theLen; // with \0 + int theRet = inWriterPtr ? + inWriterPtr->Write(mCurPtr + theLen, theMaxMsgLen) : + 0; + if (inStrLen < 0) { + if (theMaxMsgLen > 0) { + theRet += ::vsnprintf( + mCurPtr + theLen, theMaxMsgLen, inFmtStrPtr, inArgs); + } + } else { + theRet += inStrLen; + if ((size_t)inStrLen < theMaxMsgLen || + theMaxLen >= GetMaxRecordSize()) { + memcpy(mCurPtr + theLen, inFmtStrPtr, + min((size_t)inStrLen, theMaxMsgLen)); + } + } + if (theRet < 0 || theRet >= (int)theMaxMsgLen) { + if (theMaxLen >= GetMaxRecordSize()) { + mTruncatedCount++; + mCurPtr += theMaxLen; + ::memcpy(mCurPtr - theMaxLen, + mTruncatedSuffix.c_str(), mTruncatedSuffix.size()); + mCurPtr[-1] = '\n'; + mDroppedCount = 0; + mCurLogWatedTime = 0; + break; + } + if (theRet < 0 && i >= 2) { + mDroppedCount++; + mTotalDroppedCount++; + break; + } + } else { + mCurPtr += theLen + theRet; + QCASSERT(mCurPtr < mEndPtr); + if (mCurPtr < mEndPtr) { + *mCurPtr++ = '\n'; + } + mDroppedCount = 0; + mCurLogWatedTime = 0; + break; + } + } + if (! theGetTimeFlag) { + if (! theWasFlushingFlag) { + // Call even if it is now flushing, to update the next flush + // time. + FlushIfNeeded(theSec, theMicroSec); + } + RunTimers(theSec, theMicroSec); + } + } + void Run() + { + QCStMutexLocker theLocker(mMutex); + QCASSERT( + mBufSize > 0 && + mBuf0Ptr && + mBuf1Ptr + ); + for (; ;) { + while (mRunFlag && ! mWritePtr) { + const QCMutex::Time theTimeoutNanoSecs = NanoSec(max( + QCMutex::Time(10000), + (QCMutex::Time)(mFlushInterval > 0 ? + mFlushInterval / 2 : Time(500000)) + )); + mWriteCond.Wait(mMutex, theTimeoutNanoSecs); + if (mWritePtr) { + break; + } + int64_t theSec = 0; + int64_t theMicroSec = 0; + Now(theSec, theMicroSec); + FlushIfNeeded(theSec, theMicroSec); + RunTimers(theSec, theMicroSec); + if (mDeleteOldLogsFlag && mMaxLogFiles > 0) { + mDeleteOldLogsFlag = false; + } + if (mDeleteOldLogsFlag || mWritePtr) { + break; + } + } + if (! mWritePtr && ! mRunFlag) { + QCASSERT(mBufWaitersCount <= 0); + break; + } + if (mCloseFlag && mFd >= 0 && ! mFileName.empty()) { + const int theFd = mFd; + mFd = -1; + mCurWritten = 0; + mNextOpenRetryTime = Now() - Seconds(10); + QCStMutexUnlocker theUnlocker(mMutex); + ::close(theFd); + } + mCloseFlag = false; + if (mFd >= 0 && ! mFileName.empty()) { + if (mMaxLogFiles > 0) { + if ((int64_t)mCurWritten > mMaxLogFileSize) { + const int theFd = mFd; + const string theFileName = mFileName; + const int theMaxLogFiles = mMaxLogFiles; + mFd = -1; + mCurWritten = 0; + mNextOpenRetryTime = Now() - Seconds(10); + { + QCStMutexUnlocker theUnlocker(mMutex); + ::close(theFd); + } + RotateLogs(theFileName, theMaxLogFiles, mMutex); + } + } else if (! mNewLogSuffix.empty() || mDeleteOldLogsFlag) { + const string theNewLogSuffix = mNewLogSuffix; + const string theNewLogName = + theNewLogSuffix.empty() ? + theNewLogSuffix : mFileName + theNewLogSuffix; + const string theFileName = mFileName; + const int thePrevFd = mFd; + int theFd = mFd; + const int64_t theMinModTimeToKeepSec = mMinModTimeToKeepSec; + { + QCStMutexUnlocker theUnlocker(mMutex); + if (! theNewLogName.empty() && + ::rename( + theFileName.c_str(), + theNewLogName.c_str()) == 0) { + ::close(theFd); + theFd = -1; + } + } + DeleteOldLogsFiles(theFileName, theMinModTimeToKeepSec, + mMutex); + mDeleteOldLogsFlag = false; + if (theNewLogSuffix == mNewLogSuffix) { + mNewLogSuffix.clear(); + } + if (theFd < 0) { + if (thePrevFd == mFd) { + mFd = -1; + mCurWritten = 0; + mNextOpenRetryTime = Now() - Seconds(10); + } + } else if (thePrevFd != mFd) { + QCStMutexUnlocker theUnlocker(mMutex); + ::close(theFd); + } + } + } + if (mFd < 0 && ! mFileName.empty() && mNextOpenRetryTime < Now()) { + OpenLogFile(); + } + if (mFd >= 0 && mWritePtr < mWriteEndPtr) { + bool theRetryWriteFlag = false; + const int theFd = mFd; + const char* thePtr = mWritePtr; + const char* const theEndPtr = mWriteEndPtr; + QCStMutexUnlocker theUnlocker(mMutex); + int theError = 0; + while (thePtr < theEndPtr) { + const ssize_t theNWr = ::write( + theFd, thePtr, theEndPtr - thePtr); + if (theNWr < 0) { + theError = errno; + if (theError == EINTR) { + continue; + } + if (theError == EAGAIN) { + struct pollfd thePoll = { 0 }; + thePoll.fd = theFd; + thePoll.events = POLLOUT; + const int theRet = poll(&thePoll, 1, 1000); + if (theRet > 0) { + continue; + } + theError = errno; + if (theError == EAGAIN || theError == EINTR) { + continue; + } + } + break; + } + if (theNWr == 0) { + theError = -1; + break; + } + thePtr += theNWr; + } + theUnlocker.Lock(); + if (thePtr < theEndPtr) { + mWriteErrCount++; + if (theFd == mFd && mLogFileNamePrefixes.size() > 1) { + mFd = -1; // Close it, and swith to a new one. + theRetryWriteFlag = true; + } else { + mWriteBytesDiscardedCount += theEndPtr - thePtr; + } + } + mCurWritten += thePtr - mWritePtr; + if (theError != 0) { + mLastError = theError; + } + if (theFd != mFd) { + QCStMutexUnlocker theUnlocker(mMutex); + ::close(theFd); + } + if (theRetryWriteFlag) { + continue; + } + } else if (mWritePtr < mWriteEndPtr) { + mWriteErrCount++; + mWriteBytesDiscardedCount += mWriteEndPtr - mWritePtr; + } + mWritePtr = 0; + mWriteEndPtr = 0; + mWriteDoneCond.Notify(); + } + if (mFd >= 0) { + ::close(mFd); + mFd = -1; + } + } + ostream& GetStream( + LogLevel inLogLevel, + bool inDiscardFlag) + { + QCStMutexLocker theLocker(mMutex); + + MsgStream* theRetPtr = mMsgStreamHeadPtr; + if (theRetPtr) { + QCASSERT(mMsgStreamCount > 0); + mMsgStreamHeadPtr = theRetPtr->Next(); + theRetPtr->Clear(inLogLevel, inDiscardFlag); + mMsgStreamCount--; + } else { + QCASSERT(mMsgStreamCount == 0); + theRetPtr = new MsgStream(inLogLevel, inDiscardFlag); + } + return *theRetPtr; + } + void PutStream( + ostream& inStream) + { + QCStMutexLocker theLocker(mMutex); + + MsgStream& theStream = static_cast(inStream); + if (! theStream.IsDiscard()) { + AppendSelf(theStream.GetLogLevel(), + theStream.GetMsgPtr(), theStream.GetMsgLength()); + } + if (mMsgStreamCount < mMaxMsgStreamCount) { + theStream.tie(0); + theStream.Next() = mMsgStreamHeadPtr; + mMsgStreamHeadPtr = &theStream; + mMsgStreamCount++; + } else { + delete &theStream; + } + } +private: + typedef int64_t Time; + typedef uint64_t Count; + typedef vector LogFileNames; + class MsgStream : + private streambuf, + public ostream + { + public: + MsgStream( + LogLevel inLogLevel, + bool inDiscardFlag) + : streambuf(), + ostream(this), + mLogLevel(inLogLevel), + mNextPtr(0) + { setp(mBuffer, mBuffer + (inDiscardFlag ? 0 : kMaxMsgSize)); } + virtual ~MsgStream() + {} + MsgStream*& Next() + { return mNextPtr; } + virtual streamsize xsputn( + const char* inBufPtr, + streamsize inLength) + { + char* const theEndPtr = epptr(); + char* const theCurPtr = pptr(); + const streamsize theSize(min(max(streamsize(0), inLength), + streamsize(theEndPtr - theCurPtr))); + memcpy(theCurPtr, inBufPtr, theSize); + pbump(theSize); + return theSize; + } + const char* GetMsgPtr() const + { + *pptr() = 0; + return mBuffer; + } + const int GetMsgLength() const + { return (pptr() - mBuffer); } + size_t GetLength() const + { return (epptr() - pptr()); } + LogLevel GetLogLevel() const + { return mLogLevel; } + void Clear( + LogLevel inLogLevel, + bool inDiscardFlag) + { + mLogLevel = inLogLevel; + mNextPtr = 0; + ostream::clear(); + ostream::flags(ostream::dec | ostream::skipws); + ostream::precision(6); + ostream::width(0); + ostream::fill(' '); + ostream::tie(0); + setp(mBuffer, mBuffer + (inDiscardFlag ? 0 : kMaxMsgSize)); + } + bool IsDiscard() const + { return (mBuffer == epptr()); } + private: + enum { kMaxMsgSize = 512 << 10 }; + LogLevel mLogLevel; + MsgStream* mNextPtr; + char mBuffer[kMaxMsgSize + 1]; + private: + MsgStream( + const MsgStream&); + MsgStream& operator=( + const MsgStream&); + }; + + QCMutex mMutex; + QCCondVar mWriteCond; + QCCondVar mWriteDoneCond; + QCThread mThread; + LogFileNames mLogFileNamePrefixes; + string mFileName; + string mTruncatedSuffix; + string mTimeStampFormat; + string mNewLogSuffix; + Count mDroppedCount; + Count mTotalDroppedCount; + Count mTruncatedCount; + Count mBufWaitersCount; + Count mWriteErrCount; + Count mWriteBytesDiscardedCount; + Count mCurWritten; + Count mBufWaitedCount; + int64_t mMaxLogFileSize; + int64_t mMaxLogFiles; + Time mNextOpenRetryTime; + Time mOpenRetryInterval; + Time mFlushInterval; + Time mNextFlushTime; + Time mTotalLogWaitedTime; + Time mCurLogWatedTime; + Time mMaxLogWaitTime; + int64_t mNextDeleteOldLogsTimeSec; + int64_t mMinModTimeToKeepSec; + int mOpenFlags; + int mOpenMode; + int mBufSize; + int mLastError; + int mFd; + const int mMaxAppendLength; + bool mRunFlag; + bool mDeleteOldLogsFlag; + char* const mBuf0Ptr; + char* const mBuf1Ptr; + char* mBufPtr; + char* mCurPtr; + char* mEndPtr; + const char* mWritePtr; + const char* mWriteEndPtr; + bool mCloseFlag; + bool mUseGMTFlag; + int64_t mTimeSec; + int64_t mLogTimeStampSec; + int64_t mMsgAppendCount; + struct tm mTimeTm; + struct tm mLastLogTm; + int mMaxMsgStreamCount; + int mMsgStreamCount; + MsgStream* mMsgStreamHeadPtr; + char mLogTimeStampPrefixStr[256]; + + static inline Time Seconds( + Time inSec) + { return (inSec * 1000000); } + static inline Time NanoSec( + Time inMicroSec) + { return (inMicroSec * 1000); } + bool IsFlushing() const + { return (mWritePtr != 0); } + bool FlushSelf() + { + QCASSERT(mMutex.IsOwned()); + if (mCurPtr <= mBufPtr) { + return true; + } + if (mWritePtr) { + return false; + } + mWritePtr = mBufPtr; + mWriteEndPtr = mCurPtr; + mBufPtr = mBufPtr == mBuf0Ptr ? mBuf1Ptr : mBuf0Ptr; + mCurPtr = mBufPtr; + mEndPtr = mBufPtr + mBufSize; + mWriteCond.Notify(); + return true; + } + void FlushIfNeeded( + int64_t inSec, + int64_t inMicroSec) + { + if (mCurPtr <= mBufPtr) { + return; + } + const Time theNowMicroSecs = Seconds(inSec) + inMicroSec; + if (mNextFlushTime > theNowMicroSecs) { + return; + } + mNextFlushTime = theNowMicroSecs + mFlushInterval; + FlushSelf(); + } + size_t GetMaxRecordSize() const + { return (mMaxAppendLength + mTruncatedSuffix.size() + 1); } + static void DeleteOldLogsFiles( + string inFileName, + int64_t inMinModTimeSec, + QCMutex& inMutex) + { + const size_t thePos = inFileName.rfind('/'); + const string theDirName(thePos != string::npos ? + inFileName.substr(0, thePos) : "."); + const string thePrefix(thePos != string::npos ? + (thePos < inFileName.length() ? + inFileName.substr(thePos + 1, inFileName.length() - thePos - 1) : + "") : + inFileName); + if (theDirName == "/" && thePrefix.empty()) { + return; + } + int64_t theSec = 0; + int64_t theMicroSec = 0; + Now(theSec, theMicroSec); + struct dirent** theNamesPtr = 0; + const int theNameCount = + ::scandir(theDirName.c_str(), &theNamesPtr, 0, alphasort); + for (int i = 0; i < theNameCount; i++) { + const char* const theNamePtr = theNamesPtr[i]->d_name; + if (::strlen(theNamePtr) > thePrefix.length() && + ::memcmp(thePrefix.data(), theNamePtr, + thePrefix.length()) == 0) { + const string thePath = theDirName + "/" + theNamePtr; + struct stat theStat = {0}; + QCStMutexUnlocker theUnlocker(inMutex); + if (::stat(thePath.c_str(), &theStat) == 0 && + S_ISREG(theStat.st_mode) && + theStat.st_mtime + inMinModTimeSec < theSec) { + // std::cout << "deleting: " << thePath.c_str() << "\n"; + ::unlink(thePath.c_str()); + } + } + ::free(theNamesPtr[i]); + } + ::free(theNamesPtr); + } + static void RotateLogs( + string inFileName, + int inKeepCount, + QCMutex& inMutex) + { + for (int i = inKeepCount - 1; i >= 0; i--) { + const string theFrom(i == 0 ? inFileName : MakeName(inFileName, i)); + const string theTo(MakeName(inFileName, i + 1)); + const char* const theFromPtr = theFrom.c_str(); + const char* const theToPtr = theTo.c_str(); + QCStMutexUnlocker theUnlocker(inMutex); + ::rename(theFromPtr, theToPtr); + } + } + static string MakeName( + string inFileName, + int inIndex) + { + ostringstream theStream; + theStream << inFileName << "." << inIndex; + return theStream.str(); + } + static void Now( + int64_t& outSec, + int64_t& outMicroSec) + { + struct timeval theTime = {0}; + if (::gettimeofday(&theTime, 0)) { + QCUtils::FatalError("gettimeofday", errno); + } + outSec = theTime.tv_sec; + outMicroSec = theTime.tv_usec; + } + static Time Now() + { + int64_t theSec = 0; + int64_t theMicroSec = 0; + Now(theSec, theMicroSec); + return (Seconds(theSec) + theMicroSec); + } + void OpenLogFile() + { + const LogFileNames theLogFileNames = mLogFileNamePrefixes; + const string theCurFileName = mFileName; + const int theOpenFlags = mOpenFlags; + const int theOpenMode = mOpenMode; + const int theMaxLogFiles = mMaxLogFiles; + const int64_t theMaxLogFileSize = mMaxLogFileSize; + string theFileName = mFileName; + int theError = -1; + int theFd = -1; + Count theSize = 0; + for (LogFileNames::const_iterator theIt = theLogFileNames.begin(); ;) { + const char* const theFileNamePtr = theFileName.c_str(); + { + QCStMutexUnlocker theUnlocker(mMutex); + while ((theFd = ::open( + theFileNamePtr, theOpenFlags, theOpenMode)) < 0) { + theError = errno; + if (theError != EINTR && theError != EAGAIN) { + break; + } + } + } + if (theFd >= 0) { + QCStMutexUnlocker theUnlocker(mMutex); + struct stat theStat = {0}; + if (::fstat(theFd, &theStat) == 0) { + theSize = (Count)max((off_t)0, (off_t)theStat.st_size); + } else { + theSize = 0; + } + if (mMaxLogFiles > 0 && theMaxLogFileSize > 0 && + theSize >= (Count)theMaxLogFileSize) { + ::close(theFd); + theUnlocker.Lock(); + RotateLogs(theFileName, theMaxLogFiles, mMutex); + continue; + } + ::fcntl(theFd, FD_CLOEXEC, 1); + break; + } + while (theIt != theLogFileNames.end() && + theCurFileName == *theIt) { + ++theIt; + } + if (theIt == theLogFileNames.end()) { + break; + } + theFileName = *theIt++; + } + if (mFd >= 0) { + if (theFd >= 0) { + QCStMutexUnlocker theUnlocker(mMutex); + ::close(theFd); + } + } else { + mFd = theFd; + mCurWritten = theSize; + mNextOpenRetryTime = mFd < 0 ? + Now() + mOpenRetryInterval : Now() - Seconds(10); + if (mFd >= 0) { + mFileName = theFileName; + int64_t theSec = 0; + int64_t theMicroSec = 0; + Now(theSec, theMicroSec); + UpdateTimeTm(theSec); + mLastLogTm = mTimeTm; + } + } + } + size_t MsgPrefix( + int64_t inSec, + int64_t inMicroSec, + LogLevel inLogLevel) + { + if (mEndPtr <= mCurPtr) { + return 0; + } + if (mDroppedCount <= 0 && mCurLogWatedTime < Seconds(2)) { + return MsgPrefixSelf(inSec, inMicroSec, inLogLevel); + } + size_t theLen = MsgPrefixSelf(inSec, inMicroSec, kLogLevelINFO); + if (theLen <= 0) { + return 0; + } + char* thePtr = mCurPtr + theLen; + if (thePtr + 1 >= mEndPtr) { + return 0; + } + const size_t theMaxSz = mEndPtr - thePtr; + const int theNWr = ::snprintf(thePtr, theMaxSz, + "*** log records dropped: %.0f, %.0f total," + " wated: %g sec., %g sec. %.0f times total", + (double)mDroppedCount, + (double)mTotalDroppedCount, + (double)mCurLogWatedTime * 1e-6, + (double)mTotalLogWaitedTime * 1e-6, + (double)mBufWaitedCount + ); + if (theNWr < 0 || theNWr >= (int)theMaxSz) { + return 0; + } + thePtr += theNWr; + *thePtr++ = '\n'; + { + QCStValueChanger theChanger(mCurPtr, thePtr); + theLen = MsgPrefixSelf(inSec, inMicroSec, inLogLevel); + } + return (theLen <= 0 ? 0 : (thePtr - mCurPtr) + theLen); + } + size_t MsgPrefixSelf( + int64_t inSec, + int64_t inMicroSec, + LogLevel inLogLevel) + { + if (mEndPtr <= mCurPtr) { + return 0; + } + const size_t theMaxSz = mEndPtr - mCurPtr; + const int theLen = ::snprintf(mCurPtr, theMaxSz, + "%s.%03ld %s - ", + GetLogTimeStampPrefixPtr(inSec), + (long)(inMicroSec / 1000), + GetLogLevelNamePtr(inLogLevel) + ); + return ((theLen < 0 || theLen >= (int)theMaxSz) ? 0 : theLen); + } + void RunTimers( + int64_t inSec, + int64_t inMicroSec) + { + if (mFd < 0 || mMaxLogFiles > 0 || ! mNewLogSuffix.empty()) { + return; + } + UpdateTimeTm(inSec); + if (! mDeleteOldLogsFlag && mNextDeleteOldLogsTimeSec < inSec) { + mDeleteOldLogsFlag = true; + const int64_t kMinCleanupIntervalSec = 60 * 60 * 12; + mNextDeleteOldLogsTimeSec = inSec + min( + kMinCleanupIntervalSec, (60 + mMinModTimeToKeepSec / 3 * 2)); + } + if (mLastLogTm.tm_mday == mTimeTm.tm_mday && + mLastLogTm.tm_mon == mTimeTm.tm_mon && + mLastLogTm.tm_year == mTimeTm.tm_year) { + return; + } + char theBuf[64]; + const size_t theLen = ::strftime( + theBuf, sizeof(theBuf), ".%Y-%m-%d", &mLastLogTm); + if (theLen <= 0 || theLen >= sizeof(theBuf)) { + ostringstream theStream; + theStream << + "." << (1900 + mLastLogTm.tm_year) << + "-" << setfill('0') << setw(2) << (mLastLogTm.tm_mon + 1) << + "-" << setw(2) << mLastLogTm.tm_mday + ; + mNewLogSuffix = theStream.str(); + } else { + mNewLogSuffix.assign(theBuf, theLen); + } + mLastLogTm = mTimeTm; + } + void UpdateTimeTm( + int64_t inSec) + { + if (inSec != mTimeSec) { + const time_t theTime = (time_t)inSec; + if (mUseGMTFlag) { + ::gmtime_r(&theTime, &mTimeTm); + } else { + ::localtime_r(&theTime, &mTimeTm); + } + } + } + const char* GetLogTimeStampPrefixPtr( + int64_t inSec) + { + if (mLogTimeStampSec == inSec) { + return mLogTimeStampPrefixStr; + } + UpdateTimeTm(inSec); + const size_t theLen = mTimeStampFormat.empty() ? 0 : + ::strftime( + mLogTimeStampPrefixStr, sizeof(mLogTimeStampPrefixStr), + mTimeStampFormat.c_str(), &mTimeTm + ); + if (theLen <= 0 || theLen >= sizeof(mLogTimeStampPrefixStr)) { + const int theLen = ::snprintf( + mLogTimeStampPrefixStr, sizeof(mLogTimeStampPrefixStr), + "%ld.", (long)inSec + ); + if (theLen <= 0 || theLen >= (int)sizeof(mLogTimeStampPrefixStr)) { + ::strncpy(mLogTimeStampPrefixStr, "X", + sizeof(mLogTimeStampPrefixStr)); + } + } + mLogTimeStampSec = inSec; + return mLogTimeStampPrefixStr; + } +private: + Impl( + const Impl& inImpl); + Impl& operator=( + const Impl& inImpl); +}; + +BufferedLogWriter::BufferedLogWriter( + int inFd, + const char* inFileNamePtr /* = 0 */, + int inBufSize /* = 1 << 20 */, + const char* inTrucatedSuffixPtr /* = 0 */, + int64_t inOpenRetryIntervalMicroSec /* = 5000000 */, + int64_t inFlushIntervalMicroSec /* = 1000000 */, + int64_t inMaxLogFileSize /* = -1 */, + int inMaxLogsFiles /* = -1 */, + BufferedLogWriter::LogLevel inLogLevel /* = DEBUG */, + int64_t inMaxLogWaitTimeMicroSec /* = -1 */, + const char* inTimeStampFormatPtr /* = 0 */, + bool inUseGMTFlag /* = false */) + : mLogLevel(inLogLevel), + mImpl(*(new Impl( + inFd, + inFileNamePtr, + inBufSize, + inTrucatedSuffixPtr, + inOpenRetryIntervalMicroSec, + inFlushIntervalMicroSec, + inMaxLogFileSize, + inMaxLogsFiles, + inMaxLogWaitTimeMicroSec, + inTimeStampFormatPtr, + inUseGMTFlag) + )) +{ + mImpl.Start(); +} + +BufferedLogWriter::~BufferedLogWriter() +{ + delete &mImpl; +} + +void +BufferedLogWriter::SetParameters( + const Properties& inProps, + const char* inPropsPrefixPtr /* = 0 */) +{ + const string thePropsPrefix = inPropsPrefixPtr ? inPropsPrefixPtr : ""; + mImpl.SetParameters(thePropsPrefix, inProps); + SetLogLevel(inProps.getValue(thePropsPrefix + "logLevel", + Impl::GetLogLevelNamePtr(mLogLevel) + )); +} + +bool +BufferedLogWriter::Reopen() +{ + return mImpl.Reopen(); +} + +void +BufferedLogWriter::Close() +{ + mImpl.Close(); +} + +void +BufferedLogWriter::Stop() +{ + mImpl.Stop(); +} + +int +BufferedLogWriter::Open( + const char* inFileNamePtr, + int inOpenMode, + int inOpenFlags, + bool inOpenHereFlag) +{ + return mImpl.Open(inFileNamePtr, inOpenMode, inOpenFlags, inOpenHereFlag); +} + +int +BufferedLogWriter::Open( + const char* inFileNamePtr) +{ + return mImpl.Open(inFileNamePtr, + kLogWriterDefaulOpenFlags, 0644, false); +} + +void +BufferedLogWriter::Flush() +{ + mImpl.Flush(); +} + +void +BufferedLogWriter::SetMaxLogWaitTime( + int64_t inTimeMicroSec) +{ + mImpl.SetMaxLogWaitTime(inTimeMicroSec); +} + +void +BufferedLogWriter::SetFlushInterval( + int64_t inMicroSecs) +{ + mImpl.SetFlushInterval(inMicroSecs); +} + +bool +BufferedLogWriter::AddLogFileNamePrefix( + const char* inFileNamePtr) +{ + return (inFileNamePtr && mImpl.AddLogFileNamePrefix(inFileNamePtr)); +} + +void +BufferedLogWriter::ClearLogFileNamePrefixes() +{ + mImpl.ClearLogFileNamePrefixes(); +} + +void +BufferedLogWriter::Append( + BufferedLogWriter::LogLevel inLogLevel, + const char* inFmtStrPtr, + va_list inArgs) +{ + if (mLogLevel < inLogLevel) { + return; + } + mImpl.Append(inLogLevel, inFmtStrPtr, inArgs); +} + +bool +BufferedLogWriter::SetLogLevel( + const char* inLogLevelNamePtr) +{ + if (! inLogLevelNamePtr || ! *inLogLevelNamePtr) { + return false; + } + struct { const char* mNamePtr; LogLevel mLevel; } const kLogLevels[] = { + { "EMERG", kLogLevelEMERG }, + { "FATAL", kLogLevelFATAL }, + { "ALERT", kLogLevelALERT }, + { "CRIT", kLogLevelCRIT }, + { "ERROR", kLogLevelERROR }, + { "WARN", kLogLevelWARN }, + { "NOTICE", kLogLevelNOTICE }, + { "INFO", kLogLevelINFO }, + { "DEBUG", kLogLevelDEBUG }, + { "NOTSET", kLogLevelNOTSET } + }; + const size_t kNumLogLevels = sizeof(kLogLevels) / sizeof(kLogLevels[0]); + for (size_t i = 0; i < kNumLogLevels; i++) { + if (::strcmp(kLogLevels[i].mNamePtr, inLogLevelNamePtr) == 0) { + mLogLevel = kLogLevels[i].mLevel; + return true; + } + } + return false; +} + +/* static */ const char* +BufferedLogWriter::GetLogLevelNamePtr( + BufferedLogWriter::LogLevel inLogLevel) +{ + return Impl::GetLogLevelNamePtr(inLogLevel); +} + +void +BufferedLogWriter::GetCounters( + BufferedLogWriter::Counters& outCounters) +{ + mImpl.GetCounters(outCounters); +} + +void +BufferedLogWriter::ChildAtFork() +{ + mImpl.ChildAtFork(); +} + +void +BufferedLogWriter::PutStream( + ostream& inStream) +{ + mImpl.PutStream(inStream); +} + +ostream& +BufferedLogWriter::GetStream( + LogLevel inLogLevel) +{ + return mImpl.GetStream(inLogLevel, mLogLevel < inLogLevel); +} + +void +BufferedLogWriter::Append( + LogLevel inLogLevel, + Writer& inWriter) +{ + return mImpl.Append(inLogLevel, inWriter); +} + +void +BufferedLogWriter::PrepareToFork() +{ + mImpl.PrepareToFork(); +} + +void +BufferedLogWriter::ForkDone() +{ + mImpl.ForkDone(); +} + +} diff --git a/src/cc/common/BufferedLogWriter.h b/src/cc/common/BufferedLogWriter.h new file mode 100644 index 000000000..582f80ce8 --- /dev/null +++ b/src/cc/common/BufferedLogWriter.h @@ -0,0 +1,200 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/03/02 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef BUFFEREDLOGWRITER_H +#define BUFFEREDLOGWRITER_H + +#include +#include +#include + +namespace KFS +{ +using std::ostream; +class Properties; + +// Double buffered message log writer. +// Message log writer can be configured to write into files with max file size, +// number of files, and roll over time interval, or it can be configured to +// write into file descriptor (stderr or socket for example). +// The max wait / block time on the message write can be configured as well. +// if the wait exceeds max time (0 -- never wait) then the corresponding +// message(s) are discarded, and the log record with the number of discarded +// log messages will appear in the next successful buffer write / flush. This is +// need to prevent blocking on message log write with "bad" disks in the cases +// where the disk becomes unavailable or just cannot keep up. Chunk and meta +// servers message log writes are configured with 0 write wait time by default. +class BufferedLogWriter +{ +public: + enum LogLevel + { + kLogLevelEMERG = 0, + kLogLevelFATAL = 0, + kLogLevelALERT = 100, + kLogLevelCRIT = 200, + kLogLevelERROR = 300, + kLogLevelWARN = 400, + kLogLevelNOTICE = 500, + kLogLevelINFO = 600, + kLogLevelDEBUG = 700, + kLogLevelNOTSET = 800 + }; + struct Counters + { + int64_t mAppendCount; + int64_t mDroppedCount; + int64_t mWriteErrorCount; + int64_t mAppendWaitCount; + int64_t mAppendWaitMicroSecs; + }; + BufferedLogWriter( + int inFd = -1, + const char* inFileNamePtr = 0, + int inBufSize = 1 << 20, + const char* inTrucatedSuffixPtr = 0, + int64_t inOpenRetryIntervalMicroSec = 5000000, + int64_t inFlushIntervalMicroSec = 1000000, + int64_t inMaxLogFileSize = -1, + int inMaxLogsFiles = -1, + LogLevel inLogLevel = kLogLevelDEBUG, + int64_t inMaxLogWaitTimeMicroSec = -1, + const char* inTimeStampFormatPtr = 0, // see strftime + bool inUseGMTFlag = false); // GMT vs local + ~BufferedLogWriter(); + void SetParameters( + const Properties& inProps, + const char* inPropsPrefixPtr = 0); + bool Reopen(); + void Close(); + void Stop(); + int Open( + const char* inFileNamePtr, + int inOpenMode, + int inOpenFlags, + bool inOpenHereFlag = false); + int Open( + const char* inFileNamePtr); + bool AddLogFileNamePrefix( + const char* inFileNamePtr); + void ClearLogFileNamePrefixes(); + void Flush(); + void SetMaxLogWaitTime( + int64_t inMaxLogWaitTimeMicroSec); + void SetFlushInterval( + int64_t inMicroSecs); + void SetLogLevel( + LogLevel inLogLevel) + { mLogLevel = inLogLevel; } + bool SetLogLevel( + const char* inLogLevelNamePtr); + LogLevel GetLogLevel() const + { return mLogLevel; } + static const char* GetLogLevelNamePtr( + LogLevel inLogLevel); + bool IsLogLevelEnabled( + LogLevel inLogLevel) const + { return (mLogLevel >= inLogLevel); } + void Append( + LogLevel inLogLevel, + const char* inFmtStrPtr, + ...) + { + if (mLogLevel < inLogLevel) { + return; + } + va_list theArgs; + va_start(theArgs, inFmtStrPtr); + Append(inLogLevel, inFmtStrPtr, theArgs); + va_end(theArgs); + } + void Append( + LogLevel inLogLevel, + const char* inFmtStrPtr, + va_list inArgs); + void GetCounters( + Counters& outCounters); + ostream& GetStream(LogLevel inLogLevel); + void PutStream(ostream& inStreamPtr); + void ChildAtFork(); + + class StStream + { + public: + StStream( + BufferedLogWriter& inLogWriter, + LogLevel inLogLevel) + : mLogWriter(inLogWriter), + mStream(inLogWriter.GetStream(inLogLevel)) + {} + ~StStream() + { mLogWriter.PutStream(mStream); } + operator ostream& () + { return mStream; } + ostream& GetStream() + { return mStream; } + private: + BufferedLogWriter& mLogWriter; + ostream& mStream; + }; + + class Writer + { + public: + Writer() + {} + virtual ~Writer() + {} + virtual int GetMsgLength() = 0; + virtual int Write( + char* inBufPtr, + int inBufSize) = 0; + protected: + Writer( + const Writer&) + {} + Writer& operator=( + const Writer&) + { return *this; } + }; + void Append( + LogLevel inLogLevel, + Writer& inWriter); + void PrepareToFork(); + void ForkDone(); +private: + class Impl; + volatile LogLevel mLogLevel; + Impl& mImpl; + +private: + BufferedLogWriter( + const BufferedLogWriter& inWriter); + BufferedLogWriter& operator=( + const BufferedLogWriter& inWriter); +}; +} + +#endif /* BUFFEREDLOGWRITER_H */ diff --git a/src/cc/common/CMakeLists.txt b/src/cc/common/CMakeLists.txt new file mode 100644 index 000000000..24c886d87 --- /dev/null +++ b/src/cc/common/CMakeLists.txt @@ -0,0 +1,76 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +# Take all the .cc files and build a library out of them +set (lib_src + BufferedLogWriter.cc + hsieh_hash.cc + MsgLogger.cc + Properties.cc + time.cc + kfsatomic.cc + MemLock.cc + RequestParser.cc + rusage.cc +) + +# for the version file +set (VERSION_CC ${CMAKE_CURRENT_SOURCE_DIR}/Version.cc) +set_source_files_properties(${VERSION_CC} PROPERTIES GENERATED ON) + +# +# Build both static/dynamic libraries. Force the linking of all apps +# with a statically linked library. Since kfsCommon is the symbol +# used everywhere, associate that symbol with the STATIC library. +# + +add_custom_target ( + version ALL COMMAND sh ${CMAKE_CURRENT_SOURCE_DIR}/buildversgit.sh + ${CMAKE_BUILD_TYPE} + ${CMAKE_SOURCE_DIR} + ${VERSION_CC} + "boost: ${Boost_INCLUDE_DIRS} ${Boost_LIBRARIES}" + "source dir: ${CMAKE_SOURCE_DIR}" + VERBATIM +) + +add_library (kfsCommon-shared SHARED ${lib_src} ${VERSION_CC}) +add_library (kfsCommon STATIC ${lib_src} ${VERSION_CC}) +add_dependencies (kfsCommon-shared version) +add_dependencies (kfsCommon version) +set_target_properties (kfsCommon PROPERTIES OUTPUT_NAME "kfs_common") +set_target_properties (kfsCommon-shared PROPERTIES OUTPUT_NAME "kfs_common") +set_target_properties (kfsCommon PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties (kfsCommon-shared PROPERTIES CLEAN_DIRECT_OUTPUT 1) + +target_link_libraries (kfsCommon qcdio pthread) +target_link_libraries (kfsCommon-shared qcdio-shared pthread) + +# +install (TARGETS kfsCommon-shared kfsCommon + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static) + +install (FILES kfstypes.h kfsdecls.h DESTINATION include/kfs/common) diff --git a/src/cc/common/DynamicArray.h b/src/cc/common/DynamicArray.h new file mode 100644 index 000000000..fbc33e963 --- /dev/null +++ b/src/cc/common/DynamicArray.h @@ -0,0 +1,284 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/05/17 +// Author: Mike Ovsainnikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Dynamic array implementation with no re-allocation / copy. Suitable for +// array with large dynamic size ranges. +// +//---------------------------------------------------------------------------- + +#ifndef DYNAMIC_ARRAY_H +#define DYNAMIC_ARRAY_H + +#include + +namespace KFS +{ + +template +class DynamicArray +{ +public: + typedef T value_type; + typedef std::size_t size_t; + + DynamicArray() + : mSize(0), + mBufferCount(0), + mLastBufferIdx(0) + { + for (size_t i = 0; i < MaxBufferCount(); i++) { + mBuffersPtr[i] = 0; + } + } + DynamicArray( + const DynamicArray& inArray) + : mSize(0), + mBufferCount(0), + mLastBufferIdx(0) + { (*this) = inArray; } + ~DynamicArray() + { DynamicArray::Clear(); } + DynamicArray& operator=( + const DynamicArray& inArray) + { + size_t theBufIdx = 0; + size_t theBufSize = FirstBufSize(); + size_t theCnt = inArray.mSize; + while (theCnt > 0) { + if (theBufIdx >= mBufferCount) { + mBuffersPtr[mBufferCount++] = new T[theBufSize]; + } + const T* theSrcPtr = inArray.mBuffersPtr[theBufIdx]; + T* theDstPtr = mBuffersPtr[theBufIdx]; + const T* theEndPtr = theSrcPtr; + if (theCnt > theBufSize) { + theEndPtr += theBufSize; + theCnt -= theBufSize; + theBufIdx++; + theBufSize += theBufSize; + } else { + theEndPtr += theCnt; + theCnt = 0; + } + while (theSrcPtr < theEndPtr) { + *theDstPtr++ = *theSrcPtr++; + } + } + mLastBufferIdx = theBufIdx; + DeleteBuffers(theBufIdx + 2); // Hysteresis: keep the last one. + mSize = inArray.mSize; + return *this; + } + void Clear() + { + DeleteBuffers(0); + mSize = 0; + mLastBufferIdx = 0; + } + size_t GetSize() const + { return mSize; } + bool IsEmpty() const + { return (mSize <= 0); } + T& operator []( + size_t inIndex) const + { + size_t theIdx = inIndex; + size_t theBufSize = FirstBufSize(); + size_t theBufIdx = 0; + while (theIdx >= theBufSize) { + theIdx -= theBufSize; + theBufSize += theBufSize; + theBufIdx++; + } + return *(mBuffersPtr[theBufIdx] + theIdx); + } + void Swap( + DynamicArray& inArray) + { + for (size_t i = 0; i < MaxBufferCount(); i++) { + T* const thePtr = inArray.mBuffersPtr[i]; + inArray.mBuffersPtr[i] = mBuffersPtr[i]; + mBuffersPtr[i] = thePtr; + } + size_t const theSize = inArray.mSize; + inArray.mSize = mSize; + mSize = theSize; + size_t const theCnt = inArray.mBufferCount; + inArray.mBufferCount = mBufferCount; + mBufferCount = theCnt; + } + T& PushBack( + const T& inElem) + { + if (mLastBufferIdx >= mBufferCount || + (Capacity(mLastBufferIdx + 1) <= mSize && + ++mLastBufferIdx >= mBufferCount)) { + mBuffersPtr[mBufferCount] = new T[BufSize(mBufferCount)]; + mBufferCount++; + } + T& theRet = *(mBuffersPtr[mLastBufferIdx] + + (mSize - Capacity(mLastBufferIdx))); + theRet = inElem; + mSize++; + return theRet; + } + size_t PopBack() + { + if (mSize <= 0) { + return mSize; + } + mSize--; + if (mLastBufferIdx > 0 && Capacity(mLastBufferIdx) == mSize) { + // Hysteresis: keep the last buffer. + DeleteBuffers(mLastBufferIdx); + mLastBufferIdx--; + } + return mSize; + } + T& Front() + { return *(mBuffersPtr[0]); } + const T& Front() const + { return *(mBuffersPtr[0]); } + T& Back() + { + return *(mBuffersPtr[mLastBufferIdx] + + (mSize - 1 - Capacity(mLastBufferIdx))); + } + const T& Back() const + { + return *(mBuffersPtr[mLastBufferIdx] + + (mSize - 1 - Capacity(mLastBufferIdx))); + } + void Resize( + size_t inSize) + { + if (inSize <= mSize) { + RemoveBack(mSize - inSize); + return; + } + size_t theBufSize = BufSize(mLastBufferIdx); + size_t theCapacity = Capacity(mLastBufferIdx) + + (mLastBufferIdx < mBufferCount ? theBufSize : size_t(0)); + while (theCapacity < inSize) { + if (++mLastBufferIdx >= mBufferCount) { + mBuffersPtr[mBufferCount++] = new T[theBufSize]; + } + theCapacity += theBufSize; + theBufSize += theBufSize; + } + mSize = inSize; + } + size_t RemoveBack( + size_t inCnt) + { + if (inCnt <= 0) { + return mSize; + } + size_t theBufIdx = 0; + if (inCnt >= mSize) { + mSize = 0; + } else { + mSize -= inCnt; + size_t theBufSize = FirstBufSize(); + size_t theIdx = mSize; + while (theBufSize <= theIdx) { + theIdx -= theBufSize; + theBufSize += theBufSize; + theBufIdx++; + } + } + mLastBufferIdx = theBufIdx; + DeleteBuffers(theBufIdx + 2); // Hysteresis: keep the last one. + return mSize; + } + template + class IteratorT + { + public: + typedef DynamicArray DArray; + IteratorT( + const DArray& inArray) + : mIdx(0), + mBufIdx(0), + mBufSize(FirstBufSize()), + mBufPtr(inArray.mBuffersPtr), + mArray(inArray) + {} + ET* Next() + { + if (mIdx >= mArray.mSize) { + return 0; + } + if (mBufIdx >= mBufSize) { + mBufSize += mBufSize; + mBufIdx = 0; + mBufPtr++; + } + ++mIdx; + return (*mBufPtr + mBufIdx++); + } + bool HasNext() const + { return (mIdx < mArray.mSize); } + private: + typedef typename DArray::value_type BufsT; + size_t mIdx; + size_t mBufIdx; + size_t mBufSize; + BufsT* const* mBufPtr; + const DArray& mArray; + }; + friend class IteratorT; + friend class IteratorT; + typedef IteratorT Iterator; + typedef IteratorT ConstIterator; +private: + size_t mSize; + size_t mBufferCount; + size_t mLastBufferIdx; + T* mBuffersPtr[sizeof(size_t) * 8 - Log2FirstBufferSize]; + + static inline size_t BufSize( + size_t inIdx) + { return (size_t(1) << (Log2FirstBufferSize + inIdx)); } + static inline size_t FirstBufSize() + { return BufSize(0); } + static inline size_t Capacity( + size_t inBufCount) + { + return ((size_t(1) << (Log2FirstBufferSize + inBufCount)) - + (size_t(1) << Log2FirstBufferSize)); + } + inline size_t MaxBufferCount() + { return (sizeof(mBuffersPtr) / sizeof(mBuffersPtr[0])); } + void DeleteBuffers( + size_t inCnt) + { + while (mBufferCount > inCnt) { + delete [] mBuffersPtr[--mBufferCount]; + mBuffersPtr[mBufferCount] = 0; + } + } +}; + +} + +#endif /* DYNAMIC_ARRAY_H */ diff --git a/src/cc/common/FdWriter.h b/src/cc/common/FdWriter.h new file mode 100644 index 000000000..f4ee6823f --- /dev/null +++ b/src/cc/common/FdWriter.h @@ -0,0 +1,76 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/05/01 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file FdWriter.h +// \brief Basic file descriptor writer which can be used with MdStreamT. +// +//---------------------------------------------------------------------------- + +#ifndef FD_WRITER_H +#define FD_WRITER_H + +#include +#include + +namespace KFS +{ + +class FdWriter +{ +public: + FdWriter( + int inFd) + : mFd(inFd), + mError(0) + {} + ~FdWriter() + {} + void flush() + {} + bool write( + const void* inBufPtr, + size_t inLength) + { + const char* thePtr = static_cast(inBufPtr); + const char* const theEndPtr = thePtr + inLength; + while (thePtr < theEndPtr) { + const ssize_t theNWr = ::write(mFd, thePtr, theEndPtr - thePtr); + if (theNWr < 0 && errno != EINTR) { + mError = errno; + return false; + } + thePtr += theNWr; + } + return true; + } + void ClearError() + { mError = 0; } + int GetError() const + { return mError; } +private: + const int mFd; + int mError; +}; + +} + +#endif /* FD_WRITER_H */ diff --git a/src/cc/common/KfsTraceNew.cc b/src/cc/common/KfsTraceNew.cc new file mode 100644 index 000000000..d58e3191c --- /dev/null +++ b/src/cc/common/KfsTraceNew.cc @@ -0,0 +1,205 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/11/02 +// Author: Mike Ovsiannikov +// +// Copyright 2010 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Compile and link against this to replace global operator new and delete, +// and trace / debug memory allocation. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef KFS_OS_NAME_LINUX +#include +#include +#else +struct mallinfo {}; +#endif + +struct KfsTraceNew +{ + KfsTraceNew() + : mParamsSetFlag(false), + mFailedSize(0), + mMapsSize(0), + mTraceFd(-1), + mMaxNewSize(0), + mContinueOnMallocFailureFlag(false), + mParametersPtr(getenv("KFS_TRACE_NEW_PARAMS")), + mStackTraceDepth(0), + mRlimAs(), + mRlimData(), + mRlimCore(), + mMallinfo() + { + // Format: fd,maxsize,abort + // Example: KFS_TRACE_NEW_PARAMS=2,1e3,1 + + mMapsBuf[0] = 0; + if (! mParametersPtr || ! *mParametersPtr) { + return; + } + char* theEndPtr = 0; + mTraceFd = (int)strtol(mParametersPtr, &theEndPtr, 0); + if ((*theEndPtr & 0xFF) == ',') { + const char* thePtr = theEndPtr + 1; + theEndPtr = 0; + mMaxNewSize = (size_t)strtod(thePtr, &theEndPtr); + } + if ((*theEndPtr & 0xFF) == ',') { + const char* thePtr = theEndPtr + 1; + theEndPtr = 0; + mContinueOnMallocFailureFlag = strtol(thePtr, &theEndPtr, 0) == 0; + } + mParamsSetFlag = true; + } + void Trace( + const char* inMsgPtr, + size_t inSize, + const void* inPtr) + { + if (mTraceFd < 0 || ! mParamsSetFlag) { + return; + } + char theBuf[64]; + const int theLen = snprintf(theBuf, sizeof(theBuf), + "%s %lu %p\n", inMsgPtr, (unsigned long)inSize, inPtr); + if (theLen <= 0) { + return; + } + write(mTraceFd, theBuf, theLen); + } + void MallocFailed( + size_t inSize); + void* Allocate( + size_t inSize, + bool inRaiseExceptionOnFailureFlag) + { + void* const thePtr = (mMaxNewSize <= 0 || inSize <= mMaxNewSize) ? + malloc(inSize) : 0; + Trace(inRaiseExceptionOnFailureFlag ? "new" : "new_nt", inSize, thePtr); + if (! thePtr) { + MallocFailed(inSize); + if (inRaiseExceptionOnFailureFlag) { + throw std::bad_alloc(); + } + } + return thePtr; + } + void Free( + const char* inMsgPtr, + void* inPtr) + { + Trace(inMsgPtr, -1, inPtr); + free(inPtr); + } + + enum { kMaxStackTraceDepth = 64 }; + + bool mParamsSetFlag; + size_t mFailedSize; + int mMapsSize; + int mTraceFd; + size_t mMaxNewSize; + bool mContinueOnMallocFailureFlag; + const char* const mParametersPtr; + int mStackTraceDepth; + struct rlimit mRlimAs; + struct rlimit mRlimData; + struct rlimit mRlimCore; + struct mallinfo mMallinfo; + char mMapsBuf[16 << 10]; + void* mStackTrace[kMaxStackTraceDepth]; +} gKfsTraceNew; + +void +KfsTraceNew::MallocFailed( + size_t inSize) +{ + getrlimit(RLIMIT_AS, &mRlimAs); + getrlimit(RLIMIT_DATA, &mRlimData); + getrlimit(RLIMIT_CORE, &mRlimCore); +#ifdef KFS_OS_NAME_LINUX + struct mallinfo info = mallinfo(); + mFailedSize = inSize; + mMallinfo = info; + const int theFd = open("/proc/self/maps", O_RDONLY); + int theMapsSize = 0; + if (theFd >= 0) { + theMapsSize = (int)read(theFd, mMapsBuf, sizeof(mMapsBuf)); + mMapsSize = theMapsSize; + close(theFd); + } + void* theStackTrace[kMaxStackTraceDepth]; + const int theDepth = backtrace(theStackTrace, kMaxStackTraceDepth); + if (theDepth > 0) { + char theBuf[64]; + const int theLen = snprintf(theBuf, sizeof(theBuf), + "malloc(%lu) failure:\n", (unsigned long)inSize); + if (theLen > 0) { + write(2, theBuf, theLen); + } + backtrace_symbols_fd(theStackTrace, theDepth, 2); + memcpy(mStackTrace, theStackTrace, theDepth * sizeof(mStackTrace[0])); + if (theMapsSize > 0) { + write(2, mMapsBuf, theMapsSize); + } + mStackTraceDepth = theDepth; + } +#endif + if (mContinueOnMallocFailureFlag) { + return; + } + abort(); +} + +void* +operator new(std::size_t inSize) throw (std::bad_alloc) +{ + return gKfsTraceNew.Allocate(inSize, true); +} + +void +operator delete(void* inPtr) throw() +{ + gKfsTraceNew.Free("delete", inPtr); +} + +void* +operator new(std::size_t inSize, const std::nothrow_t&) throw() +{ + return gKfsTraceNew.Allocate(inSize, false); +} + +void +operator delete(void* inPtr, const std::nothrow_t&) throw() +{ + gKfsTraceNew.Free("delete_nt", inPtr); +} diff --git a/src/cc/common/LinearHash.h b/src/cc/common/LinearHash.h new file mode 100644 index 000000000..3ff2e2a5c --- /dev/null +++ b/src/cc/common/LinearHash.h @@ -0,0 +1,453 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/05/18 +// Author: Mike Ovsainnikov +// +// Copyright 2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// [Sorted] linear hash table implementation. The insertion and removal cost is +// constant (assuming "good" hash -- low number of collisions) as adding or +// removing item always re-hashes single bucket. The overhead per item is 2 +// pointers. Lookup cost (with "good" hash) is ~2 dram cache misses. Suitable +// for implementing map/set with large dynamic size ranges. +// +//---------------------------------------------------------------------------- + +#ifndef LINEAR_HASH_H +#define LINEAR_HASH_H + +#include "DynamicArray.h" +#include "SingleLinkedList.h" + +#include +#include +#include + +namespace KFS +{ + +template +struct KeyCompare +{ + typedef std::size_t size_t; + + static bool Equals( + const T& inLhs, + const T& inRhs) + { return inLhs == inRhs; } + static bool Less( + const T& inLhs, + const T& inRhs) + { return inLhs < inRhs; } + static size_t Hash( + const T& inVal) + { return size_t(inVal); } +}; + +template +class KVPair +{ +public: + typedef KeyT Key; + typedef ValT Val; + + KVPair( + const Key& inKey, + const Val& inVal) + : mKey(inKey), + mVal(inVal) + {} + Key& GetKey() { return mKey; } + const Key& GetKey() const { return mKey; } + Val& GetVal() { return mVal; } + const Val& GetVal() const { return mVal; } +private: + Key mKey; + Val mVal; +}; + +template +class KeyOnly +{ +public: + typedef KeyT Key; + typedef KeyT Val; + + KeyOnly( + const Key& inKey, + const Val& /* inVal */) + : mKey(inKey) + {} + Key& GetKey() { return mKey; } + const Key& GetKey() const { return mKey; } + Val& GetVal() { return mKey; } + const Val& GetVal() const { return mKey; } +private: + Key mKey; +}; + +template +class DeleteObserver +{ +public: + void operator()(T&) {} +}; + +template< + typename KVPairT, + typename KeyIdT = KeyCompare, + typename DArrayT = DynamicArray*>, + typename AllocT = std::allocator, + typename DeleteObserverT = DeleteObserver +> +class LinearHash +{ +public: + typedef typename KVPairT::Key Key; + typedef typename KVPairT::Val Val; + typedef SingleLinkedList Entry; + typedef typename AllocT::template rebind::other Allocator; + typedef std::size_t size_t; + + LinearHash() + : mSplitIdx(0), + mMaxSplitIdx(1), + mLastBucketIdx(0), + mLastEntryPtr(0), + mBuckets(), + mKeyId(), + mAlloc(), + mDelObserverPtr(0) + {} + LinearHash( + const LinearHash& inHash) + : mSplitIdx(0), + mMaxSplitIdx(1), + mLastBucketIdx(0), + mLastEntryPtr(0), + mBuckets(), + mKeyId(), + mAlloc(), + mDelObserverPtr(0) + { *this = inHash; } + ~LinearHash() + { LinearHash::Clear(); } + LinearHash& operator=( + const LinearHash& inHash) + { + if (this == &inHash) { + return *this; + } + Clear(); + for (size_t i = 0; i < inHash.GetSize(); i++) { + const Entry* thePtr = inHash.mBuckets[i]; + while (thePtr) { + bool theInsertedFlag; + Insert(thePtr->GetData().GetKey(), thePtr->GetData().GetVal(), + theInsertedFlag); + thePtr = thePtr->GetNextPtr(); + } + } + } + void SetDeleteObserver( + DeleteObserverT* inObserverPtr) + { mDelObserverPtr = inObserverPtr; } + const Allocator& GetAllocator() const + { return mAlloc; } + size_t GetSize() const + { return mBuckets.GetSize(); } + size_t IsEmpty() const + { return mBuckets.IsEmpty(); } + void Clear() + { + mLastBucketIdx = 0; + mLastEntryPtr = 0; + while (! mBuckets.IsEmpty()) { + Entry*& theBackPtr = mBuckets.Back(); + Entry* thePtr = theBackPtr; + theBackPtr = 0; + while (thePtr) { + Entry* const theNextPtr = thePtr->GetNextPtr(); + Delete(*thePtr); + thePtr = theNextPtr; + } + mBuckets.PopBack(); + } + mSplitIdx = 0; + mMaxSplitIdx = 1; + } + Val* Find( + const Key& inKey) const + { + if (IsEmpty()) { + return 0; + } + Entry* thePtr = GetBucket(inKey); + while (thePtr) { + if (mKeyId.Equals(inKey, thePtr->GetData().GetKey())) { + return &(thePtr->GetData().GetVal()); + } + thePtr = thePtr->GetNextPtr(); + } + return 0; + } + Val* Insert( + const Key& inKey, + const Val& inVal, + bool& outInsertedFlag) + { + if (IsEmpty()) { + Entry& theEntry = New(Entry(KVPairT(inKey, inVal), 0)); + mBuckets.PushBack(&theEntry); + outInsertedFlag = true; + return &(theEntry.GetData().GetVal()); + } + Entry*& theBucketPtr = GetBucket(inKey); + Entry* thePtr = theBucketPtr; + if (! thePtr) { + Entry& theEntry = New(Entry(KVPairT(inKey, inVal), thePtr)); + theBucketPtr = &theEntry; + Split(); + outInsertedFlag = true; + return &(theEntry.GetData().GetVal()); + } + while (! mKeyId.Equals(inKey, thePtr->GetData().GetKey())) { + Entry* const thePrevPtr = thePtr; + thePtr = thePtr->GetNextPtr(); + if (! thePtr || mKeyId.Less(inKey, thePtr->GetData().GetKey())) { + Entry& theEntry = New(Entry(KVPairT(inKey, inVal), thePtr)); + thePrevPtr->GetNextPtr() = &theEntry; + Split(); + outInsertedFlag = true; + return &(theEntry.GetData().GetVal()); + } + } + outInsertedFlag = false; + return &(thePtr->GetData().GetVal()); + } + size_t Erase( + const Key& inKey) + { + if (IsEmpty()) { + return 0; + } + Entry*& theBucketPtr = GetBucket(inKey); + Entry* thePtr = theBucketPtr; + if (! thePtr) { + return 0; + } + if (mKeyId.Equals(inKey, thePtr->GetData().GetKey())) { + theBucketPtr = thePtr->GetNextPtr(); + Delete(*thePtr); + Merge(); + mBuckets.PopBack(); + return 1; + } + // With good hash function the lists should be short enough. + // Early termination using Less() wouldn't get much with short + // lists. + for (; ;) { + Entry* const thePrevPtr = thePtr; + if (! (thePtr = thePtr->GetNextPtr())) { + break; + } + if (mKeyId.Equals(inKey, thePtr->GetData().GetKey())) { + thePrevPtr->GetNextPtr() = thePtr->GetNextPtr(); + Delete(*thePtr); + Merge(); + mBuckets.PopBack(); + return 1; + } + } + return 0; + } + void First() + { + mLastEntryPtr = 0; + mLastBucketIdx = 0; + } + const KVPairT* Next() + { + if (mLastEntryPtr) { + mLastEntryPtr = mLastEntryPtr->GetNextPtr(); + if (mLastEntryPtr) { + return &(mLastEntryPtr->GetData()); + } + mLastBucketIdx++; + } + while (mLastBucketIdx < GetSize()) { + if ((mLastEntryPtr = mBuckets[mLastBucketIdx])) { + return &(mLastEntryPtr->GetData()); + } + mLastBucketIdx++; + } + return 0; + } + void Swap(LinearHash& inHash) + { + if (this == &inHash) { + return; + } + mBuckets.Swap(inHash.mBuckets); + std::swap(mSplitIdx, inHash.mSplitIdx); + std::swap(mMaxSplitIdx, inHash.mMaxSplitIdx); + std::swap(mLastBucketIdx, inHash.mLastBucketIdx); + std::swap(mLastEntryPtr, inHash.mLastEntryPtr); + std::swap(mKeyId, inHash.mKeyId); + std::swap(mAlloc, inHash.mAlloc); + std::swap(mDelObserverPtr, inHash.mDelObserverPtr); + } + +private: + size_t mSplitIdx; + size_t mMaxSplitIdx; // Split upper bound during this expansion. + size_t mLastBucketIdx; // Cursor. + Entry* mLastEntryPtr; // Cursor. + DArrayT mBuckets; // Hash table buckets. + KeyIdT mKeyId; + Allocator mAlloc; + DeleteObserverT* mDelObserverPtr; + + Entry& New( + const Entry& inEntry) + { + Entry& theEntry = *mAlloc.allocate(1); + mAlloc.construct(&theEntry, inEntry); + return theEntry; + } + void Delete( + Entry& inEntry) + { + if (&inEntry == mLastEntryPtr) { + Next(); + } + if (mDelObserverPtr) { + (*mDelObserverPtr)(inEntry.GetData()); + } + mAlloc.destroy(&inEntry); + mAlloc.deallocate(&inEntry, 1); + } + static size_t BucketIdx( + size_t inMaxSplitIdx, + size_t inSplitIdx, + size_t inHash) + { + // maxSplit always power of 2, thus: + // hash % maxSplit == hash & (maxSplit - 1) + const size_t theIdx = inHash & (inMaxSplitIdx - 1); + return (theIdx < inSplitIdx ? + (inHash & (inMaxSplitIdx + inMaxSplitIdx - 1)) : + theIdx); + } + Entry*& GetBucket( + const Key& inKey) const + { + return mBuckets[BucketIdx(mMaxSplitIdx, mSplitIdx, mKeyId.Hash(inKey))]; + } + void Split() + { + if (IsEmpty()) { + return; // Nothing to split. + } + Entry*& theBucketPtr = mBuckets.PushBack(0); + const size_t thePrevIdx = mSplitIdx; + if (++mSplitIdx >= mMaxSplitIdx) { + // Start new expansion round. + mSplitIdx = 0; + mMaxSplitIdx += mMaxSplitIdx; + } + // Split into prev and new buckets, preserving the order. + Entry* theTailPtr = 0; + Entry*& thePrevBucketPtr = mBuckets[thePrevIdx]; + Entry* thePtr = thePrevBucketPtr; + Entry* thePrevPtr = 0; + while (thePtr) { + Entry* const theNextPtr = thePtr->GetNextPtr(); + if (BucketIdx(mMaxSplitIdx, mSplitIdx, + mKeyId.Hash(thePtr->GetData().GetKey())) == thePrevIdx) { + thePrevPtr = thePtr; + } else { + // Move the entry into new bucket. + if (thePrevPtr) { + thePrevPtr->GetNextPtr() = theNextPtr; + } else { + thePrevBucketPtr = theNextPtr; + } + if (theTailPtr) { + theTailPtr->GetNextPtr() = thePtr; + } else { + theBucketPtr = thePtr; + } + theTailPtr = thePtr; + thePtr->GetNextPtr() = 0; + } + thePtr = theNextPtr; + } + } + void Merge() + { + if (mSplitIdx == 0) { + // Start new collapse round. + // The size here is +1: the bucket removal happens after Merge(). + if (GetSize() <= 1) { + return; + } + mMaxSplitIdx >>= 1; + mSplitIdx = mMaxSplitIdx - 1; + } else { + --mSplitIdx; + } + Entry*& thePrevBucketPtr = mBuckets.Back(); + Entry* thePtr = thePrevBucketPtr; + if (! thePtr) { + return; + } + thePrevBucketPtr = 0; + Entry*& theBucketPtr = mBuckets[mSplitIdx]; + Entry* theInsertPtr = theBucketPtr; + if (! theInsertPtr) { + theBucketPtr = thePtr; + return; + } + Entry* theInsertPrevPtr = 0; + while (thePtr) { + Entry* const theNextPtr = thePtr->GetNextPtr(); + while (theInsertPtr && mKeyId.Less(theInsertPtr->GetData().GetKey(), + thePtr->GetData().GetKey())) { + theInsertPrevPtr = theInsertPtr; + theInsertPtr = theInsertPtr->GetNextPtr(); + } + if (! theInsertPtr) { + theInsertPrevPtr->GetNextPtr() = thePtr; + return; + } + thePtr->GetNextPtr() = theInsertPtr; + if (theInsertPrevPtr) { + theInsertPrevPtr->GetNextPtr() = thePtr; + } else { + theBucketPtr = thePtr; + } + theInsertPrevPtr = thePtr; + thePtr = theNextPtr; + } + } +}; + +} + +#endif /* LINEAR_HASH_H */ diff --git a/src/cc/common/MdStream.h b/src/cc/common/MdStream.h new file mode 100644 index 000000000..fe2647737 --- /dev/null +++ b/src/cc/common/MdStream.h @@ -0,0 +1,256 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/09/16 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file MdStream.h +// \brief Message digest stream object. +// +//---------------------------------------------------------------------------- + +#ifndef MD_STREAM_H +#define MD_STREAM_H + +#include +#include +#include + +#include "openssl/evp.h" + +namespace KFS +{ +using std::string; +using std::ostream; +using std::streambuf; +using std::streamsize; +using std::max; + +template +class MdStreamT : + private streambuf, + public ostream +{ +public: + static void Init() + { + OpenSSL_add_all_digests(); + } + static void Cleanup() + { + EVP_cleanup(); + } + MdStreamT( + OStreamT* inStreamPtr = 0, + bool inSyncFlag = true, + const string& inDigestName = string(), + size_t inBufSize = (1 << 20)) + : streambuf(), + ostream(this), + mDigestName(inDigestName), + mBufferPtr(new char[max(size_t(1), inBufSize)]), + mCurPtr(mBufferPtr), + mEndPtr(mCurPtr + max(size_t(1), inBufSize)), + mSyncFlag(inSyncFlag), + mWriteTroughFlag(false), + mStreamPtr(inStreamPtr) + { + EVP_MD_CTX_init(&mCtx); + MdStreamT::InitMd(); + } + virtual ~MdStreamT() + { + EVP_MD_CTX_cleanup(&mCtx); + delete [] mBufferPtr; + } + string GetMd() + { + flush(); + SyncSelf(); + + string theRet; + if (fail()) { + return theRet; + } + + EVP_MD_CTX theCtx; + EVP_MD_CTX_init(&theCtx); + if (! EVP_MD_CTX_copy_ex(&theCtx, &mCtx)) { + setstate(failbit); + return theRet; + } + unsigned char theMd[EVP_MAX_MD_SIZE]; + unsigned int theLen = 0; + if (! EVP_DigestFinal_ex(&theCtx, theMd, &theLen)) { + setstate(failbit); + } + EVP_MD_CTX_cleanup(&theCtx); + if (fail()) { + return theRet; + } + + theRet.resize(2 * theLen); + string::iterator theIt = theRet.begin(); + const char* const kHexDigits = "0123456789abcdef"; + for (unsigned int i = 0; i < theLen; i++) { + const unsigned int theDigit = theMd[i] & 0xFF; + *theIt++ = kHexDigits[(theDigit >> 4) & 0xF]; + *theIt++ = kHexDigits[theDigit & 0xF]; + } + return theRet; + } + void SetSync( + bool inFlag) + { mSyncFlag = inFlag; } + bool IsSync() const + { return mSyncFlag; } + void SetStream( + OStreamT* inStreamPtr) + { + flush(); + SyncSelf(); + mStreamPtr = inStreamPtr; + } + void Reset( + OStreamT* inStreamPtr = 0) + { + flush(); + SyncSelf(); + unsigned char theMd[EVP_MAX_MD_SIZE]; + EVP_DigestFinal_ex(&mCtx, theMd, 0); + clear(); + InitMd(); + mStreamPtr = inStreamPtr; + } + void SetWriteTrough( + bool inWriteTroughFlag) + { mWriteTroughFlag = inWriteTroughFlag; } + +protected: + virtual int overflow( + int inSym = EOF) + { + if (inSym == EOF) { + return 0; + } + if (mCurPtr < mEndPtr) { + *mCurPtr++ = inSym; + return inSym; + } + SyncSelf(); + return inSym; + } + virtual streamsize xsputn( + const char* inBufferPtr, + streamsize inSize) + { + if (inSize <= 0) { + return inSize; + } + if (! mWriteTroughFlag && + mBufferPtr + inSize * 3 / 2 < mEndPtr) { + streamsize theSize = 0; + streamsize theRem = inSize; + if (mCurPtr < mEndPtr) { + if (mCurPtr + inSize > mEndPtr) { + theSize = (streamsize)(mEndPtr - mCurPtr); + } else { + theSize = inSize; + } + memcpy(mCurPtr, inBufferPtr, theSize); + mCurPtr += theSize; + theRem -= theSize; + if (theRem <= 0) { + return inSize; + } + } + SyncSelf(); + memcpy(mCurPtr, inBufferPtr + theSize, theRem); + mCurPtr += theRem; + return inSize; + } + if (SyncSelf() == 0 && + ! fail() && ! EVP_DigestUpdate( + &mCtx, inBufferPtr, inSize)) { + setstate(failbit); + } + if (mStreamPtr) { + mStreamPtr->write(inBufferPtr, inSize); + } + return inSize; + } + virtual int sync() + { + if (! mSyncFlag) { + return 0; + } + const int theRet = SyncSelf(); + if (mStreamPtr) { + mStreamPtr->flush(); + } + return theRet; + } + int SyncSelf() + { + int theRet = 0; + if (mCurPtr <= mBufferPtr) { + return theRet; + } + const size_t theSize = mCurPtr - mBufferPtr; + if (! fail() && ! EVP_DigestUpdate( + &mCtx, mBufferPtr, theSize)) { + setstate(failbit); + theRet = -1; + } + if (mStreamPtr) { + if (! mStreamPtr->write(mBufferPtr, theSize)) { + setstate(failbit); + } + } + mCurPtr = mBufferPtr; + return theRet; + } + void InitMd() + { + const EVP_MD* const theMdPtr = mDigestName.empty() ? + EVP_md5() : + EVP_get_digestbyname(mDigestName.c_str()); + if (! theMdPtr || ! EVP_DigestInit_ex(&mCtx, theMdPtr, 0)) { + setstate(failbit); + } + } + +private: + const string mDigestName; + char* const mBufferPtr; + char* mCurPtr; + char* const mEndPtr; + bool mSyncFlag; + bool mWriteTroughFlag; + OStreamT* mStreamPtr; + EVP_MD_CTX mCtx; + + MdStreamT(const MdStreamT& inStream); + MdStreamT& operator=( const MdStreamT& inStream); +}; + +typedef MdStreamT MdStream; + +} // namespace KFS +#endif /* MD_STREAM_H */ diff --git a/src/cc/common/MemLock.cc b/src/cc/common/MemLock.cc new file mode 100644 index 000000000..2d5b93157 --- /dev/null +++ b/src/cc/common/MemLock.cc @@ -0,0 +1,202 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/10/11 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file MemLock.cc +// \brief Process common memory locking code. +// +//---------------------------------------------------------------------------- + +#include "MemLock.h" +#include "StdAllocator.h" + +#include "qcdio/QCUtils.h" + +#include +#include +#include +#include +#include + +#include + +#ifdef KFS_OS_NAME_LINUX +# include +#endif + +namespace KFS +{ +using std::string; +using std::min; +using std::max; + +class MallocSetup +{ +public: + MallocSetup( + int64_t inSize, + int64_t inMaxStlPoolSize, + const char** inErrMsgPtr = 0) + : mAllocPtr(0), + mStdPtr(0), + mAllocator() + { +#ifdef KFS_OS_NAME_LINUX + if (mallopt(M_TRIM_THRESHOLD, -1)) { + if (inErrMsgPtr) { + *inErrMsgPtr = "mallopt trim_threshold:"; + } + return; + } +#endif + if (inMaxStlPoolSize > 0) { + const int64_t theAllocCnt = min( + int64_t(10) << 20, + inMaxStlPoolSize / kStdMaxAllocSize + ) + 1; + mStdPtr = new StdAlloc::pointer[theAllocCnt]; + int64_t i; + for (i = 0; i < theAllocCnt - 1; i++) { + if (! (mStdPtr[i] = mAllocator.allocate(kStdMaxAllocSize))) { + return; + } + } + mStdPtr[i] = 0; + } + // Force sbrk / heap allocation. + mAllocPtr = new ForcePageAlloc[ + max(int64_t(0), inSize / ForcePageAlloc::kBufSize) + ]; + } + ~MallocSetup() + { + if (mStdPtr) { + StdAlloc::pointer* thePtr = mStdPtr; + while (*thePtr) { + mAllocator.deallocate(*thePtr, kStdMaxAllocSize); + } + delete [] mStdPtr; + } + delete [] mAllocPtr; + + } + bool IsError() + { return (! mAllocPtr); } +private: + enum { kStdMaxAllocSize = 256 }; + struct ForcePageAlloc + { + enum { kPageSize = 4 << 10 }; + enum { kMallocMmapThreshold = 64 << 10 }; // Default is 128K + enum { kBufSize = kMallocMmapThreshold - kPageSize }; + ForcePageAlloc() + : mBufPtr(new char[kBufSize]) + { + char* thePtr = mBufPtr; + char* const theEndPtr = thePtr + kBufSize; + while (thePtr < theEndPtr) { + *thePtr = 0xFF; + thePtr += kPageSize; + } + } + ~ForcePageAlloc() + { delete [] mBufPtr; } + char* const mBufPtr; + }; + typedef StdAllocator StdAlloc; + + ForcePageAlloc* mAllocPtr; + StdAlloc::pointer* mStdPtr; + StdAlloc mAllocator; + + MallocSetup( + const MallocSetup&); + MallocSetup& operator=( + const MallocSetup&); +}; + +static void +AllocThreadStack() +{ + char theBuf[256 << 10]; + memset(theBuf, 0xFF, sizeof(theBuf)); +} + +int +LockProcessMemory( + int64_t inMaxLockedMemorySize, + int64_t inMaxHeapSize, + int64_t inMaxStlPoolSize, + string* outErrMsgPtr) +{ + if (inMaxLockedMemorySize == 0) { + return 0; + } + int theRet = 0; +#ifndef KFS_OS_NAME_CYGWIN + const char* theMsgPtr = 0; + if (inMaxLockedMemorySize == 0 && munlockall()) { + theRet = errno; + theMsgPtr = "munlockall:"; + } else { + const int64_t kPgAlign = 8 <<10; + struct rlimit theLim = {0}; + if (getrlimit(RLIMIT_MEMLOCK, &theLim)) { + theRet = errno; + theMsgPtr = "getrlimit memlock:"; + } else { + theLim.rlim_cur = (rlim_t)((inMaxLockedMemorySize + kPgAlign - 1) / + kPgAlign * kPgAlign); + if (theLim.rlim_max < theLim.rlim_cur) { + // An attempt to raise it should succeed with enough privileges. + theLim.rlim_max = theLim.rlim_cur; + } + if (setrlimit(RLIMIT_MEMLOCK, &theLim)) { + theRet = errno; + theMsgPtr = "setrlimit memlock:"; + } else { + AllocThreadStack(); + // Try to grow the heap. + MallocSetup mallocSetup( + inMaxHeapSize, inMaxStlPoolSize, &theMsgPtr); + if (mallocSetup.IsError()) { + theRet = errno; + if (! theMsgPtr) { + theMsgPtr = "malloc:"; + } + } + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + theRet = errno; + theMsgPtr = "mlockall:"; + } + } + } + } + if (theRet != 0 && outErrMsgPtr) { + *outErrMsgPtr = QCUtils::SysError(theRet, theMsgPtr); + } +#else + AllocThreadStack(); +#endif + return theRet; +} + +} // namespace KFS diff --git a/src/cc/common/MemLock.h b/src/cc/common/MemLock.h new file mode 100644 index 000000000..f81308ed2 --- /dev/null +++ b/src/cc/common/MemLock.h @@ -0,0 +1,46 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/10/11 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file MemLock.h +// \brief Process common memory locking interface. +// +//---------------------------------------------------------------------------- + +#ifndef MEM_LOCK_H +#define MEM_LOCK_H + +#include +#include + +namespace KFS +{ +using std::string; + +int +LockProcessMemory( + int64_t inMaxLockedMemorySize, + int64_t inMaxHeapSize = 0, + int64_t inMaxStlPoolSize = 0, + string* outErrMsgPtr = 0); +} + +#endif /* MEM_LOCK_H */ diff --git a/src/cc/common/MsgLogger.cc b/src/cc/common/MsgLogger.cc new file mode 100644 index 000000000..50b130c1f --- /dev/null +++ b/src/cc/common/MsgLogger.cc @@ -0,0 +1,113 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2005/03/01 +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +//---------------------------------------------------------------------------- + +#include "MsgLogger.h" +#include "Version.h" +#include +#include + +namespace KFS +{ + +MsgLogger* MsgLogger::logger = 0; +const MsgLogger::LogLevel kMsgLoggerDefaultLogLevel = +#ifdef NDEBUG + MsgLogger::kLogLevelINFO +#else + MsgLogger::kLogLevelDEBUG +#endif +; + +MsgLogger::MsgLogger( + const char* filename, + MsgLogger::LogLevel logLevel, + const Properties* props, + const char* propPrefix) + : BufferedLogWriter(filename ? -1 : fileno(stderr), filename) +{ + BufferedLogWriter::SetLogLevel(logLevel); + BufferedLogWriter::SetMaxLogWaitTime(std::numeric_limits::max()); + if (props) { + BufferedLogWriter::SetParameters(*props, propPrefix); + } + BufferedLogWriter::Append(kLogLevelDEBUG, "version: %s %s", + KFS_BUILD_VERSION_STRING.c_str(), KFS_SOURCE_REVISION_STRING.c_str()); +} + +MsgLogger::~MsgLogger() +{ + if (this == logger) { + logger = 0; + } +} + +void +MsgLogger::Init( + const char* filename) +{ + Init(filename, kMsgLoggerDefaultLogLevel, 0, 0); +} + +void +MsgLogger::Init( + const char* filename, + MsgLogger::LogLevel logLevel) +{ + Init(filename, logLevel, 0, 0); +} + +void +MsgLogger::Init( + const Properties& props, + const char* propPrefix) +{ + Init(0, kMsgLoggerDefaultLogLevel, &props, propPrefix); +} + +void +MsgLogger::Init( + const char* filename, + MsgLogger::LogLevel logLevel, + const Properties* props, + const char* propPrefix) +{ + if (logger) { + if (props) { + logger->SetParameters(*props, propPrefix); + } + } else { + static MsgLogger sLogger(filename, logLevel, props, propPrefix); + logger = &sLogger; + } +} + +void +MsgLogger::Stop() +{ + if (logger) { + logger->BufferedLogWriter::Stop(); + } +} + +} // namespace KFS diff --git a/src/cc/common/MsgLogger.h b/src/cc/common/MsgLogger.h new file mode 100644 index 000000000..601cc089c --- /dev/null +++ b/src/cc/common/MsgLogger.h @@ -0,0 +1,122 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2007/10/17 +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2007-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief A message logging facility. +// +//---------------------------------------------------------------------------- + +#ifndef COMMON_MSG_LOGGER_H +#define COMMON_MSG_LOGGER_H + +#include "BufferedLogWriter.h" +#include + +namespace KFS +{ + // Have a singleton logger for an application + class MsgLogger : public BufferedLogWriter + { + private: + MsgLogger(const char *filename, LogLevel logLevel, + const Properties* props, const char* propPrefix); + ~MsgLogger(); + MsgLogger(const MsgLogger &other); + MsgLogger& operator=(const MsgLogger &other); + static MsgLogger *logger; + public: + static void Stop(); + static MsgLogger* GetLogger() { return logger; } + static void Init(const char *filename); + static void Init(const char *filename, LogLevel logLevel); + static void Init(const Properties& props, const char* propPrefix = 0); + static void Init(const char *filename, LogLevel logLevel, + const Properties* props, const char* propPrefix); + static void SetLevel(LogLevel logLevel) { + if (logger) { + logger->SetLogLevel(logLevel); + } + } + static bool IsLoggerInited() { return (logger != 0); } + static const char* SourceFileName(const char* name) { + if (! name) { + return ""; + } + const char* const ret = strrchr(name, '/'); + if (! ret || ! ret[1]) { + return name; + } + return ret + 1; + } + }; + +// The following if prevents arguments evaluation (and possible side effect). +// The following supports all +// std stream manipulators, has lower # of allocations, and free of possible +// problems with stream object scope / lifetime. +// The price for this is that insertion has to be always terminated with +// KFS_LOG_EOM, otherwise you'll get possibly unintelligible compile time error. +#ifndef KFS_LOG_STREAM_START +# define KFS_LOG_STREAM_START(logLevel, streamVarName) \ + if (MsgLogger::GetLogger() && \ + MsgLogger::GetLogger()->IsLogLevelEnabled(logLevel)) {\ + MsgLogger::StStream streamVarName( \ + *MsgLogger::GetLogger(), logLevel); \ + streamVarName.GetStream() << "(" << \ + MsgLogger::SourceFileName(__FILE__) << ":" << __LINE__ << ") " +#endif +#ifndef KFS_LOG_STREAM_END +# define KFS_LOG_STREAM_END \ + } (void)0 +#endif + +#ifndef KFS_LOG_STREAM +# define KFS_LOG_STREAM(logLevel) \ + KFS_LOG_STREAM_START(logLevel, _msgStream_015351104260035312) +#endif +#ifndef KFS_LOG_EOM +# define KFS_LOG_EOM \ + std::flush; \ + KFS_LOG_STREAM_END +#endif + +#ifndef KFS_LOG_STREAM_DEBUG +# define KFS_LOG_STREAM_DEBUG KFS_LOG_STREAM(MsgLogger::kLogLevelDEBUG) +#endif +#ifndef KFS_LOG_STREAM_INFO +# define KFS_LOG_STREAM_INFO KFS_LOG_STREAM(MsgLogger::kLogLevelINFO) +#endif +#ifndef KFS_LOG_STREAM_NOTICE +# define KFS_LOG_STREAM_NOTICE KFS_LOG_STREAM(MsgLogger::kLogLevelNOTICE) +#endif +#ifndef KFS_LOG_STREAM_WARN +# define KFS_LOG_STREAM_WARN KFS_LOG_STREAM(MsgLogger::kLogLevelWARN) +#endif +#ifndef KFS_LOG_STREAM_ERROR +# define KFS_LOG_STREAM_ERROR KFS_LOG_STREAM(MsgLogger::kLogLevelERROR) +#endif +#ifndef KFS_LOG_STREAM_FATAL +# define KFS_LOG_STREAM_FATAL KFS_LOG_STREAM(MsgLogger::kLogLevelFATAL) +#endif + +} // namespace KFS + +#endif // COMMON_MSG_LOGGER_H diff --git a/src/cc/common/PoolAllocator.h b/src/cc/common/PoolAllocator.h new file mode 100644 index 000000000..7da3b4eeb --- /dev/null +++ b/src/cc/common/PoolAllocator.h @@ -0,0 +1,165 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/02/22 +// Author: Mike Ovsiannikov +// +// Copyright 2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Memory pool of TItemSize fixed size blocks. Larger blocks are allocated +// starting from TMinStorageAlloc, and the size of the allocated block doubles +// when the pool grows until the size reaches TMaxStorageAlloc. The allocated +// space isn't accessed (added to the free list and such) until it is really +// needed to minimize cache / tlb misses with locked memory, and defer the +// actual memory allocation by the os without locked memory. Deallocate adds +// block to the free list. The free list is LIFO to reduce dram cache and tlb +// misses. If free list is empty then a new "next size" large block is +// allocated. Allocated blocks are never released back, until the pool +// destroyed. If TForceCleanupFlag set to false, and in use count is greater +// than 0 then all allocated blocks are "leaked". If element is larger or +// equal to the pointer size, then the allocation has no overhead. +// Suitable for allocating very large number of small elements. +// +//---------------------------------------------------------------------------- + +#ifndef POOL_ALLOCATOR_H +#define POOL_ALLOCATOR_H + +#include +#include +#include + +#include + +namespace KFS +{ + +using std::max; +using std::min; + +template< + size_t TItemSize, + size_t TMinStorageAlloc, + size_t TMaxStorageAlloc, + bool TForceCleanupFlag +> +class PoolAllocator +{ +public: + PoolAllocator() + : mFreeStoragePtr(0), + mFreeStorageEndPtr(0), + mStorageListPtr(0), + mFreeListPtr(0), + mAllocSize(max(TMinStorageAlloc, GetElemSize())), + mStorageSize(0), + mInUseCount(0) + {} + ~PoolAllocator() + { + if (! TForceCleanupFlag && mInUseCount > 0) { + return; // Memory leak + } + while (mStorageListPtr) { + char* const theCurPtr = mStorageListPtr; + char** thePtr = reinterpret_cast(theCurPtr); + mStorageListPtr = *thePtr++; + assert(*thePtr == theCurPtr); + delete [] *thePtr; + } + } + char* Allocate() + { + if (mFreeListPtr) { + mInUseCount++; + return GetNextFree(); + } + char* theEndPtr = mFreeStoragePtr + GetElemSize(); + if (theEndPtr > mFreeStorageEndPtr) { + // Maintain 2 * sizeof(size_t) alignment. + const size_t theHdrSize = 2 * sizeof(mStorageListPtr); + const size_t theSize = mAllocSize + 2 * sizeof(mStorageListPtr); + mFreeStoragePtr = new char[theSize]; + mFreeStorageEndPtr = mFreeStoragePtr + theSize; + char** thePtr = reinterpret_cast(mFreeStoragePtr); + *thePtr++ = mStorageListPtr; + *thePtr = mFreeStoragePtr; // store ptr to catch buffer overrun. + mStorageListPtr = mFreeStoragePtr; + mFreeStoragePtr += theHdrSize; + mAllocSize = min(TMaxStorageAlloc, mAllocSize << 1); + mStorageSize += theSize; + theEndPtr = mFreeStoragePtr + GetElemSize(); + } + char* const theRetPtr = mFreeStoragePtr; + mFreeStoragePtr = theEndPtr; + mInUseCount++; + return theRetPtr; + } + void Deallocate( + void* inPtr) + { + if (! inPtr) { + return; + } + assert(mInUseCount > 0); + mInUseCount--; + Put(inPtr); + } + size_t GetInUseCount() const + { return mInUseCount; } + size_t GetStorageSize() const + { return mStorageSize; } + static size_t GetItemSize() + { return TItemSize; } + static size_t GetElemSize() + { return max(TItemSize, sizeof(char*)); } +private: + char* mFreeStoragePtr; + char* mFreeStorageEndPtr; + char* mStorageListPtr; + char* mFreeListPtr; + size_t mAllocSize; + size_t mStorageSize; + size_t mInUseCount; + + char* GetNextFree() + { + char* const theRetPtr = mFreeListPtr; + if (GetElemSize() % sizeof(mFreeListPtr) == 0) { + mFreeListPtr = *reinterpret_cast(theRetPtr); + } else { + memcpy(&mFreeListPtr, theRetPtr, sizeof(mFreeListPtr)); + } + return theRetPtr; + } + void Put(void* inPtr) + { + if (GetElemSize() % sizeof(mFreeListPtr) == 0) { + *reinterpret_cast(inPtr) = mFreeListPtr; + } else { + memcpy(inPtr, &mFreeListPtr, sizeof(mFreeListPtr)); + } + mFreeListPtr = reinterpret_cast(inPtr); + } + + PoolAllocator( PoolAllocator& inAlloc); + PoolAllocator& operator=( PoolAllocator& inAlloc); +}; + +} + +#endif /* POOL_ALLOCATOR_H */ diff --git a/src/cc/common/Properties.cc b/src/cc/common/Properties.cc new file mode 100644 index 000000000..a988bf102 --- /dev/null +++ b/src/cc/common/Properties.cc @@ -0,0 +1,254 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// \brief Properties implementation. +// +// Created 2004/05/05 +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include "Properties.h" + +namespace KFS +{ + +using std::string; +using std::istream; +using std::ifstream; +using std::cerr; +using std::cout; +using std::endl; + +inline static int +AsciiCharToLower(int c) +{ + return ((c >= 'A' && c <= 'Z') ? 'a' + (c - 'A') : c); +} + +inline string +removeLTSpaces(const string& str, string::size_type start, string::size_type end, + bool asciiToLower = false) +{ + char const* const delims = " \t\r\n"; + + if (start >= str.length()) { + return string(); + } + string::size_type const first = str.find_first_not_of(delims, start); + if (end <= first || first == string::npos) { + return string(); + } + string::size_type const last = str.find_last_not_of( + delims, end == string::npos ? string::npos : end - 1); + return ( + asciiToLower ? + Properties::AsciiToLower(str.substr(first, last - first + 1)) : + str.substr(first, last - first + 1) + ); +} + +/* static */ string +Properties::AsciiToLower(const string& str) +{ + string s(str); + for (string::iterator i = s.begin(); i != s.end(); ++i) { + const int c = AsciiCharToLower(*i); + if (c != *i) { + *i = c; + } + } + return s; +} + +inline +Properties::iterator Properties::find(const string& key) const +{ + return propmap.find(key); +} + +Properties::Properties(int base) + : intbase(base), + propmap() +{ +} + +Properties::Properties(const Properties &p) + : intbase(p.intbase), + propmap(p.propmap) +{ +} + +Properties::~Properties() +{ +} + +int +Properties::loadProperties(const char* fileName, char delimiter, + bool verbose /* = false */, bool multiline /* =false */, + bool keysAsciiToLower /* = false */) +{ + ifstream input(fileName); + if(! input.is_open()) { + cerr << "Properties::loadProperties() Could not open the file:" << + fileName << endl; + return(-1); + } + loadProperties(input, delimiter, verbose, multiline, keysAsciiToLower); + input.close(); + return 0; +} + +int +Properties::loadProperties(istream &ist, char delimiter, + bool verbose /* = false */, bool multiline /* = false */, + bool keysAsciiToLower /* = false */) +{ + string line; + while (ist) { + getline(ist, line); //read one line at a time + if (line.empty() || line[0] == '#') { + continue; // ignore comments + } + // find the delimiter + string::size_type const pos = line.find(delimiter); + if (pos == string::npos) { + continue; // ignore if no delimiter is found + } + const string key(removeLTSpaces(line, 0, pos, keysAsciiToLower)); + const string val(removeLTSpaces(line, pos + 1, string::npos)); + if (multiline) { + // allow properties to span across multiple lines + propmap[key] += val; + } else { + propmap[key] = val; + } + if (verbose) { + cout << "Loading key " << key << + " with value " << propmap[key] << endl; + } + } + return 0; +} + +void +Properties::setValue(const string& key, const string& value) +{ + propmap[key] = value; + return; +} + +string +Properties::getValue(const string& key, const string& def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : i->second); +} + +const char* +Properties::getValue(const string& key, const char* def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : i->second.c_str()); +} + +int +Properties::getValue(const string& key, int def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : + (int)strtol(i->second.c_str(), 0, intbase)); +} + +unsigned int +Properties::getValue(const string& key, unsigned int def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : + (unsigned int)strtoul(i->second.c_str(), 0, intbase)); +} + +long +Properties::getValue(const string& key, long def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : strtol(i->second.c_str(), 0, intbase)); +} + +unsigned long +Properties::getValue(const string& key, unsigned long def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : strtoul(i->second.c_str(), 0, intbase)); +} + +long long +Properties::getValue(const string& key, long long def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : strtoll(i->second.c_str(), 0, intbase)); +} + +unsigned long long +Properties::getValue(const string& key, unsigned long long def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : strtoull(i->second.c_str(), 0, intbase)); +} + +double +Properties::getValue(const string& key, double def) const +{ + PropMap::const_iterator const i = find(key); + return (i == propmap.end() ? def : atof(i->second.c_str())); +} + +void +Properties::getList(string &outBuf, + const string& linePrefix, const string& lineSuffix) const +{ + PropMap::const_iterator iter; + for (iter = propmap.begin(); iter != propmap.end(); iter++) { + if (iter->first.size() > 0) { + outBuf += linePrefix; + outBuf += iter->first; + outBuf += '='; + outBuf += iter->second; + outBuf += lineSuffix; + } + } + return; +} + +void +Properties::copyWithPrefix(const string& prefix, Properties& props) const +{ + PropMap::const_iterator iter; + for (iter = propmap.begin(); iter != propmap.end(); iter++) { + const string& key = iter->first; + if (key.compare(0, prefix.length(), prefix) == 0) { + props.propmap[key] = iter->second; + } + } +} + +} // namespace KFS diff --git a/src/cc/common/Properties.h b/src/cc/common/Properties.h new file mode 100644 index 000000000..27fb392e7 --- /dev/null +++ b/src/cc/common/Properties.h @@ -0,0 +1,97 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// \brief Properties file similar to java.util.Properties +// +// Created 2004/05/05 +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +//---------------------------------------------------------------------------- + +#ifndef COMMON_PROPERTIES_H +#define COMMON_PROPERTIES_H + +#include +#include +#include + +#include "StdAllocator.h" + +namespace KFS +{ + +using std::map; +using std::string; +using std::istream; + +// Key: value properties. +// Can be used to parse rfc822 style request headers, or configuration files. +class Properties +{ +private : + int intbase; + //Map that holds the (key,value) pairs + typedef map, + StdFastAllocator > + > PropMap; + PropMap propmap; + inline PropMap::const_iterator find(const string& key) const; + +public: + static string AsciiToLower(const string& str); + + typedef PropMap::const_iterator iterator; + iterator begin() const { return propmap.begin(); } + iterator end() const { return propmap.end(); } + // load the properties from a file + int loadProperties(const char* fileName, char delimiter, + bool verbose, bool multiline = false, bool keysAsciiToLower = false); + // load the properties from an in-core buffer + int loadProperties(istream &ist, char delimiter, + bool verbose, bool multiline = false, bool keysAsciiToLower = false); + string getValue(const string& key, const string& def) const; + const char* getValue(const string& key, const char* def) const; + int getValue(const string& key, int def) const; + unsigned int getValue(const string& key, unsigned int def) const; + long getValue(const string& key, long def) const; + unsigned long getValue(const string& key, unsigned long def) const; + long long getValue(const string& key, long long def) const; + unsigned long long getValue(const string& key, unsigned long long def) const; + double getValue(const string& key, double def) const; + void setValue(const string& key, const string& value); + void getList(string &outBuf, const string& linePrefix, + const string& lineSuffix = string("\n")) const; + void clear() { propmap.clear(); } + bool empty() const { return propmap.empty(); } + size_t size() const { return propmap.size(); } + void copyWithPrefix(const string& prefix, Properties& props) const; + void swap(Properties& props) + { propmap.swap(props.propmap); } + void setIntBase(int base) + { intbase = base; } + Properties(int base = 10); + Properties(const Properties &p); + ~Properties(); + +}; + +} + +#endif // COMMON_PROPERTIES_H diff --git a/src/cc/common/RequestParser.cc b/src/cc/common/RequestParser.cc new file mode 100644 index 000000000..b0f8637c5 --- /dev/null +++ b/src/cc/common/RequestParser.cc @@ -0,0 +1,84 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/12/07 +// Author: Mike Ovsiannikov +// +// Copyright 2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "RequestParser.h" + +namespace KFS { + +const unsigned char HexIntParser::sChar2Hex[256] = { +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0x00 /* 0 */, 0x01 /* 1 */, +0x02 /* 2 */, 0x03 /* 3 */, 0x04 /* 4 */, 0x05 /* 5 */, 0x06 /* 6 */, +0x07 /* 7 */, 0x08 /* 8 */, 0x09 /* 9 */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0x0a /* A */, 0x0b /* B */, 0x0c /* C */, 0x0d /* D */, 0x0e /* E */, +0x0f /* F */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0x0a /* a */, 0x0b /* b */, 0x0c /* c */, +0x0d /* d */, 0x0e /* e */, 0x0f /* f */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, 0xff /* */, +0xff /* */ +}; +} diff --git a/src/cc/common/RequestParser.h b/src/cc/common/RequestParser.h new file mode 100644 index 000000000..006c8db95 --- /dev/null +++ b/src/cc/common/RequestParser.h @@ -0,0 +1,719 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/05/14 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Classes to build high performance rfc822 style request header parsers. +// +//---------------------------------------------------------------------------- + +#ifndef REQUEST_PARSER_H +#define REQUEST_PARSER_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "StBuffer.h" + +namespace KFS +{ +using std::string; +using std::streambuf; +using std::istream; +using std::min; +using std::make_pair; +using std::map; +using std::less; + +// Multiple inheritance below used only to enforce construction order. +class BufferInputStream : + private streambuf, + public istream +{ +public: + BufferInputStream( + const char* inPtr = 0, + size_t inLen = 0) + : streambuf(), + istream(this) + { + char* const thePtr = const_cast(inPtr); + streambuf::setg(thePtr, thePtr, thePtr + inLen); + } + istream& Set( + const char* inPtr, + size_t inLen) + { + clear(); + char* const thePtr = const_cast(inPtr); + streambuf::setg(thePtr, thePtr, thePtr + inLen); + return *this; + } +}; + +class DecIntParser +{ +public: + template + static bool Parse( + const char*& ioPtr, + size_t inLen, + T& outValue) + { + const char* thePtr = ioPtr; + const char* const theEndPtr = thePtr + inLen; + while (thePtr < theEndPtr && (*thePtr & 0xFF) <= ' ') { + thePtr++; + } + if (thePtr >= theEndPtr) { + return false; + } + const bool theMinusFlag = *thePtr == '-'; + if ((theMinusFlag || *thePtr == '+') && ++thePtr >= theEndPtr) { + return false; + } + // Do not use std::numeric_limits<>::max(), the code with these appears + // to be less efficient than the constants below, probably due to + // function call / exception handling overhead for + // std::numeric_limits<>::max() throw() invocation. + const int kRadix = 10; + const T kMax = (T)( + ~T(0) > 0 ? ~T(0) : ~(T(1) << (sizeof(T) * 8 - 1))); + const T theMaxDivRadix = kMax / kRadix; + const char* const theStartPtr = thePtr; + T theVal = 0; + while (thePtr < theEndPtr) { + const int theDigit = (*thePtr & 0xFF) - '0'; + if (theDigit < 0 || theDigit >= kRadix) { + if (thePtr <= theStartPtr) { + return false; + } + break; + } + thePtr++; + if (theVal > theMaxDivRadix || + (theVal == theMaxDivRadix && + T(theDigit) > kMax - theMaxDivRadix * kRadix)) { + // Overflow. Negative overflow for signed types will always be + // min() + 1, instead of min(), but this should be OK for now. + theVal = kMax; + break; + } + theVal = theVal * kRadix + theDigit; + // theVal = (theVal << 3) + (theVal << 1) + theDigit; + } + outValue = theMinusFlag ? -theVal : theVal; + ioPtr = thePtr; + return true; + } +}; + +class HexIntParser +{ +public: + template + static bool Parse( + const char*& ioPtr, + size_t inLen, + T& outValue) + { + if (inLen <= 0) { + return -1; + } + const unsigned char* thePtr = + reinterpret_cast(ioPtr); + const unsigned char* const theEndPtr = thePtr + inLen; + while (thePtr < theEndPtr && (*thePtr & 0xFF) <= ' ') { + thePtr++; + } + if (thePtr >= theEndPtr) { + return false; + } + const bool theMinusFlag = *thePtr == '-'; + if ((theMinusFlag || *thePtr == '+') && ++thePtr >= theEndPtr) { + return false; + } + T theVal = 0; + const unsigned char* const theNEndPtr = thePtr + sizeof(theVal) * 2 + 1; + while (thePtr < theEndPtr) { + const unsigned char theHex = sChar2Hex[*thePtr]; + if (theHex == (unsigned char)0xFF || thePtr == theNEndPtr) { + if ((*thePtr & 0xFF) > ' ') { + return false; + } + break; + } + theVal = (theVal << 4) | theHex; + ++thePtr; + } + outValue = theMinusFlag ? -theVal : theVal; + ioPtr = reinterpret_cast(thePtr); + return true; + } + static const unsigned char* GetChar2Hex() { return sChar2Hex; } +private: + static const unsigned char sChar2Hex[256]; +}; + +template +class ValueParserT +{ +public: + // The most generic version that handles all the types for which extraction + // operator (>>) exists. One wouldn't expect this to be very efficient + // though, mainly due to istream/streambuf call overhead (virtual + // function calls etc). + template + static void SetValue( + const char* inPtr, + size_t inLen, + const T& inDefaultValue, + T& outValue) + { + if (inLen <= 0) { + outValue = inDefaultValue; + } else { + BufferInputStream theStream(inPtr, inLen); + if (! (theStream >> outValue)) { + outValue = inDefaultValue; + } + } + } + // The following two do not trim whitespace. + // This is intentional, and it is up to the caller to handle this + // appropriately. + // For example PropertiesTokenizer trims white space. + static void SetValue( + const char* inPtr, + size_t inLen, + const string& inDefaultValue, + string& outValue) + { + if (inLen <= 0) { + outValue = inDefaultValue; + } else { + outValue.assign(inPtr, inLen); + } + } + template + static void SetValue( + const char* inPtr, + size_t inLen, + const StringBufT& inDefaultValue, + StringBufT& outValue) + { + if (inLen <= 0) { + outValue = inDefaultValue; + } else { + outValue.Copy(inPtr, inLen); + } + } + // The following is used for integer overloaded versions of SetValue, in + // the hope that this would be more efficient than the preceding generic + // version the above. + template + static bool ParseInt( + const char*& ioPtr, + size_t inLen, + T& outValue) + { + return INT_PARSER::Parse(ioPtr, inLen, outValue); + } + +#define _KFS_ValueParser_IntTypes(f) \ + f(char) f(short int) f(int) f(long int) \ + f(unsigned char) f(unsigned short int) f(unsigned int) f(unsigned long int)\ + f(long long int) f(unsigned long long int) + +#define _KFS_DEFINE_ValueParser_IntSetValue(IT) \ + static void SetValue( \ + const char* inPtr, \ + size_t inLen, \ + const IT& inDefaultValue, \ + IT& outValue) \ + { \ + if (! ParseInt(inPtr, inLen, outValue)) { \ + outValue = inDefaultValue; \ + } \ + } + +_KFS_ValueParser_IntTypes(_KFS_DEFINE_ValueParser_IntSetValue) +#undef _KFS_DEFINE_ValueParser_IntSetValue +#undef _KFS_DEFINE_ValueParser_IntTypes + + static void SetValue( + const char* inPtr, + size_t inLen, + const bool& inDefaultValue, + bool& outValue) + { + int theVal = 0; + if (! ParseInt(inPtr, inLen, theVal)) { + outValue = inDefaultValue; + } + outValue = theVal != 0; + } +}; + +typedef ValueParserT ValueParser; + +class PropertiesTokenizer +{ +public: + enum { kSeparator = ':' }; + struct Token + { + Token( + const char* inPtr, + const char* inEndPtr) + : mPtr(inPtr), + mLen(inEndPtr <= inPtr ? 0 : inEndPtr - inPtr) + {} + Token( + const char* inPtr, + size_t inLen) + : mPtr(inPtr), + mLen(inLen) + {} + Token( + const char* inPtr = 0) + : mPtr(inPtr), + mLen(inPtr ? strlen(inPtr) : 0) + {} + Token operator=( + const Token& inToken) + { + const_cast(mPtr) = inToken.mPtr; + const_cast(mLen) = inToken.mLen; + return *this; + } + bool operator<( + const Token& inToken) const + { + const int theRet = memcmp( + mPtr, inToken.mPtr, min(mLen, inToken.mLen)); + return (theRet < 0 || (theRet == 0 && mLen < inToken.mLen)); + } + bool operator==( + const Token& inToken) const + { + return ( + mLen == inToken.mLen && + memcmp(mPtr, inToken.mPtr, mLen) == 0 + ); + } + bool operator!=( + const Token& inToken) const + { return (! operator==(inToken)); } + const char* GetEndPtr() const + { return (mPtr + mLen); } + const char* const mPtr; + size_t const mLen; + }; + + PropertiesTokenizer( + const char* inPtr, + size_t inLen, + bool inIgnoreMalformedFlag = true) + : mPtr(inPtr), + mEndPtr(inPtr + inLen), + mKey(), + mValue(), + mIgnoreMalformedFlag(inIgnoreMalformedFlag) + {} + static bool IsWSpace( + char inChar) + { return ((inChar & 0xFF) <= ' '); } + bool Next(int inSeparator = kSeparator) + { + while (mPtr < mEndPtr) { + // Skip leading white space. + while (mPtr < mEndPtr && IsWSpace(*mPtr)) { + mPtr++; + } + if (mPtr >= mEndPtr) { + break; + } + // Find delimiter, and discard white space before delimeter. + const char* const theKeyPtr = mPtr; + while (mPtr < mEndPtr && *mPtr != inSeparator && + ! IsWSpace(*mPtr)) { + mPtr++; + } + if (mPtr >= mEndPtr) { + break; + } + const char* theKeyEndPtr = mPtr; + while (mPtr < mEndPtr && *mPtr != inSeparator && + *mPtr != '\r' && *mPtr != '\n') { + if (! IsWSpace(*mPtr)) { + theKeyEndPtr = mPtr + 1; + } + mPtr++; + } + if (*mPtr != inSeparator) { + // Ignore malformed line. + while (mPtr < mEndPtr && *mPtr != '\n') { + mPtr++; + } + if (mIgnoreMalformedFlag) { + continue; + } + mKey = Token(theKeyPtr, theKeyEndPtr); + break; + } + mPtr++; + // Skip leading white space after the delimiter. + while (mPtr < mEndPtr && IsWSpace(*mPtr) && + *mPtr != '\r' && *mPtr != '\n') { + mPtr++; + } + // Find end of line and discard trailing white space. + const char* const theValuePtr = mPtr; + const char* theValueEndPtr = mPtr; + while (mPtr < mEndPtr && *mPtr != '\r' && *mPtr != '\n') { + if (! IsWSpace(*mPtr)) { + theValueEndPtr = mPtr + 1; + } + mPtr++; + } + mKey = Token(theKeyPtr, theKeyEndPtr); + mValue = Token(theValuePtr, theValueEndPtr); + return true; + } + return false; + } + const Token& GetKey() const + { return mKey; } + const Token& GetValue() const + { return mValue; } +private: + const char* mPtr; + const char* const mEndPtr; + Token mKey; + Token mValue; + const bool mIgnoreMalformedFlag; +}; + +// Create parser for object fields, and invoke appropriate parsers based on the +// request header names. +template +class ObjectParser +{ +public: + typedef PropertiesTokenizer Tokenizer; + + // inNamePtr arguments are assumed to be static strings. + // The strings must remain constant and valid during the lifetime of + // this object. + + ObjectParser() + : mDefDoneFlag(false), + mFields() + {} + virtual ~ObjectParser() + { + for (typename Fields::iterator theIt = mFields.begin(); + theIt != mFields.end(); + ++theIt) { + delete theIt->second; + theIt->second = 0; + } + } + template + ObjectParser& Def( + const char* inNamePtr, + T OT::* inFieldPtr, + T inDefault = T()) + { + if (! mDefDoneFlag && ! mFields.insert(make_pair( + Key(inNamePtr), new Field(inFieldPtr, inDefault) + )).second) { + // Duplicate key name in the definition. + abort(); + } + return *this; + } + ObjectParser& DefDone() + { + mDefDoneFlag = true; + return *this; + } + bool IsDefined() const + { return mDefDoneFlag; } + void Parse( + Tokenizer& inTokenizer, + OBJ* inObjPtr) const + { + while (inTokenizer.Next()) { + typename Fields::const_iterator const + theIt = mFields.find(inTokenizer.GetKey()); + if (theIt != mFields.end()) { + theIt->second->Set(inObjPtr, inTokenizer.GetValue()); + } + } + } +private: + typedef Tokenizer::Token Key; + typedef Tokenizer::Token Value; + + class AbstractField + { + public: + AbstractField() + {} + virtual ~AbstractField() + {} + virtual void Set( + OBJ* inObjPtr, + const Value& inValue) const = 0; + }; + + template + class Field : public AbstractField + { + public: + Field( + T OT::* inFieldPtr, + const T& inDefault) + : AbstractField(), + mFieldPtr(inFieldPtr), + mDefault(inDefault) + {} + virtual ~Field() + {} + virtual void Set( + OBJ* inObjPtr, + const Value& inValue) const + { + // The implicit cast below from OBJ to OT is crucial. + // This is the primary reason why this code *is not* in the + // AbstractRequestParser, and why the parser definition can not be + // done with the AbstractRequestParser. + // In other words this is the reason why the definition has to be + // in one class, and can not be "inherited" from the super classes + // of the OBJ with abstract parser. + // The implicit cast correctly handles multiple inheritance where + // the result of the cast depends on the type of the "casted from" + // object. + VALUE_PARSER::SetValue( + inValue.mPtr, + inValue.mLen, + mDefault, + inObjPtr->*mFieldPtr + ); + } + private: + T OT::* const mFieldPtr; + T const mDefault; + }; + + typedef map > Fields; + + bool mDefDoneFlag; + Fields mFields; +}; + +template +class AbstractRequestParser +{ +public: + typedef unsigned int Checksum; + + AbstractRequestParser() + {} + virtual ~AbstractRequestParser() + {} + virtual ABSTRACT_OBJ* Parse( + const char* inBufferPtr, + size_t inLen, + const char* inRequestNamePtr, + size_t inRequestNameLen, + bool inHasHeaderChecksumFlag, + Checksum inChecksum) const = 0; +}; + +// Create concrete object and invoke corresponding parser. +template +class RequestParser : + public AbstractRequestParser, + public ObjectParser +{ +public: + typedef PropertiesTokenizer Tokenizer; + typedef AbstractRequestParser Super; + typedef ObjectParser ObjParser; + typedef typename Super::Checksum Checksum; + + RequestParser() + : Super(), + ObjParser() + {} + virtual ~RequestParser() + {} + virtual ABSTRACT_OBJ* Parse( + const char* inBufferPtr, + size_t inLen, + const char* inRequestNamePtr, + size_t inRequestNameLen, + bool inHasHeaderChecksumFlag, + Checksum inChecksum) const + { + OBJ* const theObjPtr = new OBJ(); + if (! theObjPtr->ValidateRequestHeader( + inRequestNamePtr, + inRequestNameLen, + inBufferPtr, + inLen, + inHasHeaderChecksumFlag, + inChecksum)) { + delete theObjPtr; + return 0; + } + Tokenizer theTokenizer(inBufferPtr, inLen); + ObjParser::Parse(theTokenizer, theObjPtr); + if (theObjPtr->Validate()) { + return theObjPtr; + } + delete theObjPtr; + return 0; + } + template + RequestParser& Def( + const char* inNamePtr, + T OT::* inFieldPtr, + T inDefault = T()) + { + ObjParser::Def(inNamePtr, inFieldPtr, inDefault); + return *this; + } + RequestParser& DefDone() + { + ObjParser::DefDone(); + return *this; + } +}; + +// Invoke appropriate request parser based on RPC name. +template +class RequestHandler +{ +public: + typedef AbstractRequestParser Parser; + typedef typename Parser::Checksum Checksum; + + RequestHandler() + {} + ~RequestHandler() + {} + static bool IsWSpace( + char inChar) + { return ((inChar & 0xFF) <= ' '); } + ABSTRACT_OBJ* Handle( + const char* inBufferPtr, + size_t inLen) const + { + const char* thePtr = inBufferPtr; + const char* const theEndPtr = thePtr + inLen; + while (thePtr < theEndPtr && IsWSpace(*thePtr)) { + thePtr++; + } + const char* const theNamePtr = thePtr; + while (thePtr < theEndPtr && ! IsWSpace(*thePtr)) { + thePtr++; + } + const size_t theNameLen = thePtr - theNamePtr; + typename Parsers::const_iterator const theIt = + mParsers.find(Name(theNamePtr, theNameLen)); + if (theIt == mParsers.end()) { + return 0; + } + // Get optional header checksum. + const char* theChecksumPtr = thePtr; + while (thePtr < theEndPtr && + *thePtr != '\r' && *thePtr != '\n') { + thePtr++; + } + Checksum theChecksum = 0; + const bool theChecksumFlag = ValueParser::ParseInt( + theChecksumPtr, thePtr - theChecksumPtr, theChecksum); + while (thePtr < theEndPtr && IsWSpace(*thePtr)) { + thePtr++; + } + return theIt->second->Parse( + thePtr, + theEndPtr - thePtr, + theNamePtr, + theNameLen, + theChecksumFlag, + theChecksum + ); + } + template + RequestParser& BeginMakeParser( + const OBJ* inNullPtr = 0) + { + static RequestParser sParser; + return sParser; + } + template + RequestHandler& EndMakeParser( + const char* inNamePtr, + T& inParser) + { + if (! mParsers.insert(make_pair( + Name(inNamePtr), &inParser.DefDone())).second) { + // Duplicate name -- definition error. + abort(); + } + return *this; + } + template + RequestHandler& MakeParser( + const char* inNamePtr, + const OBJ* inNullPtr = 0) + { + return + EndMakeParser( + inNamePtr, + OBJ::ParserDef( + BeginMakeParser(inNullPtr) + ) + ); + } + +private: + typedef PropertiesTokenizer::Token Name; + typedef map Parsers; + + Parsers mParsers; +}; + +} + +#endif /* REQUEST_PARSER_H */ diff --git a/src/cc/common/SingleLinkedList.h b/src/cc/common/SingleLinkedList.h new file mode 100644 index 000000000..1bbd18cc5 --- /dev/null +++ b/src/cc/common/SingleLinkedList.h @@ -0,0 +1,89 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/05/18 +// Author: Mike Ovsainnikov +// +// Copyright 2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Simple singly linked list. +// +//---------------------------------------------------------------------------- + +#ifndef SINGLE_LINKED_LIST_H +#define SINGLE_LINKED_LIST_H + +namespace KFS +{ + +template +class SingleLinkedList +{ +public: + typedef T value_type; + + SingleLinkedList() + : mData(), + mNextPtr(0) + {} + SingleLinkedList( + const SingleLinkedList& inNode) + : mData(inNode.mData), + mNextPtr(inNode.mNextPtr) + {} + SingleLinkedList( + const T& inData, + SingleLinkedList* inNextPtr = 0) + : mData(inData), + mNextPtr(inNextPtr) + {} + SingleLinkedList& operator=( + const SingleLinkedList& inNode) + { + mData = inNode.mData; + mNextPtr = inNode.mNextPtr; + return *this; + } + T& GetData() + { return mData; } + const T& GetData() const + { return mData; } + SingleLinkedList*& GetNextPtr() + { return mNextPtr; } + const SingleLinkedList*& GetNextPtr() const + { return mNextPtr; } + SingleLinkedList& Reverse() + { + SingleLinkedList* theRetPtr = 0; + SingleLinkedList* theCurPtr = this; + do { + SingleLinkedList* const theNextPtr = theCurPtr->mNextPtr; + theCurPtr->mNextPtr = theRetPtr; + theRetPtr = theCurPtr; + theCurPtr = theNextPtr; + } while (theCurPtr); + return *theRetPtr; + } + +private: + T mData; + SingleLinkedList* mNextPtr; +}; + +} + +#endif /* SINGLE_LINKED_LIST_H */ diff --git a/src/cc/common/StBuffer.h b/src/cc/common/StBuffer.h new file mode 100644 index 000000000..bd4e7d279 --- /dev/null +++ b/src/cc/common/StBuffer.h @@ -0,0 +1,324 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/05/14 +// Author: Mike Ovsiannikov +// +// Copyright 2010 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef ST_BUFFER_H +#define ST_BUFFER_H + +#include +#include +#include + +#include +#include + +namespace KFS +{ + +// Stack based buffer. The intention is to use buffer mBuf allocated on the +// stack (or as part of other object) in most cases, and do real buffer +// allocation if the size exceeds default capacity. +template +class StBufferT +{ +public: + StBufferT() + : mBufPtr(mBuf), + mCapacity(DEFAULT_CAPACITY), + mSize(0) + {} + StBufferT( + const T* inPtr, + size_t inLen) + : mBufPtr(mBuf), + mCapacity(DEFAULT_CAPACITY), + mSize(0) + { Copy(inPtr, inLen); } + StBufferT( + const StBufferT& inBuf) + : mBufPtr(mBuf), + mCapacity(DEFAULT_CAPACITY), + mSize(0) + { Copy(inBuf, inBuf.GetSize()); } + template + StBufferT( + const StBufferT& inBuf) + : mBufPtr(mBuf), + mCapacity(DEFAULT_CAPACITY), + mSize(0) + { Copy(inBuf, inBuf.GetSize()); } + template + StBufferT& operator=( + const StBufferT& inBuf) + { return Copy(inBuf, inBuf.GetSize()); } + ~StBufferT() + { + if (mBufPtr != mBuf) { + delete [] mBufPtr; + } + } + size_t Capacity() const + { return mCapacity; } + size_t GetSize() const + { return mSize; } + T* Resize( + size_t inSize) + { + EnsureCapacity(inSize); + mSize = inSize; + return mBufPtr; + } + T* GetPtr() + { return mBufPtr; } + const T* GetPtr() const + { return mBufPtr; } + template + StBufferT& Copy( + const StBufferT& inBuf, + size_t inLen) + { + mSize = 0; + std::copy( + inBuf.GetPtr(), + inBuf.GetPtr() + std::min(inBuf.GetSize(), inLen), + Resize(inBuf.GetSize()) + ); + return *this; + } + StBufferT& Copy( + const T* inPtr, + size_t inLen) + { + mSize = 0; + std::copy(inPtr, inPtr + inLen, Resize(inLen)); + return *this; + } + StBufferT& Append( + const T& inVal) + { + Resize(mSize + 1); + mBufPtr[mSize - 1] = inVal; + return *this; + } +protected: + T* mBufPtr; + size_t mCapacity; + size_t mSize; + T mBuf[DEFAULT_CAPACITY]; + + T* EnsureCapacity( + size_t inCapacity) + { + if (inCapacity <= mCapacity) { + return mBufPtr; + } + T* const theBufPtr = new T[inCapacity]; + std::copy(mBufPtr, mBufPtr + mSize, theBufPtr); + if (mBufPtr != mBuf) { + delete [] mBufPtr; + } + mCapacity = inCapacity; + mBufPtr = theBufPtr; + return mBufPtr; + } +}; + +// String buffer, with lazy conversion to std::string. +template +class StringBufT +{ +public: + StringBufT() + : mStr(), + mSize(-1) + {} + StringBufT( + const StringBufT& inBuf) + : mStr(), + mSize(-1) + { Copy(inBuf); } + ~StringBufT() + {} + template + StringBufT( + const StringBufT& inBuf) + : mStr(), + mSize(-1) + { Copy(inBuf); } + StringBufT( + const std::string& inStr) + : mStr(inStr), + mSize(-1) + {} + template + StringBufT& operator=( + const StringBufT& inBuf) + { return Copy(inBuf); } + StringBufT& operator=( + const std::string& inStr) + { return Copy(inStr); } + const char* GetPtr() const + { return (mSize < 0 ? mStr.c_str() : mBuf); } + size_t GetSize() const + { return (mSize < 0 ? mStr.size() : size_t(mSize)); } + StringBufT& Copy( + const char* inPtr, + size_t inLen) + { + if (inLen < DEFAULT_CAPACITY) { + // memcpy appears slightly faster, if it isn't inlined. + if (mBuf <= inPtr && inPtr < mBuf + DEFAULT_CAPACITY) { + memmove(mBuf, inPtr, inLen); + } else { + memcpy(mBuf, inPtr, inLen); + } + mSize = inLen; + mBuf[mSize] = 0; + mStr.clear(); + } else { + mSize = -1; + mStr.assign(inPtr, inLen); + } + return *this; + } + StringBufT& Copy( + const std::string& inStr) + { + mSize = -1; + mStr = inStr; + return *this; + } + template + StringBufT& Copy( + const StringBufT& inBuf) + { + if (inBuf.mSize > 0) { + Copy(inBuf.mBuf, inBuf.mSize); + } else { + mStr = inBuf.mStr; + mSize = -1; + } + return *this; + } + std::string GetStr() const + { + if (mSize > 0) { + std::string& theStr = const_cast(mStr); + theStr.assign(mBuf, mSize); + const_cast(mSize) = -1; + } + return mStr; + } + template + bool Comapre( + const StringBufT& inBuf) const + { + const int theRet = memcmp( + GetPtr(), inBuf.GetPtr(), std::min(GetSize(), inBuf.GetSize())); + return (theRet == 0 ? GetSize() - inBuf.GetSize() : theRet); + } + template + bool operator==( + const StringBufT& inBuf) const + { + return ( + GetSize() == inBuf.GetSize() && + memcmp(GetPtr(), inBuf.GetPtr(), GetSize()) == 0 + ); + } + // The following two aren't necessarily the same as string.compare(), + int Compare( + const std::string& inStr) const + { + const int theRet = memcmp( + GetPtr(), inStr.data(), std::min(GetSize(), inStr.size())); + return (theRet == 0 ? GetSize() - inStr.size() : theRet); + } + bool operator==( + const std::string& inStr) const + { + return ( + GetSize() == inStr.size() && + memcmp(GetPtr(), inStr.data(), GetSize()) == 0 + ); + } + int Compare( + const char* inStrPtr) const + { return strcmp(GetPtr(), inStrPtr); } + bool operator==( + const char* inStrPtr) const + { return (Compare(inStrPtr) == 0); } + template + bool operator<( + const StringBufT& inBuf) const + { return (Compare(inBuf) < 0); } + bool operator<( + const std::string& inStr) const + { return (Compare(inStr) < 0); } + const char* c_str() const + { return GetPtr(); } + bool empty() const + { return (GetSize() <= 0); } + size_t size() const + { return GetSize(); } + size_t length() const + { return GetSize(); } + void clear() + { + mSize = -1; + mStr.clear(); + } + //operator std::string () const + // { return GetStr(); } +private: + std::string mStr; + char mBuf[DEFAULT_CAPACITY + 1]; + int mSize; +}; + +template +inline static bool operator==( + const std::string& inStr, + const StringBufT& inBuf) +{ + return (inBuf == inStr); +} + +template +inline static bool operator==( + const char* inStrPtr, + const StringBufT& inBuf) +{ + return (inBuf == inStrPtr); +} + +template +inline static std::ostream& operator<<( + std::ostream& inStream, + const StringBufT& inBuf) +{ return inStream.write(inBuf.GetPtr(), inBuf.GetSize()); } + +} + +#endif /* ST_BUFFER_H */ diff --git a/src/cc/common/StTmp.h b/src/cc/common/StTmp.h new file mode 100644 index 000000000..fb73e98ab --- /dev/null +++ b/src/cc/common/StTmp.h @@ -0,0 +1,104 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/06/09 +// Author: Mike Ovsainnikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Temporary recursion protected "variable". +// +//---------------------------------------------------------------------------- + +#ifndef ST_TMP_H +#define ST_TMP_H + +#include + +namespace KFS +{ + +template class StTmp +{ +public: + template class TmpT + { + public: + TmpT() + : mTmp(), + mInUseFlag(false) + {} + ~TmpT() { assert(! mInUseFlag); } + private: + Tp mTmp; + bool mInUseFlag; + + TmpT* Get() + { + if (mInUseFlag) { + return 0; + } + mInUseFlag = true; + return this; + } + void Put() + { + assert(mInUseFlag); + mTmp.clear(); + mInUseFlag = false; + } + TmpT(const TmpT&); + TmpT& operator=(const TmpT&); + friend class StTmp; + }; + typedef TmpT Tmp; + + StTmp(Tmp& inTmp) + : mTmpPtr(inTmp.Get()), + mTmp(mTmpPtr ? mTmpPtr->mTmp : + *(new (&mTmpStorage) T())) + {} + ~StTmp() + { + if (mTmpPtr) { + mTmpPtr->Put(); + } else { + mTmp.~T(); + } + } + T& Get() + { + mTmp.clear(); + return mTmp; + } +private: + Tmp* const mTmpPtr; + struct { + size_t mStorage[ + (sizeof(T) + sizeof(size_t) - 1) / + sizeof(size_t) + ]; + } mTmpStorage; + T& mTmp; + + StTmp(const StTmp&); + StTmp& operator=(const StTmp&); +}; + +} + +#endif /* ST_TMP_H */ diff --git a/src/cc/common/StdAllocator.h b/src/cc/common/StdAllocator.h new file mode 100644 index 000000000..512696023 --- /dev/null +++ b/src/cc/common/StdAllocator.h @@ -0,0 +1,127 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/08/09 +// Author: Mike Ovsainnikov +// +// Copyright 2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Stl pool allocators. Boost fast allocator has a bug with 0 size allocation. +// To paper over this bug return pointer to an empty sting, and do not invoke +// de-allocation with 0 element count. +// If available use GNU pool allocator, instead of boost pool allocator. +// GNU allocator can be turned off at run time by setting environment variable +// GLIBCXX_FORCE_NEW=1 to check for memory leaks with valgrind and such. +// +//---------------------------------------------------------------------------- + +#ifndef STD_ALLOCATOR_H +#define STD_ALLOCATOR_H + +#if ! defined(__GNUC__) || (__GNUC__ < 3 || \ + (__GNUC__ == 3 && __GNUC_MINOR__ < 4)) +# include "boost/pool/pool_alloc.hpp" +# define KFS_STD_POOL_ALLOCATOR_T boost::pool_allocator +# define KFS_STD_FAST_POOL_ALLOCATOR_T boost::fast_pool_allocator +#else +# include +# define KFS_STD_POOL_ALLOCATOR_T __gnu_cxx::__pool_alloc +# define KFS_STD_FAST_POOL_ALLOCATOR_T __gnu_cxx::__pool_alloc +#endif + +namespace KFS +{ + +template < + typename T, + typename ALLOCATOR = KFS_STD_POOL_ALLOCATOR_T +> +class StdAllocator : public ALLOCATOR::template rebind::other +{ +private: + typedef typename ALLOCATOR::template rebind::other MySuper; +public: + typedef typename MySuper::pointer pointer; + typedef typename MySuper::size_type size_type; + + pointer allocate( + size_type inCount) + { + return (inCount == 0 ? (pointer)"" : MySuper::allocate(inCount)); + } + void deallocate( + pointer inPtr, + size_type inCount) + { + if (inCount != 0) { + MySuper::deallocate(inPtr, inCount); + } + } + template + struct rebind + { + typedef StdAllocator other; + }; + StdAllocator() + : MySuper() + {} + StdAllocator( + const StdAllocator& inAlloc) + : MySuper(inAlloc) + {} + template + StdAllocator( + const StdAllocator& inAlloc) + : MySuper(inAlloc) + {} + ~StdAllocator() + {} +}; + +template < + typename T, + typename ALLOCATOR = KFS_STD_FAST_POOL_ALLOCATOR_T +> +class StdFastAllocator : public StdAllocator +{ +private: + typedef StdAllocator MySuper; +public: + template + struct rebind + { + typedef StdFastAllocator other; + }; + StdFastAllocator() + : MySuper() + {} + StdFastAllocator( + const StdFastAllocator& inAlloc) + : MySuper(inAlloc) + {} + template + StdFastAllocator( + const StdFastAllocator& inAlloc) + : MySuper(inAlloc) + {} + ~StdFastAllocator() + {} +}; + +} // namespace KFS + +#endif /* STD_ALLOCATOR_H */ diff --git a/src/cc/common/ValueSampler.h b/src/cc/common/ValueSampler.h new file mode 100644 index 000000000..68e786231 --- /dev/null +++ b/src/cc/common/ValueSampler.h @@ -0,0 +1,147 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/06/21 +// Author: Mike Ovsiannikov +// +// Copyright 2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef VALUE_SAMPLER_H +#define VALUE_SAMPLER_H + +#include + +namespace KFS +{ + +// Sample / history buffer. Put puts current value and the corresponding time +// stamp into the buffer. +// Can be used to implement filters, compute average, etc. +class ValueSampler +{ +public: + typedef int64_t Sample; + typedef int64_t Time; + + ValueSampler( + int inMaxSamples, + Sample inSample, + Time inTime) + : mMaxSamples(0), + mCurIdx(0), + mSamplesPtr(0), + mTotal(0) + { ValueSampler::SetMaxSamples(inMaxSamples, inSample, inTime); } + ~ValueSampler() + { delete [] mSamplesPtr; } + void Put( + Sample inSample, + Time inTime) + { + SampleEntry& theEntry = mSamplesPtr[mCurIdx]; + mTotal -= theEntry.mSample; + mTotal += inSample; + theEntry.mSample = inSample; + theEntry.mTime = inTime; + if (++mCurIdx >= mMaxSamples) { + mCurIdx = 0; + } + } + int GetMaxSamples() const + { return mMaxSamples; } + Time GetTimeIterval() const + { + return (mSamplesPtr[GetLastIdx()].mTime - + mSamplesPtr[GetFirstIdx()].mTime); + } + Sample GetTotal() const + { return mTotal; } + Sample GetLastFirstDiffByTime() const + { + SampleEntry& theFirst = mSamplesPtr[GetFirstIdx()]; + SampleEntry& theLast = mSamplesPtr[GetLastIdx()]; + const Time theTime = theLast.mTime - theFirst.mTime; + if (theTime <= 0) { + return 0; + } + return ((theLast.mSample - theFirst.mSample) / theTime); + } + void SetMaxSamples( + int inMaxSamples, + Sample inSample, + Time inTime) + { + if (inMaxSamples == mMaxSamples && mMaxSamples > 0) { + return; + } + mMaxSamples = inMaxSamples > 0 ? inMaxSamples : 1; + delete [] mSamplesPtr; + mSamplesPtr = new SampleEntry[mMaxSamples]; + Reset(inSample, inTime); + } + void Reset( + Sample inSample, + Time inTime) + { + for (int i = 0; i < mMaxSamples; i++) { + mSamplesPtr[i].mSample = inSample; + mSamplesPtr[i].mTime = inTime; + mTotal += inSample; + } + } + void GetLastFirstDiff( + Sample& outSampleDiff, + Time& outTimeDiff) const + { + SampleEntry& theFirst = mSamplesPtr[GetFirstIdx()]; + SampleEntry& theLast = mSamplesPtr[GetLastIdx()]; + outTimeDiff = theLast.mTime - theFirst.mTime; + outSampleDiff = theLast.mSample - theFirst.mSample; + } +private: + struct SampleEntry + { + SampleEntry() + : mSample(), + mTime() + {} + Sample mSample; + Time mTime; + }; + + int mMaxSamples; + int mCurIdx; + SampleEntry* mSamplesPtr; + Sample mTotal; + + int GetFirstIdx() const + { return mCurIdx; } + int GetLastIdx() const + { return ((mCurIdx > 0 ? mCurIdx : mMaxSamples) - 1); } +private: + ValueSampler( + const ValueSampler& inSampler); + ValueSampler& operator=( + const ValueSampler& inSampler); +}; + +} // namespace KFS + +#endif /* VALUE_SAMPLER_H */ diff --git a/src/cc/common/Version.h b/src/cc/common/Version.h new file mode 100644 index 000000000..71ea213cd --- /dev/null +++ b/src/cc/common/Version.h @@ -0,0 +1,42 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id: Version.h 192 2008-10-22 05:33:26Z sriramsrao $ +// +// \brief Header file for getting KFS version #'s related to builds. +// +// Created 2008/10/20 +// Author: Sriram Rao +// +// Copyright 2008-2010 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +//---------------------------------------------------------------------------- + +#ifndef COMMON_KFSVERSION_H +#define COMMON_KFSVERSION_H + +#include + +// Store build version informantion in executables. +// Build info can be retrieved with strings | awk +// See buildversgit.sh +namespace KFS { + extern const std::string KFS_BUILD_VERSION_STRING; + extern const std::string KFS_SOURCE_REVISION_STRING; + extern const std::string KFS_BUILD_INFO_STRING; +} + + +#endif diff --git a/src/cc/common/buildVers.py b/src/cc/common/buildVers.py new file mode 100755 index 000000000..83c1013e3 --- /dev/null +++ b/src/cc/common/buildVers.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# +# $Id: buildVers.py 192 2008-10-22 05:33:26Z sriramsrao $ +# +# Copyright 2008-2010 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# Script to generate Version.cc. What we are interested in is the +# revision # from the repository for the entire tree as we built the +# source. This script is invoked by the makefile. +# + +import os,sys + +# get the version info from p4 +buildstr = "perforce1:1666/trunk@1000" +srcRevision = "100" + +fh = open(sys.argv[2], "w") +print >> fh, "//" +print >> fh, "// This file is generated during compilation. DO NOT EDIT!" +print >> fh, "//" +print >> fh, "#include \"Version.h\" " +print >> fh, "const std::string KFS::KFS_BUILD_VERSION_STRING=\"%s\";" % buildstr +print >> fh, "const std::string KFS::KFS_SOURCE_REVISION_STRING=\"%s\";" % srcRevision + diff --git a/src/cc/common/buildversgit.sh b/src/cc/common/buildversgit.sh new file mode 100755 index 000000000..f99f9582f --- /dev/null +++ b/src/cc/common/buildversgit.sh @@ -0,0 +1,148 @@ +#!/bin/sh +# +# $Id$ +# +# Created 2011/05/04 +# Author: Mike Ovsiannikov +# +# Copyright 2011-2012 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# Create Version.cc file with the relevant source / build information. +# +# + +if [ x"$1" = x'-g' -o x"$1" = x'-get' ]; then + while [ $# -gt 1 ]; do + shift + strings -a "$1" | awk '/KFS_BUILD_INFO_START/,/KFS_BUILD_INFO_END/' + done + exit +fi + +# Set official release version here. +# kfs_release_version + +if [ $# -eq 1 -a x"$1" = x'-v' ]; then + if [ x"$kfs_release_version" != x ]; then + echo "$kfs_release_version" + exit + fi + ( + cd "`dirname "$0"`" >/dev/null || exit + git log -n 1 --pretty=format:%H + ) 2>/dev/null + exit 0 +fi + +if [ $# -lt 3 ]; then + echo "Usage: $0 " + echo "or: $0 -g " + exit 1 +fi + + +buildtype=$1 +shift +sourcedir=$1 +shift +outfile=$1 +shift + +if [ x"$kfs_release_version" != x ]; then + kfs_version_prefix="${kfs_release_version}-" +else + kfs_version_prefix='' +fi + +lastchangeid=`git log -n 1 --pretty=format:%H -- "$sourcedir" 2>/dev/null` +if [ x"$lastchangeid" = x ]; then + remote='unspecified' + branch='unspecified' +else + remote=`git remote -v show | awk '{if($NF=="(fetch)") { printf("%s", $2); exit; }}'` + branch=`git branch --no-color | awk '{if($1=="*") { if ($3 != "branch)") printf("%s", $2); exit; }}'` +fi + +tmpfile="$outfile.$$.tmp"; + +{ +echo ' +// Generated by '"$0"'. Do not edit. + +#include "Version.h" +#include "hsieh_hash.h" + +namespace KFS { + +const std::string KFS_BUILD_INFO_STRING=' + +{ +echo KFS_BUILD_INFO_START +echo "host: `hostname`" +echo "user: $USER" +echo "date: `date`" +echo "build type: $buildtype" +while [ $# -gt 0 ]; do + echo "$1" + shift +done +if [ x"$kfs_release_version" != x ]; then + echo "release: $kfs_release_version" +else + echo "release: none" +fi +if [ x"$lastchangeid" != x ]; then + echo "git config:" + git config -l + echo "git status:" + git status --porcelain -- "$sourcedir" + echo "git branch:" + git branch -v --no-abbrev --no-color + echo "git remote:" + git remote -v + echo "version:" + echo "${remote}/${branch}@$lastchangeid" +else + echo 'git source build version not available' +fi +echo KFS_BUILD_INFO_END +} | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e 's/^/"/' -e 's/$/\\n"/' + +echo '; +static std::string MakeVersionHash() +{ + Hsieh_hash_fcn f; + unsigned int h = (unsigned int)f(KFS_BUILD_INFO_STRING); + std::string ret(2 * sizeof(h), '"'0'"'); + for (size_t i = ret.length() - 1; h != 0; i--) { + ret[i] = "0123456789ABCDEF"[h & 0xF]; + h >>= 4; + } + return ret; +} +const std::string KFS_BUILD_VERSION_STRING( + std::string("'"${kfs_release_version}${lastchangeid}-${buildtype}"'-") + + MakeVersionHash() +); +const std::string KFS_SOURCE_REVISION_STRING( + "'"${kfs_release_version}${remote}/${branch}@$lastchangeid"'" +); +} +' + +} > "$tmpfile" +mv "$tmpfile" $outfile diff --git a/src/cc/common/buildversp4.sh b/src/cc/common/buildversp4.sh new file mode 100755 index 000000000..04ac9e3f5 --- /dev/null +++ b/src/cc/common/buildversp4.sh @@ -0,0 +1,122 @@ +#!/bin/sh +# +# $Id$ +# +# Created 2010/10/20 +# Author: Mike Ovsiannikov +# +# Copyright 2010-2011 Quantcast Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +if [ x"$1" = x'-g' -o x"$1" = x'-get' ]; then + while [ $# -gt 1 ]; do + shift + strings -a "$1" | awk '/KFS_BUILD_INFO_START/,/KFS_BUILD_INFO_END/' + done + exit +fi + +if [ $# -lt 3 ]; then + echo "Usage: $0 " + echo "or: $0 -g " + exit 1; +fi + +buildtype=$1 +shift +sourcedir=$1 +shift +outfile=$1 +shift + +if [ x"$P4PORT" = x -o x"$KFS_BUILD_VERS_NO_P4" != x ]; then + lastchangenum=0 +else + lastchange=`p4 changes -m 1 -t "$sourcedir/...#have"` + lastchangenum=`echo "$lastchange" | awk '/Change /{c=$2;} END{printf("%d", c);}'` +fi +if [ $lastchangenum -ne 0 ]; then + p4path=`p4 have "$sourcedir/CMakeLists.txt" | sed -e 's/CMakeLists.txt.*$//'` +else + p4path='//unspecified/' +fi + +tmpfile="$outfile.$$.tmp"; + +{ +echo ' +// Generated by '"$0"'. Do not edit. + +#include "Version.h" +#include "hsieh_hash.h" + +namespace KFS { + +const std::string KFS_BUILD_INFO_STRING=' + +{ +echo KFS_BUILD_INFO_START +echo "host: `hostname`" +echo "user: $USER" +echo "date: `date`" +echo "build type: $buildtype" +while [ $# -gt 0 ]; do + echo "$1" + shift +done +echo "p4: $P4PORT" +if [ $lastchangenum -ne 0 ]; then + p4 info + echo "$lastchange" + echo "${p4path}...@$lastchangenum" + { + p4 have "$sourcedir"/... + echo 'opened:' + p4 opened "$sourcedir"/... 2>/dev/null + } | sed -e 's/\(#[0-9]*\) - .*$/\1/' +else + echo 'p4 source build version disabled' + echo "KFS_BUILD_VERS_NO_P4: $KFS_BUILD_VERS_NO_P4" +fi +echo KFS_BUILD_INFO_END +} | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e 's/^/"/' -e 's/$/\\n"/' + +echo '; +static std::string MakeVersionHash() +{ + Hsieh_hash_fcn f; + unsigned int h = (unsigned int)f(KFS_BUILD_INFO_STRING); + std::string ret(2 * sizeof(h), '"'0'"'); + for (size_t i = ret.length() - 1; h != 0; i--) { + ret[i] = "0123456789ABCDEF"[h & 0xF]; + h >>= 4; + } + return ret; +} +const std::string KFS_BUILD_VERSION_STRING( + std::string("'"${lastchangenum}-${buildtype}"'-") + + MakeVersionHash() +); +const std::string KFS_SOURCE_REVISION_STRING( + "'"${p4path}...@$lastchangenum"'" +); +} +' + +} > "$tmpfile" +mv "$tmpfile" $outfile diff --git a/src/cc/common/config.h b/src/cc/common/config.h new file mode 100644 index 000000000..14ad73448 --- /dev/null +++ b/src/cc/common/config.h @@ -0,0 +1,50 @@ + +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/08/23 +// Author: Sriram Rao +// +// Copyright 2008 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +//---------------------------------------------------------------------------- + +#ifndef CC_CONFIG_H +#define CC_CONFIG_H + +//---------------------------------------------------------------------------- +// Attribute to disable "unused variable" warnings +// +// Example: +// int UNUSED_ATTR r = aFunctionThatAlwaysAlwaysAlwaysReturnsZero(); +// assert(r == 0); +// +// Note, this doesn't break the variable when it actually *is* used, +// as in a debug build. It just makes the compiler keep quiet about +// not using it in release builds. +//---------------------------------------------------------------------------- +#if !defined(UNUSED_ATTR) +#if defined(__GNUC__) +#define UNUSED_ATTR __attribute__((unused)) +#else +#define UNUSED_ATTR +#endif +#endif + + +#endif // CC_CONFIG_H diff --git a/src/cc/common/hsieh_hash.cc b/src/cc/common/hsieh_hash.cc new file mode 100644 index 000000000..2edc68b7f --- /dev/null +++ b/src/cc/common/hsieh_hash.cc @@ -0,0 +1,101 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/07/17 +// +// By Paul Hsieh (C) 2004, 2005. Covered under the Paul Hsieh derivative +// license. See: +// http://www.azillionmonkeys.com/qed/weblicense.html for license details. +// +// http://www.azillionmonkeys.com/qed/hash.html +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "hsieh_hash.h" + +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif + +using std::string; + +using namespace KFS; + +uint32_t SuperFastHash (const char *data, int len) +{ + uint32_t hash = len, tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} + +// the wrappers we need +size_t Hsieh_hash_fcn::operator() (const char * data, int len) const +{ + return SuperFastHash(data, len); +} + +size_t Hsieh_hash_fcn::operator() (const string &data) const +{ + return SuperFastHash(data.c_str(), data.size()); +} diff --git a/src/cc/common/hsieh_hash.h b/src/cc/common/hsieh_hash.h new file mode 100644 index 000000000..581ded634 --- /dev/null +++ b/src/cc/common/hsieh_hash.h @@ -0,0 +1,41 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/07/17 +// Author: Sriram Rao +// +// Copyright 2008 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief String hash using Hsieh hash function. +// +//---------------------------------------------------------------------------- + +#ifndef COMMON_HSIEH_HASH_H +#define COMMON_HSIEH_HASH_H + +#include +#include +#include +namespace KFS +{ + struct Hsieh_hash_fcn { + std::size_t operator()(const char *data, int len) const; + std::size_t operator()(const std::string &data) const; + }; +} + +#endif // COMMON_HSIEH_HASH_H diff --git a/src/cc/common/httputils.h b/src/cc/common/httputils.h new file mode 100644 index 000000000..069ee9cb1 --- /dev/null +++ b/src/cc/common/httputils.h @@ -0,0 +1,219 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/04/09 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Very basic http utility functions. +// +//---------------------------------------------------------------------------- + +#ifndef HTTPUTILS_H +#define HTTPUTILS_H + +#include "RequestParser.h" +#include + +namespace KFS +{ +namespace httputils +{ + +using std::string; + +template +inline static int +GetHeaderLength(const IOBufferT& iobuf) +{ + int idx = iobuf.IndexOf(0, "\r\n\r\n"); + if (idx >= 0) { + return idx + 4; + } + idx = iobuf.IndexOf(0, "\n\n"); + if (idx >= 0) { + return idx + 2; + } + idx = iobuf.IndexOf(0, "\r\r"); + return (idx >= 0 ? idx + 2 : idx); +} + +inline static int +HexToChar(int c) +{ + return (c >= '0' && c <= '9' ? c - '0' + : c >= 'A' && c <= 'F' ? c - 'A' + 10 + : c - 'a' + 10 + ); +} + +inline static int +AsciiCharToLower(int c) +{ + return ((c >= 'A' && c <= 'Z') ? 'a' + (c - 'A') : c); +} + +inline static bool +EqualsAsciiIgnoreCase(const char* s1, const char* s2, int len) +{ + while (len-- > 0) { + if (AsciiCharToLower(*s1++) != AsciiCharToLower(*s2++)) { + return false; + } + } + return true; +} + +inline static bool +EqualsToLowerAscii(const char* s1, const char* s2, int len) +{ + while (len-- > 0) { + if (AsciiCharToLower(*s1++) != *s2++) { + return false; + } + } + return true; +} + +inline static bool +IsProtoAtLeast11(const char* ptr, const char* end) +{ + while (ptr < end && *ptr <= ' ') { + ++ptr; + } + while (ptr < end && *ptr == '/') { + ++ptr; + } + int major = -1; + int minor = -1; + return ( + ValueParser::ParseInt(ptr, end - ptr, major) && + (major > 1 || + (ptr < end && *ptr++ == '.' && + ValueParser::ParseInt(ptr, end - ptr, minor) && + minor >= 1 + ))); +} + +class ByteIterator +{ +public: + ByteIterator( + const char* start, + const char* end) + : mPtr(start), + mEnd(end) + {} + const char* Next() + { return (mPtr < mEnd ? mPtr++ : 0); } +private: + const char* mPtr; + const char* const mEnd; +}; + +template +inline static void +LoadUrlEncodedParams(T& it, PropertiesT& props) +{ + const char* ptr; + while ((ptr = it.Next()) && *ptr <= ' ') + {} + + string buf; + string key; + for (; ptr && *ptr > ' '; ptr = it.Next()) { + int sym = *ptr; + if (sym == '%') { + if (! (ptr = it.Next())) { + break; + } + sym = (HexToChar(*ptr & 0xFF) & 0xF) << 4; + if (! (ptr = it.Next())) { + break; + } + sym += HexToChar(*ptr & 0xFF) & 0xF; + } else if (sym == '+') { + sym = ' '; + } else if (sym == '=') { + key = buf; + buf.clear(); + continue; + } else if (sym == '&') { + if (! key.empty()) { + props.setValue(key, buf); + key.clear(); + } + buf.clear(); + continue; + } + buf.append(1, char(sym)); + } + if (! key.empty()) { + props.setValue(key, buf); + } +} + +const PropertiesTokenizer::Token kHttpContentLengthKey("content-length"); +const PropertiesTokenizer::Token kHttpHostKey("host"); + +template +inline static bool +GetContentLengthAndHost(const char* ptr, const char* end, + LengthT& contentLength, HostT* host) +{ + bool found = true; + PropertiesTokenizer tokenizer(ptr, end - ptr); + while (tokenizer.Next()) { + if (tokenizer.GetKey().mLen == kHttpContentLengthKey.mLen && + EqualsToLowerAscii( + tokenizer.GetKey().mPtr, + kHttpContentLengthKey.mPtr, + kHttpContentLengthKey.mLen + )) { + const char* ptr = tokenizer.GetValue().mPtr; + found = ValueParser::ParseInt( + ptr, tokenizer.GetValue().mLen, contentLength + ) || found; + } else if (host && + tokenizer.GetKey().mLen == kHttpHostKey.mLen && + EqualsToLowerAscii( + tokenizer.GetKey().mPtr, + kHttpHostKey.mPtr, + kHttpHostKey.mLen + )) { + ValueParser::SetValue( + tokenizer.GetValue().mPtr, + tokenizer.GetValue().mLen, + HostT(), + *host + ); + } + } + return found; +} + +template +inline static bool +GetContentLength(const char* ptr, const char* end, T& contentLength) +{ + return GetContentLengthAndHost(ptr, end, contentLength, 0); +} + +}} + +#endif /* HTTPUTILS_H */ diff --git a/src/cc/common/kfsatomic.cc b/src/cc/common/kfsatomic.cc new file mode 100644 index 000000000..90337aeb5 --- /dev/null +++ b/src/cc/common/kfsatomic.cc @@ -0,0 +1,87 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/05/15 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "kfsatomic.h" + +#include +#include + +#ifndef _KFS_ATOMIC_USE_MUTEX +# ifndef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 +# define _KFS_ATOMIC_GCC_IMPLEMENT_ATOMIC_8_BYTE_OPS +# endif +#endif + +namespace KFS +{ + +#if defined(_KFS_ATOMIC_USE_MUTEX) || \ + defined(_KFS_ATOMIC_GCC_IMPLEMENT_ATOMIC_8_BYTE_OPS) + +static pthread_mutex_t sKfsAtomicMutex = PTHREAD_MUTEX_INITIALIZER; + +inline static void KfsAtomicLockImpl() +{ + const int err = pthread_mutex_lock(&sKfsAtomicMutex); + if (err) { + abort(); + } +} + +inline static void KfsAtomicUnlockImpl() +{ + const int err = pthread_mutex_unlock(&sKfsAtomicMutex); + if (err) { + abort(); + } +} + +#endif + +#ifdef _KFS_ATOMIC_USE_MUTEX +namespace atomicmpl +{ +void AtomicLock() { KfsAtomicLockImpl(); } +void AtomicUnlock() { KfsAtomicUnlockImpl(); } +} +#endif /* _KFS_ATOMIC_USE_MUTEX */ + +#ifdef _KFS_ATOMIC_GCC_IMPLEMENT_ATOMIC_8_BYTE_OPS +extern "C" +{ +long long unsigned int __sync_add_and_fetch_8( + volatile void* val, long long unsigned int inc) +{ + KfsAtomicLockImpl(); + long long unsigned int ret = *((long long unsigned int*)val); + ret += inc; + *((long long unsigned int*)val) = ret; + KfsAtomicUnlockImpl(); + return ret; +} +} +#endif /* _KFS_ATOMIC_GCC_IMPLEMENT_ATOMIC_8_BYTE_OPS */ + +} // namespace KFS diff --git a/src/cc/common/kfsatomic.h b/src/cc/common/kfsatomic.h new file mode 100644 index 000000000..a90802230 --- /dev/null +++ b/src/cc/common/kfsatomic.h @@ -0,0 +1,67 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/05/15 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// "Atomic" variables of different sizes (well up to 64 bits on 64 bit +// platforms). +// +//---------------------------------------------------------------------------- + +#ifndef KFS_ATOMIC_H +#define KFS_ATOMIC_H + +namespace KFS +{ +#if ! defined(_KFS_ATOMIC_USE_MUTEX) && (\ + ! defined(__GNUC__) || (__GNUC__ < 4 || \ + (__GNUC__ == 4 && (__GNUC_MINOR__ < 1 || \ + (__GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ < 2))))) +# define _KFS_ATOMIC_USE_MUTEX +#endif + +#ifdef _KFS_ATOMIC_USE_MUTEX + +namespace atomicmpl +{ +void AtomicLock(); +void AtomicUnlock(); +} + +template T SyncAddAndFetch(volatile T& val, T inc) +{ + atomicmpl::AtomicLock(); + val += inc; + const T ret = val; + atomicmpl::AtomicUnlock(); + return ret; +} + +#else + +template T SyncAddAndFetch(volatile T& val, T inc) +{ + return __sync_add_and_fetch(&val, inc); +} + +#endif /* _KFS_ATOMIC_USE_MUTEX */ +} + +#endif /* KFS_ATOMIC_H */ diff --git a/src/cc/common/kfsdecls.h b/src/cc/common/kfsdecls.h new file mode 100644 index 000000000..dd941ec5d --- /dev/null +++ b/src/cc/common/kfsdecls.h @@ -0,0 +1,205 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// \brief Common declarations of KFS structures +// +// Created 2006/10/20 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +//---------------------------------------------------------------------------- + +#ifndef COMMON_KFSDECLS_H +#define COMMON_KFSDECLS_H + +#include "kfstypes.h" + +#include +#include +#include +#include +#include + +namespace KFS +{ +using std::ostringstream; +using std::istringstream; +using std::ostream; +using std::istream; +using std::string; +using std::min; + +/// +/// Define a server process' location: hostname and the port at which +/// it is listening for incoming connections +/// +struct ServerLocation { + ServerLocation(): hostname(""), port(-1) { } + ServerLocation(const ServerLocation &other): + hostname(other.hostname), port(other.port) { } + ServerLocation(const string &h, int p): hostname(h), port(p) { } + ServerLocation & operator = (const ServerLocation &other) { + hostname = other.hostname; + port = other.port; + return *this; + } + void Reset(const char *h, int p) { + hostname = h; + port = p; + } + bool operator == (const ServerLocation &other) const { + return hostname == other.hostname && port == other.port; + } + bool operator != (const ServerLocation &other) const { + return hostname != other.hostname || port != other.port; + } + bool operator < (const ServerLocation &other) const { + const int res = hostname.compare(other.hostname); + return (res < 0 || (res == 0 && port < other.port)); + } + bool IsValid() const { + // Hostname better be non-null and port better + // be a positive number + return (! hostname.empty() && port > 0); + } + + // a proxy for distance between two hosts: take the difference + // between their hostnames. this will mostly work as long as all + // machines in the cluster are named as nodeXXX, where XXX is a number + int Distance(const string &otherhost) { + int len = (int) min(hostname.size(), otherhost.size()); + int hosta = 0, hostb = 0; + int scalefactor = 1; + + for (int i = len - 1; i >= 0; --i) { + if (isdigit(hostname[i])) + hosta += (hostname[i] - '0') * scalefactor; + if (isdigit(otherhost[i])) + hostb += (otherhost[i] - '0') * scalefactor; + scalefactor *= 10; + } + return abs(hosta - hostb); + } + string ToString() const { + ostringstream os; + Display(os); + return os.str(); + } + ostream& Display(ostream& os) const { + return (os << hostname << ' ' << port); + } + void FromString(const string &s) { + istringstream is(s); + is >> hostname; + is >> port; + } + + string hostname; //!< Location of the server: machine name/IP addr + int port; //!< Location of the server: port to connect to +}; + +inline static ostream& +operator<<(ostream& os, const ServerLocation& loc) { + return loc.Display(os); +} +inline static istream& +operator>>(istream& is, ServerLocation& loc) { + return (is >> loc.hostname >> loc.port); +} + +// I-node (file / directory) permissions. +class Permissions +{ +public: + enum PBits + { + kExec = 1, + kWrite = 2, + kRead = 4 + }; + enum { kStickyBit = 1 << (3 * 3) }; + enum { kAccessModeMask = 0777 }; + enum { kFileModeMask = kAccessModeMask }; + enum { kDirModeMask = kStickyBit | kAccessModeMask }; + + kfsUid_t user; + kfsGid_t group; + kfsMode_t mode; + + Permissions( + kfsUid_t u = kKfsUserNone, + kfsGid_t g = kKfsGroupNone, + kfsMode_t m = kKfsModeUndef) + : user(u), + group(g), + mode(m) + {} + kfsMode_t GetPermissions(kfsUid_t euser, kfsGid_t egroup) const + { + if (user == euser) { + return ((mode >> 6) & 0x7); + } + if (group == egroup) { + return ((mode >> 3) & 0x7); + } + return (mode & 0x7); + } + bool Access(kfsUid_t euser, kfsGid_t egroup, PBits perm) const + { + return (euser == kKfsUserRoot || + (GetPermissions(euser, egroup) & perm) != 0); + } + bool CanExec(kfsUid_t euser, kfsGid_t egroup) const + { return Access(euser, egroup, kExec); } + bool CanWrite(kfsUid_t euser, kfsGid_t egroup) const + { return Access(euser, egroup, kWrite); } + bool CanRead(kfsUid_t euser, kfsGid_t egroup) const + { return Access(euser, egroup, kRead); } + bool CanSearch(kfsUid_t euser, kfsGid_t egroup) const + { return Access(euser, egroup, kExec); } + bool IsAnyPermissionDefined() const + { + return ( + mode != kKfsModeUndef || + user != kKfsUserNone || + group != kKfsGroupNone + ); + } + bool IsPermissionValid() const + { + return ( + mode != kKfsModeUndef && + user != kKfsUserNone && + group != kKfsGroupNone + ); + } + bool IsSticky() const { return (mode & kStickyBit); } + void SetSticky(bool flag) + { + if (flag) { + mode |= kfsMode_t(kStickyBit); + } else { + mode &= ~kfsMode_t(kStickyBit); + } + } +}; + +} + +#endif // COMMON_KFSDECLS_H diff --git a/src/cc/common/kfstypes.h b/src/cc/common/kfstypes.h new file mode 100644 index 000000000..dca742917 --- /dev/null +++ b/src/cc/common/kfstypes.h @@ -0,0 +1,117 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// \brief Common declarations for KFS (meta/chunk/client-lib) +// +// Created 2006/10/20 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +//---------------------------------------------------------------------------- + +#ifndef COMMON_KFSTYPES_H +#define COMMON_KFSTYPES_H + +#include +#include + +#include +#include +#include + +namespace KFS { + +typedef int64_t seq_t; //!< request sequence no. for logging +typedef int64_t seqid_t; //!< sequence number id's for file/chunks +typedef seqid_t fid_t; //!< file ID +typedef seqid_t chunkId_t; //!< chunk ID +typedef int64_t chunkOff_t; //!< chunk offset +const fid_t ROOTFID = 2; //!< special fid for "/ + +//!< Every time we change the protocol, rev. this one. We can use this value to +//!< detect clients running old binaries. +const int KFS_CLIENT_PROTO_VERS = 114; +const int KFS_CLIENT_MIN_STRIPED_FILE_SUPPORT_PROTO_VERS = 110; + +//!< Declarations as used in the Chunkserver/client-library +typedef int64_t kfsFileId_t; +typedef int64_t kfsChunkId_t; +typedef int64_t kfsSeq_t; + +typedef uint32_t kfsUid_t; +typedef uint32_t kfsGid_t; +typedef uint16_t kfsMode_t; + +const kfsUid_t kKfsUserRoot = 0; +const kfsUid_t kKfsUserNone = ~kfsUid_t(0); +const kfsGid_t kKfsGroupRoot = 0; +const kfsGid_t kKfsGroupNone = ~kfsGid_t(0); +const kfsMode_t kKfsModeUndef = ~kfsMode_t(0); + +const size_t CHUNKSIZE = 64u << 20; //!< (64MB) +const int MAX_RPC_HEADER_LEN = 16 << 10; //!< Max length of header in RPC req/response +const size_t MAX_FILE_NAME_LENGTH = 4 << 10; +const size_t MAX_PATH_NAME_LENGTH = MAX_FILE_NAME_LENGTH * 3; +const short int NUM_REPLICAS_PER_FILE = 3; //!< default degree of replication +const short int MAX_REPLICAS_PER_FILE = 64; //!< max. replicas per chunk of file + +//!< Default lease interval of 5 mins +const int LEASE_INTERVAL_SECS = 300; + +//!< Error codes for KFS specific errors +// version # being presented by client doesn't match what the server has +const int EBADVERS = 1000; + +// lease has expired +const int ELEASEEXPIRED = 1001; + +// checksum for data on a server is bad; client should read from elsewhere +const int EBADCKSUM = 1002; + +// data lives on chunkservers that are all non-reachable +const int EDATAUNAVAIL = 1003; + +// an error to indicate a server is busy and can't take on new work +const int ESERVERBUSY = 1004; + +// an error occurring during allocation; the client will see this error +// code and retry. +const int EALLOCFAILED = 1005; + +// error to indicate that there is a cluster key mismatch between +// chunkserver and metaserver. +const int EBADCLUSTERKEY = 1006; + +// invalid chunk size +const int EINVALCHUNKSIZE = 1007; + +enum StripedFileType +{ + KFS_STRIPED_FILE_TYPE_UNKNOWN = 0, + KFS_STRIPED_FILE_TYPE_NONE = 1, + KFS_STRIPED_FILE_TYPE_RS = 2 +}; + +const int KFS_STRIPE_ALIGNMENT = 4096; +const int KFS_MIN_STRIPE_SIZE = KFS_STRIPE_ALIGNMENT; +const int KFS_MAX_STRIPE_SIZE = (int)CHUNKSIZE; + +} + +#endif // COMMON_KFSTYPES_H diff --git a/src/cc/common/rusage.cc b/src/cc/common/rusage.cc new file mode 100644 index 000000000..831891dbc --- /dev/null +++ b/src/cc/common/rusage.cc @@ -0,0 +1,81 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/05/03 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file rusage.cc +// \brief getrusage wrapper +// +//---------------------------------------------------------------------------- + +#ifndef RUSAGE_H +#define RUSAGE_H + +#include "rusage.h" +#include +#include +#include + +#include +#include + +namespace KFS +{ +using std::ostream; + +ostream& +showrusage(ostream& inStream, const char* inSeparatorPtr, + const char* inDelimiterPtr, bool inSelfFlag) +{ + const char* const theSepPtr = inSeparatorPtr ? inSeparatorPtr : ": "; + const char* const theDelimPtr = inDelimiterPtr ? inDelimiterPtr : "\n"; + struct rusage theRusage; + memset(&theRusage, 0, sizeof(theRusage)); + if (getrusage(inSelfFlag ? RUSAGE_SELF : RUSAGE_CHILDREN, &theRusage)) { + inStream.setstate(ostream::failbit); + return inStream; + } + inStream << + "utime" << theSepPtr << + theRusage.ru_utime.tv_sec * 1000 * 1000 + + theRusage.ru_utime.tv_usec << theDelimPtr << + "stime" << theSepPtr << + theRusage.ru_stime.tv_sec * 1000 * 1000 + + theRusage.ru_stime.tv_usec << theDelimPtr << + "maxrss" << theSepPtr << theRusage.ru_maxrss << theDelimPtr << + "ixrss" << theSepPtr << theRusage.ru_ixrss << theDelimPtr << + "idrss" << theSepPtr << theRusage.ru_idrss << theDelimPtr << + "isrss" << theSepPtr << theRusage.ru_isrss << theDelimPtr << + "minflt" << theSepPtr << theRusage.ru_minflt << theDelimPtr << + "majflt" << theSepPtr << theRusage.ru_majflt << theDelimPtr << + "inblock" << theSepPtr << theRusage.ru_inblock << theDelimPtr << + "oublock" << theSepPtr << theRusage.ru_oublock << theDelimPtr << + "msgsnd" << theSepPtr << theRusage.ru_msgsnd << theDelimPtr << + "msgrcv" << theSepPtr << theRusage.ru_msgrcv << theDelimPtr << + "nsignals" << theSepPtr << theRusage.ru_nsignals << theDelimPtr << + "nvscw" << theSepPtr << theRusage.ru_nvcsw << theDelimPtr << + "nivscw" << theSepPtr << theRusage.ru_nivcsw << theDelimPtr + ; + return inStream; +} + +} + +#endif /* RUSAGE_H */ diff --git a/src/cc/common/rusage.h b/src/cc/common/rusage.h new file mode 100644 index 000000000..aa366b954 --- /dev/null +++ b/src/cc/common/rusage.h @@ -0,0 +1,41 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/05/03 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file rusage.h +// \brief getrusage wrapper +// +//---------------------------------------------------------------------------- + +#ifndef RUSAGE_H +#define RUSAGE_H + +#include + +namespace KFS +{ +using std::ostream; + +ostream& showrusage(ostream& inStream, const char* inSeparatorPtr, + const char* inDelimiterPtr, bool inSelfFlag); +} + +#endif /* RUSAGE_H */ diff --git a/src/cc/common/time.cc b/src/cc/common/time.cc new file mode 100644 index 000000000..e07866244 --- /dev/null +++ b/src/cc/common/time.cc @@ -0,0 +1,59 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/10/27 +// Author: Dan Adkins +// +// Copyright 2010 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file time.cc +// \brief time related functions +// +//---------------------------------------------------------------------------- + +#include +#include +#include "time.h" + +namespace KFS { + +int64_t +microseconds(void) +{ + struct timeval tv; + + if (gettimeofday(&tv, 0) < 0) + return -1; + + return (int64_t)tv.tv_sec*1000*1000 + tv.tv_usec; +} + +int64_t +cputime(int64_t *user, int64_t *sys) +{ + struct rusage ru; + + if (getrusage(RUSAGE_SELF, &ru) < 0) + return -1; + + *user = (int64_t)ru.ru_utime.tv_sec*1000*1000 + ru.ru_utime.tv_usec; + *sys = (int64_t)ru.ru_stime.tv_sec*1000*1000 + ru.ru_stime.tv_usec; + + return *user + *sys; +} + +} // namespace KFS diff --git a/src/cc/common/time.h b/src/cc/common/time.h new file mode 100644 index 000000000..5339076f3 --- /dev/null +++ b/src/cc/common/time.h @@ -0,0 +1,40 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/10/27 +// Author: Dan Adkins +// +// Copyright 2010 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file time.cc +// \brief time related functions +// +//---------------------------------------------------------------------------- + +#ifndef COMMON_TIME_H +#define COMMON_TIME_H + +#include + +namespace KFS { + +extern int64_t microseconds(void); +extern int64_t cputime(int64_t *user, int64_t *sys); + +} // namespace KFS + +#endif // COMMON_TIME_H diff --git a/src/cc/devtools/CMakeLists.txt b/src/cc/devtools/CMakeLists.txt new file mode 100644 index 000000000..e1ed3bda5 --- /dev/null +++ b/src/cc/devtools/CMakeLists.txt @@ -0,0 +1,55 @@ +# +# $Id$ +# +# Created 2006/10/20 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +set (exe_files +checksum +dirtree_creator +logger +rand-sfmt +requestparser +sortedhash +stlset +) + +# +# Every executable depends on its namesake source with _main.cc +# +foreach (exe_file ${exe_files}) + add_executable (${exe_file} ${exe_file}_main.cc) + if (USE_STATIC_LIB_LINKAGE) + add_dependencies (${exe_file} kfsClient) + target_link_libraries (${exe_file} tools kfsClient qcdio pthread) + else (USE_STATIC_LIB_LINKAGE) + add_dependencies (${exe_file} kfsClient-shared) + target_link_libraries (${exe_file} tools-shared kfsClient-shared qcdio-shared pthread) + endif (USE_STATIC_LIB_LINKAGE) + +endforeach (exe_file) + +# +install (TARGETS ${exe_files} + RUNTIME DESTINATION bin/devtools) + + diff --git a/src/cc/devtools/README b/src/cc/devtools/README new file mode 100644 index 000000000..7a4fac620 --- /dev/null +++ b/src/cc/devtools/README @@ -0,0 +1,20 @@ +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// + +The files in this directory are meant for filesystem developers' internal use. +Please refrain from executing the corresponding binaries without understanding +the code. diff --git a/src/cc/devtools/checksum_main.cc b/src/cc/devtools/checksum_main.cc new file mode 100644 index 000000000..11532bad5 --- /dev/null +++ b/src/cc/devtools/checksum_main.cc @@ -0,0 +1,83 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/05/25 +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Kfs checksum (adler32) unit test. +// +//---------------------------------------------------------------------------- + +#include "kfsio/checksum.cc" + +#include +#include +#include +#include + +int main(int argc, char** argv) +{ + if (argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { + printf("Usage: %s [flags]\n" + " flags can be any combination of 'c', 'n', 'd'.\n" + " c: test adler32 combine.\n" + " n: pad with 0.\n" + " d: debug.\n" + " The test reads input from STDIN ended by Ctrl+D.\n", + argv[0]); + return 0; + } + + static char buf[KFS::CHECKSUM_BLOCKSIZE * 4]; + char* p = buf; + ssize_t n = 0; + unsigned long o = 0; + const bool padd = argc <= 1 || strchr(argv[1], 'n') == 0; + const bool tcomb = argc > 1 && strchr(argv[1], 'c'); + const bool debug = argc > 1 && strchr(argv[1], 'd'); + char* const e = p + (tcomb ? sizeof(buf) : KFS::CHECKSUM_BLOCKSIZE); + + do { + while (p < e && (n = read(0, p, e - p)) > 0) { + p += n; + } + if (p <= buf) { + break; + } + const size_t len = (padd ? e : p) - buf; + if (padd && p < e) { + memset(p, 0, e - p); + } + const uint32_t cksum = KFS::ComputeBlockChecksum(buf, len); + if (tcomb) { + uint32_t cck = 0; + KFS::ComputeChecksums(buf, len, &cck); + if (cck != cksum) { + printf("mismatch %lu %lu %u %u\n", o, (unsigned long)len, + (unsigned int)cksum, (unsigned int)cck); + abort(); + } + } + if (! tcomb || debug) { + printf("%lu %lu %u\n", o, (unsigned long)len, (unsigned int)cksum); + } + o += len; + p = buf; + } while (n > 0); + return 0; +} diff --git a/src/cc/devtools/dirtree_creator_main.cc b/src/cc/devtools/dirtree_creator_main.cc new file mode 100644 index 000000000..555908e48 --- /dev/null +++ b/src/cc/devtools/dirtree_creator_main.cc @@ -0,0 +1,123 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/05/09 +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief This program can be used to evaluate the memory use on the metaserver +// by creating a directory hierarchy. For input, provide a file that +// lists the directory hierarchy to be created with the path to a +// complete file, one per line. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include "libclient/KfsClient.h" + +using std::ios_base; +using std::cout; +using std::ifstream; +using std::string; + +using namespace KFS; + +int +main(int argc, char **argv) +{ + char optchar; + const char* kfsPropsFile = 0; + const char* dataFile = 0; + bool help = false; + ifstream ifs; + + while ((optchar = getopt(argc, argv, "f:p:")) != -1) { + switch (optchar) { + case 'f': + dataFile = optarg; + break; + case 'p': + kfsPropsFile = optarg; + break; + default: + cout << "Unrecognized flag: " << optchar << "\n"; + help = true; + break; + } + } + + if (help || ! kfsPropsFile || ! dataFile) { + cout << "Usage: " << argv[0] << " -p " + << " -f " << "\n"; + return 0; + } + + ifs.open(dataFile, ios_base::in); + if (!ifs) { + cout << "Unable to open: " << dataFile << "\n"; + return 1; + } + + KfsClient* const kfsClient = Connect(kfsPropsFile); + if (! kfsClient ) { + cout << "kfs client failed to initialize...exiting" << "\n"; + return 1; + } + + int fd = 0; + int count = 0; + string kfspathname; + while (getline(ifs, kfspathname)) { + string kfsdirname, kfsfilename; + string::size_type slash = kfspathname.rfind('/'); + + if (slash == string::npos) { + cout << "Bad kfs path: " << kfsdirname << "\n"; + fd = 1; + break; + } + kfsdirname.assign(kfspathname, 0, slash); + kfsfilename.assign(kfspathname, slash + 1, kfspathname.size()); + if (kfsfilename.rfind(".crc") != string::npos) { + continue; + } + ++count; + if ((count % 10000) == 0) { + cout << "Done with " << count << " non-crc files" << "\n"; + } + if ((fd = kfsClient->Mkdirs(kfsdirname.c_str())) < 0) { + cout << "Mkdir failed: " << kfsdirname << + ": " << ErrorCodeToStr(fd) << "\n"; + break; + } + fd = kfsClient->Create(kfspathname.c_str()); + if (fd < 0) { + cout << "Create failed for path: " << kfspathname << " error: " << + ErrorCodeToStr(fd) << "\n"; + break; + } + fd = kfsClient->Close(fd); + } + delete kfsClient; + + return (fd != 0 ? 1 : 0); +} diff --git a/src/cc/devtools/logger_main.cc b/src/cc/devtools/logger_main.cc new file mode 100644 index 000000000..156addf0c --- /dev/null +++ b/src/cc/devtools/logger_main.cc @@ -0,0 +1,90 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/03/07 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "common/BufferedLogWriter.h" +#include "qcdio/QCUtils.h" +#include "common/Properties.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace KFS; +using namespace std; + +static uint64_t +Now() +{ + struct timeval theTime; + if (gettimeofday(&theTime, 0)) { + QCUtils::FatalError("gettimeofday", errno); + } + return (int64_t(theTime.tv_sec) * 1000000 + theTime.tv_usec); +} + +int +main(int argc, char** argv) +{ + if (argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { + return 0; + } + + BufferedLogWriter theLogWriter(fileno(stderr), 0, 1 << 20); + if (argc > 1) { + Properties theProps; + if (theProps.loadProperties(argv[1], '=', true) != 0) { + return 1; + } + theLogWriter.SetParameters(theProps, "logWriter."); + } + if (argc > 2) { + int theMax = (int)atof(argv[2]); + uint64_t theStart = Now(); + uint64_t thePrev = theStart; + for (int i = 1; i <= theMax; i++) { + theLogWriter.Append(BufferedLogWriter::kLogLevelDEBUG, "%d", i); + if ((i & 0xFFFFF) == 0) { + const uint64_t theNow = Now(); + cout << i * 1e6 / (double(theNow - theStart) + 1e-7) << + " rec/sec avg " << + 1e6 * 0xFFFFF / (double(theNow - thePrev) + 1e-7) << + " rec/sec\n"; + thePrev = theNow; + } + } + } else { + string theLine; + while (getline(cin, theLine)) { + theLogWriter.Append(BufferedLogWriter::kLogLevelDEBUG, + "%s", theLine.c_str()); + } + } + return 0; +} diff --git a/src/cc/devtools/rand-sfmt_main.cc b/src/cc/devtools/rand-sfmt_main.cc new file mode 100644 index 000000000..9d2edbcc3 --- /dev/null +++ b/src/cc/devtools/rand-sfmt_main.cc @@ -0,0 +1,72 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/05/20 +// Author: Mike Ovsainnikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Pseudo random number generator for various test purposes. +// +//---------------------------------------------------------------------------- + +#include + +#include +#include +#include +#include +#include +#include + +using std::cerr; +typedef boost::mt19937 RandomGenerator; +typedef RandomGenerator::result_type ResultType; + +int +main(int argc, char **argv) +{ + if (argc <= 1 || strncmp(argv[1], "-g", 2) != 0) { + cerr << argv[0] << " -g [size] [seed]\n"; + return 1; + } + int64_t len = + argc > 2 ? (int64_t)atof(argv[2]) : ((int64_t)1 << 63); + const ResultType seed = + argc > 3 ? (ResultType)atol(argv[3]) : (ResultType)time(0); + static RandomGenerator::result_type buf[100000]; + RandomGenerator gen(seed); + const int64_t bs = (int64_t)sizeof(buf); + for (; len > 0; len -= bs) { + size_t n = (size_t)(len > bs ? bs : len); + size_t r = (n + sizeof(buf[0]) - 1) / sizeof(buf[0]); + while (r > 0) { + buf[--r] = gen(); + } + ssize_t w = 0; + const char* p = (const char*)buf; + while ((w = write(1, p, n)) < (ssize_t)n) { + if (w < 0) { + perror("write"); + return 1; + } + n -= w; + p += w; + } + } + return 0; +} diff --git a/src/cc/devtools/requestparser_main.cc b/src/cc/devtools/requestparser_main.cc new file mode 100644 index 000000000..98dd71bba --- /dev/null +++ b/src/cc/devtools/requestparser_main.cc @@ -0,0 +1,273 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/05/14 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "common/RequestParser.h" +#include "common/Properties.h" + +#include +#include +#include + +using namespace KFS; + +class AbstractTest +{ +public: + int64_t seq; + + AbstractTest() + : seq(-1) + {} + virtual ~AbstractTest() + {} + bool Validate() const { return true; } + virtual std::ostream& Show( + std::ostream& inStream) + { return inStream; } + bool ValidateRequestHeader( + const char* /* name */, + size_t /* nameLen */, + const char* /* header */, + size_t /* headerLen */, + bool /* hasChecksum */, + uint32_t /* checksum */) + { return true; } + template static T& ParserDef( + T& inParser) + { + return inParser + .Def("Cseq", &AbstractTest::seq, int64_t(-1)) + ; + } +}; + +class AbstractTest1 +{ +public: + int vers; + + AbstractTest1() + : vers(-1) + {} + virtual ~AbstractTest1() + {} + template static T& ParserDef( + T& inParser) + { + return inParser + .Def("Client-Protocol-Version", &AbstractTest1::vers, -1) + ; + } +}; + +class Test : public AbstractTest, public AbstractTest1 +{ +public: + StringBufT<64> host; + StringBufT<256> path; + std::string hostStr; + std::string pathStr; + int64_t fid; + int64_t offset; + int64_t reserve; + bool append; + int maxAppenders; + double doubleTest; + + Test() + : AbstractTest(), + host(), + path(), + hostStr(), + pathStr(), + fid(-1), + offset(-1), + reserve(-1), + append(false), + maxAppenders(64), + doubleTest(-1) + {} + + // bool Validate() const { return false; } + virtual ~Test() + {} + template static T& ParserDef( + T& inParser) + { + return + AbstractTest1::ParserDef( + AbstractTest::ParserDef( + inParser + )) + // .Def("Cseq", &Test::seq, int64_t(-1)) + // .Def("Client-Protocol-Version", &Test::vers, -1 ) + .Def("Client-host", &Test::host ) + .Def("Pathname", &Test::path ) + .Def("File-handle", &Test::fid, int64_t(-1)) + .Def("Chunk-offset", &Test::offset, int64_t(-1)) + .Def("Chunk-append", &Test::append, false ) + .Def("Space-reserve", &Test::reserve, int64_t(-1)) + .Def("Max-appenders", &Test::maxAppenders, 64 ) + .Def("Double test", &Test::doubleTest, -1. ) + ; + } + template void Load( + const Properties& props, + const char* name, + T& val, + T def=T()) + { val = props.getValue(name, def); } + void Load( + const Properties& props) + { + Load(props, "Cseq", seq, int64_t(-1)); + Load(props, "Client-Protocol-Version", vers, -1 ); + Load(props, "Client-host", hostStr ); + Load(props, "Pathname", pathStr ); + Load(props, "File-handle", fid, int64_t(-1)); + Load(props, "Chunk-offset", offset, int64_t(-1)); + Load(props, "Chunk-append", append, false ); + Load(props, "Space-reserve", reserve, int64_t(-1)); + Load(props, "Max-appenders", maxAppenders, 64 ); + Load(props, "Double test", doubleTest, -1. ); + } + static AbstractTest* Load( + std::istream& is) + { + Test* const res = new Test; + const char separator = ':'; + Properties props; + props.loadProperties(is, separator, false); + res->Load(props); + return res; + } + virtual std::ostream& Show( + std::ostream& inStream) + { + return inStream << + "Cseq: " << seq << "\r\n" + "Version: " "KFS/1.0" "\r\n" + "Client-Protocol-Version: " << vers << "\r\n" + "Client-host: " << host << hostStr << "\r\n" + "Pathname: " << path << pathStr << "\r\n" + "File-handle: " << fid << "\r\n" + "Chunk-offset: " << offset << "\r\n" + "Chunk-append: " << append << "\r\n" + "Space-reserve: " << reserve << "\r\n" + "Max-appenders: " << maxAppenders << "\r\n" + "Double test: " << doubleTest << "\r\n" + ; + } +}; + +/* +ALLOCATE\r +Cseq: $seq\r +Version: KFS/1.0\r +Client-Protocol-Version: 100\r +Client-host: somehostname\r +Pathname: /sort/job/1/fanout/27/file.27\r +File-handle: $fid\r +Chunk-offset: 0\r +Chunk-append: 1\r +Space-reserve: 0\r +Max-appenders: 640000000\r +*/ +/* + To benchmark: + ../src/test-scripts/allocatesend.pl 1e6 | ( time src/cc/devtools/requestparser_test q ) +*/ + +typedef RequestHandler ReqHandler; +static const ReqHandler& MakeRequestHandler() +{ + static ReqHandler sHandler; + return sHandler + .MakeParser("ALLOCATE") + .MakeParser("xALLOCATE") + ; +} +static const ReqHandler& sReqHandler = MakeRequestHandler(); + +int +main(int argc, char** argv) +{ + if (argc <= 1 || (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { + return 0; + } + + static char buf[1 << 20]; + char* ptr = buf; + char* end = buf + sizeof(buf); + BufferInputStream myis; + ssize_t nrd; + const bool quiet = argc > 1 && strchr(argv[1], 'q'); + const bool noparse = argc > 1 && strchr(argv[1], 'n'); + const bool alloc = argc > 1 && strchr(argv[1], 'a'); + const bool useprop = argc > 1 && strchr(argv[1], 'p'); + + while ((nrd = read(0, ptr, end - ptr)) > 0) { + end = ptr + nrd; + ptr = buf; + if (*ptr == '\n') { + ptr++; + } + char* re = ptr; + while ((re = (char*)memchr(re, '\n', end - re))) { + if (re + 2 >= end) { + break; + } + if (re[-1] != '\r' || re[1] != '\r' || re[2] != '\n') { + re++; + continue; + } + re += 3; + if (! quiet) { + std::cout << "Request:\n"; + std::cout.write(ptr, re - ptr); + } + if (! noparse || alloc) { + AbstractTest* const tst = useprop ? + Test::Load(myis.Set(ptr, noparse ? 0 : re - ptr)) : + sReqHandler.Handle(ptr, noparse ? 0 : re - ptr); + if (tst) { + if (! quiet) { + std::cout << "Parsed request:\n"; + tst->Show(std::cout); + } + delete tst; + } else { + std::cout << "parse failure\nRequest:\n"; + std::cout.write(ptr, re - ptr); + } + } + ptr = re; + } + memmove(buf, ptr, end - ptr); + ptr = buf + (end - ptr); + end = buf + sizeof(buf); + } + return 0; +} diff --git a/src/cc/devtools/sortedhash_main.cc b/src/cc/devtools/sortedhash_main.cc new file mode 100644 index 000000000..f3e196cf7 --- /dev/null +++ b/src/cc/devtools/sortedhash_main.cc @@ -0,0 +1,281 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/01/29 +// +// Copyright 2011-2012 Quantcast Corp. +// Author: Mike Ovsainnikov +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Sorted linear hash table unit and performance tests. +// +//---------------------------------------------------------------------------- + +#include "common/LinearHash.h" +#include "common/PoolAllocator.h" + +#include +#include +#include + +#include +#include +#include + +#ifdef _USE_STD_DEQUE +#include +#endif + +typedef int64_t MyKey; +struct MyKVPair +{ + typedef MyKey Key; + typedef MyKey Val; + + MyKey key; + + MyKVPair(const Key& key, const Val& /* val */) + : key(key) + {} + Key& GetKey() { return key; } + const Key& GetKey() const { return key; } + Val& GetVal() { return key; } + const Val& GetVal() const { return key; } +}; + +#ifdef _USE_STD_DEQUE +struct MyDynArray : public std::deque*> +{ + value_type& PopBack() + { + if (! empty()) { + pop_back(); + } + return back(); + } + bool IsEmpty() const + { return empty(); } + size_t GetSize() const + { return size(); } + size_t PushBack(const value_type& v) + { + push_back(v); + return size(); + } + value_type& Front() + { return front(); } + value_type& Back() + { return back(); } +}; +#else +typedef KFS::DynamicArray*, 22> MyDynArray; +#endif + +template +class Allocator +{ +public: + T* allocate(size_t n) + { + if (n != 1) { + abort(); + } + return reinterpret_cast(mAlloc.Allocate()); + } + void deallocate(T* ptr, size_t n) + { + if (n != 1) { + abort(); + } + mAlloc.Deallocate(ptr); + } + static void construct(T* ptr, const T& other) + { new (ptr) T(other); } + static void destroy(T* ptr) + { ptr->~T(); } + template + struct rebind { + typedef Allocator other; + }; +private: + typedef KFS::PoolAllocator< + sizeof(T), // size_t TItemSize, + size_t(8) << 20, // size_t TMinStorageAlloc, + size_t(512) << 20, // size_t TMaxStorageAlloc, + true // bool TForceCleanupFlag + > Alloc; + Alloc mAlloc; +}; + +typedef KFS::LinearHash< + MyKVPair, + KFS::KeyCompare, + MyDynArray, + Allocator +> MySLH; + +using namespace std; +typedef set MySet; + +static void +TestFailed() +{ + abort(); +} + +static void +Verify(const MySet& set, MySLH& ht) +{ + if (set.size() != ht.GetSize()) { + TestFailed(); + } + for (MySet::const_iterator it = set.begin(); it != set.end(); ++it) { + const MyKey* const p = ht.Find(*it); + if (! p || *p != *it) { + TestFailed(); + } + } + ht.First(); + for (const MyKVPair* p; (p = ht.Next()); ) { + if (set.find(p->GetKey()) == set.end()) { + TestFailed(); + } + } +} + +int +main(int argc, char** argv) +{ + if (argc <= 1 || (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { + return 0; + } + + MySLH ht; + bool inserted = false; + // Unit test. + if (! ht.Insert(500, 500, inserted) || ! inserted) { + TestFailed(); + } + if (! ht.Insert(100, 100, inserted) || ! inserted) { + TestFailed(); + } + if (ht.Find(3)) { + TestFailed(); + } + ht.Clear(); + + MySet myset; + const int kRandTestSize = 100 * 1000; + const unsigned int kSeed = 10; + srandom(kSeed); + for (int i = 0; i < kRandTestSize; i++) { + const MyKey r = (MyKey)random(); + inserted = 0; + if (*(ht.Insert(r, r, inserted)) != r) { + TestFailed(); + } + if (myset.insert(r).second != inserted) { + TestFailed(); + } + // Verify(myset, ht); + } + Verify(myset, ht); + cout << "inserted: " << ht.GetSize() << " of " << kRandTestSize << "\n"; + + srandom(kSeed); + for (int i = 0; i < kRandTestSize; i++) { + const MyKey r = (MyKey)random(); + if (i % 3 != 0) { + continue; + } + const size_t rht = ht.Erase(r); + if (! rht) { + const MyKey* const p = ht.Find(r); + if (p) { + TestFailed(); + } + } + const size_t rset = myset.erase(r); + if (rht != rset) { + TestFailed(); + } + // Verify(myset, ht); + } + Verify(myset, ht); + cout << "removed: size: " << ht.GetSize() << + " of " << kRandTestSize << "\n"; + + srandom(kSeed); + for (int i = 0; i < kRandTestSize; i++) { + const MyKey r = (MyKey)random(); + const MyKey* const res = ht.Find(r); + if (res && *res != r) { + TestFailed(); + } + if ((myset.find(r) != myset.end()) != (res != 0)) { + TestFailed(); + } + } + + // Performance test. + ht.Clear(); + clock_t s = clock(); + int k = 0; + inserted = false; + const int nk = argc > 1 ? (int)atof(argv[1]) : (1<<27) - (1<<16); + for (MyKey i = 1000 * 1000 + 345; k < nk; i += 33, k++) { + if (! ht.Insert(i, i, inserted) || ! inserted) { + abort(); + } + } + clock_t e = clock(); + cout << k << " " << double(e - s)/CLOCKS_PER_SEC << "\n"; + s = clock(); + k = 0; + for (MyKey i = 1000 * 1000 + 345; k < nk; i += 33, k++) { + if (! ht.Find(i)) { + abort(); + } + } + e = clock(); + cout << k << " " << double(clock() - s)/CLOCKS_PER_SEC << "\n"; + s = clock(); + k = 0; + ht.First(); + int64_t t = 0; + for (const MyKVPair* p; (p = ht.Next()); k++) { + // cout << p->Key() << "\n"; + t += p->GetKey(); + } + e = clock(); + cout << k << " " << double(clock() - s)/CLOCKS_PER_SEC << " " << t << "\n"; + cout << "press any key and then enter to continue\n"; + string str; + cin >> str; + s = clock(); + k = 0; + for (MyKey i = 1000 * 1000 + 345; k < nk; i += 33, k++) { + if (ht.Erase(i) != 1) { + abort(); + } + } + e = clock(); + cout << k << " " << double(clock() - s)/CLOCKS_PER_SEC << "\n"; + if (! ht.IsEmpty()) { + abort(); + } + return 0; +} diff --git a/src/cc/devtools/stlset_main.cc b/src/cc/devtools/stlset_main.cc new file mode 100644 index 000000000..5a7bdd9bb --- /dev/null +++ b/src/cc/devtools/stlset_main.cc @@ -0,0 +1,84 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/01/29 +// +// Copyright 2011-2012 Quantcast Corp. +// Author: Mike Ovsainnikov +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Stl set (red and black tree) performance test. +// +//---------------------------------------------------------------------------- + +#include + +#include +#include +#include +#include + +#include +#include +#include + +typedef int64_t MyKey; + +typedef std::set< + MyKey, + std::less, + boost::fast_pool_allocator +> MySet; + +using namespace std; + +int main(int argc, char** argv) +{ + if (argc <= 1 || (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { + return 0; + } + + clock_t s = clock(); + MySet ht; + int k = 0; + const int nk = argc > 1 ? (int)atof(argv[1]) : (1<<27)-(1<<16); + for (MyKey i = 1000 * 1000 + 345; k < nk; i += 33, k++) { + ht.insert(i); + } + clock_t e = clock(); + cout << k << " " << double(e - s)/CLOCKS_PER_SEC << "\n"; + s = e; + k = 0; + for (MyKey i = 1000 * 1000 + 345; k < nk; i += 33, k++) { + if (ht.find(i) == ht.end()) { + abort(); + } + } + e = clock(); + cout << k << " " << double(clock() - s)/CLOCKS_PER_SEC << "\n"; + s = e; + k = 0; + int64_t t = 0; + for (MySet::const_iterator it = ht.begin(); it != ht.end(); ++it, k++) { + // cout << *it << "\n"; + t += *it; + } + e = clock(); + cout << k << " " << double(clock() - s)/CLOCKS_PER_SEC << " " << t << "\n"; + cout << "enter number to exit\n"; + cin >> k; + return 0; +} diff --git a/src/cc/doxkfs b/src/cc/doxkfs new file mode 100644 index 000000000..8226a0afc --- /dev/null +++ b/src/cc/doxkfs @@ -0,0 +1,1259 @@ +# Doxyfile 1.4.5 +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = KFS + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Brazilian, Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, +# Dutch, Finnish, French, German, Greek, Hungarian, Italian, Japanese, +# Japanese-en (Japanese with English messages), Korean, Korean-en, Norwegian, +# Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, +# Swedish, and Ukrainian. + +OUTPUT_LANGUAGE = English + +# This tag can be used to specify the encoding used in the generated output. +# The encoding is not always determined by the language that is chosen, +# but also whether or not the output is meant for Windows or non-Windows users. +# In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES +# forces the Windows encoding (this is the default for the Windows binary), +# whereas setting the tag to NO uses a Unix-style encoding (the default for +# all platforms other than Windows). + +USE_WINDOWS_ENCODING = NO + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like the Qt-style comments (thus requiring an +# explicit @brief command for a brief description. + +JAVADOC_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the DETAILS_AT_TOP tag is set to YES then Doxygen +# will output the detailed description near the top, like JavaDoc. +# If set to NO, the detailed description appears after the member +# documentation. + +DETAILS_AT_TOP = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for Java. +# For instance, namespaces will be presented as packages, qualified scopes +# will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to +# include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is YES. + +SHOW_DIRECTORIES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from the +# version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES (the default) +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES (the default) +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be +# generated containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, +# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are +# probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = letter + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = NO + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = NO + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = YES + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will +# generate a call dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable call graphs for selected +# functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width +# (in pixels) of the graphs generated by dot. If a graph becomes larger than +# this value, doxygen will try to truncate the graph, so that it fits within +# the specified constraint. Beware that most browsers cannot cope with very +# large images. + +MAX_DOT_GRAPH_WIDTH = 1024 + +# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height +# (in pixels) of the graphs generated by dot. If a graph becomes larger than +# this value, doxygen will try to truncate the graph, so that it fits within +# the specified constraint. Beware that most browsers cannot cope with very +# large images. + +MAX_DOT_GRAPH_HEIGHT = 1024 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that a graph may be further truncated if the graph's +# image dimensions are not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH +# and MAX_DOT_GRAPH_HEIGHT). If 0 is used for the depth value (the default), +# the graph is not depth-constrained. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, which results in a white background. +# Warning: Depending on the platform used, enabling this option may lead to +# badly anti-aliased labels on the edges of a graph (i.e. they become hard to +# read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to the search engine +#--------------------------------------------------------------------------- + +# The SEARCHENGINE tag specifies whether or not a search engine should be +# used. If set to NO the values of all tags below this one will be ignored. + +SEARCHENGINE = NO diff --git a/src/cc/emulator/CMakeLists.txt b/src/cc/emulator/CMakeLists.txt new file mode 100644 index 000000000..beecf3ae5 --- /dev/null +++ b/src/cc/emulator/CMakeLists.txt @@ -0,0 +1,73 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +# +# For the library take everything except the *_main.cc files +# +set (lib_srcs +ChunkServerEmulator.cc +LayoutEmulator.cc +emulator_setup.cc +) + +add_library (kfsEmulator STATIC ${lib_srcs}) +add_library (kfsEmulator-shared SHARED ${lib_srcs}) +set_target_properties (kfsEmulator PROPERTIES OUTPUT_NAME "kfs_emulator") +set_target_properties (kfsEmulator-shared PROPERTIES OUTPUT_NAME "kfs_emulator") +set_target_properties (kfsEmulator PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties (kfsEmulator-shared PROPERTIES CLEAN_DIRECT_OUTPUT 1) + +set (exe_files rebalanceplanner rebalanceexecutor replicachecker) +foreach (exe_file ${exe_files}) + add_executable (${exe_file} ${exe_file}_main.cc) + if (USE_STATIC_LIB_LINKAGE) + target_link_libraries (${exe_file} kfsEmulator kfsMeta kfsIO kfsCommon qcdio pthread crypto) + add_dependencies (${exe_file} kfsEmulator kfsCommon kfsIO qcdio kfsMeta) + else (USE_STATIC_LIB_LINKAGE) + target_link_libraries (${exe_file} kfsEmulator-shared kfsMeta-shared kfsIO-shared kfsCommon-shared qcdio-shared pthread crypto) + add_dependencies (${exe_file} kfsEmulator-shared kfsCommon-shared kfsIO-shared kfsMeta-shared qcdio-shared) + endif (USE_STATIC_LIB_LINKAGE) +endforeach (exe_file) + +if (APPLE OR CYGWIN) + target_link_libraries(kfsEmulator-shared kfsMeta-shared kfsCommon-shared kfsIO-shared) +endif (APPLE OR CYGWIN) + +if (NOT APPLE) + target_link_libraries(kfsEmulator rt) +endif (NOT APPLE) + +if (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + target_link_libraries(kfsEmulator mtmalloc) +endif (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + +# +# Install them +# +install (TARGETS ${exe_files} kfsEmulator kfsEmulator-shared + RUNTIME DESTINATION bin/emulator + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static) + diff --git a/src/cc/emulator/ChunkServerEmulator.cc b/src/cc/emulator/ChunkServerEmulator.cc new file mode 100644 index 000000000..130c90f92 --- /dev/null +++ b/src/cc/emulator/ChunkServerEmulator.cc @@ -0,0 +1,132 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/27 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Chunk server "emulator" / stub implementation. +// +//---------------------------------------------------------------------------- + +#include "common/MsgLogger.h" +#include "common/kfsdecls.h" +#include "meta/MetaRequest.h" +#include "meta/util.h" +#include "ChunkServerEmulator.h" +#include "LayoutEmulator.h" + +namespace KFS +{ +using std::numeric_limits; + +ChunkServerEmulator::ChunkServerEmulator( + const ServerLocation& loc, int rack, const string& peerName) + : TcpSocket(numeric_limits::max()), // Fake fd, for IsGood() + ChunkServer(NetConnectionPtr( + new NetConnection(this, this, false, false)), peerName), + mPendingReqs(), + mOut(0) +{ + SetServerLocation(loc); + SetRack(rack); +} + +ChunkServerEmulator::~ChunkServerEmulator() +{ + if (mNetConnection) { + mNetConnection->Close(); + } + TcpSocket& sock = *this; + sock = TcpSocket(); // Reset socket's fake fd. + ChunkServerEmulator::FailPendingOps(); +} + +void +ChunkServerEmulator::EnqueueSelf(MetaChunkRequest* r) +{ + mPendingReqs.push_back(r); +} + +size_t +ChunkServerEmulator::Dispatch() +{ + // Use index instead of iterator in order handle correctly Enqueue() + // while iterating though the queue (though this isn't needed at the + // time of writing). + size_t i; + for (i = 0; i < mPendingReqs.size(); i++) { + MetaRequest* const r = mPendingReqs[i]; + MetaChunkRequest* const op = FindMatchingRequest(r->opSeqno); + if (op != r) { + panic("invalid request: not in the queue"); + } + if (r->op == META_CHUNK_REPLICATE) { + MetaChunkReplicate* const mcr = static_cast(r); + if (gLayoutEmulator.ChunkReplicationDone(mcr)) { + KFS_LOG_STREAM_DEBUG << + "moved chunk: " << mcr->chunkId << + " to " << mcr->server->GetServerLocation() << + KFS_LOG_EOM; + if (mOut) { + (*mOut) << + mcr->chunkId << " " << mcr->server->GetServerLocation() << + "\n"; + } + } + } else if (r->op == META_CHUNK_DELETE) { + MetaChunkDelete* const mcd = static_cast(r); + if (mNumChunks > 0) { + mNumChunks--; + mUsedSpace -= gLayoutEmulator.GetChunkSize(mcd->chunkId); + if (mUsedSpace < 0 || mNumChunks <= 0) { + mUsedSpace = 0; + } + mAllocSpace = mUsedSpace; + } + } else { + KFS_LOG_STREAM_ERROR << "unexpected op: " << r->Show() << + KFS_LOG_EOM; + } + delete r; + } + mPendingReqs.clear(); + return i; +} + +void +ChunkServerEmulator::FailPendingOps() +{ + for (size_t i = 0; i < mPendingReqs.size(); i++) { + MetaRequest* const r = mPendingReqs[i]; + MetaChunkRequest* const op = FindMatchingRequest(r->opSeqno); + if (op != r) { + panic("invalid request: not in the queue"); + } + if (r->op == META_CHUNK_REPLICATE) { + MetaChunkReplicate* const mcr = static_cast(r); + mcr->status = -EIO; + gLayoutEmulator.ChunkReplicationDone(mcr); + } + delete r; + } + mPendingReqs.clear(); +} + +} // namespace KFS diff --git a/src/cc/emulator/ChunkServerEmulator.h b/src/cc/emulator/ChunkServerEmulator.h new file mode 100644 index 000000000..b85cf18e5 --- /dev/null +++ b/src/cc/emulator/ChunkServerEmulator.h @@ -0,0 +1,96 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/27 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief An emulator for a chunk server, that only "emulates" chunk replication +// and recovery to make layout emulator work. +// +//---------------------------------------------------------------------------- + +#ifndef EMULATOR_CHUNKSERVEREMULATOR_H +#define EMULATOR_CHUNKSERVEREMULATOR_H + +#include +#include + +#include "meta/ChunkServer.h" +#include "kfsio/TcpSocket.h" + +namespace KFS +{ +using std::vector; +using std::ostream; + +class ChunkServerEmulator : private TcpSocket, public ChunkServer +{ +public: + ChunkServerEmulator( + const ServerLocation& loc, int rack, const string& peerName); + virtual ~ChunkServerEmulator(); + + size_t Dispatch(); + // when this emulated server goes down, fail the pending ops + // that were destined to this node + void FailPendingOps(); + void HostingChunk(kfsChunkId_t /* chunkId */, size_t chunksize) + { + mNumChunks++; + mUsedSpace += chunksize; + mAllocSpace = mUsedSpace; + } + void SetRebalancePlanOutFd(ostream* os) + { + mOut = os; + } + void InitSpace(int64_t totalSpace, int64_t usedSpace, + bool useFsTotalSpaceFlag) + { + if (useFsTotalSpaceFlag) { + mTotalFsSpace = totalSpace; + if (totalSpace > usedSpace) { + mTotalSpace = totalSpace - usedSpace; + } else { + mTotalSpace = 0; + } + } else { + mTotalFsSpace = totalSpace; + mTotalSpace = totalSpace; + } + mAllocSpace = 0; + mUsedSpace = 0; + } + +protected: + virtual void EnqueueSelf(MetaChunkRequest* r); + +private: + typedef vector PendingReqs; + PendingReqs mPendingReqs; + ostream* mOut; +private: + ChunkServerEmulator(const ChunkServerEmulator&); + ChunkServerEmulator& operator=(const ChunkServerEmulator&); +}; + +} + +#endif // EMULATOR_CHUNKSERVEREMULATOR_H diff --git a/src/cc/emulator/LayoutEmulator.cc b/src/cc/emulator/LayoutEmulator.cc new file mode 100644 index 000000000..e646ff420 --- /dev/null +++ b/src/cc/emulator/LayoutEmulator.cc @@ -0,0 +1,926 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/27 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Emulator for the layout manager: read in a chunk->location +// map; we can then migrate blocks around to experiment with placement algorithms. +// +//---------------------------------------------------------------------------- + +#include "LayoutEmulator.h" +#include "ChunkServerEmulator.h" +#include "common/kfstypes.h" +#include "common/MsgLogger.h" +#include "common/RequestParser.h" +#include "common/StBuffer.h" +#include "meta/kfstree.h" +#include "meta/util.h" + +#include +#include +#include + +#include +#include +#include +#include + +namespace KFS +{ + +using std::string; +using std::ifstream; +using std::for_each; +using std::ofstream; +using boost::bind; + +static inline ChunkServerEmulator& +GetCSEmulator(ChunkServer& server) +{ + return static_cast(server); +} + +int +LayoutEmulator::LoadChunkmap( + const string& chunkLocationFn, bool addChunksToReplicationChecker) +{ + ifstream file(chunkLocationFn.c_str()); + if (! file) { + const int err = errno; + KFS_LOG_STREAM_INFO << chunkLocationFn << ": " << strerror(err) << + KFS_LOG_EOM; + return (err > 0 ? -err : -1); + } + const size_t kMaxLineSize = 256 << 10; + StBufferT buf; + char* const line = buf.Resize(kMaxLineSize); + size_t lineno = 1; + size_t len = 0; + ServerLocation loc; + line[0] = 0; + while (file.getline(line, kMaxLineSize) && + (len = file.gcount()) < kMaxLineSize - 1 && + Parse(line, len, addChunksToReplicationChecker, loc)) { + lineno++; + } + const bool badFlag = file.bad(); + if (! badFlag && file.eof()) { + return 0; + } + const int err = badFlag ? errno : EINVAL; + KFS_LOG_STREAM_ERROR << chunkLocationFn << ":" << lineno << + (badFlag ? " " : " malformed: ") << + (badFlag ? strerror(err) : line ) << + KFS_LOG_EOM; + return (err > 0 ? -err : -1); +} + +bool +LayoutEmulator::Parse( + const char* line, + size_t size, + bool addChunksToReplicationChecker, + ServerLocation& loc) +{ + // format of the file: + // <# of servers> [server location] + // \n + // where, : replica-size name port rack# + // and we have as many server locations as # of servers + + kfsChunkId_t cid; + fid_t fid; + int numServers; + const char* p = line; + const char* const end = p + size; + if (! DecIntParser::Parse(p, end - p, cid) || + ! DecIntParser::Parse(p, end - p, fid) || + ! DecIntParser::Parse(p, end - p, numServers)) { + return false; + } + CSMap::Entry* const ci = mChunkToServerMap.Find(cid); + if (! ci) { + KFS_LOG_STREAM_ERROR << "no such chunk: " << cid << KFS_LOG_EOM; + return true; + } + for (int i = 0; i < numServers; i++) { + while (p < end && (*p & 0xFF) <= ' ') { + p++; + } + if (p >= end) { + return false; + } + const char* const host = p; + while (p < end && (*p & 0xFF) > ' ') { + p++; + } + if (p >= end) { + return false; + } + loc.hostname.assign(host, p - host); + if (! DecIntParser::Parse(p, end - p, loc.port)) { + return false; + } + while (p < end && (*p & 0xFF) <= ' ') { + p++; + } + if (p >= end) { + return false; + } + // const char* const rack = p; + while (p < end && (*p & 0xFF) > ' ') { + p++; + } + Loc2Server::const_iterator const it = mLoc2Server.find(loc); + if (it == mLoc2Server.end()) { + KFS_LOG_STREAM_ERROR << + "chunk: " << cid << + " no such server: " << loc << + KFS_LOG_EOM; + continue; + } + if (! AddReplica(*ci, it->second)) { + KFS_LOG_STREAM_ERROR << + "chunk: " << cid << + " add server: " << loc << + " failed" << + KFS_LOG_EOM; + continue; + } + GetCSEmulator(*(it->second)).HostingChunk(cid, GetChunkSize(*ci)); + } + if (addChunksToReplicationChecker) { + CheckChunkReplication(*ci); + } + return true; +} + +// override what is in the layout manager (only for the emulator code) +bool +LayoutEmulator::ChunkReplicationDone(MetaChunkReplicate* req) +{ + mOngoingReplicationStats->Update(-1); + // Book-keeping.... + if (mNumOngoingReplications > 0) { + mNumOngoingReplications--; + } + req->server->ReplicateChunkDone(req->chunkId); + if (req->srcLocation.IsValid() && req->dataServer) { + req->dataServer->UpdateReplicationReadLoad(-1); + } + req->dataServer.reset(); + if (req->status != 0) { + // Replication failed...we will try again later + KFS_LOG_STREAM_ERROR << + "replication failed" + " chunk: " << req->chunkId << + " status: " << req->status << + " server: " << req->server->GetServerLocation() << + KFS_LOG_EOM; + mFailedReplicationStats->Update(1); + return false; + } + mNumBlksRebalanced++; + // replication succeeded: book-keeping + CSMap::Entry* const ci = mChunkToServerMap.Find(req->chunkId); + if (! ci) { + KFS_LOG_STREAM_ERROR << + "replication completion: no such chunk: " << req->chunkId << + KFS_LOG_EOM; + return false; + } + const bool addedFlag = AddReplica(*ci, req->server); + if (addedFlag) { + GetCSEmulator(*(req->server)).HostingChunk( + req->chunkId, GetChunkSize(*ci)); + } else { + KFS_LOG_STREAM_ERROR << + "chunk: " << req->chunkId << + " add server: " << req->server->GetServerLocation() << + " failed" << + KFS_LOG_EOM; + } + CheckChunkReplication(*ci); + return addedFlag; +} + +void +LayoutEmulator::MarkServerDown(const ServerLocation& loc) +{ + Loc2Server::iterator const it = mLoc2Server.find(loc); + if (it == mLoc2Server.end()) { + KFS_LOG_STREAM_ERROR << + "server down: no such server: " << loc << + KFS_LOG_EOM; + return; + } + ServerDown(it->second); + mLoc2Server.erase(it); + KFS_LOG_STREAM_INFO << "server down: " << loc << + KFS_LOG_EOM; +} + +seq_t +LayoutEmulator::GetChunkversion(chunkId_t cid) const +{ + const CSMap::Entry* const ci = mChunkToServerMap.Find(cid); + return (ci ? ci->GetChunkInfo()->chunkVersion : -1); +} + +size_t +LayoutEmulator::GetChunkSize(chunkId_t cid) const +{ + const CSMap::Entry* const ci = mChunkToServerMap.Find(cid); + if (! ci) { + return 0; + } + return GetChunkSize(*ci); +} + +size_t +LayoutEmulator::GetChunkSize(const CSMap::Entry& ci) const +{ + // Assume that the file isn't sparse. + const MetaFattr* const fa = ci.GetFattr(); + if (fa->chunkcount() <= 0) { + return 0; + } + const chunkOff_t pos = ci.GetChunkInfo()->offset; + const chunkOff_t fsize = metatree.getFileSize(fa); + if (! fa->IsStriped()) { + if (fsize < pos) { + return CHUNKSIZE; + } + return min(CHUNKSIZE, size_t(fsize - pos)); + } + + const chunkOff_t blkSize = fa->numStripes * (chunkOff_t)CHUNKSIZE; + const chunkOff_t blkPos = fa->ChunkPosToChunkBlkFileStartPos(pos); + const chunkOff_t size = fsize - blkPos; + if (size <= 0 || size >= blkSize) { + return CHUNKSIZE; + } + const int chunkStripeIdx = (int)(pos / (chunkOff_t)CHUNKSIZE % + (fa->numStripes + fa->numRecoveryStripes)); + const chunkOff_t strideSize = fa->stripeSize * fa->numStripes; + const chunkOff_t strideCount = size / strideSize; + const chunkOff_t strideHead = size % strideSize; + const chunkOff_t stripeIdx = strideHead / fa->stripeSize; + const int idx = + chunkStripeIdx < fa->numStripes ? chunkStripeIdx : 0; + chunkOff_t chunkSize = strideCount * fa->stripeSize; + if (idx < stripeIdx) { + chunkSize += fa->stripeSize; + } else if (idx == stripeIdx) { + chunkSize += strideHead % fa->stripeSize; + } + return (size_t)chunkSize; +} + +void +LayoutEmulator::AddServer( + const ServerLocation& loc, + int rack, + uint64_t totalSpace, + uint64_t usedSpace) +{ + ostringstream os; + os << loc.hostname << ":" << loc.port; + const string peerName = os.str(); + ChunkServerPtr c; + ChunkServerEmulator& srv = *(new ChunkServerEmulator(loc, rack, peerName)); + c.reset(&srv); + + srv.InitSpace(totalSpace, usedSpace, mUseFsTotalSpaceFlag); + + mChunkToServerMap.AddServer(c); + mLoc2Server.insert(make_pair(loc, c)); + mChunkServers.push_back(c); + RackInfos::iterator const it = find_if( + mRacks.begin(), mRacks.end(), + bind(&RackInfo::id, _1) == rack); + if (it != mRacks.end()) { + it->addServer(c); + } else if (rack >= 0) { + mRacks.push_back(RackInfo(rack, 1.0, c)); + } + UpdateSrvLoadAvg(srv, 0); + UpdateReplicationsThreshold(); + + KFS_LOG_STREAM_INFO << + "added:" + " server: " << srv.GetServerLocation() << + " rack: " << srv.GetRack() << + " space:" + " total: " << srv.GetTotalSpace(mUseFsTotalSpaceFlag) << + " utilization: " << srv.GetSpaceUtilization(mUseFsTotalSpaceFlag) << + KFS_LOG_EOM; +} + +int +LayoutEmulator::SetRebalancePlanOutFile(const string &rebalancePlanFn) +{ + mPlanFile.close(); + mPlanFile.open(rebalancePlanFn.c_str(), ofstream::out); + mPlanFile.setf(istream::hex); + if (! mPlanFile) { + const int err = errno; + KFS_LOG_STREAM_ERROR << rebalancePlanFn << ": " << strerror(err) << + KFS_LOG_EOM; + return -1; + } + for (Servers::iterator i = mChunkServers.begin(); + i != mChunkServers.end(); + i++) { + GetCSEmulator(**i).SetRebalancePlanOutFd(&mPlanFile); + } + return 0; +} + +size_t +LayoutEmulator::RunChunkserverOps() +{ + size_t opsCount = 0; + for (size_t i = 0; i < mChunkServers.size(); i++) { + ChunkServer& srv = *mChunkServers[i]; + opsCount += GetCSEmulator(srv).Dispatch(); + // Handle the case where the chunk server might go away as result of + // executing op. + if (mChunkServers.size() <= i) { + break; + } + if (&*mChunkServers[i] == &srv) { + UpdateSrvLoadAvg(srv, 0); + } + } + return opsCount; +} + +void +LayoutEmulator::CalculateRebalaceThresholds() +{ + double avgSpaceUtil = 0; + for (Servers::iterator it = mChunkServers.begin(); + it != mChunkServers.end(); + it++) { + avgSpaceUtil += (*it)->GetSpaceUtilization(mUseFsTotalSpaceFlag); + } + const size_t cnt = mChunkServers.size(); + if (cnt > 0) { + avgSpaceUtil /= cnt; + } + if (mVariationFromMean > 0) { + // Take the average utilizaiton in the cluster; any node that has + // utilizaiton outside the average is candidate for rebalancing + mMinRebalanceSpaceUtilThreshold = + max(0.0, avgSpaceUtil - mVariationFromMean * 0.5); + mMaxRebalanceSpaceUtilThreshold = + min(1.0, avgSpaceUtil + mVariationFromMean * 0.5); + } + KFS_LOG_STREAM_INFO << + "chunk servers: " << cnt << + " racks: " << mRacks.size() << + " rebalance thresholds:" + " average: " << avgSpaceUtil << + " min: " << mMinRebalanceSpaceUtilThreshold << + " max: " << mMaxRebalanceSpaceUtilThreshold << + KFS_LOG_EOM; +} + +void +LayoutEmulator::PrepareRebalance(bool enableRebalanceFlag) +{ + ToggleRebalancing(enableRebalanceFlag); + + if (enableRebalanceFlag) { + CalculateRebalaceThresholds(); + } + const int64_t kMicroseconds = int64_t(1000) * 1000; + const int64_t kMaxRunTime = kMicroseconds * 15; + mRebalanceRunInterval = -1; + mMinChunkReplicationCheckInterval = -1; + mMaxRebalanceRunTime = max(mMaxRebalanceRunTime, kMaxRunTime); + mMaxTimeForChunkReplicationCheck = + max(mMaxTimeForChunkReplicationCheck, kMaxRunTime); + + if (mMaxConcurrentWriteReplicationsPerNode < 1) { + mMaxConcurrentWriteReplicationsPerNode = 1; + UpdateReplicationsThreshold(); + } + if (mMaxConcurrentReadReplicationsPerNode < 1) { + mMaxConcurrentReadReplicationsPerNode = 1; + } +} + +void +LayoutEmulator::BuildRebalancePlan() +{ + PrepareRebalance(true); + + RebalanceCtrs::Counter round = mRebalanceCtrs.GetRoundCount() + 1; + RebalanceCtrs::Counter nextScanned = 0; + const size_t kThreshUpdateInterval = 4 << 10; + size_t updateThreshOpsCnt = kThreshUpdateInterval; + for (int prev = mNumBlksRebalanced; ;) { + if (mCleanupScheduledFlag) { + ScheduleCleanup(); + } + ChunkReplicationChecker(); + const size_t opsCount = RunChunkserverOps(); + if (prev != mNumBlksRebalanced) { + prev = mNumBlksRebalanced; + round = mRebalanceCtrs.GetRoundCount() + 1; + } + const bool doneFlag = + mStopFlag || + mChunkServers.empty() || + (opsCount <= 0 && + ! mCleanupScheduledFlag && + mRebalanceCtrs.GetRoundCount() > round && + ! mChunkToServerMap.Front(CSMap::Entry::kStateCheckReplication)); + RebalanceCtrs::Counter const scanned = mRebalanceCtrs.GetTotalScanned(); + if (doneFlag || nextScanned < scanned) { + KFS_LOG_STREAM_START(MsgLogger::kLogLevelINFO, logStream); + ostream& os = logStream.GetStream(); + os << "=== rebalance counters: "; + mRebalanceCtrs.Show(os, ": ", " "); + KFS_LOG_STREAM_END; + nextScanned = scanned + 1000 * 1000; + } + if (doneFlag) { + break; + } + if (mVariationFromMean > 0) { + if (updateThreshOpsCnt <= opsCount) { + CalculateRebalaceThresholds(); + updateThreshOpsCnt = kThreshUpdateInterval; + } else { + updateThreshOpsCnt -= opsCount; + } + } + } +} + +void +LayoutEmulator::ExecuteRebalancePlan() +{ + // the plan has already been worked out; we just execute + PrepareRebalance(false); + + RebalanceCtrs::Counter nextScanned = 0; + for (; ;) { + if (mCleanupScheduledFlag) { + ScheduleCleanup(); + } + ChunkReplicationChecker(); + const bool doneFlag = + mStopFlag || + mChunkServers.empty() || + (RunChunkserverOps() <= 0 && + ! mChunkToServerMap.Front(CSMap::Entry::kStateCheckReplication) && + ! mIsExecutingRebalancePlan && + ! mCleanupScheduledFlag); + RebalanceCtrs::Counter const scanned = mRebalanceCtrs.GetTotalScanned(); + if (doneFlag || nextScanned < scanned) { + KFS_LOG_STREAM_START(MsgLogger::kLogLevelINFO, logStream); + ostream& os = logStream.GetStream(); + os << "=== rebalance counters: "; + mRebalanceCtrs.Show(os, ": ", " "); + KFS_LOG_STREAM_END; + nextScanned = scanned + 1000 * 1000; + } + if (doneFlag) { + break; + } + } +} + +class PrintBlockCount +{ + ostream& mOs; + const bool mUseFsTotalSpaceFlag; +public: + PrintBlockCount(ostream& os, bool f) + : mOs(os), + mUseFsTotalSpaceFlag(f) + {} + void operator()(const ChunkServerPtr &c) const + { + ChunkServer& cse = *c; + mOs << cse.GetServerLocation() << + ' ' << cse.GetNumChunks() << + ' ' << cse.GetUsedSpace() << + ' ' << cse.GetSpaceUtilization(mUseFsTotalSpaceFlag) << + '\n'; + } +}; + +void +LayoutEmulator::PrintChunkserverBlockCount(ostream& os) const +{ + for_each(mChunkServers.begin(), mChunkServers.end(), + PrintBlockCount(os, mUseFsTotalSpaceFlag)); +} + +int +LayoutEmulator::ReadNetworkDefn(const string& networkFn) +{ + ifstream file(networkFn.c_str()); + if (! file) { + const int err = errno; + KFS_LOG_STREAM_ERROR << networkFn << ": " << strerror(err) << + KFS_LOG_EOM; + return (err > 0 ? -err : -1); + } + + ServerLocation loc; + BufferInputStream bis; + const size_t kMaxLineSize = 4 << 10; + char line[kMaxLineSize]; + size_t lineno = 1; + size_t len; + line[0] = 0; + while (file.getline(line, kMaxLineSize) && + (len = file.gcount()) < kMaxLineSize - 1) { + istream& is = bis.Set(line, len); + int rack; + uint64_t totalSpace; + uint64_t usedSpace; + if (! (is >> loc.hostname >> loc.port >> rack >> + totalSpace >> usedSpace)) { + KFS_LOG_STREAM_ERROR << networkFn << ":" << lineno << + " malformed: " << line << + KFS_LOG_EOM; + return -EINVAL; + } + lineno++; + AddServer(loc, rack, totalSpace, usedSpace); + } + const bool badFlag = file.bad(); + if (badFlag || ! file.eof()) { + const int err = badFlag ? errno : EINVAL; + KFS_LOG_STREAM_ERROR << networkFn << ":" << lineno << + (badFlag ? " " : " malformed: ") << + (badFlag ? strerror(err) : line ) << + KFS_LOG_EOM; + return (err > 0 ? -err : -1); + } + + const size_t cnt = mChunkServers.size(); + KFS_LOG_STREAM_INFO << + "chunk servers: " << cnt << + " racks: " << mRacks.size() << + KFS_LOG_EOM; + for (RackInfos::iterator it = mRacks.begin(); + it != mRacks.end(); + ++it) { + KFS_LOG_STREAM_INFO << + "rack: " << it->id() << + " weight: " << it->getWeight() << + " servers: " << it->getServers().size() << + " candidates: " << it->getPossibleCandidatesCount() << + " weighted candidates: " << + it->getWeightedPossibleCandidatesCount() << + KFS_LOG_EOM; + } + + return 0; +} + +class LayoutEmulator::PlacementVerifier +{ +public: + int sameRack; + int underReplicated; + int overReplicated; + int missing; + int sameNode; + int stripeSameNode; + const int64_t startTime; + + PlacementVerifier() + : sameRack(0), + underReplicated(0), + overReplicated(0), + missing(0), + sameNode(0), + stripeSameNode(0), + startTime(microseconds()) + {} + ostream& report(ostream& os, size_t chunkCount) + { + os << + "************************************************\n" + " Total chunks : " << chunkCount << "\n" + " Total chunks missing : " << missing << "\n" + " Total chunks on same rack : " << sameRack << "\n" + " Total chunks under replicated : " << underReplicated << "\n" + " Total chunks over replicated : " << overReplicated << "\n" + " Total stripes on same node : " << stripeSameNode << "\n" + " Run time : " << ((microseconds() - startTime) * 1e-6) << "\n" + "************************************************\n" + ; + return os; + } + bool IsHealthy() const + { return (missing <= 0); } +}; + +inline const string& +GetFileName(const MetaFattr* fa, string& fileName) +{ + if (fileName.empty()) { + fileName = metatree.getPathname(fa); + } + return fileName; +} + +class DisplayFileType +{ +public: + DisplayFileType(const MetaFattr* a) + : fa(a) + {} + ostream& Display(ostream& os) const + { + if (! fa->IsStriped()) { + return (os << "r " << fa->numReplicas); + } + return (os << "rs " << fa->numReplicas << "," << + fa->numStripes << "+" << fa->numRecoveryStripes); + } +private: + const MetaFattr* const fa; +}; + +ostream& +operator<<(ostream& os, const DisplayFileType& d) { + return d.Display(os); +} + +void +LayoutEmulator::ShowPlacementError( + ostream& os, + const CSMap::Entry& c, + const ChunkServer* srv, + string& fileName, + size_t replicas, + const char* reason) +{ + const MetaFattr* const fa = c.GetFattr(); + const chunkOff_t pos = c.GetChunkInfo()->offset; + os << + reason << + " chunk: " << c.GetChunkId() << + " pos: " << pos << + " size: " << GetChunkSize(c) << + " block: " << fa->ChunkPosToChunkBlkIndex(pos) << + " file: " << c.GetFileId() << + " type: " << DisplayFileType(fa) << + " node: " << (srv ? srv->GetServerLocation() : ServerLocation()) << + " rack: " << (srv ? srv->GetRack() : -1) << + " replicas: " << fa->numReplicas << + " actual: " << replicas << + " " << GetFileName(fa, fileName) << + "\n"; +} + +void +LayoutEmulator::VerifyPlacement( + const CSMap::Entry& c, + const LayoutEmulator::Servers& servers, + const vector& cblk, + LayoutEmulator::ChunkPlacement& placement, + ostream& os, + bool verboseFlag, + bool reportAllFlag, + LayoutEmulator::PlacementVerifier& verifier) +{ + const MetaFattr* const fa = c.GetFattr(); + string fileName; + + if (servers.empty()) { + if (verboseFlag) { + ShowPlacementError(os, c, 0, fileName, servers.size(), + "no replicas"); + } + verifier.missing++; + return; + } + for (Servers::const_iterator it = servers.begin(); + it != servers.end(); + ++it) { + ChunkServer& srv = **it; + if (placement.IsServerExcluded(srv)) { + if (! fa->IsStriped() || + find(it + 1, servers.end(), *it) != + servers.end()) { + verifier.sameNode++; + ShowPlacementError(os, c, &srv, fileName, servers.size(), + "duplicate server"); + } else if (reportAllFlag || + placement.GetExcludedServersCount() < + mChunkServers.size()) { + verifier.stripeSameNode++; + if (verboseFlag) { + ShowPlacementError(os, c, &srv, fileName, servers.size(), + "same node"); + } + } + placement.ExcludeServerAndRack(srv, c.GetChunkId()); + } else if (! placement.ExcludeServerAndRack(srv, c.GetChunkId()) && + (reportAllFlag || placement.HasCandidateRacks())) { + verifier.sameRack++; + if (verboseFlag) { + ShowPlacementError(os, c, &srv, fileName, servers.size(), + "same rack"); + } + } + } + if (! servers.empty() && + servers.size() != (size_t)fa->numReplicas) { + const bool underReplicatedFlag = + servers.size() < (size_t)fa->numReplicas; + if (underReplicatedFlag) { + verifier.underReplicated++; + } else { + verifier.overReplicated++; + } + if (verboseFlag) { + ShowPlacementError(os, c, 0, fileName, servers.size(), + (underReplicatedFlag ? " under replicated" : + " over replicated")); + } + } +} + +int +LayoutEmulator::VerifyRackAwareReplication( + bool reportAllFlag, bool verboseFlag, ostream& os) +{ + os << + "************************************************\n" + " KFS Replica Checker\n" + "************************************************\n" + ; + StTmp serversTmp(mServersTmp); + StTmp placementTmp(mChunkPlacementTmp); + StTmp > cinfoTmp(mChunkInfosTmp); + PlacementVerifier verifier; + const bool kIncludeThisChunkFlag = false; + const bool kStopIfHasAnyReplicationsInFlight = false; + mChunkToServerMap.First(); + for (const CSMap::Entry* p; (p = mChunkToServerMap.Next()); ) { + ChunkPlacement& placement = placementTmp.Get(); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(*p, servers); + vector& cblk = cinfoTmp.Get(); + if (! servers.empty()) { + GetPlacementExcludes(*p, placement, kIncludeThisChunkFlag, + kStopIfHasAnyReplicationsInFlight, &cblk); + } + VerifyPlacement(*p, servers, cblk, placement, + os, verboseFlag, reportAllFlag, verifier); + } + verifier.report(os, mChunkToServerMap.Size()); + return (verifier.IsHealthy() ? 0 : 1); +} + +int +LayoutEmulator::RunFsck( + const string& fileName) +{ + const string kStdout("-"); + const int outfd = fileName == kStdout ? + fileno(stdout) : + open(fileName.c_str(), O_WRONLY | O_TRUNC | O_CREAT, 0644); + if (outfd < 0) { + const int err = errno; + KFS_LOG_STREAM_ERROR << "failed to create temporary file: " << + fileName << ": " << strerror(err) << + KFS_LOG_EOM; + return (err > 0 ? -err : -1); + } + const bool kReportAbandonedFilesFlag = true; + const int cnt = FsckStreamCount( + kReportAbandonedFilesFlag); + const char* const suffix = ".XXXXXX"; + const size_t suffixLen = strlen(suffix); + ofstream* const streams = new ofstream[cnt]; + StBufferT buf; + StBufferT osbuf; + ostream** const osptr = osbuf.Resize(cnt + 1); + vector fd; + fd.reserve(cnt); + for (int i = 0; i < cnt; i++) { + char* const ptr = buf.Resize(fileName.length() + suffixLen + 1); + memcpy(ptr, fileName.data(), fileName.size()); + strcpy(ptr + fileName.size(), suffix); + int tfd = mkstemp(ptr); + int err = errno; + if (tfd > 0) { + streams[i].open(ptr); + err = errno; + unlink(ptr); + if (! streams[i]) { + close(tfd); + tfd = -1; + } + } + if (tfd < 0) { + KFS_LOG_STREAM_ERROR << "failed to create temporary file: " << + ptr << ": " << strerror(err) << + KFS_LOG_EOM; + while (--i >= 0) { + streams[i].close(); + close(fd[i]); + } + delete [] streams; + close(outfd); + return (err > 0 ? -err : -1); + } + fd.push_back(tfd); + osptr[i] = streams + i; + } + osptr[cnt] = 0; + Fsck(osptr, kReportAbandonedFilesFlag); + int err = 0; + for (int i = 0; i < cnt; i++) { + streams[i].close(); + if (! streams[i]) { + err = errno; + KFS_LOG_STREAM_ERROR << "failed to close temporary file: " << + strerror(err) << + KFS_LOG_EOM; + err = err > 0 ? -err : -1; + } + } + int i = 0; + if (err == 0) { + const size_t sz = 1 << 20; + char* const ptr = buf.Resize(sz); + for (i = 0; err == 0 && i < cnt; i++) { + ssize_t nrd; + while (err == 0 && (nrd = read(fd[i], ptr, sz)) > 0) { + const char* p = ptr; + const char* const e = p + nrd; + ssize_t nwr; + while (p < e && (nwr = write(outfd, p, e - p)) > 0) { + p += nwr; + } + if (p < e) { + err = errno; + KFS_LOG_STREAM_ERROR << fileName << ": " << + strerror(err) << + KFS_LOG_EOM; + err = err > 0 ? -err : -1; + break; + } + } + if (nrd < 0) { + err = errno; + KFS_LOG_STREAM_ERROR << "read failure: " << + strerror(err) << + KFS_LOG_EOM; + err = err > 0 ? -err : -1; + } + close(fd[i]); + } + } + for ( ; i < cnt; i++) { + close(fd[i]); + } + if (fileName != kStdout && close(outfd)) { + err = errno; + KFS_LOG_STREAM_ERROR << fileName << ": " << + strerror(err) << + KFS_LOG_EOM; + err = err > 0 ? -err : -1; + } + delete [] streams; + return err; +} + +LayoutEmulator gLayoutEmulator; +LayoutManager& gLayoutManager = gLayoutEmulator; + +} // namespace KFS diff --git a/src/cc/emulator/LayoutEmulator.h b/src/cc/emulator/LayoutEmulator.h new file mode 100644 index 000000000..1774224f1 --- /dev/null +++ b/src/cc/emulator/LayoutEmulator.h @@ -0,0 +1,140 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/08 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Emulator for the layout manager: read in a chunk->location +// map; we can then migrate blocks around to test, debug chunk placement and +// re-balancing, and experiment with chunk placement algorithms. +// +//---------------------------------------------------------------------------- + +#ifndef EMULATOR_LAYOUTEMULATOR_H +#define EMULATOR_LAYOUTEMULATOR_H + +#include +#include +#include +#include +#include "meta/LayoutManager.h" + +namespace KFS +{ +using std::ofstream; +using std::map; +using std::vector; +using std::string; + +class LayoutEmulator : public LayoutManager +{ +public: + LayoutEmulator() + : mVariationFromMean(0), + mNumBlksRebalanced(0), + mStopFlag(false), + mPlanFile(), + mLoc2Server() + { + SetMinChunkserversToExitRecovery(0); + ToggleRebalancing(true); + } + ~LayoutEmulator() + { + mPlanFile.close(); + } + // Given a chunk->location data in a file, rebuild the chunk->location map. + // + int LoadChunkmap(const string& chunkLocationFn, + bool addChunksToReplicationChecker = false); + void AddServer(const ServerLocation& loc, + int rack, uint64_t totalSpace, uint64_t usedSpace); + void SetupForRebalancePlanning( + double utilizationPercentVariationFromMean) + { + mVariationFromMean = + min(1., max(0., utilizationPercentVariationFromMean * 1e-2)); + } + int SetRebalancePlanOutFile(const string& rebalancePlanFn); + void BuildRebalancePlan(); + bool ChunkReplicationDone(MetaChunkReplicate* req); + void ExecuteRebalancePlan(); + void PrintChunkserverBlockCount(ostream& os) const; + int ReadNetworkDefn(const string& networkFn); + int VerifyRackAwareReplication( + bool reportAllFlag, bool verbose, ostream& os); + seq_t GetChunkversion(chunkId_t cid) const; + size_t GetChunkSize(chunkId_t cid) const; + void MarkServerDown(const ServerLocation& loc); + int GetNumBlksRebalanced() const + { + return mNumBlksRebalanced; + } + void Stop() + { + mStopFlag = true; + } + int RunFsck(const string& fileName); +private: + typedef map Loc2Server; + class PlacementVerifier; + + size_t RunChunkserverOps(); + void CalculateRebalaceThresholds(); + void PrepareRebalance(bool enableRebalanceFlag); + bool Parse(const char* line, size_t size, + bool addChunksToReplicationChecker, ServerLocation& loc); + void ShowPlacementError( + ostream& os, + const CSMap::Entry& c, + const ChunkServer* srv, + string& fileName, + size_t replicas, + const char* reason); + void VerifyPlacement( + const CSMap::Entry& c, + const Servers& servers, + const vector& cblk, + ChunkPlacement& placement, + ostream& os, + bool verboseFlag, + bool reportAllFlag, + PlacementVerifier& verifier); + size_t GetChunkSize(const CSMap::Entry& ci) const; + + // for the purposes of rebalancing, we compute the cluster + // wide average space utilization; then we take into the + // desired variation from mean to compute thresholds that determine + // which nodes are candidates for migration. + double mVariationFromMean; + int mNumBlksRebalanced; + bool mStopFlag; + ofstream mPlanFile; + Loc2Server mLoc2Server; +private: + // No copy. + LayoutEmulator(const LayoutEmulator&); + LayoutEmulator& operator=(const LayoutEmulator&); +}; + +extern LayoutEmulator gLayoutEmulator; +} + +#endif // EMULATOR_LAYOUTEMULATOR_H diff --git a/src/cc/emulator/emulator_setup.cc b/src/cc/emulator/emulator_setup.cc new file mode 100644 index 000000000..bcb160c29 --- /dev/null +++ b/src/cc/emulator/emulator_setup.cc @@ -0,0 +1,97 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/29 +// + +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +// \brief Code to setup the emulator: load checkpoint, replay transaction logs, +// and load "network definition" -- chunk to chunk server assigment, rack +// assignment, and chunk server space utilization. +//---------------------------------------------------------------------------- + +#include "LayoutEmulator.h" +#include "emulator_setup.h" + +#include "meta/kfstree.h" +#include "meta/Checkpoint.h" +#include "meta/Replay.h" +#include "meta/Restorer.h" +#include "meta/Logger.h" +#include "meta/util.h" +#include "common/MsgLogger.h" + +#include + +namespace KFS +{ +using std::string; + +int +EmulatorSetup( + string& logdir, + string& cpdir, + string& networkFn, + string& chunkmapFn, + int16_t minReplicasPerFile, + bool addChunksToReplicationChecker) +{ + logger_setup_paths(logdir); + checkpointer_setup_paths(cpdir); + + KFS_LOG_STREAM_INFO << "restoring from checkpoint: " << LASTCP << + KFS_LOG_EOM; + int status; + if (file_exists(LASTCP)) { + Restorer r; + status = r.rebuild(LASTCP, minReplicasPerFile) ? 0 : -EIO; + // gLayoutEmulator.InitRecoveryStartTime(); + } else { + status = metatree.new_tree(); + } + if (status != 0) { + return status; + } + KFS_LOG_STREAM_INFO << "replaying logs from: " << logdir << + KFS_LOG_EOM; + status = replayer.playAllLogs(); + if (status != 0) { + return status; + } + KFS_LOG_STREAM_INFO << "updating meta tree" << + KFS_LOG_EOM; + metatree.setUpdatePathSpaceUsage(true); + metatree.enableFidToPathname(); + KFS_LOG_STREAM_INFO << "reading network defn: " << networkFn << + KFS_LOG_EOM; + status = gLayoutEmulator.ReadNetworkDefn(networkFn); + if (status != 0) { + return status; + } + KFS_LOG_STREAM_INFO << "loading chunkmap: " << chunkmapFn << + KFS_LOG_EOM; + status = gLayoutEmulator.LoadChunkmap( + chunkmapFn, addChunksToReplicationChecker); + KFS_LOG_STREAM_INFO << "fs layout emulator setup complete." << KFS_LOG_EOM; + return status; +} + +} diff --git a/src/cc/emulator/emulator_setup.h b/src/cc/emulator/emulator_setup.h new file mode 100644 index 000000000..a2eb6fa22 --- /dev/null +++ b/src/cc/emulator/emulator_setup.h @@ -0,0 +1,49 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/29 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Setup code to get an emulator up. +// +//---------------------------------------------------------------------------- + +#ifndef EMULATOR_EMULATORSETUP_H +#define EMULATOR_EMULATORSETUP_H + +#include +#include + +namespace KFS +{ +using std::string; +using std::ostream; + +// pass an optional argument that enables changing the degree of replication for a file. +int EmulatorSetup( + string& logdir, + string& cpdir, + string& networkFn, + string& chunkmapFn, + int16_t minReplicasPerFile = 1, + bool addChunksToReplicationChecker = false); +} + +#endif // EMULATOR_EMULATORSETUP_H diff --git a/src/cc/emulator/rebalanceexecutor_main.cc b/src/cc/emulator/rebalanceexecutor_main.cc new file mode 100644 index 000000000..a27a5382c --- /dev/null +++ b/src/cc/emulator/rebalanceexecutor_main.cc @@ -0,0 +1,142 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/27 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Driver program to run the meta server in emulator mode and +// executes the plan for re-balancing blocks. +// Might be used for "off-line" debugging meta server layout emulation code, +// and / or the re-balance plan / off-line re-balancer. +// +//---------------------------------------------------------------------------- + +#include "LayoutEmulator.h" +#include "emulator_setup.h" + +#include "common/MsgLogger.h" +#include "common/Properties.h" +#include "common/MdStream.h" +#include "meta/AuditLog.h" + +#include + +using std::string; +using std::cout; + +using namespace KFS; + +int +main(int argc, char** argv) +{ + string rebalancePlanFn("rebalanceplan.txt"); + string logdir("kfslog"); + string cpdir("kfscp"); + string networkFn("network.def"); + string chunkmapFn("chunkmap.txt"); + string propsFn; + string chunkMapDir; + int optchar; + bool helpFlag = false; + bool debugFlag = false; + + while ((optchar = getopt(argc, argv, "c:l:n:b:r:hdp:o:")) != -1) { + switch (optchar) { + case 'l': + logdir = optarg; + break; + case 'c': + cpdir = optarg; + break; + case 'n': + networkFn = optarg; + break; + case 'b': + chunkmapFn = optarg; + break; + case 'r': + rebalancePlanFn = optarg; + break; + case 'h': + helpFlag = true; + break; + case 'd': + debugFlag = true; + break; + case 'p': + propsFn = optarg; + break; + case 'o': + chunkMapDir = optarg; + break; + default: + helpFlag = true; + break; + } + } + + if (helpFlag || rebalancePlanFn.empty()) { + cout << "Usage: " << argv[0] << "\n" + "[-l (default " << logdir << ")]\n" + "[-c (default " << cpdir << ")]\n" + "[-n (default " << + networkFn << ")]\n" + "[-b (default " << chunkmapFn << ")]\n" + "[-r ] (default" << rebalancePlanFn << ")\n" + " 0 - use default / configured re-balance thresholds]\n" + "[-p <[meta server] configuration file> (default none)]\n" + "[-o (default none)]\n" + "[-d debug -- print chunk into stdout layout before and after]\n" + ; + return 1; + } + + MdStream::Init(); + MsgLogger::Init(0, MsgLogger::kLogLevelINFO); + + Properties props; + int status = 0; + if ((propsFn.empty() || + (status = props.loadProperties(propsFn.c_str(), char('='), false)) + == 0)) { + gLayoutEmulator.SetParameters(props); + if ((status = EmulatorSetup(logdir, cpdir, networkFn, chunkmapFn)) + == 0 && + (status = gLayoutEmulator.LoadRebalancePlan(rebalancePlanFn)) + == 0) { + if (debugFlag) { + gLayoutEmulator.PrintChunkserverBlockCount(cout); + } + gLayoutEmulator.ExecuteRebalancePlan(); + if (! chunkMapDir.empty()) { + gLayoutEmulator.DumpChunkToServerMap(chunkMapDir); + } + if (debugFlag) { + gLayoutEmulator.PrintChunkserverBlockCount(cout); + } + } + } + + AuditLog::Stop(); + MdStream::Cleanup(); + + return (status == 0 ? 0 : 1); +} + diff --git a/src/cc/emulator/rebalanceplanner_main.cc b/src/cc/emulator/rebalanceplanner_main.cc new file mode 100644 index 000000000..ef86f4b21 --- /dev/null +++ b/src/cc/emulator/rebalanceplanner_main.cc @@ -0,0 +1,185 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/27 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Driver program to run the meta server on-line re-balancing off-line +// and create re-balance plan for moving / re-assigning chunks (fs blocks). +// The plan can be executed by the meta server or layout emulator, or can be +// used for testing or debugging meta server re-balancing logic off-line. +// +//---------------------------------------------------------------------------- + +#include "LayoutEmulator.h" +#include "emulator_setup.h" + +#include "common/MsgLogger.h" +#include "common/Properties.h" +#include "common/MdStream.h" +#include "qcdio/QCUtils.h" +#include "meta/AuditLog.h" + +#include +#include + +using std::string; +using std::cout; +using std::cerr; + +using namespace KFS; + +static void HandleStop(int) { gLayoutEmulator.Stop(); } + +int +main(int argc, char** argv) +{ + string rebalancePlanFn("rebalanceplan.txt"); + string logdir("kfslog"); + string cpdir("kfscp"); + string networkFn("network.def"); + string chunkmapFn("chunkmap.txt"); + string propsFn; + string chunkMapDir; + int optchar; + int16_t minReplication = -1; + double variationFromAvg = 0; + bool helpFlag = false; + bool debugFlag = false; + + while ((optchar = getopt(argc, argv, "c:l:n:b:r:hp:o:dm:t:")) != -1) { + switch (optchar) { + case 'l': + logdir = optarg; + break; + case 'c': + cpdir = optarg; + break; + case 'n': + networkFn = optarg; + break; + case 'b': + chunkmapFn = optarg; + break; + case 'r': + rebalancePlanFn = optarg; + break; + case 'h': + helpFlag = true; + break; + case 't': + variationFromAvg = atof(optarg); + break; + case 'p': + propsFn = optarg; + break; + case 'o': + chunkMapDir = optarg; + break; + case 'd': + debugFlag = true; + break; + case 'm': + minReplication = atoi(optarg); + break; + default: + cerr << "Unrecognized flag: " << (char)optchar << "\n"; + helpFlag = true; + break; + } + } + if (helpFlag || rebalancePlanFn.empty()) { + cout << + "Usage: " << argv[0] << "\n" + "[-l (default " << logdir << ")]\n" + "[-c (default " << cpdir << ")]\n" + "[-n (default " << + networkFn << ")]\n" + "[-b (default " << chunkmapFn << ")]\n" + "[-r ] (default" << rebalancePlanFn << ")\n" + "[-t <% variation from average utilization> (default " << + variationFromAvg << "%)" + " 0 - use default / configured re-balance thresholds]\n" + "[-p <[meta server] configuration file> (default none)]\n" + "[-o (default none)]\n" + "[-d debug -- print chunk layout before and after]\n" + "[-m (default -1 -- no change)]\n" + "To create network defininiton file and chunk map files:\n" + "telnet to the meta server, and issue DUMP_CHUNKTOSERVERMAP\n" + "followed by an empty line.\n" + "Copy produced chunkmap.txt and network.def files as well as,\n" + "latest checkpoint and transacton logs.\n" + "The closer in time the checkpoint and logs to the chumk map\n" + "the better -- less descrepancies due to fs modifications.\n" + ; + return 1; + } + + MdStream::Init(); + MsgLogger::Init(0, MsgLogger::kLogLevelINFO); + + if (signal(SIGINT, &HandleStop) == SIG_ERR) { + KFS_LOG_STREAM_ERROR << + QCUtils::SysError(errno, "signal(SIGINT):") << + KFS_LOG_EOM; + return 1; + } + if (signal(SIGQUIT, &HandleStop) == SIG_ERR) { + KFS_LOG_STREAM_ERROR << + QCUtils::SysError(errno, "signal(SIGQUIT):") << + KFS_LOG_EOM; + return 1; + } + + Properties props; + int status = 0; + if (propsFn.empty() || + (status = props.loadProperties(propsFn.c_str(), char('='), false)) + == 0) { + gLayoutEmulator.SetParameters(props); + gLayoutEmulator.SetupForRebalancePlanning(variationFromAvg); + status = EmulatorSetup(logdir, cpdir, networkFn, chunkmapFn, + minReplication, minReplication > 1); + if (status == 0 && + (status = gLayoutEmulator.SetRebalancePlanOutFile( + rebalancePlanFn)) == 0) { + if (debugFlag) { + gLayoutEmulator.PrintChunkserverBlockCount(cout); + } + KFS_LOG_STREAM_NOTICE << "creating re-balance plan: " << + rebalancePlanFn << + KFS_LOG_EOM; + gLayoutEmulator.BuildRebalancePlan(); + if (! chunkMapDir.empty()) { + gLayoutEmulator.DumpChunkToServerMap(chunkMapDir); + } + if (debugFlag) { + gLayoutEmulator.PrintChunkserverBlockCount(cout); + } + KFS_LOG_STREAM_NOTICE << "replicated chunks: " << + gLayoutEmulator.GetNumBlksRebalanced() << + KFS_LOG_EOM; + } + } + AuditLog::Stop(); + MdStream::Cleanup(); + return (status == 0 ? 0 : 1); +} + diff --git a/src/cc/emulator/replicachecker_main.cc b/src/cc/emulator/replicachecker_main.cc new file mode 100644 index 000000000..6266e62a8 --- /dev/null +++ b/src/cc/emulator/replicachecker_main.cc @@ -0,0 +1,131 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/08/27 +// +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Read in a network map, a block location map and verify that +// the three copies of a replica are on three different racks and server, and +// report chunk placement problems. +// +//---------------------------------------------------------------------------- + +#include "LayoutEmulator.h" +#include "emulator_setup.h" + +#include "common/MdStream.h" +#include "common/MsgLogger.h" +#include "common/Properties.h" +#include "meta/AuditLog.h" + +using std::string; +using std::cout; +using std::cerr; + +using namespace KFS; + +int +main(int argc, char** argv) +{ + string logdir("kfslog"); + string cpdir("kfscp"); + string networkFn("network.def"); + string chunkmapFn("chunkmap.txt"); + string fsckFn("-"); + string propsFn; + int optchar; + bool helpFlag = false; + bool reportAllFlag = false; + bool verboseFlag = false; + + while ((optchar = getopt(argc, argv, "avc:l:n:b:r:hf:p:")) != -1) { + switch (optchar) { + case 'l': + logdir = optarg; + break; + case 'c': + cpdir = optarg; + break; + case 'n': + networkFn = optarg; + break; + case 'b': + chunkmapFn = optarg; + break; + case 'h': + helpFlag = true; + break; + case 'a': + reportAllFlag = true; + break; + case 'f': + fsckFn = optarg; + break; + case 'v': + verboseFlag = true; + break; + case 'p': + propsFn = optarg; + break; + default: + cerr << "Unrecognized flag " << (char)optchar << "\n"; + helpFlag = true; + break; + } + } + + if (helpFlag) { + cout << "Usage: " << argv[0] << "\n" + "[-l (default " << logdir << ")]\n" + "[-c (default " << cpdir << ")]\n" + "[-n (default " << + networkFn << ")]\n" + "[-b (default " << chunkmapFn << ")]\n" + "[-p <[meta server] configuration file> (default none)]\n" + "[-f (- stdout) (default " << + fsckFn << ")]\n" + "[-v verbose replica check output]\n" + "[-a report all placement problems]\n" + ; + return 1; + } + + MdStream::Init(); + MsgLogger::Init(0, MsgLogger::kLogLevelINFO); + Properties props; + int fsckStatus = 0; + int status = 0; + if (propsFn.empty() || + (status = props.loadProperties(propsFn.c_str(), char('='), false)) + == 0) { + gLayoutEmulator.SetParameters(props); + if ((status = EmulatorSetup(logdir, cpdir, networkFn, chunkmapFn)) == + 0) { + if (! fsckFn.empty()) { + fsckStatus = gLayoutEmulator.RunFsck(fsckFn); + } + status = gLayoutEmulator.VerifyRackAwareReplication( + reportAllFlag, verboseFlag, cout); + } + } + AuditLog::Stop(); + MdStream::Cleanup(); + return (status ? 1 : (fsckStatus == 0 ? 0 : 1)); +} diff --git a/src/cc/fuse/CMakeLists.txt b/src/cc/fuse/CMakeLists.txt new file mode 100644 index 000000000..84f2999fd --- /dev/null +++ b/src/cc/fuse/CMakeLists.txt @@ -0,0 +1,43 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +# +# Build the fuse binary; build it only on demand +# +add_executable (kfs_fuse kfs_fuse_main.cc) +link_directories(${Fuse_LIBRARY_DIR}) + +IF (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SIZEOF_VOID_P STREQUAL "8" AND EXISTS "/usr/local/lib/libfuse_ino64.dylib") + message(STATUS "Using fuse_ino64 dylib") + target_link_libraries (kfs_fuse kfsClient qcdio fuse_ino64 pthread) +ELSE (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SIZEOF_VOID_P STREQUAL "8" AND EXISTS "/usr/local/lib/libfuse_ino64.dylib") + target_link_libraries (kfs_fuse kfsClient qcdio fuse pthread) +ENDIF (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SIZEOF_VOID_P STREQUAL "8" AND EXISTS "/usr/local/lib/libfuse_ino64.dylib") + +# +install (TARGETS kfs_fuse + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib-static) diff --git a/src/cc/fuse/TODO b/src/cc/fuse/TODO new file mode 100644 index 000000000..b2e797abe --- /dev/null +++ b/src/cc/fuse/TODO @@ -0,0 +1,3 @@ +* Reads & writes are 4k. There's supposed to be a way to enable large + reads and writes in fuse. +* Permissions come out --------- when you cp from kfs to local. diff --git a/src/cc/fuse/kfs_fuse_main.cc b/src/cc/fuse/kfs_fuse_main.cc new file mode 100644 index 000000000..3ec4ee6e1 --- /dev/null +++ b/src/cc/fuse/kfs_fuse_main.cc @@ -0,0 +1,549 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/11/01 +// Author: Blake Lewis (Kosmix Corp.) +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Kfs fuse. +// Default is to mount read only, as non sequential write isn't supported with +// files created with Reed-Solomon recovery, as well as simultaneous read and +// write (O_RDWR) into the same file by a single writer. +// +//---------------------------------------------------------------------------- + +#include "libclient/KfsClient.h" + +#define FUSE_USE_VERSION 26 +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include + +using std::string; +using std::vector; +using KFS::KfsClient; +using KFS::KfsFileAttr; +using KFS::kfsMode_t; +using KFS::kfsUid_t; +using KFS::kfsGid_t; +using KFS::kKfsUserNone; +using KFS::kKfsGroupNone; +using KFS::kKfsModeUndef; +using KFS::Permissions; +using KFS::KFS_STRIPED_FILE_TYPE_NONE; + +static KfsClient *client; + +static inline kfsMode_t +mode2kfs_mode(mode_t mode) +{ + kfsMode_t km = (kfsMode_t)mode & Permissions::kAccessModeMask; +#ifdef S_ISVTX + if ((mode & S_ISVTX) != 0) { + km |= Permissions::kStickyBit; + } +#endif + return km; +} + +static int +fuse_getattr(const char *path, struct stat *s) +{ + KfsFileAttr attr; + int status = client->Stat(path, attr); + if (status < 0) + return status; + attr.ToStat(*s); + return 0; +} + +static int +fuse_fgetattr(const char *path, struct stat *s, struct fuse_file_info *finfo) +{ + return fuse_getattr(path, s); +} + +static int +fuse_mkdir(const char *path, mode_t mode) +{ + return client->Mkdir(path, mode2kfs_mode(mode)); +} + +static int +fuse_unlink(const char *path) +{ + return client->Remove(path); +} + +static int +fuse_rmdir(const char *path) +{ + return client->Rmdir(path); +} + +static int +fuse_rename(const char *src, const char *dst) +{ + return client->Rename(src, dst, false); +} + +static int +fuse_ftruncate(const char *path, off_t size, struct fuse_file_info *finfo) +{ + return client->Truncate(finfo->fh, size); +} + +static int +fuse_truncate(const char *path, off_t size) +{ + return client->Truncate(path, size); +} + +static int +fuse_open(const char *path, struct fuse_file_info *finfo) +{ + int fd = client->Open(path, finfo->flags); + if (fd < 0) + return fd; + finfo->fh = fd; + return 0; +} + +static int +fuse_create(const char *path, mode_t mode, struct fuse_file_info *finfo) +{ + const int numReplicas = 3; + const bool exclusive = false; + const int numStripes = 0; + const int numRecoveryStripes = 0; + const int stripeSize = 0; + const int stripedType = KFS_STRIPED_FILE_TYPE_NONE; + const bool forceTypeFlag = true; + const kfsMode_t kfs_mode = + (kfsMode_t)mode & Permissions::kAccessModeMask; + int fd = client->Create(path, + numReplicas, + exclusive, + numStripes, + numRecoveryStripes, + stripeSize, + stripedType, + forceTypeFlag, + kfs_mode + ); + if (fd < 0) + return fd; + finfo->fh = fd; + return 0; +} + +static int +fuse_read(const char *path, char *buf, size_t nbytes, off_t off, + struct fuse_file_info *finfo) +{ + return (int)client->PRead(finfo->fh, off, buf, nbytes); +} + +static int +fuse_write(const char *path, const char *buf, size_t nbytes, off_t off, + struct fuse_file_info *finfo) +{ + return (int)client->PWrite(finfo->fh, off, buf, nbytes); +} + +static int +fuse_flush(const char *path, struct fuse_file_info *finfo) +{ + // NO! + return 0; +} + +static int +fuse_release(const char *path, struct fuse_file_info *finfo) +{ + return client->Close(finfo->fh); +} + +static int +fuse_fsync(const char *path, int flags, struct fuse_file_info *finfo) +{ + return client->Sync(finfo->fh); +} + +static int +fuse_opendir(const char *path, struct fuse_file_info *finfo) +{ + if (!client->IsDirectory(path)) + return -ENOTDIR; + return 0; +} + +static int +fuse_readdir(const char *path, void *buf, + fuse_fill_dir_t filler, off_t offset, + struct fuse_file_info *finfo) +{ + vector contents; + int status = client->ReaddirPlus(path, contents); + if (status < 0) + return status; + int n = contents.size(); + for (int i = 0; i < n; i++) { + struct stat s; + contents[i].ToStat(s); + if (filler(buf, contents[i].filename.c_str(), &s, 0) != 0) { + break; + } + } + return 0; +} + +static int +fuse_releasedir(const char *path, struct fuse_file_info *finfo) +{ + return 0; +} + +static int +fuse_access(const char *path, int mode) +{ + KfsFileAttr attr; + int status = client->Stat(path, attr); + if (status != 0) { + return status; + } + if (attr.mode == kKfsModeUndef || mode == F_OK) { + return 0; + } + if (((mode & R_OK) != 0 && (attr.mode & 0400) == 0) || + ((mode & W_OK) != 0 && (attr.mode & 0200) == 0) || + ((mode & X_OK) != 0 && (attr.mode & 0100) == 0)) { + return -EACCES; + } + return 0; +} + +static int +fuse_chmod(const char *path, mode_t mode) +{ + return client->Chmod(path, mode2kfs_mode(mode)); +} + +static int +fuse_chown(const char *path, uid_t user, gid_t group) +{ + return client->Chown(path, + user == (uid_t)-1 ? kKfsUserNone : (kfsUid_t)user, + group == (gid_t)-1 ? kKfsGroupNone : (kfsGid_t)group + ); +} + +struct fuse_operations ops = { + fuse_getattr, + NULL, /* readlink */ + NULL, /* getdir */ + NULL, /* mknod */ + fuse_mkdir, + fuse_unlink, + fuse_rmdir, + NULL, /* symlink */ + fuse_rename, + NULL, /* link */ + fuse_chmod, /* chmod */ + fuse_chown, /* chown */ + fuse_truncate, + NULL, /* utime */ + fuse_open, + fuse_read, + fuse_write, + NULL, /* statfs */ + fuse_flush, /* flush */ + fuse_release, /* release */ + fuse_fsync, /* fsync */ + NULL, /* setxattr */ + NULL, /* getxattr */ + NULL, /* listxattr */ + NULL, /* removexattr */ + fuse_opendir, + fuse_readdir, + fuse_releasedir, + NULL, /* fsyncdir */ + NULL, /* init */ + NULL, /* destroy */ + fuse_access, /* access */ + fuse_create, /* create */ + fuse_ftruncate, /* ftruncate */ + fuse_fgetattr, /* fgetattr */ +}; + +struct fuse_operations ops_readonly = { + fuse_getattr, + NULL, /* readlink */ + NULL, /* getdir */ + NULL, /* mknod */ + NULL, /* mkdir */ + NULL, /* unlink */ + NULL, /* rmdir */ + NULL, /* symlink */ + NULL, /* rename */ + NULL, /* link */ + NULL, /* chmod */ + NULL, /* chown */ + NULL, /* truncate */ + NULL, /* utime */ + fuse_open, + fuse_read, + NULL, /* write */ + NULL, /* statfs */ + NULL, /* flush */ + fuse_release, /* release */ + NULL, /* fsync */ + NULL, /* setxattr */ + NULL, /* getxattr */ + NULL, /* listxattr */ + NULL, /* removexattr */ + fuse_opendir, + fuse_readdir, + NULL, /* releasedir */ + NULL, /* fsyncdir */ + NULL, /* init */ + NULL, /* destroy */ + fuse_access, /* access */ + NULL, /* create */ + NULL, /* ftruncate */ + fuse_fgetattr, /* fgetattr */ +}; + +void +fatal(const char *fmt, ...) +{ + va_list arg; + + fflush(stdout); + + va_start(arg, fmt); + vfprintf(stderr, fmt, arg); + va_end(arg); + + if (errno != 0) + fprintf(stderr, " %s", strerror(errno)); + fprintf(stderr, "\n"); + + exit(2); +} + +void +initkfs(char *addr) +{ + char *cp; + + if ((cp = strchr(addr, ':')) == NULL) + fatal("bad address: %s", addr); + string host(addr, cp - addr); + int port = atoi(cp + 1); + if ((client = KFS::Connect(host, port)) == NULL) + fatal("connect: %s:%d", host.c_str(), port); +} + +static struct fuse_args* +get_fs_args(struct fuse_args* args) +{ +#ifdef KFS_OS_NAME_DARWIN + return NULL; +#else + if (!args) { + return NULL; + } + args->argc = 2; + args->argv = (char**)calloc(sizeof(char*), args->argc + 1); + args->argv[0] = strdup("kfs_fuse"); + args->argv[1] = strdup("-obig_writes"); + args->allocated = 1; + return args; +#endif +} + +static struct fuse_args* +get_mount_args(struct fuse_args* args, const char* options) +{ + if (!args) { + return NULL; + } + + args->argc = 2; + args->argv = (char**)calloc(sizeof(char*), args->argc + 1); + args->argv[0] = strdup("unused_arg0"); + args->argv[1] = strdup(options); + args->allocated = 1; + return args; +} + +/* + * Run through the -o OPTIONS and interpret it as writable only if 'rrw' is + * explicitly specified. We use 'rrw' instead of 'rw' because a 'default' + * mount option entry in the fstab calls us with 'rw', but we want the default + * behavior to be readonly. + */ +static int +massage_options(char** opt_argv, int opt_argc, string* options, bool* readonly) +{ + if (!opt_argv || !readonly || !options) { + return -1; + } + if (opt_argc <= 0 || opt_argc > 2 || strncmp(opt_argv[0], "-o", 2)) { + return -1; + } + *readonly = true; + string cmdline = opt_argc == 1 ? opt_argv[0] + 2 : opt_argv[1]; + + size_t start = 0; + size_t end = 0; + vector opts; + string delim = " ,"; + while (true) { + start = cmdline.find_first_not_of(delim, start); + if (start == string::npos){ + break; + } + + end = cmdline.find_first_of(delim, start); + if (end == string::npos) { + string token = cmdline.substr(start); + if (token == "rrw") { + *readonly = false; + opts.push_back("rw"); + } else if (token != "rw") { + opts.push_back(token); + } + break; + } + string token = cmdline.substr(start, end - start); + if (token == "rrw") { + *readonly = false; + opts.push_back("rw"); + } else if (token != "rw") { + opts.push_back(token); + } + start = end; + } + + if (*readonly) { + *options = "-oro"; + } else { + *options = "-orw"; + } + + while (!opts.empty()) { + string token = opts.back(); + opts.pop_back(); + if (token == "rw" || token == "ro") { + continue; + } + options->append(","); + options->append(token); + } + return 0; +} + +/* + * Fork and do the work in the child so that init will reap the process. + * Do the KfsClient connection, fuse mount, and so on in the child process. + */ +void +initfuse(char* kfs_host_address, const char* mountpoint, + const char* options, bool readonly) +{ + int pid = fork(); + if (pid < 0) { + fatal("fork:"); + } + if (pid == 0) { + initkfs(kfs_host_address); + + struct fuse_args fs_args; + struct fuse_args mnt_args; + + struct fuse_chan* ch = NULL; + ch = fuse_mount(mountpoint, get_mount_args(&mnt_args, options)); + if (ch == NULL) { + delete client; + fatal("fuse_mount: %s:", mountpoint); + } + + struct fuse* fuse = NULL; + fuse = fuse_new(ch, get_fs_args(&fs_args), + (readonly ? &ops_readonly : &ops), + (readonly ? sizeof(ops_readonly) : sizeof(ops)), + NULL); + if (fuse == NULL) { + fuse_unmount(mountpoint, ch); + delete client; + fatal("fuse_new:"); + } + + fuse_loop_mt(fuse); + fuse_unmount(mountpoint, ch); + fuse_destroy(fuse); + delete client; + } + return; +} + +void +usage(int e) +{ + //Undocumented option: 'rrw'. See massage_options() above. + fprintf(stderr, "usage: kfs_fuse kfshost mountpoint [-o opt1[,opt2..]]\n" + " eg: kfs_fuse 127.0.0.1:20000 " + "/mnt/kfs -o allow_other,ro\n"); + exit(e); +} + +int +main(int argc, char **argv) +{ + argc--; argv++; + + if (argc >= 1 && ( + !strncmp("-h", argv[0], 2) || + !strncmp("-help", argv[0], 5) || + !strncmp("--help", argv[0], 6))) + usage(0); + + if (argc < 2) + usage(1); + + // Default is readonly mount,private mount. + string options("-oro"); + bool readonly = true; + if (argc > 2) { + if (massage_options(argv + 2, argc - 2, &options, &readonly) < 0) { + usage(1); + } + } + + //setsid(); // detach from console + + initfuse(argv[0], argv[1], options.c_str(), readonly); + + return 0; +} diff --git a/src/cc/kfsio/Acceptor.cc b/src/cc/kfsio/Acceptor.cc new file mode 100644 index 000000000..2a2349b72 --- /dev/null +++ b/src/cc/kfsio/Acceptor.cc @@ -0,0 +1,154 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/23 +// Author: Sriram Rao +// +// Copyright 2008-2011 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "Acceptor.h" +#include "NetManager.h" +#include "Globals.h" +#include "common/MsgLogger.h" +#include "qcdio/QCUtils.h" +#include + +namespace KFS +{ +using namespace KFS::libkfsio; +/// +/// Create a TCP socket, bind it to the port, and listen for incoming connections. +/// +Acceptor::Acceptor(NetManager& netManager, int port, IAcceptorOwner *owner) + : mPort(port), + mAcceptorOwner(owner), + mConn(), + mNetManager(netManager) +{ + SET_HANDLER(this, &Acceptor::RecvConnection); + Acceptor::Listen(); +} + +Acceptor::Acceptor(int port, IAcceptorOwner *owner) + : mPort(port), + mAcceptorOwner(owner), + mConn(), + mNetManager(globalNetManager()) +{ + SET_HANDLER(this, &Acceptor::RecvConnection); + Acceptor::Listen(); +} + +Acceptor::~Acceptor() +{ + if (mConn) { + mConn->Close(); + mConn.reset(); + } +} + +void +Acceptor::Listen() +{ + if (! mNetManager.IsRunning()) { + return; + } + if (mConn) { + mConn->Close(); + mConn.reset(); + } + TcpSocket* const sock = new TcpSocket(); + const bool kNonBlockingAcceptFlag = true; + const int res = sock->Listen(mPort, kNonBlockingAcceptFlag); + if (res < 0) { + KFS_LOG_STREAM_ERROR << + "Unable to bind to port: " << mPort << + " error: " << QCUtils::SysError(-res) << + KFS_LOG_EOM; + delete sock; + return; + } + mConn.reset(new NetConnection(sock, this, true)); + mConn->EnableReadIfOverloaded(); + mNetManager.AddConnection(mConn); +} + +/// +/// Event handler that gets called back whenever a new connection is +/// received. In response, the AcceptorOwner object is first notified of +/// the new connection and then, the new connection is added to the +/// list of connections owned by the NetManager. @see NetManager +/// +int +Acceptor::RecvConnection(int code, void* data) +{ + switch (code) { + case EVENT_NEW_CONNECTION: + break; + case EVENT_NET_ERROR: + KFS_LOG_STREAM_INFO << + "acceptor on port: " << mPort << + " error: " << + QCUtils::SysError(mConn ? mConn->GetSocketError() : 0) << + (mNetManager.IsRunning() ? ", restarting" : ", exiting") << + KFS_LOG_EOM; + if (mConn) { + mConn->Close(); + mConn.reset(); + } + if (mNetManager.IsRunning()) { + Listen(); + if (! IsAcceptorStarted()) { + abort(); + } + } + return 0; + case EVENT_INACTIVITY_TIMEOUT: + KFS_LOG_STREAM_DEBUG << + "acceptror inactivity timeout event ignored" << + KFS_LOG_EOM; + return 0; + default: + KFS_LOG_STREAM_FATAL << + "Unexpected event code: " << code << + KFS_LOG_EOM; + abort(); + break; + } + if (! data) { + KFS_LOG_STREAM_FATAL << + "Unexpected null argument, event code: " << code << + KFS_LOG_EOM; + abort(); + } + NetConnectionPtr& conn = *reinterpret_cast(data); + KfsCallbackObj* const obj = mAcceptorOwner->CreateKfsCallbackObj(conn); + if (conn) { + if (obj) { + conn->SetOwningKfsCallbackObj(obj); + mNetManager.AddConnection(conn); + } else { + conn->Close(); + } + } + return 0; +} +} diff --git a/src/cc/kfsio/Acceptor.h b/src/cc/kfsio/Acceptor.h new file mode 100644 index 000000000..b199e74c9 --- /dev/null +++ b/src/cc/kfsio/Acceptor.h @@ -0,0 +1,116 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/22 +// Author: Sriram Rao +// +// Copyright 2008 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _LIBIO_ACCEPTOR_H +#define _LIBIO_ACCEPTOR_H + +#include "KfsCallbackObj.h" +#include "NetConnection.h" + +namespace KFS +{ + +/// +/// \file Acceptor.h +/// \brief Mechanism for accepting TCP connections. +/// +/// Accepting a TCP connection consists of two pieces: +/// +/// 1. Setting up a TCP socket and marking it for listen: This is +/// handled by the Acceptor. +/// +/// 2. Once a connection is received, "doing something" with it: +/// This is handled by the IAcceptorOwner. +/// + +/// +/// \class IAcceptorOwner +/// Abstract class defines the interface that must be implemented by an +/// owner of an acceptor object. +/// + +class IAcceptorOwner { +public: + virtual ~IAcceptorOwner() { }; + + /// + /// Callback that will be invoked whenever a new connection is + /// received. The callback is expected to create a continuation + /// and return that as the result. + /// @param conn A smart pointer that encapsulates the connection + /// that was received. @see NetConnectionPtr + /// + virtual KfsCallbackObj *CreateKfsCallbackObj(NetConnectionPtr &conn) = 0; +}; + +class NetManager; + +/// +/// \class Acceptor +/// A continuation for receiving connections on a TCP port. Calls +/// back the associated IAcceptorOwner whenever a connection is received. +/// +class Acceptor : public KfsCallbackObj { +public: + + /// + /// @param port Port number used to listen for incoming + /// connections. + /// @param owner The IAcceptorOwner object that "owns" this Acceptor. + /// + Acceptor(int port, IAcceptorOwner *owner); + Acceptor(NetManager& netManager, int port, IAcceptorOwner *owner); + ~Acceptor(); + + /// Return true if we were able to bind to the acceptor port + bool IsAcceptorStarted() const { + return (mConn && mConn->IsGood()); + } + /// + /// Event handler to handle incoming connections. @see KfsCallbackObj + /// @param code Unused argument + /// @param data NetConnectionPtr object that encapsulates the + /// accepted connection. + /// @result Returns 0. + /// + int RecvConnection(int code, void *data); + +private: + /// + /// The encapsulated connection object that corresponds to the TCP + /// port on which the Acceptor is listening for connections. + /// + const int mPort; + IAcceptorOwner* const mAcceptorOwner; + NetConnectionPtr mConn; + NetManager& mNetManager; + + void Listen(); +}; + +} + +#endif // _LIBIO_ACCEPTOR_H diff --git a/src/cc/kfsio/BufferedSocket.cc b/src/cc/kfsio/BufferedSocket.cc new file mode 100644 index 000000000..d6f41b493 --- /dev/null +++ b/src/cc/kfsio/BufferedSocket.cc @@ -0,0 +1,139 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/07/03 +// Author: Sriram Rao +// +// Copyright 2008 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Code for doing buffered reads from socket. +// +//---------------------------------------------------------------------------- + +#include +#include + +#include "BufferedSocket.h" + +namespace KFS +{ +using std::string; + +int +BufferedSocket::ReadLine(string &result) +{ + int navail, nread = 0; + char *lineEnd; + + lineEnd = index(mHead, '\n'); + if (lineEnd != NULL) { + nread = lineEnd - mHead + 1; + result.append(mHead, nread); + Consume(nread); + return nread; + } + + // no line-end...so, copy out what is in the buffer + if (mAvail > 0) { + nread = mAvail; + result.append(mHead, nread); + Consume(nread); + } + + // read until we get a new-line + while (1) { + navail = Recv(mBuffer, BUF_SIZE); + if (navail == 0) { + // socket is down + return nread; + } + if ((navail < 0) && (errno == EAGAIN)) + continue; + if (navail < 0) + break; + + Fill(navail); + + lineEnd = index(mBuffer, '\n'); + if (lineEnd == NULL) { + // haven't hit a line boundary...so, keep going + result.append(mBuffer, navail); + nread += navail; + Consume(navail); + continue; + } + navail = (lineEnd - mBuffer + 1); + nread += navail; + result.append(mBuffer, navail); + Consume(navail); + break; + } + return nread; +} + +int +BufferedSocket::DoSynchRecv(char *buf, int bufLen, struct timeval &timeout) +{ + int nread = 0, res; + + // Copy out of the buffer and then, if needed, get from the socket. + if (mAvail > 0) { + nread = bufLen < mAvail ? bufLen : mAvail; + memcpy(buf, mHead, nread); + Consume(nread); + } + + if ((bufLen - nread) <= 0) + return nread; + + assert(mAvail == 0); + + res = TcpSocket::DoSynchRecv(buf + nread, bufLen - nread, timeout); + if (res > 0) + nread += res; + return nread; + +} + +int +BufferedSocket::Recv(char *buf, int bufLen) +{ + int nread = 0, res; + + // Copy out of the buffer and then, if needed, get from the socket. + if (mAvail > 0) { + nread = bufLen < mAvail ? bufLen : mAvail; + memcpy(buf, mHead, nread); + Consume(nread); + } + + if ((bufLen - nread) <= 0) + return nread; + + assert(mAvail == 0); + + res = TcpSocket::Recv(buf + nread, bufLen - nread); + if (res > 0) { + nread += res; + return nread; + } + if (nread == 0) + return res; + return nread; +} +} diff --git a/src/cc/kfsio/BufferedSocket.h b/src/cc/kfsio/BufferedSocket.h new file mode 100644 index 000000000..20cae8136 --- /dev/null +++ b/src/cc/kfsio/BufferedSocket.h @@ -0,0 +1,113 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/07/03 +// Author: Sriram Rao +// +// Copyright 2008 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Helper class that has a socket with a buffer. This enables +// API such as readLine() where we read N bytes from the socket, +// return a line and leave the rest buffered for subsequent access. +// NOTE: This class only does buffered I/O for reads; for writes, it +// is all pass-thru to the TcpSocket class. +//---------------------------------------------------------------------------- + +#ifndef LIBKFSIO_BUFFEREDSOCKET_H +#define LIBKFSIO_BUFFEREDSOCKET_H + +#include "TcpSocket.h" + +#include +#include +#include +#include + +namespace KFS +{ + +class BufferedSocket : public TcpSocket +{ +public: + BufferedSocket() { + Reset(); + } + BufferedSocket(int fd) : TcpSocket(fd) { + Reset(); + } + /// Read a line of data which is terminated by a '\n' from the + /// socket. + /// @param[out] result The string that corresponds to data read + /// from the network + /// @retval # of bytes read; -1 on error + int ReadLine(std::string &result); + + /// Synchronously (blocking) receive for the desired # of bytes. + /// Note that we first pull data out the buffer and if there is + /// more to be read, we get it from the socket. + /// @param[out] buf The buffer to be filled with data from the + /// socket. + /// @param[in] bufLen The desired # of bytes to be read in + /// @param[in] timeout The max amount of time to wait for data + /// @retval # of bytes read; -1 on error; -ETIMEOUT if timeout + /// expires and no data is read in + int DoSynchRecv(char *buf, int bufLen, struct timeval &timeout); + + /// Read at most the specified # of bytes from the socket. + /// Note that we first pull data out the buffer and if there is + /// more to be read, we get it from the socket. The read is + /// non-blocking: if recv() returns EAGAIN (to indicate that no + /// data is available), we return how much ever data we have read + /// so far. + /// @param[out] buf The buffer to be filled with data from the + /// socket. + /// @param[in] bufLen The desired # of bytes to be read in + /// @retval # of bytes read; -1 on error + int Recv(char *buf, int bufLen); + +private: + const static int BUF_SIZE = 4096; + /// The buffer into which data has been read from the socket. + char mBuffer[BUF_SIZE]; + /// Since we have read from the buffer, head tracks where the next + /// character is available for read from mBuffer[] + char *mHead; + /// How much data is in the buffer + int mAvail; + + void Reset() { + mHead = mBuffer; + mAvail = 0; + memset(mBuffer, '\0', BUF_SIZE); + } + void Fill(int nbytes) { + mAvail += nbytes; + assert(mAvail <= BUF_SIZE); + } + void Consume(int nbytes) { + mHead += nbytes; + mAvail -= nbytes; + if (mAvail == 0) + Reset(); + assert(mAvail >= 0); + } +}; + +} + +#endif // LIBKFSIO_BUFFEREDSOCKET_H diff --git a/src/cc/kfsio/CMakeLists.txt b/src/cc/kfsio/CMakeLists.txt new file mode 100644 index 000000000..af5d73ed1 --- /dev/null +++ b/src/cc/kfsio/CMakeLists.txt @@ -0,0 +1,69 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +# Take all the .cc files and build a library out of them +set (sources + Acceptor.cc + BufferedSocket.cc + checksum.cc + Globals.cc + IOBuffer.cc + NetConnection.cc + NetErrorSimulator.cc + NetManager.cc + TcpSocket.cc + requestio.cc +) + +add_library (kfsIO STATIC ${sources}) +add_library (kfsIO-shared SHARED ${sources}) +set_target_properties (kfsIO PROPERTIES OUTPUT_NAME "kfs_io") +set_target_properties (kfsIO-shared PROPERTIES OUTPUT_NAME "kfs_io") + +set_target_properties (kfsIO PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties (kfsIO-shared PROPERTIES CLEAN_DIRECT_OUTPUT 1) + +add_dependencies (kfsIO qcdio kfsCommon) +add_dependencies (kfsIO-shared qcdio-shared kfsCommon-shared) +target_link_libraries (kfsIO qcdio kfsCommon pthread z ${Boost_LIBRARIES}) +target_link_libraries (kfsIO-shared qcdio-shared kfsCommon-shared pthread z) + +if (APPLE OR CYGWIN) + target_link_libraries (kfsIO-shared ${Boost_LIBRARIES}) +endif (APPLE OR CYGWIN) + +if (NOT APPLE) + target_link_libraries (kfsIO rt) + target_link_libraries (kfsIO-shared rt) +endif(NOT APPLE) + +if (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + target_link_libraries (kfsIO nsl socket resolv) + target_link_libraries (kfsIO-shared nsl socket resolv) +endif (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + +install (TARGETS kfsIO kfsIO-shared + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static) diff --git a/src/cc/kfsio/Counter.h b/src/cc/kfsio/Counter.h new file mode 100644 index 000000000..c31b1a0c5 --- /dev/null +++ b/src/cc/kfsio/Counter.h @@ -0,0 +1,165 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/07/20 +// Author: Sriram Rao +// +// Copyright 2008-2011 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +// \brief Counter for statistics gathering. +// +//---------------------------------------------------------------------------- + +#ifndef LIBKFSIO_COUNTER_H +#define LIBKFSIO_COUNTER_H + +#include + +#include +#include +#include +#include + +#include "common/kfsatomic.h" + +namespace KFS +{ +using std::ostream; +using std::string; +using std::for_each; + +/// Counters in KFS are currently setup to track a single "thing". +/// If we need to track multiple related items (such as, network +/// connections and how much I/O is done on them), then we need to +/// have multiple counters one for each and then display the +/// accumulated statistics in some way. +class Counter { +public: + // XXX: add threshold values for counts + + Counter() : mName(""), mCount(0), mTimeSpent(0) { } + Counter(const char *name) : mName(name), mCount(0), mTimeSpent(0) { } + virtual ~Counter() { } + + /// Print out some information about this counter + virtual void Show(ostream &os) { + os << mName << ": " << mCount << "," << (mTimeSpent * 1e-6) << "\r\n"; + } + + void SetName(const char *name) { + mName = name; + } + + /// Update the counter + virtual void Update(int64_t amount) { + SyncAddAndFetch(mCount, amount); + } + + virtual void UpdateTime(int64_t timeSpentMicroSec) { + SyncAddAndFetch(mTimeSpent, timeSpentMicroSec); + } + + virtual void Set(int64_t c) { mCount = c; } + + /// Reset the state of this counter + virtual void Reset() { mCount = 0; mTimeSpent = 0; } + + const string& GetName() const { + return mName; + } + int64_t GetValue() const { + return mCount; + } +protected: + /// Name of this counter object + string mName; + /// Value of this counter + volatile int64_t mCount; + /// time related statistics + volatile int64_t mTimeSpent; +}; + +class ShowCounter { + ostream &os; +public: + ShowCounter(ostream &o) : os(o) { } + void operator() (std::tr1::unordered_map::value_type v) { + Counter *c = v.second; + + c->Show(os); + } +}; + +/// +/// Counter manager that tracks all the counters in the system. The +/// manager can be queried for statistics. +/// +class CounterManager { + typedef std::tr1::unordered_map CounterMap; +public: + CounterManager() + : mCounters() + {} + ~CounterManager() + {} + + /// Add a counter object + /// @param[in] counter The counter to be added + void AddCounter(Counter *counter) { + mCounters[counter->GetName()] = counter; + } + + /// Remove a counter object + /// @param[in] counter The counter to be removed + void RemoveCounter(Counter *counter) { + CounterMap::iterator const it = mCounters.find(counter->GetName()); + if (it != mCounters.end() && it->second == counter) { + mCounters.erase(it); + } + } + + /// Given a counter's name, retrieve the associated counter + /// object. + /// @param[in] name Name of the counter to be retrieved + /// @retval The associated counter object if one exists; NULL + /// otherwise. + Counter *GetCounter(const string &name) { + CounterMap::iterator const it = mCounters.find(name); + return (it == mCounters.end() ? 0 : it->second); + } + + /// Print out all the counters in the system, one per line. Each + /// line is terminated with a "\r\n". If there are no counters, + /// then we print "\r\n". + void Show(ostream &os) { + if (mCounters.empty()) { + os << "\r\n"; + return; + } + for_each(mCounters.begin(), mCounters.end(), ShowCounter(os)); + } + +private: + /// Map that tracks all the counters in the system + CounterMap mCounters; +}; + +} // namespace KFS + +#endif // LIBKFSIO_COUNTER_H diff --git a/src/cc/kfsio/FileHandle.h b/src/cc/kfsio/FileHandle.h new file mode 100644 index 000000000..b38e45ad1 --- /dev/null +++ b/src/cc/kfsio/FileHandle.h @@ -0,0 +1,57 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2008/05/13 +// +// Author: Sriram Rao +// +// Copyright 2008 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief A ref-counted file-id object. +// +//---------------------------------------------------------------------------- + +#ifndef _LIBKFSIO_FILEHANDLE_H +#define _LIBKFSIO_FILEHANDLE_H + +#include + +namespace KFS +{ +struct FileHandle_t +{ + FileHandle_t() : mFd(-1) { } + FileHandle_t(int fd) : mFd(fd) { } + ~FileHandle_t() { + if (mFd < 0) + return; + close(mFd); + mFd = -1; + } + void Close() { + if (mFd < 0) + return; + close(mFd); + mFd = -1; + } + int mFd; // the underlying file pointer +}; + +typedef boost::shared_ptr FileHandlePtr; +} + +#endif // _LIBKFSIO_FILEHANDLE_H diff --git a/src/cc/kfsio/Globals.cc b/src/cc/kfsio/Globals.cc new file mode 100644 index 000000000..34e685e33 --- /dev/null +++ b/src/cc/kfsio/Globals.cc @@ -0,0 +1,100 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/10/09 +// Author: Sriram Rao +// +// Copyright 2008 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Define the symbol for the KFS IO library global variables. + +//---------------------------------------------------------------------------- + +#include "Globals.h" + +namespace KFS +{ +namespace libkfsio +{ + +Globals_t* Globals_t::sForGdbToFindInstance = 0; + +Globals_t::Globals_t() + : counterManager(), + ctrOpenNetFds ("Open network fds"), + ctrOpenDiskFds ("Open disk fds"), + ctrNetBytesRead ("Bytes read from network"), + ctrNetBytesWritten ("Bytes written to network"), + ctrDiskBytesRead ("Bytes read from disk"), + ctrDiskBytesWritten("Bytes written to disk"), + ctrDiskIOErrors ("Disk I/O errors"), + mInitedFlag(false), + mDestructedFlag(false), + mForGdbToFindNetManager(0) +{ + counterManager.AddCounter(&ctrOpenNetFds); + counterManager.AddCounter(&ctrOpenDiskFds); + counterManager.AddCounter(&ctrNetBytesRead); + counterManager.AddCounter(&ctrNetBytesWritten); + counterManager.AddCounter(&ctrDiskBytesRead); + counterManager.AddCounter(&ctrDiskBytesWritten); + counterManager.AddCounter(&ctrDiskIOErrors); + sForGdbToFindInstance = this; +} + +Globals_t::~Globals_t() +{ + mDestructedFlag = true; +} + +Globals_t& +Globals_t::Instance() +{ + static Globals_t globalsInstance; + return globalsInstance; +} + +NetManager& +Globals_t::getNetManager() +{ + // Ensure that globals are constructed before net manager. + NetManager*& netManager = Instance().mForGdbToFindNetManager; + static NetManager netManagerInstance; + if (! netManager) { + netManager = &netManagerInstance; + } + return netManagerInstance; +} + +void +Globals_t::Init() +{ + if (mInitedFlag) { + return; + } + mInitedFlag = true; +} + +void +Globals_t::Destroy() +{ + // Rely on compiler / runtime to invoke static dtors in the reverse order. +} + +} +} diff --git a/src/cc/kfsio/Globals.h b/src/cc/kfsio/Globals.h new file mode 100644 index 000000000..5004d13cf --- /dev/null +++ b/src/cc/kfsio/Globals.h @@ -0,0 +1,75 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/10/09 +// Author: Sriram Rao +// +// Copyright 2008 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Define the globals needed by the KFS IO library. These +// globals are also available to any app that uses the KFS IO library. +//---------------------------------------------------------------------------- + +#ifndef LIBKFSIO_GLOBALS_H +#define LIBKFSIO_GLOBALS_H + +#include "NetManager.h" +#include "Counter.h" + +namespace KFS +{ +namespace libkfsio +{ + +struct Globals_t +{ + CounterManager counterManager; + // Commonly needed counters + Counter ctrOpenNetFds; + Counter ctrOpenDiskFds; + Counter ctrNetBytesRead; + Counter ctrNetBytesWritten; + Counter ctrDiskBytesRead; + Counter ctrDiskBytesWritten; + // track the # of failed read/writes + Counter ctrDiskIOErrors; + void Init(); + static NetManager& getNetManager(); + static void Destroy(); + static Globals_t& Instance(); +private: + ~Globals_t(); + Globals_t(); + bool mInitedFlag; + bool mDestructedFlag; + NetManager* mForGdbToFindNetManager; + static Globals_t* sForGdbToFindInstance; +}; + +inline static void InitGlobals() + { Globals_t::Instance().Init(); } +inline static void DestroyGlobals() + { Globals_t::Destroy(); } +inline static NetManager& globalNetManager() + { return Globals_t::getNetManager(); } +inline static Globals_t & globals() + { return Globals_t::Instance(); } +} +} + +#endif // LIBKFSIO_GLOBALS_H diff --git a/src/cc/kfsio/IOBuffer.cc b/src/cc/kfsio/IOBuffer.cc new file mode 100644 index 000000000..87a3366f2 --- /dev/null +++ b/src/cc/kfsio/IOBuffer.cc @@ -0,0 +1,1590 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/15 +// Author: Sriram Rao +// Mike Ovsiannikov -- iostream, aligned buffers support for direct IO, +// scatter / gather io with readv and writev, make IOBuffer generic scatter +// gather list with *SpaceAvailable* methods. +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Scatter / gatherer io list implementation. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "IOBuffer.h" +#include "Globals.h" + +namespace KFS +{ + +using std::min; +using std::max; +using std::list; + +using namespace KFS::libkfsio; + +// To conserve memory, by default, we allocate IOBufferData in 4K +// blocks. However, applications are free to change this default unit +// to what they like. +static libkfsio::IOBufferAllocator* sIOBufferAllocator = 0; +static volatile bool sIsIOBufferAllocatorUsed = false; +int IOBufferData::sDefaultBufferSize = 4 << 10; + +struct IOBufferArrayDeallocator +{ + void operator()(char* buf) { delete [] buf; } +}; + +struct IOBufferDeallocator +{ + void operator()(char* buf) { sIOBufferAllocator->Deallocate(buf); } +}; + +struct IOBufferDeallocatorCustom +{ + IOBufferDeallocatorCustom( + libkfsio::IOBufferAllocator& allocator) + : mAllocator(allocator) + {} + void operator()(char* buf) { mAllocator.Deallocate(buf); } +private: + libkfsio::IOBufferAllocator& mAllocator; +}; + +// Call this function if you want to change the default allocator. +bool +libkfsio::SetIOBufferAllocator(libkfsio::IOBufferAllocator* allocator) +{ + if (sIsIOBufferAllocatorUsed || + (allocator && (int)allocator->GetBufferSize() <= 0)) { + return false; + } + sIOBufferAllocator = allocator; + return true; +} + +inline int +IOBufferData::MaxAvailable(int numBytes) const +{ + return max(0, min(int(SpaceAvailable()), numBytes)); +} + +inline int +IOBufferData::MaxConsumable(int numBytes) const +{ + return max(0, min(BytesConsumable(), numBytes)); +} + +inline void +IOBufferData::Init(char* buf, int bufSize) +{ + // glibc malloc returns 2 * sizeof(size_t) aligned blocks. + const int size = max(0, bufSize); + mData.reset(buf ? buf : new char [size], IOBufferArrayDeallocator()); + mProducer = mData.get(); + mEnd = mProducer + size; + mConsumer = mProducer; +} + +inline void +IOBufferData::Init(char* buf, libkfsio::IOBufferAllocator& allocator) +{ + if (&allocator == sIOBufferAllocator) { + if (! sIsIOBufferAllocatorUsed) { + sDefaultBufferSize = sIOBufferAllocator->GetBufferSize(); + } + sIsIOBufferAllocatorUsed = true; + mData.reset(buf ? buf : allocator.Allocate(), + IOBufferDeallocator()); + } else { + mData.reset(buf ? buf : allocator.Allocate(), + IOBufferDeallocatorCustom(allocator)); + } + if (! (mProducer = mData.get())) { + abort(); + } + mEnd = mProducer + allocator.GetBufferSize(); + mConsumer = mProducer; +} + +// setup a new IOBufferData for access by block sharing. +IOBufferData::IOBufferData(const IOBufferData& other, + char* c, char* e, char* p /* = 0 */) + : mData(other.mData), + mEnd(e), + mProducer(p ? p : e), + mConsumer(c) +{ + if (! (mData.get() <= mConsumer && + mConsumer <= mProducer && + mProducer <= mEnd && + mEnd <= other.mEnd)) { + abort(); + } +} + +IOBufferData::IOBufferData() + : mData(), + mEnd(0), + mProducer(0), + mConsumer(0) +{ + if (sIOBufferAllocator) { + IOBufferData::Init(0, *sIOBufferAllocator); + } else { + IOBufferData::Init(0, sDefaultBufferSize); + } +} + +IOBufferData::IOBufferData(int bufsz) + : mData(), + mEnd(0), + mProducer(0), + mConsumer(0) +{ + IOBufferData::Init(0, bufsz); +} + +IOBufferData::IOBufferData(char* buf, int offset, int size, libkfsio::IOBufferAllocator& allocator) + : mData(), + mEnd(0), + mProducer(0), + mConsumer(0) +{ + IOBufferData::Init(buf, allocator); + IOBufferData::Fill(offset + size); + IOBufferData::Consume(offset); +} + +IOBufferData::IOBufferData(char* buf, int bufSize, int offset, int size) + : mData(), + mEnd(0), + mProducer(0), + mConsumer(0) +{ + IOBufferData::Init(buf, bufSize); + IOBufferData::Fill(offset + size); + IOBufferData::Consume(offset); +} + +IOBufferData::IOBufferData(const IOBufferBlockPtr& data, int bufSize, int offset, int size) + : mData(data), + mEnd(0), + mProducer(0), + mConsumer(0) +{ + char* const buf = data.get(); + mEnd = buf + bufSize; + mProducer = buf; + mConsumer = buf; + IOBufferData::Fill(offset + size); + IOBufferData::Consume(offset); +} + +IOBufferData::~IOBufferData() +{ +} + +int +IOBufferData::ZeroFill(int numBytes) +{ + const int nbytes = MaxAvailable(numBytes); + memset(mProducer, '\0', nbytes); + mProducer += nbytes; + return nbytes; +} + +int +IOBufferData::Fill(int numBytes) +{ + const int nbytes = MaxAvailable(numBytes); + mProducer += nbytes; + return nbytes; +} + +int +IOBufferData::Consume(int numBytes) +{ + const int nbytes = MaxConsumable(numBytes); + mConsumer += nbytes; + assert(mConsumer <= mProducer); + return nbytes; +} + +int +IOBufferData::Trim(int numBytes) +{ + const int nbytes = MaxConsumable(numBytes); + mProducer = mConsumer + nbytes; + return nbytes; +} + +int +IOBufferData::Read(int fd, int maxReadAhead /* = -1 */) +{ + int numBytes = mEnd - mProducer; + int nread; + + if (maxReadAhead >= 0 && numBytes > maxReadAhead) { + numBytes = maxReadAhead; + } + assert(numBytes >= 0); + + if (numBytes <= 0) { + return -1; + } + nread = read(fd, mProducer, numBytes); + + if (nread > 0) { + mProducer += nread; + globals().ctrNetBytesRead.Update(nread); + } + + return (nread >= 0 ? nread : (errno > 0 ? -errno : nread)); +} + +int +IOBufferData::Write(int fd) +{ + int numBytes = mProducer - mConsumer; + int nwrote; + + assert(numBytes >= 0); + + if (numBytes <= 0) { + return -1; + } + nwrote = write(fd, mConsumer, numBytes); + + if (nwrote > 0) { + mConsumer += nwrote; + globals().ctrNetBytesWritten.Update(nwrote); + } + + return (nwrote >= 0 ? nwrote : (errno > 0 ? -errno : nwrote)); +} + +int +IOBufferData::CopyIn(const char *buf, int numBytes) +{ + const int nbytes = MaxAvailable(numBytes); + if (buf != mProducer) { + memmove(mProducer, buf, nbytes); + } + mProducer += nbytes; + return nbytes; +} + +int +IOBufferData::CopyIn(const IOBufferData *other, int numBytes) +{ + const int nbytes = MaxAvailable(min(numBytes, other->BytesConsumable())); + memmove(mProducer, other->mConsumer, nbytes); + mProducer += nbytes; + return nbytes; +} + +int +IOBufferData::CopyOut(char *buf, int numBytes) const +{ + const int nbytes = MaxConsumable(numBytes); + memmove(buf, mConsumer, nbytes); + return nbytes; +} + +#ifdef DEBUG_IOBuffer + +#include + +inline void +IOBuffer::DebugChecksum(const char* buf, int len) +{ + DebugVerify(); + if (len > 0) { + mDebugChecksum = adler32(mDebugChecksum, (const Bytef *)buf, len); + } +} + +inline void +IOBuffer::DebugChecksum(const IOBufferData& buf) +{ + DebugVerify(); + const int nb = buf.BytesConsumable(); + if (nb > 0) { + mDebugChecksum = + adler32(mDebugChecksum, (const Bytef *)buf.Consumer(), nb); + } +} + +inline void +IOBuffer::DebugChecksum(const IOBuffer& buf, int numBytes) +{ + buf.DebugVerify(); + DebugVerify(); + int rem = numBytes; + for (iterator i = buf.begin(); rem > 0 && i != buf.end(); ++i) { + const int nb = min(rem, i->BytesConsumable()); + if (nb <= 0) { + continue; + } + mDebugChecksum = + adler32(mDebugChecksum, (const Bytef*)i->Consumer(), nb); + rem -= nb; + } +} + +inline void +IOBuffer::DebugVerify(bool updateChecksum) +{ + int byteCount = 0; + unsigned int checksum = adler32(0L, Z_NULL, 0); + for (iterator i = begin(); i != end(); ++i) { + const int nb = i->BytesConsumable(); + if (nb <= 0) { + continue; + } + checksum = adler32(checksum, (const Bytef*)i->Consumer(), nb); + byteCount += nb; + } + if (updateChecksum) { + mDebugChecksum = checksum; + } + if (checksum != mDebugChecksum || byteCount != mByteCount) { + abort(); + } +} + +inline void IOBuffer::DebugVerify() const +{ const_cast(this)->DebugVerify(false); } + +#else +inline void IOBuffer::DebugChecksum(const char* buf, int len) {} +inline void IOBuffer::DebugChecksum(const IOBufferData& buf) {} +inline void IOBuffer::DebugChecksum(const IOBuffer& buf, int numBytes) {} +inline void IOBuffer::DebugVerify(bool updateChecksum) {} +inline void IOBuffer::DebugVerify() const {} +#endif + +IOBuffer::IOBuffer() + : mBuf(), mByteCount(0) +#ifdef DEBUG_IOBuffer + , mDebugChecksum(0) +#endif +{ + DebugVerify(true); +} + +IOBuffer::~IOBuffer() +{ + DebugVerify(); +} + +void +IOBuffer::Append(const IOBufferData &buf) +{ + DebugChecksum(buf); + mBuf.push_back(buf); + assert(mByteCount >= 0); + mByteCount += buf.BytesConsumable(); + DebugVerify(); +} + +int +IOBuffer::Append(IOBuffer *ioBuf) +{ + DebugChecksum(*ioBuf, ioBuf->mByteCount); + int nBytes = 0; + BList::iterator it; + for (it = ioBuf->mBuf.begin(); it != ioBuf->mBuf.end(); ) { + const int nb = it->BytesConsumable(); + if (nb > 0) { + mBuf.splice(mBuf.end(), ioBuf->mBuf, it++); + nBytes += nb; + } else { + it = ioBuf->mBuf.erase(it); + } + } + assert(mByteCount >= 0 && + ioBuf->mByteCount == nBytes && ioBuf->mBuf.empty()); + ioBuf->mByteCount = 0; + mByteCount += nBytes; + ioBuf->DebugVerify(true); + DebugVerify(); + return nBytes; +} + +inline IOBuffer::BList::iterator +IOBuffer::BeginSpaceAvailable(int* nBytes /* = 0 */) +{ + BList::iterator it = mBuf.end(); + while (it != mBuf.begin() && (--it)->IsEmpty()) { + if (it->IsFull()) { + it = mBuf.erase(it); + } else if (nBytes) { + *nBytes += it->SpaceAvailable(); + } + } + if (it != mBuf.end() && it->IsFull()) { + assert(! it->IsEmpty()); + ++it; + } + return it; +} + +int +IOBuffer::MoveSpaceAvailable(IOBuffer *other, int numBytes) +{ + other->DebugVerify(); + DebugVerify(); + if (numBytes <= 0) { + return 0; + } + BList& buf = other->mBuf; + BList::iterator it = other->BeginSpaceAvailable(); + int nBytes = numBytes; + while (it != buf.end() && nBytes > 0) { + IOBufferData& d = *it; + const int n = (int)d.SpaceAvailable(); + if (n <= 0) { + ++it; + continue; + } + if (n <= nBytes) { + if (d.IsEmpty()) { + mBuf.splice(mBuf.end(), buf, it++); + } else { + char* const p = d.Producer(); + mBuf.push_back(IOBufferData(d, p, p + n, p)); + d = IOBufferData(d, d.Consumer(), p); + ++it; + } + nBytes -= n; + } else { + ++it; + char* const p = d.Producer(); + mBuf.push_back(IOBufferData(d, p, p + nBytes, p)); + const IOBufferData nd(d, p + nBytes, p + n, p + nBytes); + if (d.IsEmpty()) { + d = nd; + } else { + d = IOBufferData(d, d.Consumer(), p); + buf.insert(it, nd); + } + nBytes = 0; + } + } + other->DebugVerify(); + DebugVerify(); + return (numBytes - nBytes); +} + +int +IOBuffer::EnsureSpaceAvailable(int numBytes) +{ + int nBytes = 0; + BeginSpaceAvailable(&nBytes); + while (nBytes < numBytes) { + IOBufferData buf; + const int nb = buf.SpaceAvailable(); + assert(nb > 0); + if (nBytes + nb > numBytes) { + char* const p = buf.Producer(); + mBuf.push_back(IOBufferData(buf, p, p + numBytes - nBytes, p)); + nBytes = numBytes; + } else { + mBuf.push_back(buf); + nBytes += nb; + } + } + return nBytes; +} + +void +IOBuffer::RemoveSpaceAvailable() +{ + if (IsEmpty()) { + Clear(); + return; + } + DebugVerify(); + while (! mBuf.empty() && mBuf.back().IsEmpty()) { + mBuf.pop_back(); + } + if (! mBuf.empty()) { + IOBufferData& d = mBuf.back(); + if (! d.IsFull()) { + d = IOBufferData(d, d.Consumer(), d.Producer()); + } + } + DebugVerify(); +} + +int +IOBuffer::UseSpaceAvailable(const IOBuffer* other, int numBytes) +{ + other->DebugVerify(); + DebugVerify(); + if (numBytes <= 0) { + return 0; + } + const BList& obuf = other->mBuf; + BList::const_iterator oit = + other->mByteCount <= 0 ? obuf.begin() : obuf.end(); + while (oit != obuf.begin() && (--oit)->IsEmpty()) + {} + BList::iterator it = mBuf.begin(); + int nBytes = numBytes; + while (oit != obuf.end() && nBytes > 0) { + int nb = min(nBytes, (int)oit->SpaceAvailable()); + if (nb > 0) { + char* const p = const_cast(oit->Producer()); + IOBufferData d(*oit, p, p + nb, p); + while (it != mBuf.end()) { + if (it->IsEmpty()) { + it = mBuf.erase(it); + continue; + } + if (nb <= 0) { + break; + } + const int n = it->Consume(d.CopyIn(&(*it), nb)); + nb -= n; + nBytes -= n; + } + mBuf.insert(it, d); + nBytes -= nb; + } + ++oit; + } + while (it != mBuf.end()) { + if (it->IsEmpty()) { + it = mBuf.erase(it); + } else { + ++it; + } + } + assert(0 <= nBytes && nBytes <= numBytes); + other->DebugVerify(); + DebugVerify(); + return (numBytes - nBytes); +} + +int +IOBuffer::ZeroFillSpaceAvailable(int numBytes) +{ + DebugVerify(); + if (numBytes <= 0) { + return 0; + } + BList::iterator it = BeginSpaceAvailable(); + int nBytes = numBytes; + while (nBytes > 0 && it != mBuf.end()) { + nBytes -= it->ZeroFill(nBytes); + ++it; + } + assert(0 <= nBytes && nBytes <= numBytes); + nBytes = numBytes - nBytes; + mByteCount += nBytes; + assert(mByteCount >= 0); + DebugVerify(true); + return nBytes; +} + +int +IOBuffer::Move(IOBuffer* other, int numBytes) +{ + int nBytes = other->mByteCount; + if (numBytes >= nBytes) { + Move(other); + return nBytes; + } + DebugChecksum(*other, numBytes); + nBytes = numBytes; + while (! other->mBuf.empty() && nBytes > 0) { + IOBufferData& s = other->mBuf.front(); + const int nb = s.BytesConsumable(); + if (nBytes >= nb) { + if (nb > 0) { + mBuf.splice(mBuf.end(), other->mBuf, other->mBuf.begin()); + nBytes -= nb; + } else { + other->mBuf.pop_front(); + } + } else { + // this is the last buffer being moved; only partial data + // from the buffer needs to be moved. do the move by + // sharing the block (and therby avoid data copy) + mBuf.push_back(IOBufferData( + s, s.Consumer(), s.Consumer() + nBytes)); + nBytes -= s.Consume(nBytes); + assert(nBytes == 0); + } + } + while (! other->mBuf.empty() && other->mBuf.front().IsEmpty()) { + other->mBuf.pop_front(); + } + nBytes = numBytes - nBytes; + assert(mByteCount >= 0 && other->mByteCount >= 0); + mByteCount += nBytes; + other->mByteCount -= nBytes; + other->DebugVerify(true); + DebugVerify(); + return nBytes; +} + +void +IOBuffer::Move(IOBuffer* other) +{ + DebugChecksum(*other, other->mByteCount); + assert(mByteCount >= 0 && other->mByteCount >= 0); + mBuf.splice(mBuf.end(), other->mBuf); + mByteCount += other->mByteCount; + other->mByteCount = 0; + other->DebugVerify(true); + DebugVerify(); +} + +int +IOBuffer::MoveSpace(IOBuffer* other, int numBytes) +{ + if (numBytes <= 0 || other->mBuf.empty()) { + return 0; + } + int nBytes; + if (other->mByteCount <= numBytes && + ! other->mBuf.back().IsEmpty() && + (nBytes = other->mByteCount + + other->mBuf.back().SpaceAvailable()) <= numBytes) { + Move(other); + return nBytes; + } + DebugChecksum(*other, numBytes); + nBytes = numBytes; + while (! other->mBuf.empty() && nBytes > 0) { + IOBufferData& s = other->mBuf.front(); + const int nb = s.BytesConsumable(); + const int sa = (int)s.SpaceAvailable(); + const int st = nb + sa; + if (nBytes >= st) { + if (st > 0) { + mBuf.splice(mBuf.end(), other->mBuf, other->mBuf.begin()); + mByteCount += nb; + other->mByteCount -= nb; + nBytes -= st; + } else { + other->mBuf.pop_front(); + } + } else { + // this is the last buffer being moved; only partial data + // from the buffer needs to be moved. do the move by + // sharing the block (and therby avoid data copy) + char* const c = s.Consumer(); + mBuf.push_back(IOBufferData(s, c, c + nBytes, c + min(nBytes, nb))); + s = IOBufferData(s, c + nBytes, c + st, c + max(nBytes, nb)); + const int n = mBuf.back().BytesConsumable(); + other->mByteCount -= n; + mByteCount += n; + nBytes = 0; + } + } + assert(mByteCount >= 0 && other->mByteCount >= 0); + other->DebugVerify(true); + DebugVerify(); + return (numBytes - nBytes); +} + +inline IOBuffer::BList::iterator +IOBuffer::SplitBufferListAt(IOBuffer::BList& buf, int& nBytes) +{ + IOBuffer::BList::iterator iter = buf.begin(); + while (nBytes > 0 && iter != buf.end()) { + IOBufferData& data = *iter; + const int nb = data.BytesConsumable(); + if (nb <= 0) { + iter = buf.erase(iter); + continue; + } + if (nb > nBytes) { + buf.insert(iter, IOBufferData( + data, data.Consumer(), data.Consumer() + nBytes)); + nBytes -= data.Consume(nBytes); + assert(nBytes == 0); + } else { + nBytes -= nb; + ++iter; + } + } + return iter; +} + +void +IOBuffer::Replace(IOBuffer* other, int offset, int numBytes) +{ + other->DebugVerify(); + DebugVerify(); + // find the insertion point + int nBytes = offset; + BList::iterator iter = SplitBufferListAt(mBuf, nBytes); + // extend buffer if needed + if (nBytes > 0) { + ZeroFill(nBytes); + } + // split "other" at numBytes + nBytes = numBytes; + BList::iterator const otherEnd = + SplitBufferListAt(other->mBuf, nBytes); + + // remove min(numBytes, other->BytesCounsumable()) starting from offset: + // [offset, offset + min(numBytes, other->BytesCounsumable()) + nBytes = numBytes - nBytes; + other->mByteCount -= nBytes; + assert(other->mByteCount >= 0); + while (iter != mBuf.end() && nBytes > 0) { + IOBufferData& data = *iter; + nBytes -= data.Consume(nBytes); + if (data.IsEmpty()) { + iter = mBuf.erase(iter); + } else { + assert(nBytes == 0); + break; + } + } + mByteCount += nBytes; + + // now, put the thing at insertPt + mBuf.splice(iter, other->mBuf, other->mBuf.begin(), otherEnd); + assert(mByteCount >= 0); + other->DebugVerify(true); + DebugVerify(true); +} + +void +IOBuffer::ReplaceKeepBuffersFull(IOBuffer* srcBuf, int inOffset, int numBytes) +{ + srcBuf->DebugVerify(); + DebugVerify(); + const int offset = max(0, inOffset); + const int moveLen = min(max(0, numBytes), srcBuf->mByteCount); + const int dstLen = max(mByteCount, offset + moveLen); + assert(moveLen >= 0 && dstLen >= 0 && + mByteCount >= 0 && srcBuf->mByteCount >= moveLen); + + BList& dst = mBuf; + BList& src = srcBuf->mBuf; + BList::iterator di = offset == mByteCount ? dst.end() : dst.begin(); + int off = offset == mByteCount ? offset : 0; + while (di != dst.end()) { + const int nb = di->BytesConsumable(); + if (nb <= 0) { + di = dst.erase(di); + } else { + off += nb; + if (off >= offset) { + break; + } + ++di; + } + } + int rem = numBytes; + if (offset > off) { + int nFill = offset - off; + if (! dst.empty()) { + nFill -= dst.back().ZeroFill(nFill); + } + while (nFill > 0) { + dst.push_back(IOBufferData()); + IOBufferData& d = dst.back(); + nFill -= d.ZeroFill(nFill); + } + assert(nFill == 0); + // Fill the last buffer. + IOBufferData& d = dst.back(); + while (rem > 0 && ! src.empty() && ! d.IsFull()) { + IOBufferData& s = src.front(); + rem -= s.Consume(d.CopyIn(&s, rem)); + if (s.IsEmpty()) { + src.pop_front(); + } + } + assert(rem == 0 || d.IsFull()); + di = dst.end(); + off = 0; + } else if ((off -= offset) != 0) { + assert(di != dst.end()); + off = di->BytesConsumable() - off; + assert(off >= 0); + } else { + // Find the last buffer, and make sure it is full. + IOBufferData* d = 0; + if (di != dst.end()) { + while (di != dst.end() && di->IsEmpty()) { + di = dst.erase(di); + } + if (di != dst.end()) { + d = &*di; + ++di; + } + } else { + while (! dst.empty() && dst.back().IsEmpty()) { + dst.pop_back(); + } + if (! dst.empty()) { + d = &dst.back(); + } + } + if (d) { + while (rem > 0 && ! src.empty() && ! d->IsFull()) { + IOBufferData& s = src.front(); + rem -= s.Consume(d->CopyIn(&s, rem)); + if (s.IsEmpty()) { + src.pop_front(); + } + } + } + // Move whole buffers from src to dst if possible. + while (rem > 0 && ! src.empty()) { + IOBufferData& s = src.front(); + const int nb = s.BytesConsumable(); + if (nb <= 0) { + src.pop_front(); + continue; + } + if (rem < nb || ! s.HasCompleteBuffer() || + (! s.IsFull() && &s != &src.back())) { + break; + } + if (di != dst.end() && nb != di->BytesConsumable()) { + break; + } + dst.splice(di, src, src.begin()); + if (di != dst.end()) { + di = dst.erase(di); + while (di != dst.end() && di->IsEmpty()) { + di = dst.erase(di); + } + } + rem -= nb; + } + } + // Replace. + while (rem > 0 && ! src.empty() && di != dst.end()) { + IOBufferData* s = &src.front(); + if (s->IsEmpty()) { + src.pop_front(); + continue; + } + int dl; + while ((dl = di->BytesConsumable()) <= 0 && + (di = dst.erase(di)) != dst.end()) + {} + if (dl <= 0) { + break; + } + // Un-share if needed. + if (di->IsShared()) { + BList::iterator in = di; + IOBufferData fp(*in); // Make a shallow copy. + *in = IOBufferData(); // Replace with new buffer. + while ((dl -= fp.Consume(in->CopyIn(fp.Consumer(), dl))) > 0) { + in = dst.insert(++in, IOBufferData()); + } + // If more than one buffer was created, then postion to the one + // at the requested offset. + while ((dl = di->BytesConsumable()) < off) { + off -= dl; + ++di; + assert(di != dst.end()); + } + assert(dl > 0); + } + assert(dl >= off); + dl -= off; + char* d = di->Consumer() + off; + off = 0; + if (rem < dl) { + dl = rem; + } + rem -= dl; + while (dl > 0) { + const int n = s->Consume(s->CopyOut(d, dl)); + d += n; + dl -= n; + while (s->IsEmpty()) { + src.pop_front(); + if (src.empty()) { + dl = 0; + break; + } + s = &src.front(); + } + } + ++di; + } + // Append. + while (rem > 0 && ! src.empty()) { + IOBufferData& s = src.front(); + if (s.IsEmpty()) { + src.pop_front(); + continue; + } + if (dst.empty() || dst.back().IsFull()) { + dst.push_back(IOBufferData()); + } + rem -= s.Consume(dst.back().CopyIn(&s, rem)); + } + // Clean up left over empty buffers if any. + for ( ; ! src.empty() && src.front().IsEmpty(); src.pop_front()) + {} + mByteCount = dstLen; + srcBuf->mByteCount -= moveLen; + srcBuf->DebugVerify(true); + DebugVerify(true); +} + +void +IOBuffer::ZeroFill(int numBytes) +{ + DebugVerify(); + while (! mBuf.empty() && mBuf.back().IsEmpty()) { + mBuf.pop_back(); + } + int nBytes = numBytes; + if (nBytes > 0 && ! mBuf.empty()) { + nBytes -= mBuf.back().ZeroFill(nBytes); + } + while (nBytes > 0) { + mBuf.push_back(IOBufferData()); + nBytes -= mBuf.back().ZeroFill(nBytes); + } + assert(mByteCount >= 0); + if (numBytes > 0) { + mByteCount += numBytes; + } + DebugVerify(true); +} + +inline static void* +AllocBuffer(size_t allocSize) +{ + return (sIOBufferAllocator ? + sIOBufferAllocator->Allocate() : new char[allocSize]); +} + +int +IOBuffer::Read(int fd, int maxReadAhead /* = -1 */) +{ + DebugVerify(); + if (sIOBufferAllocator && ! sIsIOBufferAllocatorUsed) { + IOBufferData initWithAllocator; + } + // Read into available space at the end, if any. + BList::iterator it = BeginSpaceAvailable(); + const size_t bufSize = + sIOBufferAllocator ? sIOBufferAllocator->GetBufferSize() : + IOBufferData::GetDefaultBufferSize(); + if (maxReadAhead > 0 && maxReadAhead <= int(bufSize)) { + const bool addBufFlag = it == mBuf.end(); + if (addBufFlag) { + it = mBuf.insert(mBuf.end(), IOBufferData()); + } + if (it->SpaceAvailable() >= size_t(maxReadAhead)) { + const int nRd = it->Read(fd, maxReadAhead); + if (nRd > 0) { + mByteCount += nRd; + } else if (addBufFlag) { + mBuf.erase(it); + } + DebugVerify(true); + return nRd; + } + } + + const ssize_t kMaxReadv = 64 << 10; + const int kMaxReadvBufs(kMaxReadv / (4 << 10) + 1); + const int maxReadvBufs = min(IOV_MAX, + min(kMaxReadvBufs, int(kMaxReadv / bufSize + 1))); + struct iovec readVec[kMaxReadvBufs]; + ssize_t totRead = 0; + ssize_t maxRead(maxReadAhead >= 0 ? + maxReadAhead : std::numeric_limits::max()); + + while (maxRead > 0) { + assert(it == mBuf.end() || ! it->IsFull()); + int nVec = 0; + ssize_t numRead = maxRead; + size_t nBytes(numRead); + for (BList::iterator i = it; + i != mBuf.end() && nBytes > 0 && nVec < maxReadvBufs; + ++i) { + IOBufferData& buf = *i; + const size_t nb = min(nBytes, buf.SpaceAvailable()); + if (nb > 0) { + readVec[nVec].iov_len = nb; + readVec[nVec].iov_base = buf.Producer(); + nVec++; + nBytes -= nb; + } + } + const int allocBegin = nVec; + for ( ; nBytes > 0 && nVec < maxReadvBufs; nVec++) { + const size_t nb = min(nBytes, bufSize); + readVec[nVec].iov_len = nb; + if (! (readVec[nVec].iov_base = AllocBuffer(bufSize))) { + if (totRead <= 0 && nVec <= 0) { + abort(); // Allocation falure. + } + break; + } + nBytes -= nb; + } + numRead -= nBytes; + const ssize_t nRd = readv(fd, readVec, nVec); + if (nRd < numRead) { + maxRead = 0; // short read, eof, or error: we're done + } else if (maxRead > 0) { + maxRead -= nRd; + assert(maxRead >= 0); + } + numRead = max(ssize_t(0), nRd); + for ( ; it != mBuf.end() && numRead > 0; ++it) { + numRead -= it->Fill(numRead); + if (numRead <= 0) { + if (it->IsFull()) { + ++it; + } + break; + } + } + for (int i = allocBegin; i < nVec; i++) { + char* const buf = reinterpret_cast(readVec[i].iov_base); + if (numRead > 0) { + if (sIOBufferAllocator) { + mBuf.push_back( + IOBufferData(buf, 0, numRead, *sIOBufferAllocator)); + } else { + mBuf.push_back( + IOBufferData(buf, bufSize, 0, numRead)); + } + numRead -= mBuf.back().BytesConsumable(); + } else { + if (sIOBufferAllocator) { + sIOBufferAllocator->Deallocate(buf); + } else { + delete [] buf; + } + } + } + assert(numRead == 0); + if (nRd > 0) { + totRead += nRd; + globals().ctrNetBytesRead.Update(nRd); + } else if (totRead == 0 && nRd < 0 && + (totRead = -(errno == 0 ? EAGAIN : errno)) > 0) { + totRead = -totRead; + } + } + assert(mByteCount >= 0); + if (totRead > 0) { + mByteCount += totRead; + } + DebugVerify(true); + return totRead; +} + +int +IOBuffer::Write(int fd) +{ + DebugVerify(); + const int kMaxWritevBufs = 32; + const int maxWriteBufs = min(IOV_MAX, kMaxWritevBufs); + const int kPreferredWriteSize = 64 << 10; + struct iovec writeVec[kMaxWritevBufs]; + ssize_t totWr = 0; + + while (! mBuf.empty()) { + BList::iterator it; + int nVec; + ssize_t toWr; + for (it = mBuf.begin(), nVec = 0, toWr = 0; + it != mBuf.end() && nVec < maxWriteBufs && + toWr < kPreferredWriteSize; + ) { + const int nBytes = it->BytesConsumable(); + if (nBytes <= 0) { + it = mBuf.erase(it); + continue; + } + writeVec[nVec].iov_base = it->Consumer(); + writeVec[nVec].iov_len = (size_t)nBytes; + toWr += nBytes; + nVec++; + ++it; + } + if (nVec <= 0) { + assert(it == mBuf.end()); + mBuf.clear(); + break; + } + const ssize_t nWr = writev(fd, writeVec, nVec); + if (nWr == toWr && it == mBuf.end()) { + mBuf.clear(); + } else { + ssize_t nBytes = nWr; + int nb; + while ((nb = mBuf.front().BytesConsumable()) <= nBytes) { + nBytes -= nb; + mBuf.pop_front(); + } + if (nBytes > 0) { + nBytes -= mBuf.front().Consume(nBytes); + assert(nBytes == 0); + } + } + if (nWr > 0) { + totWr += nWr; + globals().ctrNetBytesWritten.Update(nWr); + } else if (totWr <= 0 && (totWr = -(errno == 0 ? EAGAIN : errno)) > 0) { + totWr = -totWr; + } + if (nWr != toWr) { + break; + } + } + assert(mByteCount >= 0); + if (totWr > 0) { + assert(mByteCount >= totWr); + mByteCount -= totWr; + } + DebugVerify(true); + return totWr; +} + +void +IOBuffer::Verify() const +{ +#ifdef DEBUG_IOBuffer + DebugVerify(); +#else + BList::const_iterator it; + int numBytes = 0; + for (it = mBuf.begin(); it != mBuf.end(); ++it) { + numBytes += it->BytesConsumable(); + } + if (numBytes != mByteCount) { + abort(); + } +#endif +} + +int +IOBuffer::ZeroFillLast() +{ + DebugVerify(); + int nBytes = 0; + while (! mBuf.empty()) { + IOBufferData& b = mBuf.back(); + if (b.IsEmpty()) { + mBuf.pop_back(); + } else { + nBytes = b.ZeroFill(b.SpaceAvailable()); + break; + } + } + assert(mByteCount >= 0); + mByteCount += nBytes; + DebugVerify(true); + return nBytes; +} + +int +IOBuffer::Consume(int numBytes) +{ + DebugVerify(); + if (numBytes >= mByteCount) { + mBuf.clear(); + const int nBytes = mByteCount; + mByteCount = 0; + DebugVerify(true); + return nBytes; + } + int nBytes = numBytes; + BList::iterator it = mBuf.begin(); + while (numBytes > 0 && it != mBuf.end()) { + nBytes -= it->Consume(nBytes); + if (it->IsEmpty()) { + it = mBuf.erase(it); + } else { + ++it; + } + } + nBytes = numBytes - nBytes; + assert(mByteCount >= 0); + mByteCount -= nBytes; + DebugVerify(true); + return nBytes; +} + +void +IOBuffer::Trim(int numBytes) +{ + DebugVerify(); + if (mByteCount <= numBytes) { + return; + } + if (numBytes <= 0) { + mBuf.clear(); + mByteCount = 0; + DebugVerify(true); + return; + } + int nBytes = numBytes; + BList::iterator iter = mBuf.begin(); + while (iter != mBuf.end()) { + const int nb = iter->BytesConsumable(); + if (nb <= 0) { + iter = mBuf.erase(iter); + } else { + if (nb > nBytes) { + nBytes -= iter->Trim(nBytes); + if (! iter->IsEmpty()) { + ++iter; + } + break; + } + nBytes -= nb; + ++iter; + } + } + iter = mBuf.erase(iter, mBuf.end()); + assert(mByteCount >= 0); + mByteCount = numBytes; + DebugVerify(true); +} + +int +IOBuffer::CopyIn(const char *buf, int numBytes) +{ + DebugChecksum(buf, numBytes); + if (numBytes <= 0) { + return 0; + } + int defaultBufSz; + if (! sIOBufferAllocator && mBuf.empty() && numBytes > + (defaultBufSz = IOBufferData::GetDefaultBufferSize()) * 32) { + IOBufferData bd( + (numBytes + defaultBufSz - 1) / defaultBufSz * defaultBufSz); + mByteCount += bd.CopyIn(buf, numBytes); + assert(mByteCount == numBytes); + mBuf.push_back(bd); + DebugVerify(true); + return mByteCount; + } + // Copy into available space at the end, if any. + BList::iterator it = BeginSpaceAvailable(); + if (it == mBuf.end()) { + it = mBuf.insert(it, IOBufferData()); + } + int nBytes = numBytes; + const char* cur = buf; + for (; ;) { + const int nb = it->CopyIn(cur, nBytes); + cur += nb; + nBytes -= nb; + if (nBytes <= 0) { + break; + } + assert(it->IsFull()); + if (++it == mBuf.end()) { + it = mBuf.insert(it, IOBufferData()); + } + } + nBytes = numBytes - nBytes; + assert(mByteCount >= 0); + mByteCount += nBytes; + DebugVerify(true); + return nBytes; +} + +int +IOBuffer::CopyOut(char *buf, int numBytes) const +{ + BList::const_iterator it; + char* cur = buf; + int nBytes = numBytes; + if (nBytes > 0) { + *cur = '\0'; + } + for (it = mBuf.begin(); nBytes > 0 && it != mBuf.end(); ++it) { + const int nb = it->CopyOut(cur, nBytes); + cur += nb; + nBytes -= nb; + } + DebugVerify(); + return (cur - buf); +} + +int +IOBuffer::Copy(const IOBuffer* buf, int numBytes) +{ + DebugChecksum(*buf, numBytes); + int rem = numBytes; + BList::const_iterator it; + for (it = buf->mBuf.begin(); it != buf->mBuf.end() && rem > 0; ++it) { + const int nb = min(rem, it->BytesConsumable()); + if (nb <= 0) { + continue; + } + char* const c = const_cast(it->Consumer()); + mBuf.push_back(IOBufferData(*it, c, c + nb)); + rem -= nb; + } + rem = numBytes - rem; + mByteCount += rem; + assert(mByteCount >= 0); + buf->DebugVerify(); + DebugVerify(); + return rem; +} + +// +// Clone the contents of an IOBuffer by block sharing +// +IOBuffer* +IOBuffer::Clone() const +{ + DebugVerify(); + IOBuffer* const clone = new IOBuffer(); + BList::const_iterator it; + for (it = mBuf.begin(); it != mBuf.end(); ++it) { + if (! it->IsEmpty()) { + clone->mBuf.push_back(IOBufferData(*it, + const_cast(it->Consumer()), + const_cast(it->Producer()))); + } + } + assert(mByteCount >= 0); + clone->mByteCount = mByteCount; +#ifdef DEBUG_IOBuffer + clone->mDebugChecksum = mDebugChecksum; +#endif + clone->DebugVerify(); + DebugVerify(); + return clone; +} + +void +IOBuffer::MakeBuffersFull() +{ + DebugVerify(); + if (mBuf.empty()) { + return; + } + // Move write data to the start of the buffers, to make it aligned. + BList buf; + buf.swap(mBuf); + while (! buf.empty()) { + IOBufferData& s = buf.front(); + const int nb = s.BytesConsumable(); + if (nb <= 0) { + buf.pop_front(); + continue; + } + if (mBuf.empty() || mBuf.back().IsFull()) { + if (s.HasCompleteBuffer() && (s.IsFull() || &s == &buf.back())) { + mBuf.splice(mBuf.end(), buf, buf.begin()); + continue; + } + mBuf.push_back(IOBufferData()); + } + s.Consume(mBuf.back().CopyIn(&s, nb)); + } + DebugVerify(); +} + +void +IOBuffer::TrimAtBufferBoundaryLeaveOnly(int& offset, int& numBytes) +{ + // Trim data at the buffer boundary at the beginning. + DebugVerify(); + int nBytes = offset; + while (! mBuf.empty()) { + const int nb = mBuf.front().BytesConsumable(); + if (nb > nBytes) { + break; + } + nBytes -= nb; + mBuf.pop_front(); + } + offset -= nBytes; + // Trim data at the buffer boundary from the end. + nBytes = max(0, nBytes) + numBytes; + numBytes = 0; + for (BList::iterator i = mBuf.begin(); i != mBuf.end(); ) { + if (nBytes > numBytes) { + numBytes += i->BytesConsumable(); + ++i; + } else { + i = mBuf.erase(i, mBuf.end()); + } + } + assert(mByteCount >= 0 && numBytes > 0); + mByteCount = numBytes; + DebugVerify(true); +} + +int +IOBuffer::IndexOf(int offset, const char* str) const +{ + DebugVerify(); + const char* const ss = str ? str : ""; + const int soff = max(0, offset); + int nBytes = soff; + BList::const_iterator it; + for (it = mBuf.begin(); it != mBuf.end(); ++it) { + const int nb = it->BytesConsumable(); + if (nb > nBytes) { + break; + } + nBytes -= nb; + } + if (*ss == 0) { + // Nothing to search for. + return (it != mBuf.end() ? soff : -1); + } + int off = soff - nBytes; + const char* s = ss; + int idx = -1; + int pbo = -1; + BList::const_iterator pit; + while (it != mBuf.end()) { + const int nb = it->BytesConsumable(); + const char* const c = it->Consumer(); + const char* n = c + nBytes; + const char* const e = c + nb; + nBytes = 0; + if (idx >= 0) { + while (n < e && *s == *n) { + if (*++s == 0) { + // Found. + DebugVerify(); + return idx; + } + n++; + } + if (n < e) { + // Start over, from prefix start index plus one. + s = ss; + assert(pbo >= 0); + it = pit; + off = idx - pbo; + nBytes = pbo + 1; + pbo = -1; + idx = -1; + continue; + } + } else { + while (n < e && (n = (const char*)memchr(n, *s, e - n))) { + const char* const f = n; + while (*++s != 0 && ++n < e && *n == *s) + {} + if (*s == 0) { + // Found. + DebugVerify(); + return (off + int(f - c)); + } + if (n < e) { + // Start over, from prefix start index plus one. + s = ss; + n = f + 1; + } else { + // Prefix start, end of buffer: remember the prefix position. + pbo = int(f - c); + pit = it; + idx = off + pbo; + } + } + } + off += nb; + ++it; + } + DebugVerify(); + return -1; +} + +int +IOBuffer::StreamBuffer::underflow() +{ + if (mMaxReadLength <= 0 || mCur == mIoBuf->end()) { + return EOF; + } + int nb; + while ((nb = mCur->BytesConsumable()) <= 0) { + if (++mCur == mIoBuf->end()) { + return EOF; + } + } + char* const c = const_cast(mCur->Consumer()); + nb = min(mMaxReadLength, nb); + setg(c, c, c + nb); + mMaxReadLength -= nb; + ++mCur; + return (*c & 0xFF); +} + +int +IOBuffer::StreamBuffer::overflow(int c) +{ + if (c == EOF || ! mIoBuf || mWriteRem <= 0) { + return EOF; + } + char ch(c); + const int ret = mIoBuf->CopyIn(&ch, 1); + if (ret <= 0) { + return EOF; + } + mWriteRem -= ret; + return c; +} + +std::streamsize +IOBuffer::StreamBuffer::xsputn(const char *s, std::streamsize n) +{ + if (! mIoBuf || mWriteRem < (int)n) { + return 0; + } + const int ret = mIoBuf->CopyIn(s, int(n)); + if (ret > 0) { + mWriteRem -= ret; + } + return ret; +} + +} diff --git a/src/cc/kfsio/IOBuffer.h b/src/cc/kfsio/IOBuffer.h new file mode 100644 index 000000000..e84eb3bb4 --- /dev/null +++ b/src/cc/kfsio/IOBuffer.h @@ -0,0 +1,634 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/14 +// Author: Sriram Rao +// Mike Ovsiannikov -- iostream, aligned buffer support for direct IO, +// scatter / gather io with readv and writev, make IOBuffer generic scatter +// gather list with *SpaceAvailable* methods. +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Scatter / gather KFS I/O. +// +//---------------------------------------------------------------------------- + +#ifndef _LIBIO_IOBUFFER_H +#define _LIBIO_IOBUFFER_H + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/StdAllocator.h" + +namespace KFS +{ +using std::ostream; +using std::istream; +using std::streambuf; +using std::streamsize; +using std::list; +using std::numeric_limits; +using boost::shared_ptr; + +namespace libkfsio +{ +// IO buffer allocator. Typically used with io buffer pool. +class IOBufferAllocator +{ +protected: + IOBufferAllocator() + {} + virtual ~IOBufferAllocator() + {} + IOBufferAllocator& operator=(const IOBufferAllocator&) + { return *this; } +public: + virtual size_t GetBufferSize() const = 0; + virtual char* Allocate() = 0; + virtual void Deallocate(char* buf) = 0; +}; +/// API to set the default allocation when allocating +/// IOBufferData(). The default allocation unit is 4K unless +/// changed by this API call. +/// Can only be called once, prior to any buffer allocation. +bool SetIOBufferAllocator(IOBufferAllocator* allocator); +} + +/// +/// \class IOBufferData +/// \brief An IOBufferData contains a buffer and associated +/// producer/consumer points. +class IOBufferData +{ +public: + /// Data buffer that is ref-counted for sharing. + typedef shared_ptr IOBufferBlockPtr; + + IOBufferData(); + IOBufferData(int bufsz); + IOBufferData(char* buf, int offset, int size, + libkfsio::IOBufferAllocator& allocator); + IOBufferData(char* buf, int bufSize, int offset, int size); + IOBufferData(const IOBufferBlockPtr& data, int bufSize, int offset, int size); + + /// Create an IOBufferData blob by sharing data block from other; + /// set the producer/consumer based on the start/end positions + /// that are passed in + IOBufferData(const IOBufferData &other, char *s, char *e, char* p = 0); + ~IOBufferData(); + + /// + /// Read data from file descriptor into the buffer. + /// @param[in] fd file descriptor to be used for reading. + /// @result Returns the # of bytes read + /// + int Read(int fd, int maxReadAhead /* = -1 */); + + /// + /// Write data from the buffer to the file descriptor. + /// @param[in] fd file descriptor to be used for writing. + /// @result Returns the # of bytes written + /// + int Write(int fd); + + /// + /// Copy data into the buffer. For doing a copy, data is appended + /// to the buffer starting at the offset corresponding to + /// mProducer. # of bytes copied is min (# of bytes, space + /// avail), where space avail = mEnd - mProducer. + /// + /// NOTE: As a result of copy, the "producer" pointer is not + /// advanced. + /// + /// @param[out] buf A containing the data to be copied in. + /// @param[in] numBytes # of bytes to be copied. + /// @retval Returns the # of bytes copied. + /// + int CopyIn(const char *buf, int numBytes); + int CopyIn(const IOBufferData *other, int numBytes); + /// + /// Copy data out the buffer. For doing a copy, data is copied + /// out of the buffer starting at the offset corresponding to + /// mConsumer. # of bytes copied is min (# of bytes, bytes + /// avail), where bytes avail = mProducer - mConsumer. + /// + /// NOTE: As a result of copy, the "consumer" pointer is not + /// advanced. + /// + /// @param[out] buf A containing the data to be copied in. + /// @param[in] numBytes # of bytes to be copied. + /// @retval Returns the # of bytes copied. + /// + int CopyOut(char *buf, int numBytes) const; + + char *Producer() { return mProducer; } + char *Consumer() { return mConsumer; } + const char *Producer() const { return mProducer; } + const char *Consumer() const { return mConsumer; } + + /// + /// Some data has been filled in the buffer. So, advance + /// mProducer. + /// @param[in] nbytes # of bytes of data filled + /// @retval # of bytes filled in this buffer. + /// + int Fill(int nbytes); + int ZeroFill(int nbytes); + + /// + /// Some data has been consumed from the buffer. So, advance + /// mConsumer. + /// @param[in] nbytes # of bytes of data consumed + /// @retval # of bytes consumed from this buffer. + /// + int Consume(int nbytes); + + /// + /// Remove some data from the end of the buffer. So, pull back + /// mProducer + /// @param[in] nbytes # of bytes of data to be trimmed + /// @retval # of bytes in this buffer. + /// + int Trim(int nbytes); + + /// Returns the # of bytes available for consumption. + int BytesConsumable() const { return mProducer - mConsumer; } + + /// Return the space available in the buffer + size_t SpaceAvailable() const { return mEnd - mProducer; } + int IsFull() const { return mProducer >= mEnd; } + int IsEmpty() const { return mProducer <= mConsumer; } + /// Returns true if has whole data buffer. + bool HasCompleteBuffer() const { + return (mData.get() == mConsumer && + mConsumer + sDefaultBufferSize == mEnd); + } + bool IsShared() const { + return (! mData.unique()); + } + static int GetDefaultBufferSize() { + return sDefaultBufferSize; + } + +private: + IOBufferBlockPtr mData; + /// Pointers that correspond to the start/end of the buffer + char* mEnd; + /// Pointers into mData that correspond to producer/consumer + char* mProducer; + char* mConsumer; + + /// Allocate memory and init the pointers. + inline void Init(char* buf, int bufSize); + inline void Init(char* buf, + libkfsio::IOBufferAllocator& allocator); + + inline int MaxAvailable(int numBytes) const; + inline int MaxConsumable(int numBytes) const; + + static int sDefaultBufferSize; +}; + + +/// +/// \class IOBuffer -- scatter gather list. +/// An IOBuffer consists of a list of IOBufferData. It provides +/// API's for reading/writing data to/from the buffer. Operations on +/// IOBuffer translates to operations on appropriate IOBufferData. +/// +class IOBuffer +{ +private: + typedef list< + IOBufferData, + StdFastAllocator + > BList; +public: + typedef BList::const_iterator iterator; + + IOBuffer(); + ~IOBuffer(); + + IOBuffer *Clone() const; + + /// Append the IOBufferData block to the list stored in this buffer. + /// Unlike methods with IOBuffer as argument, this method will not + /// Consume() or change buf in any way, the underlying buffer will be + /// shared. + void Append(const IOBufferData& buf); + + /// Append the contents of ioBuf to this buffer. + int Append(IOBuffer *ioBuf); + + /// Move data buffers with space available at the end of ioBuf. + /// @param[in] other Buffer from which the available space to move + /// @param[in] numBytes # of bytes of available space to be used + /// @retval Returns the # of bytes moved. + /// + int MoveSpaceAvailable(IOBuffer* other, int numBytes); + /// Remove space available at the end of ioBuf. + /// + void RemoveSpaceAvailable(); + /// Use available buffer space at the end of "other" buffer. + /// Copy data, if any, into "other"'s available space, but + /// do not advance / modify IOBufferData buffer pointers of "other". + /// @param[in] other Buffer from which the available space to be used + /// @param[in] numBytes # of bytes of available space to be used + /// @retval Returns the # of bytes used. + /// + int UseSpaceAvailable(const IOBuffer* other, int numBytes); + /// Zero fill the buffer for length + /// min(numBytes, ). + /// @param[in] numBytes # of bytes to be zero-filled. + /// @retval Returns the # of bytes filled. + /// + int ZeroFillSpaceAvailable(int numBytes); + /// Ensure that at least numBytes, is available. + /// If more than numBytes is always available do nothing, + /// otherwise add buffer space to make exactly numBytes available. + /// @param[in] numBytes size of the available space. + /// @retval Returns actual available space size. + /// + int EnsureSpaceAvailable(int numBytes); + + + int Read(int fd, int maxReadAhead = -1); + int Write(int fd); + + /// Move data from one buffer to another. This involves (mostly) + /// shuffling pointers without incurring data copying. + /// The requirement is that "other" better have as much bytes as + /// we are trying to move. + /// @param[in] other Buffer from which data has to be moved + /// @param[in] numBytes # of bytes of data to be moved over + /// @retval Returns the # of bytes moved. + /// + int Move(IOBuffer* other, int numBytes); + /// Move whole buffer. + /// + void Move(IOBuffer *other); + /// Move data and available space from one buffer to another. + /// @param[in] other Buffer from which space has to be moved + /// @param[in] numBytes # of bytes of space to be moved over + /// @retval Returns the # of space moved. + /// + int MoveSpace(IOBuffer* other, int numBytes); + + /// Replace data in the range + /// [offset, offset + min(numBytes, other->BytesConsumable()) + /// The range [BytesConsumable(), offset) is zero filled. + /// In addition this method has the same effect as other->Consume(numBytes). + /// @param[in] other Buffer from which data has to be spliced + /// @param[in] offset The offset at which data has to be spliced in + /// @param[in] numBytes # of bytes of data to be moved over + /// + void Replace(IOBuffer* other, int offset, int numBytes); + /// Same as Replace, except it ensures that all buffers in the destination + /// fully utilized: IsFull() && HasCompleteBuffer() + /// It copies over min(srcBuf->BytesConsumable(), numBytes) into this. + /// If offset > this->BytesConsumable(), the this is zero filled. + /// This method "consumes" min(srcBuf->BytesConsumable(), numBytes) from + /// srcBuf. + void ReplaceKeepBuffersFull(IOBuffer* srcBuf, int offset, int numBytes); + + /// Zero fill the buffer for length numBytes. + /// @param[in] numBytes # of bytes to be zero-filled. + void ZeroFill(int numBytes); + + /// + /// Copy data into the buffer. For doing a copy, data is appended + /// to the last buffer in mBuf. If the amount of data to be + /// copied exceeds space in the last buffer, additional buffers + /// are allocated and copy operation runs to finish. + /// + /// NOTE: As a result of copy, the "producer" portion of an + /// IOBufferData is not advanced. + /// + /// @param[in] buf A containing the data to be copied in. + /// @param[in] numBytes # of bytes to be copied in. + /// @retval Returns the # of bytes copied. + /// + int CopyIn(const char* buf, int numBytes); + + int Copy(const IOBuffer* buf, int numBytes); + + /// + /// Copy data out of the buffer. For doing a copy, data is copied + /// from the first buffer in mBuf. If the amount of data to be + /// copied exceeds what is available in the first buffer, the list + /// of buffers is walked to copy out data. + /// + /// NOTE: As a result of copy, the "consumer" portion of an + /// IOBufferData is not advanced. + /// + /// @param[out] buf A null-terminated buffer containing the data + /// copied out. + /// @param[in] bufLen Length of buf passed in. At most bufLen + /// bytes are copied out. + /// @retval Returns the # of bytes copied. + /// + int CopyOut(char* buf, int bufLen) const; + + /// Copy the data into buf, or get buffer pointer if the data is + /// contiguous in one buffer. + const char* CopyOutOrGetBufPtr(char* buf, int& nbytes) const + { + if (nbytes > mByteCount) { + nbytes = mByteCount; + } + if (! mBuf.empty() && mBuf.front().BytesConsumable() >= nbytes) { + return mBuf.front().Consumer(); + } + nbytes = CopyOut(buf, nbytes); + return buf; + } + + /// + /// Consuming data in the IOBuffer translates to advancing the + /// "consumer" point on underlying IOBufferData. From the head + /// of the list, the consumer point will be advanced on sufficient + /// # of buffers. + /// @retval Returns the # of bytes consumed. + /// + int Consume(int nbytes); + + /// Returns the # of bytes that are available for consumption. + int BytesConsumable() const + { return mByteCount; } + + /// Trim data from the end of the buffer to nbytes. This is the + /// converse of consume, where data is removed from the front of + /// the buffer. + void Trim(int nbytes); + + /// Ensures HasCompleteBuffer() returns true for all buffers, + /// and all buffers possibly except the last one are full. + void MakeBuffersFull(); + + /// Trim at buffer boundary + void TrimAtBufferBoundaryLeaveOnly(int& offset, int& numBytes); + + /// Searches for a string in the buffer, strstr() equivalent. + /// @param[in] offset to start search from. + /// @param[in] str string to search for. + /// @retval Returns position of the beginning of the "str" if found, + /// or -1 if not. + int IndexOf(int offset, const char* str) const; + + /// Returns true if buffer has no data. + bool IsEmpty() const + { return mByteCount <= 0; } + + /// Zero fill, if needed the last buffer to make it full. + /// @retval Returns number of bytes added. + int ZeroFillLast(); + + /// Returns bytes available for consumption in the last buffer + /// @retval # of bytes consumable in the last buffer. + int BytesConsumableLast() const + { return (mBuf.empty() ? 0 : mBuf.back().BytesConsumable()); } + + /// Returns available space in the last buffer. + /// @retval available space in the last buffer. + int SpaceAvailableLast() const + { return (mBuf.empty() ? 0 : mBuf.back().SpaceAvailable()); } + + /// Retruns true if the last the buffer is full + bool IsLastFull() const + { return mBuf.empty() ? true : mBuf.back().IsFull(); } + + /// Remove all data. + void Clear() + { + mBuf.clear(); + mByteCount = 0; + } + + /// Buffer list iterator. + /// Do not modify IOBufferData pointed by the iterator, or its content. + iterator begin() const { return mBuf.begin(); } + iterator end() const { return mBuf.end(); } + + /// Debug + void Verify() const; + + /// This is to create istream ostream with StreamBuffer(iobuffer); + class StreamBuffer : public streambuf + { + public: + StreamBuffer( + IOBuffer& iobuf, + int maxReadLength = numeric_limits::max(), + int maxWriteLength = numeric_limits::max()) + : streambuf(), + mMaxReadLength(maxReadLength), + mWriteRem(maxWriteLength), + mCur(iobuf.begin()), + mIoBuf(&iobuf) + {} + StreamBuffer() + : streambuf(), + mMaxReadLength(0), + mWriteRem(0), + mCur(), + mIoBuf(0) + {} + void Reset(int maxReadLength, int maxWriteLength) + { + if (mIoBuf) { + mCur = mIoBuf->begin(); + mMaxReadLength = maxReadLength; + mWriteRem = maxWriteLength; + } else { + mMaxReadLength = 0; + mWriteRem = 0; + } + } + void SetReadOnly(IOBuffer* iobuf, int maxReadLength) + { + // Make sure that overflow() will always return EOF. + mMaxReadLength = iobuf ? maxReadLength : 0; + mWriteRem = 0; + mIoBuf = iobuf; + if (mIoBuf) { + mCur = mIoBuf->begin(); + } + } + void SetWriteOnly(IOBuffer* iobuf, int maxWriteLength) + { + // Make sure that underflow() will always return EOF. + mMaxReadLength = 0; + mWriteRem = iobuf ? maxWriteLength : 0; + mIoBuf = iobuf; + } + protected: + virtual int underflow(); + virtual int overflow(int c = EOF); + virtual streamsize xsputn(const char * s, streamsize n); + private: + int mMaxReadLength; + int mWriteRem; + iterator mCur; + IOBuffer* mIoBuf; + private: + StreamBuffer(const StreamBuffer&); + StreamBuffer& operator=(const StreamBuffer&); + }; + class OStream; + class IStream; + class WOStream; + class ByteIterator + { + public: + ByteIterator(const IOBuffer& buf) + : mBuf(buf), + mIt(mBuf.begin()), + mCur(mIt != mBuf.end() ? mIt->Consumer() : 0), + mEnd(mIt != mBuf.end() ? mIt->Producer() : 0) + {} + const char* Next() + { + for (; ;) { + if (mCur < mEnd) { + return mCur++; + } + if (! mCur || ++mIt == mBuf.end()) { + mCur = 0; + mEnd = 0; + return mCur; + } + mCur = mIt->Consumer(); + mEnd = mIt->Producer(); + } + } + private: + const IOBuffer& mBuf; + iterator mIt; + const char* mCur; + const char* mEnd; + }; +private: + BList mBuf; + int mByteCount; +#ifdef DEBUG_IOBuffer + unsigned int mDebugChecksum; +#endif + inline void DebugChecksum(const char* buf, int len); + inline void DebugChecksum(const IOBufferData& buf); + inline void DebugChecksum(const IOBuffer& buf, int numBytes); + inline void DebugVerify() const; + inline void DebugVerify(bool updateChecksum); + + inline static BList::iterator SplitBufferListAt(BList& buf, int& nBytes); + inline BList::iterator BeginSpaceAvailable(int* nBytes = 0); + IOBuffer(const IOBuffer& buf); + IOBuffer& operator=(const IOBuffer& buf); +}; + +class IOBuffer::OStream : + public IOBuffer, + private IOBuffer::StreamBuffer, + public ostream +{ +public: + OStream() + : IOBuffer(), + IOBuffer::StreamBuffer(*this, 0), + ostream(this) + {} +}; + +class IOBuffer::WOStream : + private IOBuffer::StreamBuffer, + public ostream +{ +public: + WOStream() + : IOBuffer::StreamBuffer(), + ostream(this) + {} + ostream& Set( + IOBuffer* iobuf, + int maxWriteLength = numeric_limits::max()) + { + SetWriteOnly(iobuf, maxWriteLength); + ostream::clear(); + ostream::flags(ostream::dec | ostream::skipws); + ostream::precision(6); + ostream::width(0); + ostream::fill(' '); + return *this; + } + ostream& Set( + IOBuffer& iobuf, + int maxWriteLength = numeric_limits::max()) + { return Set(&iobuf, maxWriteLength); } + ostream& Reset() + { return Set(0, 0); } +}; + +class IOBuffer::IStream : + private IOBuffer::StreamBuffer, + public istream +{ +public: + IStream( + IOBuffer& iobuf, + int maxReadLength = numeric_limits::max()) + : IOBuffer::StreamBuffer(iobuf, maxReadLength, 0), + istream(this) + {} + IStream() + : IOBuffer::StreamBuffer(), + istream(this) + {} + void Rewind(int maxReadLength) + { + StreamBuffer::Reset(maxReadLength, 0); + istream::clear(); + rdbuf(this); + } + istream& Set( + IOBuffer* iobuf, + int maxReadLength = numeric_limits::max()) + { + StreamBuffer::SetReadOnly(iobuf, maxReadLength); + istream::clear(); + rdbuf(this); + return *this; + } + istream& Set( + IOBuffer& iobuf, + int maxReadLength = numeric_limits::max()) + { return Set(&iobuf, maxReadLength); } + istream& Reset() + { return Set(0, 0); } +}; + +} + +#endif // _LIBIO_IOBUFFER_H diff --git a/src/cc/kfsio/ITimeout.h b/src/cc/kfsio/ITimeout.h new file mode 100644 index 000000000..0677aa214 --- /dev/null +++ b/src/cc/kfsio/ITimeout.h @@ -0,0 +1,112 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/25 +// Author: Sriram Rao +// +// Copyright 2008-2011 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef LIBIO_I_TIMEOUT_H +#define LIBIO_I_TIMEOUT_H + +#include "common/time.h" +#include "qcdio/QCDLList.h" + +#include +#include + +namespace KFS +{ + +/// +/// \file ITimeout.h +/// \brief Define the ITimeout interface. +/// + +/// +/// \class ITimeout +/// Abstract class that defines a Timeout interface. Whenever a +/// timeout occurs, the Timeout() method will be invoked. An optional +/// setting, interval can be specified, which signifies the time +/// interval between successive invocations of Timeout(). +/// +/// NOTE: Timeout interface supports only a pseudo-real-time timers. +/// There is no guarantee that the desired interval will hold between +/// successive invocations of Timeout(). +/// +class ITimeout +{ +public: + ITimeout() + : mIntervalMs(0), mDisabled(false), mLastCall(0) + { List::Init(*this); } + virtual ~ITimeout() { assert(! List::IsInList(*this)); } + void Disable() { + mDisabled = true; + } + /// Specify the interval in milli-seconds at which the timeout + /// should occur. + void SetTimeoutInterval(int intervalMs, bool resetTimer = false) { + mDisabled = false; + mIntervalMs = intervalMs; + if (resetTimer) { + ResetTimer(); + } + } + int GetTimeElapsed() { + return (NowMs() - mLastCall); + } + void ResetTimer() { + mLastCall = NowMs(); + } + static int64_t NowMs() { + return microseconds() / 1000; + } + /// Whenever a timer expires (viz., a call to select returns), + /// this method gets invoked. Depending on the time-interval + /// specified, the timeout is appropriately invoked. + void TimerExpired(int64_t nowMs) { + if (mDisabled) { + return; + } + if (mIntervalMs <= 0 || nowMs >= mLastCall + mIntervalMs) { + mLastCall = nowMs; + Timeout(); + } + } + /// This method will be invoked when a timeout occurs. + virtual void Timeout() = 0; +protected: + int mIntervalMs; + bool mDisabled; +private: + typedef QCDLListOp List; + int64_t mLastCall; + ITimeout* mPrevPtr[1]; + ITimeout* mNextPtr[1]; + + friend class NetManager; + friend class QCDLListOp; +}; + +} + +#endif // LIBIO_I_TIMEOUT_H diff --git a/src/cc/kfsio/KfsCallbackObj.h b/src/cc/kfsio/KfsCallbackObj.h new file mode 100644 index 000000000..00ad6c849 --- /dev/null +++ b/src/cc/kfsio/KfsCallbackObj.h @@ -0,0 +1,149 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/14 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _LIBIO_KFSCALLBACKOBJ_H +#define _LIBIO_KFSCALLBACKOBJ_H + +#include + +namespace KFS +{ +/// +/// \file KfsCallbackObj.h +/// \brief Callback/Continuations based programming model +/// +/// A KfsCallback object is based on a Continuation programming +/// model: The object executes until it makes a blocking call, at +/// which point control switches over to another object. +/// +/// A continuation consists of two parts: (1) state, (2) an event +/// handler that will be called when an event occurs. The +/// KfsCallbackObj class defined here is only a base class. +/// + +// +// For KfsCallbackObj object, we want the virtual function table to be the +// first element of the object. This will debugging easier on +// optimized builds---from the virtual table, we can tell what type of +// object we are looking at. +// +struct _force_vfp_to_top { + virtual ~_force_vfp_to_top() { }; +}; + + +// abstract base class for ObjectMethod template +class ObjectMethodBase { +public: + virtual ~ObjectMethodBase() {} + virtual int execute(int code, void *data) = 0; + +}; + +// +// A derived sub-class of the KfsCallbackObj class defines its own event +// handlers. We need to store a pointer to such an handler so that +// the callback can be invoked. This is an implementation problem +// because: we can store a pointer in a derived class to something in +// the base class, but not vice-versa. +// +// SOOO..., create an object that holds two things: (1) the object on +// which a callback is defined, and (2) a pointer to the method in +// that object. By doing this with templates, we preserve type-safety +// and work the magic without using any type-casting. +// +template +class ObjectMethod : public ObjectMethodBase { + +public: + typedef int (T::*MethodPtr)(int code, void *data); + + // save pointer to object and method + ObjectMethod( T* optr, MethodPtr mptr ) + : mOptr(optr), mMptr(mptr) {} + int execute(int code, void *data) { + return (mOptr->*mMptr)(code, data); // execute the method + } + +private: + T* mOptr; // pointer to the object + MethodPtr mMptr; // pointer to the method +}; + +/// +/// \brief Sets the event handler for a callback object. +/// @param pobj Pointer to the KfsCallback object +/// @param meth Pointer to the handler method in the KfsCallbackObj +/// +template +void SET_HANDLER( T* pobj, typename ObjectMethod::MethodPtr meth ) +{ + pobj->SetHandler(pobj, meth); +} + +/// +/// \class KfsCallbackObj +/// A callback object has state and an event handler that will be invoked +/// whenever an event occurs for this callback object. +/// +class KfsCallbackObj : public _force_vfp_to_top { +public: + KfsCallbackObj() : mObjMeth(0) { + } + + virtual ~KfsCallbackObj() { + if (mObjMeth) { + mObjMeth->~ObjectMethodBase(); + } + } + + /// + /// Signature for an event handler: + /// @param code An integer about the event that occurred + /// @param data A pointer to the data associated with the event + /// + int HandleEvent(int code, void *data) { + return mObjMeth->execute(code, data); + } + + template + void SetHandler(T* pobj, typename ObjectMethod::MethodPtr meth) { + BOOST_STATIC_ASSERT(sizeof(ObjectMethod) <= sizeof(mObjMethodStorage)); + if (mObjMeth) { + mObjMeth->~ObjectMethodBase(); + } + mObjMeth = ::new (&mObjMethodStorage) ObjectMethod(pobj, meth); + } +private: + struct { + char mStorage[sizeof(ObjectMethod)]; + } mObjMethodStorage; + ObjectMethodBase *mObjMeth; +}; + +} + +#endif // _LIBIO_KFSCALLBACKOBJ_H diff --git a/src/cc/kfsio/NetConnection.cc b/src/cc/kfsio/NetConnection.cc new file mode 100644 index 000000000..87b2ed2ba --- /dev/null +++ b/src/cc/kfsio/NetConnection.cc @@ -0,0 +1,153 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/14 +// Author: Sriram Rao +// +// Copyright 2008-2011 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Network connection implementation. +// +//---------------------------------------------------------------------------- + +#include "Globals.h" +#include "NetConnection.h" +#include "common/MsgLogger.h" +#include "qcdio/QCUtils.h" + +#include + +namespace KFS +{ + +using namespace KFS::libkfsio; + +#ifndef NET_CONNECTION_LOG_STREAM_DEBUG +#define NET_CONNECTION_LOG_STREAM_DEBUG \ + KFS_LOG_STREAM_DEBUG << "netconn: " << (mSock ? mSock->GetFd() : -1) << " " +#endif + +void +NetConnection::HandleReadEvent(int maxAcceptsPerRead /* = 1 */) +{ + if (! IsGood()) { + NET_CONNECTION_LOG_STREAM_DEBUG << "read event ignored: fd closed" << + KFS_LOG_EOM; + } else if (mListenOnly) { + int i = 0; + do { + int err = 0; + TcpSocket* const sock = mSock->Accept(&err); + if (sock) { + NetConnectionPtr conn(new NetConnection(sock, 0)); + conn->mTryWrite = true; // Connected, and good to write. + mCallbackObj->HandleEvent(EVENT_NEW_CONNECTION, &conn); + if (conn) { + conn->Update(); + } + } else { + if (i == 0 || (err != EAGAIN && err != EWOULDBLOCK)) { + NET_CONNECTION_LOG_STREAM_DEBUG << + " accept failure: " << QCUtils::SysError(err) << + " open fd:" + " net: " << globals().ctrOpenDiskFds.GetValue() << + " disk: " << globals().ctrOpenNetFds.GetValue() << + KFS_LOG_EOM; + } + break; + } + } while (++i < maxAcceptsPerRead && IsGood()); + } else if (IsReadReady()) { + const int nread = mInBuffer.Read(mSock->GetFd(), maxReadAhead); + if (nread <= 0 && nread != -EAGAIN && nread != -EINTR) { + NET_CONNECTION_LOG_STREAM_DEBUG << + "read: " << (nread == 0 ? "EOF" : QCUtils::SysError(-nread)) << + KFS_LOG_EOM; + if (nread != 0) { + Close(); + } + mCallbackObj->HandleEvent(EVENT_NET_ERROR, NULL); + } else if (nread > 0) { + mCallbackObj->HandleEvent(EVENT_NET_READ, &mInBuffer); + } + } + Update(); +} + +void +NetConnection::HandleWriteEvent() +{ + const bool wasConnectPending = mNetManagerEntry.IsConnectPending(); + mNetManagerEntry.SetConnectPending(false); + int nwrote = 0; + if (IsGood()) { + nwrote = IsWriteReady() ? mOutBuffer.Write(mSock->GetFd()) : 0; + if (nwrote < 0 && nwrote != -EAGAIN && nwrote != -EINTR) { + NET_CONNECTION_LOG_STREAM_DEBUG << + "write: error: " << QCUtils::SysError(-nwrote) << + KFS_LOG_EOM; + Close(); + mCallbackObj->HandleEvent(EVENT_NET_ERROR, NULL); + } else if (nwrote > 0 || wasConnectPending) { + mCallbackObj->HandleEvent(EVENT_NET_WROTE, &mOutBuffer); + } + } + mTryWrite = mOutBuffer.IsEmpty(); + Update(nwrote != 0); +} + +void +NetConnection::HandleErrorEvent() +{ + if (IsGood()) { + NET_CONNECTION_LOG_STREAM_DEBUG << "connection error, closing" << + KFS_LOG_EOM; + Close(); + mCallbackObj->HandleEvent(EVENT_NET_ERROR, NULL); + } else { + Update(); + } +} + +void +NetConnection::HandleTimeoutEvent() +{ + const int timeOut = GetInactivityTimeout(); + if (timeOut < 0) { + NET_CONNECTION_LOG_STREAM_DEBUG << + "ignoring timeout event, time out value: " << timeOut << + KFS_LOG_EOM; + } else { + NET_CONNECTION_LOG_STREAM_DEBUG << "inactivity timeout:" << + " read-ahead: " << maxReadAhead << + " in: " << mInBuffer.BytesConsumable() << + " out: " << mOutBuffer.BytesConsumable() << + KFS_LOG_EOM; + mCallbackObj->HandleEvent(EVENT_INACTIVITY_TIMEOUT, NULL); + } + Update(); +} + +void +NetConnection::Update(bool resetTimer) +{ + NetManager::Update( + mNetManagerEntry, IsGood() ? mSock->GetFd() : -1, resetTimer); +} + +} diff --git a/src/cc/kfsio/NetConnection.h b/src/cc/kfsio/NetConnection.h new file mode 100644 index 000000000..045e12824 --- /dev/null +++ b/src/cc/kfsio/NetConnection.h @@ -0,0 +1,384 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/14 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _LIBIO_NETCONNECTION_H +#define _LIBIO_NETCONNECTION_H + +#include "KfsCallbackObj.h" +#include "event.h" +#include "IOBuffer.h" +#include "TcpSocket.h" +#include "common/StdAllocator.h" + +#include +#include + +namespace KFS +{ +using std::list; +using std::string; + +class NetManager; +/// +/// \file NetConnection.h +/// \brief A network connection uses TCP sockets for doing I/O. +/// +/// A network connection contains a socket and data in buffers. +/// Whenever data is read from the socket it is held in the "in" +/// buffer; whenever data needs to be written out on the socket, that +/// data should be dropped into the "out" buffer and it will +/// eventually get sent out. +/// + +/// +/// \class NetConnection +/// A net connection contains an underlying socket and is associated +/// with a KfsCallbackObj. Whenever I/O is done on the socket (either +/// for read or write) or when an error occurs (such as the remote +/// peer closing the connection), t;he associated KfsCallbackObj is +/// called back with an event notification. +/// +class NetConnection +{ +public: + typedef boost::shared_ptr NetConnectionPtr; + + /// @param[in] sock TcpSocket on which I/O can be done + /// @param[in] c KfsCallbackObj associated with this connection + /// @param[in] listenOnly boolean that specifies whether this + /// connection is setup only for accepting new connections. + NetConnection(TcpSocket* sock, KfsCallbackObj* c, + bool listenOnly = false, bool ownsSocket = true) + : mNetManagerEntry(), + mListenOnly(listenOnly), + mOwnsSocket(ownsSocket), + mTryWrite(false), + mCallbackObj(c), + mSock(sock), + mInBuffer(), + mOutBuffer(), + mInactivityTimeoutSecs(-1), + maxReadAhead(-1), + mPeerName() { + assert(mSock); + } + + ~NetConnection() { + NetConnection::Close(); + } + + void SetOwningKfsCallbackObj(KfsCallbackObj* c) { + mCallbackObj = c; + } + + void EnableReadIfOverloaded() { + mNetManagerEntry.EnableReadIfOverloaded(); + Update(false); + } + + void SetDoingNonblockingConnect() { + mNetManagerEntry.SetConnectPending(true); + mTryWrite = false; + Update(false); + } + + /// If there is no activity on this socket for nsecs, then notify + /// the owning object; maybe time to close the connection + /// Setting new timeout resets the timer. + void SetInactivityTimeout(int nsecs) { + if (mInactivityTimeoutSecs != nsecs) { + mInactivityTimeoutSecs = nsecs; + Update(); + } + } + + int GetInactivityTimeout() const { + return mInactivityTimeoutSecs; + } + + /// Callback for handling a read. That is, select() thinks that + /// data is available for reading. So, do something. If system is + /// overloaded and we don't have a special pass, leave the data in + /// the buffer alone. + void HandleReadEvent(int maxAcceptsPerRead = 1); + + /// Callback for handling a writing. That is, select() thinks that + /// data can be sent out. So, do something. + void HandleWriteEvent(); + + /// Callback for handling errors. That is, select() thinks that + /// an error occurred. So, do something. + void HandleErrorEvent(); + + /// Timeout call back. + void HandleTimeoutEvent(); + + /// Do we expect data to be read in? + bool IsReadReady() const { + return (maxReadAhead != 0); + }; + + /// Is data available for reading? + bool HasPendingRead() const { + return (! mInBuffer.IsEmpty()); + } + + /// Is data available for writing? + bool IsWriteReady() const { + return (! mOutBuffer.IsEmpty()); + } + + /// # of bytes available for writing(false), + int GetNumBytesToWrite() const { + return mOutBuffer.BytesConsumable(); + } + + /// Is the connection still good? + bool IsGood() const { + return (mSock && mSock->IsGood()); + } + + string GetPeerName() const { + if (IsGood()) { + if (mPeerName.empty()) { + // mutable + const_cast(this)->mPeerName = + mSock->GetPeerName(); + } + return mPeerName; + } else { + return (mPeerName.empty() ? string("not connected") : + ("was connected to " + mPeerName)); + } + } + + string GetSockName() const { + return (IsGood() ? mSock->GetSockName() : string("not connected")); + } + + /// Enqueue data to be sent out. + void Write(const IOBufferData &ioBufData, bool resetTimerFlag = true) { + if (! ioBufData.IsEmpty()) { + const bool resetTimer = resetTimerFlag && mOutBuffer.IsEmpty(); + mOutBuffer.Append(ioBufData); + Update(resetTimer); + } + } + + /// Enqueue data to be sent out. + void Write(IOBuffer* ioBuf) { + const int numBytes = ioBuf ? ioBuf->BytesConsumable() : 0; + if (numBytes > 0) { + const bool resetTimer = mOutBuffer.IsEmpty(); + mOutBuffer.Move(ioBuf); + Update(resetTimer); + } + } + + /// Enqueue data to be sent out. + void Write(IOBuffer* ioBuf, int numBytes, bool resetTimerFlag = true) { + const bool resetTimer = resetTimerFlag &&mOutBuffer.IsEmpty(); + if (ioBuf && numBytes > 0 && mOutBuffer.Move(ioBuf, numBytes) > 0) { + Update(resetTimer); + } + } + + /// Enqueue data to be sent out. + void WriteCopy(const IOBuffer* ioBuf, int numBytes, + bool resetTimerFlag = true) { + const bool resetTimer = resetTimerFlag && mOutBuffer.IsEmpty(); + if (ioBuf && numBytes > 0 && mOutBuffer.Copy(ioBuf, numBytes) > 0) { + Update(resetTimer); + } + } + + /// Enqueue data to be sent out. + void Write(const char *data, int numBytes, bool resetTimerFlag = true) { + const bool resetTimer = resetTimerFlag && mOutBuffer.IsEmpty(); + if (mOutBuffer.CopyIn(data, numBytes) > 0) { + Update(resetTimer); + } + } + + bool CanStartFlush() const { + return (mTryWrite && IsWriteReady() && IsGood()); + } + + /// If there is any data to be sent out, start the send. + void StartFlush() { + if (CanStartFlush()) { + HandleWriteEvent(); + } + } + + int GetSocketError() const { + return (mSock ? mSock->GetSocketError() : 0); + } + + /// Close the connection. + void Close(bool clearOutBufferFlag = true) { + if (! mSock) { + return; + } + // To avoid race with file descriptor number re-use by the OS, + // remove the socket from poll set first, then close the socket. + TcpSocket* const sock = mOwnsSocket ? mSock : 0; + mSock = 0; + // Clear data that can not be sent, but keep input data if any. + if (clearOutBufferFlag) { + mOutBuffer.Clear(); + } + Update(); + if (sock) { + sock->Close(); + delete sock; + } + } + + int GetNumBytesToRead() const { + return mInBuffer.BytesConsumable(); + } + + /// Set max read ahead. + /// @param[in] read ahead amount, < 0 -- unlimited. + void SetMaxReadAhead(int readAhead) { + const bool update = (maxReadAhead != 0) != (readAhead != 0); + maxReadAhead = readAhead; + if (update) { + Update(false); + } + } + + void DiscardRead() { + const bool resetTimer = ! mInBuffer.IsEmpty(); + mInBuffer.Clear(); + Update(resetTimer); + } + + void DiscardWrite() { + mOutBuffer.Clear(); + Update(); + } + + IOBuffer& GetInBuffer() { + return mInBuffer; + } + + // StartFlush() or Flush() must called, to initial buffer send, if something + // gets written into the buffer externally, rather than using Write() + // methods the above. + IOBuffer& GetOutBuffer() { + return mOutBuffer; + } + + void Flush(bool resetTimerFlag = true) { + if (CanStartFlush()) { + Update(resetTimerFlag); + } + } + + class NetManagerEntry + { + public: + typedef list > List; + + NetManagerEntry() + : mIn(false), + mOut(false), + mAdded(false), + mEnableReadIfOverloaded(false), + mConnectPending(false), + mFd(-1), + mWriteByteCount(0), + mTimerWheelSlot(-1), + mExpirationTime(-1), + mNetManager(0), + mListIt() + {} + void EnableReadIfOverloaded() { mEnableReadIfOverloaded = true; } + void SetConnectPending(bool flag) { mConnectPending = flag; } + bool IsConnectPending() const { return mConnectPending; } + + private: + bool mIn:1; + bool mOut:1; + bool mAdded:1; + /// should we add this connection to the poll vector for reads + /// even when the system is overloaded? + bool mEnableReadIfOverloaded:1; + bool mConnectPending:1; + int mFd; + int mWriteByteCount; + int mTimerWheelSlot; + time_t mExpirationTime; + NetManager* mNetManager; + List::iterator mListIt; + + friend class NetManager; + + private: + NetManagerEntry(const NetManagerEntry&); + NetManagerEntry operator=(const NetManagerEntry&); + }; + NetManagerEntry* GetNetManagerEntry() { + return &mNetManagerEntry; + } + const NetManagerEntry* GetNetManagerEntry() const { + return &mNetManagerEntry; + } + void Update(bool resetTimer = true); + +private: + NetManagerEntry mNetManagerEntry; + const bool mListenOnly:1; + const bool mOwnsSocket:1; + bool mTryWrite:1; + /// KfsCallbackObj that will be notified whenever "events" occur. + KfsCallbackObj* mCallbackObj; + /// Socket on which I/O will be done. + TcpSocket* mSock; + /// Buffer that contains data read from the socket + IOBuffer mInBuffer; + /// Buffer that contains data that should be sent out on the socket. + IOBuffer mOutBuffer; + /// When was the last activity on this connection + /// # of bytes from the out buffer that should be sent out. + int mInactivityTimeoutSecs; + int maxReadAhead; + string mPeerName; + +private: + // No copies. + NetConnection(const NetConnection&); + NetConnection& operator=(const NetConnection&); +}; + +typedef NetConnection::NetConnectionPtr NetConnectionPtr; + + +} +#endif // LIBIO_NETCONNECTION_H diff --git a/src/cc/kfsio/NetErrorSimulator.cc b/src/cc/kfsio/NetErrorSimulator.cc new file mode 100644 index 000000000..199ee1335 --- /dev/null +++ b/src/cc/kfsio/NetErrorSimulator.cc @@ -0,0 +1,485 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/10/03 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Network error simulator implementation. +// +//---------------------------------------------------------------------------- + +#include "NetErrorSimulator.h" + +#ifdef KFS_DONT_USE_BOOST_REGEX_LIB +// Paper over missing boost regex lib + +#include + +namespace KFS +{ +bool +NetErrorSimulatorConfigure( + NetManager& /* inNetManager */, + const char* inConfigPtr) +{ + if (inConfigPtr && *inConfigPtr) { + std::cerr << + "NetErrorSimulatorConfigure is not supported" << + std::endl; + } + return true; +} + +} + +#else + +#include "common/MsgLogger.h" +#include "common/StdAllocator.h" +#include "common/Properties.h" +#include "qcdio/QCFdPoll.h" +#include "NetManager.h" +#include "Globals.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace KFS +{ + +class NetErrorSimulator : public NetManager::PollEventHook +{ +public: + NetErrorSimulator( + NetManager& inNetManager) + : NetManager::PollEventHook(), + mSpecs(), + mConnMap(), + mRandom(/* seed */), + mRandMax(mRandom.max()), + mNetManager(inNetManager) + {} + virtual ~NetErrorSimulator() + { + PollEventHook* const theHookPtr = mNetManager.SetPollEventHook(); + if (theHookPtr && theHookPtr != this) { + mNetManager.SetPollEventHook(theHookPtr); + } + } + static bool Set( + NetManager& inNetManager, + std::string inSpecs) + { + NetErrorSimulator* theSimPtr = inSpecs.empty() ? + 0 : new NetErrorSimulator(inNetManager); + if (theSimPtr && ! theSimPtr->SetParameters(inSpecs)) { + delete theSimPtr; + theSimPtr = 0; + } + PollEventHook* const thePrevHookPtr = + inNetManager.SetPollEventHook(theSimPtr); + if (! thePrevHookPtr) { + return true; + } + NetErrorSimulator* const thePrevSimPtr = + dynamic_cast(thePrevHookPtr); + if (thePrevSimPtr) { + delete thePrevSimPtr; + return true; + } + inNetManager.SetPollEventHook(thePrevHookPtr); + delete theSimPtr; + return false; + } + bool SetParameters( + std::string inSpecs) + { + mSpecs.clear(); + mConnMap.clear(); + // sn=^[^:]*:30000$,pn=^[^:]*:20000$,a=rn+rd+wd+err+rst+rand+log,int=1; + for (size_t theNextPos = 0; ;) { + const size_t theEndPos = inSpecs.find(';', theNextPos); + std::string theSpec = inSpecs.substr( + theNextPos, + theEndPos == std::string::npos ? + theEndPos : theEndPos - theNextPos + ); + for (size_t thePos = 0; + (thePos = theSpec.find(',', thePos)) != + std::string::npos; ) { + theSpec[thePos] = '\n'; + } + std::istringstream theInStream(theSpec); + Properties theProps; + theProps.loadProperties(theInStream, '=', false); + int theActionFlags = 0; + std::string theActions = theProps.getValue("a", ""); + if (theActions.find("rn") != std::string::npos) { + theActionFlags |= SimSpec::kActionDisableRead; + } + if (theActions.find("rd") != std::string::npos) { + theActionFlags |= SimSpec::kActionDiscardRead; + } + if (theActions.find("wd") != std::string::npos) { + theActionFlags |= SimSpec::kActionDiscardWrite; + } + if (theActions.find("err") != std::string::npos) { + theActionFlags |= SimSpec::kActionSetError; + } + if (theActions.find("rst") != std::string::npos) { + theActionFlags |= SimSpec::kActionClose; + } + if (theActions.find("rand") != std::string::npos) { + theActionFlags |= SimSpec::kActionRandomInterval; + } + if (theActions.find("log") != std::string::npos) { + theActionFlags |= SimSpec::kActionLog; + } + if (theActions.find("exit") != std::string::npos) { + theActionFlags |= SimSpec::kActionExit; + } + if (theActions.find("abort") != std::string::npos) { + theActionFlags |= SimSpec::kActionAbort; + } + if (theActions.find("erd") != std::string::npos) { + theActionFlags |= SimSpec::kActionSetErrorOnRead; + } + if (theActions.find("ewr") != std::string::npos) { + theActionFlags |= SimSpec::kActionSetErrorOnWrite; + } + double theSleepSec = theProps.getValue("sleep", (double)-1); + if (theSleepSec > 0) { + theActionFlags |= SimSpec::kActionSleep; + } else { + if ((theSleepSec = + theProps.getValue("rsleep", (double)-1)) > 0) { + theActionFlags |= SimSpec::kActionRandomSleep; + } + } + if (theActionFlags != 0) { + mSpecs.push_back(SimSpec( + theProps.getValue("sn", ""), + theProps.getValue("pn", ""), + theActionFlags, + (uint32_t)theProps.getValue("int", (uint64_t)0), + (float)theSleepSec + )); + } + if (theEndPos == std::string::npos) { + break; + } + theNextPos = theEndPos + 1; + } + return (! mSpecs.empty()); + } + virtual void Remove( + NetManager& inMgr, + NetConnection& inConn) + { mConnMap.erase(&inConn); } + virtual void Event( + NetManager& inMgr, + NetConnection& inConn, + int& ioPollEvent) + { + if (! inConn.IsGood()) { + return; + } + const NetConnection* const theConnPtr = &inConn; + std::pair + theRange = mConnMap.equal_range(theConnPtr); + if (theRange.first == theRange.second) { + bool theInsertedFlag = false; + const std::string theSockName = inConn.GetSockName(); + const std::string thePeerName = inConn.GetPeerName(); + for (SimSpecs::const_iterator theIt = mSpecs.begin(); + theIt != mSpecs.end(); + ++theIt) { + const SimSpec& theSpec = *theIt; + if ((theSpec.mSockNameRegex.empty() || regex_match( + theSockName, + theSpec.mSockNameRegex)) && + (theSpec.mPeerNameRegex.empty() || regex_match( + thePeerName, + theSpec.mPeerNameRegex))) { + mConnMap.insert(std::make_pair(theConnPtr, ConnEntry( + theIt, + theSockName + "/" + thePeerName, + GetCount(*theIt) + ))); + theInsertedFlag = true; + } + } + if (! theInsertedFlag) { + mConnMap.insert(std::make_pair(theConnPtr, mSpecs.end())); + } + theRange = mConnMap.equal_range(theConnPtr); + } + for ( ; theRange.first != theRange.second; ++theRange.first) { + if (theRange.first->second.mSpecIt == mSpecs.end()) { + continue; + } + ConnEntry& theEntry = theRange.first->second; + if (theEntry.mCount > 0) { + theEntry.mCount--; + continue; + } + const SimSpec& theSpec = *theEntry.mSpecIt; + if (theSpec.mActionFlags == SimSpec::kActionNone) { + continue; + } + theEntry.mCount = GetCount(theSpec); + const int theOrigPollEvent = ioPollEvent; + std::string theActions; + if ((theSpec.mActionFlags & SimSpec::kActionDisableRead) != 0) { + ioPollEvent &= ~int( + QCFdPoll::kOpTypeIn | + QCFdPoll::kOpTypePri | + QCFdPoll::kOpTypeHup + ); + inConn.SetMaxReadAhead(0); + ListAdd(theActions, "rn"); + } + int theRdDiscarded = 0; + if ((theSpec.mActionFlags & SimSpec::kActionDiscardRead) != 0) { + theRdDiscarded = inConn.GetNumBytesToRead(); + inConn.DiscardRead(); + ListAdd(theActions, "rd"); + } + int theWrDiscarded = 0; + if ((theSpec.mActionFlags & + SimSpec::kActionDiscardWrite) != 0) { + ioPollEvent &= ~int(QCFdPoll::kOpTypeOut); + theWrDiscarded = inConn.GetNumBytesToWrite(); + inConn.DiscardWrite(); + ListAdd(theActions, "wd"); + } + if ((theSpec.mActionFlags & SimSpec::kActionSetError) != 0) { + ioPollEvent = int(QCFdPoll::kOpTypeError); + ListAdd(theActions, "err"); + } + if ((theSpec.mActionFlags & SimSpec::kActionSetErrorOnRead) != 0 && + (ioPollEvent & QCFdPoll::kOpTypeIn) != 0) { + ioPollEvent = int(QCFdPoll::kOpTypeError); + ListAdd(theActions, "erd"); + } + if ((theSpec.mActionFlags & SimSpec::kActionSetErrorOnWrite) != 0 && + (ioPollEvent & QCFdPoll::kOpTypeOut) != 0) { + ioPollEvent = int(QCFdPoll::kOpTypeError); + ListAdd(theActions, "ewr"); + } + if ((theSpec.mActionFlags & SimSpec::kActionClose) != 0) { + inConn.Close(); + ioPollEvent = 0; + ListAdd(theActions, "rst"); + } + if ((theSpec.mActionFlags & SimSpec::kActionLog) != 0) { + ListAdd(theActions, "log"); + KFS_LOG_STREAM_DEBUG << theEntry.mConnId << + " " << theActions << + " poll: " << DisplayPollFlags(theOrigPollEvent) << + " -> " << DisplayPollFlags(ioPollEvent) << + " discarded:" + " rd: " << theRdDiscarded << + " wr: " << theWrDiscarded << + KFS_LOG_EOM; + } + if ((theSpec.mActionFlags & SimSpec::kActionSleep) != 0) { + Sleep(theSpec.mSleepSec); + } + if ((theSpec.mActionFlags & SimSpec::kActionRandomSleep) != 0) { + RandomSleep(theSpec.mSleepSec); + } + if ((theSpec.mActionFlags & SimSpec::kActionAbort) != 0) { + abort(); + } + if ((theSpec.mActionFlags & SimSpec::kActionExit) != 0) { + _exit(1); + } + } + } +private: + typedef boost::regex Regex; + struct SimSpec + { + enum + { + kActionNone = 0, + kActionDisableRead = 1, + kActionDiscardRead = 1 << 1, + kActionDiscardWrite = 1 << 2, + kActionSetError = 1 << 3, + kActionClose = 1 << 4, + kActionRandomInterval = 1 << 5, + kActionLog = 1 << 6, + kActionSleep = 1 << 7, + kActionRandomSleep = 1 << 8, + kActionExit = 1 << 9, + kActionAbort = 1 << 10, + kActionSetErrorOnRead = 1 << 11, + kActionSetErrorOnWrite = 1 << 12 + }; + SimSpec() + : mSockNameRegex(), + mPeerNameRegex(), + mActionFlags(kActionNone), + mInterval(0), + mSleepSec(0) + {} + SimSpec( + const std::string& inSockNameRegexStr, + const std::string& inPeerNameRegexStr, + int inActionFlags, + uint32_t inInterval, + float inSleepSec) + : mSockNameRegex(inSockNameRegexStr, + Regex::perl + Regex::icase + Regex::no_except), + mPeerNameRegex(inPeerNameRegexStr, + Regex::perl + Regex::icase + Regex::no_except), + mActionFlags(inActionFlags), + mInterval(inInterval), + mSleepSec(inSleepSec) + {} + Regex mSockNameRegex; + Regex mPeerNameRegex; + int mActionFlags; + uint32_t mInterval; + float mSleepSec; + }; + typedef std::vector SimSpecs; + typedef boost::mt19937 Random; + struct ConnEntry + { + ConnEntry( + SimSpecs::const_iterator inSpecIt = SimSpecs::const_iterator(), + std::string inConnId = std::string(), + Random::result_type inCount = 0) + : mSpecIt(inSpecIt), + mConnId(inConnId), + mCount(inCount) + {} + SimSpecs::const_iterator mSpecIt; + std::string mConnId; + Random::result_type mCount; + }; + typedef std::multimap< + const NetConnection*, + ConnEntry, + std::less, + StdFastAllocator< + std::pair + > + > ConnMap; + + SimSpecs mSpecs; + ConnMap mConnMap; + Random mRandom; + const Random::result_type mRandMax; + NetManager& mNetManager; + + static void ListAdd( + std::string& inList, + const char* inElemPtr, + const char* inDelimPtr = "+") + { + if (! inList.empty()) { + inList += inDelimPtr; + } + inList += inElemPtr; + } + static std::string DisplayPollFlags( + int inFlags) + { + std::string theRet; + if ((inFlags & QCFdPoll::kOpTypeIn) != 0) { + ListAdd(theRet, "in"); + } + if ((inFlags & QCFdPoll::kOpTypeOut) != 0) { + ListAdd(theRet, "out"); + } + if ((inFlags & QCFdPoll::kOpTypePri) != 0) { + ListAdd(theRet, "pri"); + } + if ((inFlags & QCFdPoll::kOpTypeHup) != 0) { + ListAdd(theRet, "hup"); + } + if ((inFlags & QCFdPoll::kOpTypeError) != 0) { + ListAdd(theRet, "err"); + } + return theRet; + } + Random::result_type GetCount( + const SimSpec& inSpec) + { + // Don't use modulo, low order bits might be "less random". + // Though this shouldn't be a problem with Mersenne twister. + const uint64_t theInterval = inSpec.mInterval; + return Random::result_type( + ((inSpec.mActionFlags == + SimSpec::kActionRandomInterval) != 0 && + theInterval > 0) ? + (uint64_t)mRandom() * theInterval / mRandMax : + theInterval + ); + } + void Sleep( + float inSec) + { + if (inSec <= 0) { + return; + } + struct timespec theTs; + theTs.tv_sec = time_t(inSec); + long kMaxNsec = 999999999; + theTs.tv_nsec = std::min(kMaxNsec, long((inSec - theTs.tv_sec) * 1e9)); + while ( + (theTs.tv_sec > 0 || theTs.tv_nsec > 0) && + nanosleep(&theTs, &theTs) != 0 && + errno == EINTR) + {} + } + void RandomSleep( + float inSec) + { Sleep(mRandom() * inSec / mRandMax); } + +private: + NetErrorSimulator( + const NetErrorSimulator&); + NetErrorSimulator& operator=( + const NetErrorSimulator&); +}; + +bool +NetErrorSimulatorConfigure( + NetManager& inNetManager, + const char* inConfigPtr) +{ + return NetErrorSimulator::Set(inNetManager, inConfigPtr ? inConfigPtr : ""); +} + +} /* namespace KFS */ +#endif diff --git a/src/cc/kfsio/NetErrorSimulator.h b/src/cc/kfsio/NetErrorSimulator.h new file mode 100644 index 000000000..d96cd1e35 --- /dev/null +++ b/src/cc/kfsio/NetErrorSimulator.h @@ -0,0 +1,59 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/10/03 +// Author: Mike Ovsiannikov +// +// Copyright 2009 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef NET_ERROR_SIMULATOR_H +#define NET_ERROR_SIMULATOR_H + +namespace KFS +{ +// Network error simulator. Used for testing and debugging. It is instrumental in +// discovering protocol and implementation bugs, and in particular "timing holes". +// Use in "endurance test". +// +// Error simulation spec syntax: +// sn=^[^:]*:30000$,pn=^[^:]*:20000$,a=rn+rd+wd+err+rst+rand+log,int=1,sleep|rsleep=2;;... +// sn -- perl regex to match : socket name returned by getsockname() +// pn -- perl regex to match : peer name returned by getpeername() +// a -- action: +// rn -- read none, disable read +// rd -- read discard, discard input buffer data, doesn't disable read +// wd -- write discard, disable write, discard output buffer data +// err -- set poll error flag, reset all other flags +// erd -- when read flag set, set poll error flag, reset all other flags +// ewr -- when write flag set, set poll error flag, reset all other flags +// rst -- close connection, call NetConnection::Close() +// rand -- use random inteval from 0 to "int=" +// exit -- call _exit(1) +// abort -- call abort() +// log -- emit log message when action performed +// int=x -- action interval +// sleep=x -- sleep for x seconds +// rsleep=x -- sleep for random # of seconds in the range [0:x] +// +class NetManager; +bool NetErrorSimulatorConfigure(NetManager& inNetManager, const char* inConfigPtr = 0); +} + +#endif /* NET_ERROR_SIMULATOR_H */ diff --git a/src/cc/kfsio/NetManager.cc b/src/cc/kfsio/NetManager.cc new file mode 100644 index 000000000..665d460f0 --- /dev/null +++ b/src/cc/kfsio/NetManager.cc @@ -0,0 +1,710 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/14 +// Author: Sriram Rao +// Mike Ovsiannikov -- re-implement by separating poll/select/epoll +// os specific logic, implement timer wheel to get rid of linear +// connection list scans on every event, add Timer class. +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Generic network io event loop implementation. +// +//---------------------------------------------------------------------------- + +#include +#include +#include +#include + +#include "NetManager.h" +#include "TcpSocket.h" +#include "ITimeout.h" + +#include "common/MsgLogger.h" +#include "qcdio/QCFdPoll.h" +#include "qcdio/QCUtils.h" +#include "qcdio/QCMutex.h" +#include "qcdio/qcstutils.h" + +namespace KFS +{ +using std::min; +using std::max; +using std::numeric_limits; + +class NetManager::Waker +{ +public: + Waker() + : mMutex(), + mWritten(0), + mSleepingFlag(false), + mWakeFlag(false) + { + const int res = pipe(mPipeFds); + if (res < 0) { + perror("pipe"); + mPipeFds[0] = -1; + mPipeFds[1] = -1; + abort(); + return; + } + fcntl(mPipeFds[0], F_SETFL, O_NONBLOCK); + fcntl(mPipeFds[1], F_SETFL, O_NONBLOCK); + fcntl(mPipeFds[0], FD_CLOEXEC, 1); + fcntl(mPipeFds[1], FD_CLOEXEC, 1); + } + ~Waker() { Waker::Close(); } + bool Sleep() + { + QCStMutexLocker lock(mMutex); + mSleepingFlag = ! mWakeFlag; + mWakeFlag = false; + return mSleepingFlag; + } + int Wake() + { + QCStMutexLocker lock(mMutex); + mSleepingFlag = false; + while (mWritten > 0) { + char buf[64]; + const int res = read(mPipeFds[0], buf, sizeof(buf)); + if (res > 0) { + mWritten -= min(mWritten, res); + } else { + break; + } + } + return (mWritten); + } + void Wakeup() + { + QCStMutexLocker lock(mMutex); + mWakeFlag = true; + if (mSleepingFlag && mWritten <= 0) { + mWritten++; + const char buf = 'k'; + write(mPipeFds[1], &buf, sizeof(buf)); + } + } + int GetFd() const { return mPipeFds[0]; } + void Close() + { + for (int i = 0; i < 2; i++) { + if (mPipeFds[i] >= 0) { + close(mPipeFds[i]); + mPipeFds[i] = -1; + } + } + } +private: + QCMutex mMutex; + int mWritten; + int mPipeFds[2]; + bool mSleepingFlag; + bool mWakeFlag; + +private: + Waker(const Waker&); + Waker& operator=(const Waker&); +}; + +NetManager::NetManager(int timeoutMs) + : mRemove(), + mTimerWheelBucketItr(mRemove.end()), + mCurConnection(0), + mCurTimerWheelSlot(0), + mConnectionsCount(0), + mDiskOverloaded(false), + mNetworkOverloaded(false), + mIsOverloaded(false), + mRunFlag(true), + mShutdownFlag(false), + mTimerRunningFlag(false), + mTimeoutMs(timeoutMs), + mStartTime(time(0)), + mNow(mStartTime), + mMaxOutgoingBacklog(0), + mNumBytesToSend(0), + mTimerOverrunCount(0), + mTimerOverrunSec(0), + mMaxAcceptsPerRead(1), + mPoll(*(new QCFdPoll())), + mWaker(*(new Waker())), + mPollEventHook(0), + mCurTimeoutHandler(0), + mEpollError() +{ + TimeoutHandlers::Init(mTimeoutHandlers); +} + +NetManager::~NetManager() +{ + NetManager::CleanUp(); + delete &mPoll; + delete &mWaker; +} + +void +NetManager::AddConnection(const NetConnectionPtr& conn) +{ + if (mShutdownFlag) { + return; + } + NetConnection::NetManagerEntry* const entry = + conn->GetNetManagerEntry(); + if (! entry) { + return; + } + if (entry->mNetManager && entry->mNetManager != this) { + KFS_LOG_STREAM_FATAL << + "attempt to add connection to different net manager" << + KFS_LOG_EOM; + MsgLogger::Stop(); + abort(); + } + if (! entry->mAdded) { + entry->mTimerWheelSlot = kTimerWheelSize; + entry->mListIt = mTimerWheel[kTimerWheelSize].insert( + mTimerWheel[kTimerWheelSize].end(), conn); + mConnectionsCount++; + assert(mConnectionsCount > 0); + entry->mAdded = true; + entry->mNetManager = this; + if (mPollEventHook) { + mPollEventHook->Add(*this, *conn); + } + } + conn->Update(); +} + +void +NetManager::RegisterTimeoutHandler(ITimeout* handler) +{ + if (handler) { + TimeoutHandlers::PushBack(mTimeoutHandlers, *handler); + } +} + +void +NetManager::UnRegisterTimeoutHandler(ITimeout* handler) +{ + if (! handler) { + return; + } + if (mCurTimeoutHandler == handler) { + mCurTimeoutHandler = &ITimeout::List::GetNext(*handler); + if (mCurTimeoutHandler == TimeoutHandlers::Front(mTimeoutHandlers)) { + mCurTimeoutHandler = 0; + } + } + TimeoutHandlers::Remove(mTimeoutHandlers, *handler); +} + +inline void +NetManager::UpdateTimer(NetConnection::NetManagerEntry& entry, int timeOut) +{ + assert(entry.mAdded); + + if (mShutdownFlag) { + return; + } + int timerWheelSlot; + if (timeOut < 0) { + timerWheelSlot = kTimerWheelSize; + } else if ((timerWheelSlot = mCurTimerWheelSlot + + // When the timer is running the effective wheel size "grows" by 1: + // leave (move) entries with timeouts >= kTimerWheelSize in (to) the + // current slot. + min((kTimerWheelSize - (mTimerRunningFlag ? 0 : 1)), timeOut)) >= + kTimerWheelSize) { + timerWheelSlot -= kTimerWheelSize; + } + // This method can be invoked from timeout handler. + // Make sure that the entry doesn't get moved to the end of the current + // list, which can be traversed by the timer. + if (timerWheelSlot != entry.mTimerWheelSlot) { + if (mTimerWheelBucketItr == entry.mListIt) { + ++mTimerWheelBucketItr; + } + mTimerWheel[timerWheelSlot].splice( + mTimerWheel[timerWheelSlot].end(), + mTimerWheel[entry.mTimerWheelSlot], entry.mListIt); + entry.mTimerWheelSlot = timerWheelSlot; + } +} + +void +NetManager::Update(NetConnection::NetManagerEntry& entry, int fd, + bool resetTimer) +{ + if (entry.mNetManager) { + entry.mNetManager->UpdateSelf(entry, fd, resetTimer, false); + } +} + +inline static int +CheckFatalPollSysError(int err, const char* msg) +{ + if (! err) { + return err; + } + if ((err & QCFdPoll::kEpollFailureAfterFork) == 0) { + KFS_LOG_STREAM_FATAL << QCUtils::SysError(err, msg) << KFS_LOG_EOM; + MsgLogger::Stop(); + abort(); + return err; + } + const int ret = err & ~QCFdPoll::kEpollFailureAfterFork; + KFS_LOG_STREAM_ERROR << "epoll error: " << QCUtils::SysError(ret, msg) << + KFS_LOG_EOM; + return ret; +} + +void +NetManager::PollRemove(int fd) +{ + CheckFatalPollSysError( + fd < 0 ? EINVAL : mPoll.Remove(fd), + "failed to remove fd from poll set" + ); +} + +void +NetManager::UpdateSelf(NetConnection::NetManagerEntry& entry, int fd, + bool resetTimer, bool epollError) +{ + if (! entry.mAdded) { + return; + } + assert(*entry.mListIt); + NetConnection& conn = **entry.mListIt; + assert(fd >= 0 || ! conn.IsGood()); + // Always check if connection has to be removed: this method always + // called before socket fd gets closed. + if (! conn.IsGood() || fd < 0 || epollError) { + if (entry.mFd >= 0) { + PollRemove(entry.mFd); + entry.mFd = -1; + } + if (mTimerWheelBucketItr == entry.mListIt) { + ++mTimerWheelBucketItr; + } + if (epollError) { + assert(conn.IsGood()); + mEpollError.splice(mEpollError.end(), + mTimerWheel[entry.mTimerWheelSlot], entry.mListIt); + return; + } + assert(mConnectionsCount > 0 && + entry.mWriteByteCount >= 0 && + entry.mWriteByteCount <= mNumBytesToSend); + entry.mAdded = false; + mConnectionsCount--; + mNumBytesToSend -= entry.mWriteByteCount; + mRemove.splice(mRemove.end(), + mTimerWheel[entry.mTimerWheelSlot], entry.mListIt); + // Do not reset entry->mNetManager, it is an error to add connection to + // a different net manager even after close. + if (mPollEventHook) { + mPollEventHook->Remove(*this, **entry.mListIt); + } + return; + } + if (&conn == mCurConnection) { + // Defer all updates for the currently dispatched connection until the + // end of the event dispatch loop. + return; + } + // Update timer. + if (resetTimer) { + const int timeOut = conn.GetInactivityTimeout(); + if (timeOut >= 0) { + entry.mExpirationTime = mNow + timeOut; + } + UpdateTimer(entry, timeOut); + } + // Update pending send. + assert(entry.mWriteByteCount >= 0 && + entry.mWriteByteCount <= mNumBytesToSend); + mNumBytesToSend -= entry.mWriteByteCount; + entry.mWriteByteCount = max(0, conn.GetNumBytesToWrite()); + mNumBytesToSend += entry.mWriteByteCount; + // Update poll set. + const bool in = conn.IsReadReady() && + (! mIsOverloaded || entry.mEnableReadIfOverloaded); + const bool out = conn.IsWriteReady() || entry.mConnectPending; + if (in != entry.mIn || out != entry.mOut) { + assert(fd >= 0); + const int op = + (in ? QCFdPoll::kOpTypeIn : 0) + (out ? QCFdPoll::kOpTypeOut : 0); + if ((fd != entry.mFd || op == 0) && entry.mFd >= 0) { + PollRemove(entry.mFd); + entry.mFd = -1; + } + if (entry.mFd < 0) { + if (op) { + if (CheckFatalPollSysError( + mPoll.Add(fd, op, &conn), + "failed to add fd to poll set") == 0) { + entry.mFd = fd; + } else { + UpdateSelf(entry, fd, false, true); + return; // Tail recursion + } + } + } else { + if (CheckFatalPollSysError( + mPoll.Set(fd, op, &conn), + "failed to change poll flags" + ) != 0) { + UpdateSelf(entry, fd, false, true); + return; // Tail recursion + } + } + entry.mIn = in && entry.mFd >= 0; + entry.mOut = out && entry.mFd >= 0; + } +} + +void +NetManager::Wakeup() +{ + mWaker.Wakeup(); +} + +void +NetManager::MainLoop(QCMutex* mutex /* = 0 */) +{ + QCStMutexLocker locker(mutex); + + mNow = time(0); + time_t lastTimerTime = mNow; + CheckFatalPollSysError( + mPoll.Add(mWaker.GetFd(), QCFdPoll::kOpTypeIn), + "failed to add net waker's fd to the poll set" + ); + const int timerOverrunWarningTime(mTimeoutMs / (1000/2)); + while (mRunFlag) { + const bool wasOverloaded = mIsOverloaded; + CheckIfOverloaded(); + if (mIsOverloaded != wasOverloaded) { + KFS_LOG_STREAM_INFO << + (mIsOverloaded ? + "System is now in overloaded state" : + "Clearing system overload state") << + " " << mNumBytesToSend << " bytes to send" << + KFS_LOG_EOM; + // Turn on read only if returning from overloaded state. + // Turn off read in the event processing loop if overloaded, and + // read event is pending. + // The "lazy" processing here is to reduce number of system calls. + if (! mIsOverloaded) { + for (int i = 0; i <= kTimerWheelSize; i++) { + for (List::iterator c = mTimerWheel[i].begin(); + c != mTimerWheel[i].end(); ) { + assert(*c); + NetConnection& conn = **c; + ++c; + conn.Update(false); + } + } + } + } + { + const int timeout = mWaker.Sleep() ? mTimeoutMs : 0; + QCStMutexUnlocker unlocker(mutex); + const int ret = mPoll.Poll(mConnectionsCount + 1, timeout); + if (ret < 0 && ret != -EINTR && ret != -EAGAIN) { + KFS_LOG_STREAM_ERROR << + QCUtils::SysError(-ret, "poll error") << + KFS_LOG_EOM; + } + } + mWaker.Wake(); + const int64_t nowMs = ITimeout::NowMs(); + mNow = time_t(nowMs / 1000); + mCurTimeoutHandler = TimeoutHandlers::Front(mTimeoutHandlers); + while (mCurTimeoutHandler) { + ITimeout& cur = *mCurTimeoutHandler; + mCurTimeoutHandler = &ITimeout::List::GetNext(cur); + if (mCurTimeoutHandler == TimeoutHandlers::Front(mTimeoutHandlers)) { + mCurTimeoutHandler = 0; + } + cur.TimerExpired(nowMs); + } + /// Process poll events. + int op; + void* ptr; + while (mPoll.Next(op, ptr)) { + if (op == 0 || ! ptr) { + continue; + } + NetConnection& conn = *reinterpret_cast(ptr); + if (! conn.GetNetManagerEntry()->mAdded) { + // Skip stale event, the conection should be in mRemove list. + continue; + } + // Defer update for this connection. + mCurConnection = &conn; + if (mPollEventHook) { + mPollEventHook->Event(*this, conn, op); + } + const bool hupError = op == QCFdPoll::kOpTypeHup && + ! conn.IsReadReady() && ! conn.IsWriteReady(); + if ((op & (QCFdPoll::kOpTypeIn | QCFdPoll::kOpTypeHup)) != 0 && + conn.IsGood() && (! mIsOverloaded || + conn.GetNetManagerEntry()->mEnableReadIfOverloaded)) { + conn.HandleReadEvent(mMaxAcceptsPerRead); + } + if ((op & (QCFdPoll::kOpTypeOut | QCFdPoll::kOpTypeHup)) != 0 && + conn.IsGood()) { + conn.HandleWriteEvent(); + } + if (((op & QCFdPoll::kOpTypeError) != 0 || hupError) && + conn.IsGood()) { + conn.HandleErrorEvent(); + } + // Try to write, if the last write was sucessfull. + conn.StartFlush(); + // Update the connection. + mCurConnection = 0; + conn.Update(); + } + while (! mEpollError.empty()) { + assert(mEpollError.front()); + NetConnection& conn = *mEpollError.front(); + assert(conn.IsGood()); + conn.HandleErrorEvent(); + } + mRemove.clear(); + mNow = time(0); + int slotCnt = min(int(kTimerWheelSize), int(mNow - lastTimerTime)); + if (lastTimerTime + timerOverrunWarningTime < mNow) { + KFS_LOG_STREAM_INFO << + "timer overrun " << (mNow - lastTimerTime) << + " seconds detected" << + KFS_LOG_EOM; + mTimerOverrunCount++; + mTimerOverrunSec += mNow - lastTimerTime; + } + mTimerRunningFlag = true; + while (slotCnt-- > 0) { + List& bucket = mTimerWheel[mCurTimerWheelSlot]; + mTimerWheelBucketItr = bucket.begin(); + while (mTimerWheelBucketItr != bucket.end()) { + assert(*mTimerWheelBucketItr); + NetConnection& conn = **mTimerWheelBucketItr; + assert(conn.IsGood()); + ++mTimerWheelBucketItr; + NetConnection::NetManagerEntry& entry = + *conn.GetNetManagerEntry(); + const int timeOut = conn.GetInactivityTimeout(); + if (timeOut < 0) { + // No timeout, move it to the corresponding list. + UpdateTimer(entry, timeOut); + } else if (entry.mExpirationTime <= mNow) { + conn.HandleTimeoutEvent(); + } else { + // Not expired yet, move to the new slot, taking into the + // account possible timer overrun. + UpdateTimer(entry, + slotCnt + int(entry.mExpirationTime - mNow)); + } + } + if (++mCurTimerWheelSlot >= kTimerWheelSize) { + mCurTimerWheelSlot = 0; + } + mRemove.clear(); + } + mTimerRunningFlag = false; + lastTimerTime = mNow; + mTimerWheelBucketItr = mRemove.end(); + } + CheckFatalPollSysError( + mPoll.Remove(mWaker.GetFd()), + "failed to removed net kicker's fd from poll set" + ); + CleanUp(); +} + +void +NetManager::CheckIfOverloaded() +{ + if (mMaxOutgoingBacklog > 0) { + if (! mNetworkOverloaded) { + mNetworkOverloaded = mNumBytesToSend > mMaxOutgoingBacklog; + } else if (mNumBytesToSend <= mMaxOutgoingBacklog / 2) { + // network was overloaded and that has now cleared + mNetworkOverloaded = false; + } + } else { + mNetworkOverloaded = false; + } + mIsOverloaded = mDiskOverloaded || mNetworkOverloaded; +} + +void +NetManager::ChangeDiskOverloadState(bool v) +{ + mDiskOverloaded = v; +} + +void +NetManager::CleanUp(bool childAtForkFlag, bool onlyCloseFdFlag) +{ + mShutdownFlag = true; + while (! TimeoutHandlers::IsEmpty(mTimeoutHandlers)) { + TimeoutHandlers::PopFront(mTimeoutHandlers); + } + if (childAtForkFlag) { + mPoll.Close(); + mWaker.Close(); + } + for (int i = 0; i <= kTimerWheelSize; i++) { + for (mTimerWheelBucketItr = mTimerWheel[i].begin(); + mTimerWheelBucketItr != mTimerWheel[i].end(); ) { + NetConnection* const conn = mTimerWheelBucketItr->get(); + ++mTimerWheelBucketItr; + if (conn) { + if (childAtForkFlag) { + conn->GetNetManagerEntry()->mAdded = false; + if (onlyCloseFdFlag) { + const bool kClearOutBufferFlag = false; + conn->Close(kClearOutBufferFlag); + } + } + if (conn->IsGood()) { + conn->HandleErrorEvent(); + } + } + } + assert((childAtForkFlag && onlyCloseFdFlag) || mTimerWheel[i].empty()); + mRemove.clear(); + } + mTimerWheelBucketItr = mRemove.end(); +} + +void +NetManager::ChildAtFork(bool onlyCloseFdFlag) +{ + CleanUp(true, onlyCloseFdFlag); +} + +inline const NetManager* +NetManager::GetNetManager(const NetConnection& conn) +{ + return conn.GetNetManagerEntry()->mNetManager; +} + +inline time_t +NetManager::Timer::Handler::Now() const +{ + return GetNetManager(*mConn)->Now(); +} + +NetManager::Timer::Handler::Handler(NetManager& netManager, KfsCallbackObj& obj, int tmSec) + : KfsCallbackObj(), + mObj(obj), + mStartTime(tmSec >= 0 ? netManager.Now() : 0), + mSock(numeric_limits::max()), // Fake fd, for IsGood() + mConn(new NetConnection(&mSock, this, false, false)) +{ + SET_HANDLER(this, &Handler::EventHandler); + mConn->SetMaxReadAhead(0); // Do not add this to poll. + mConn->SetInactivityTimeout(tmSec); + netManager.AddConnection(mConn); +} + +void +NetManager::Timer::Handler::SetTimeout(int tmSec) +{ + const int prevTm = mConn->GetInactivityTimeout(); + mStartTime = Now(); + if (prevTm != tmSec) { + mConn->SetInactivityTimeout(tmSec); // Reset timer. + } else { + mConn->Update(); // Reset timer. + } +} + +time_t +NetManager::Timer::Handler::GetRemainingTime() const +{ + const int tmSec = mConn->GetInactivityTimeout(); + if (tmSec < 0) { + return tmSec; + } + const time_t next = mStartTime + tmSec; + const time_t now = Now(); + return (next > now ? next - now : 0); +} + +int +NetManager::Timer::Handler::EventHandler(int type, void* /* data */) +{ + switch (type) { + case EVENT_NET_ERROR: // Invoked from net manager cleanup code. + Cleanup(); + // Fall through + case EVENT_INACTIVITY_TIMEOUT: + mStartTime = Now(); + return mObj.HandleEvent(EVENT_INACTIVITY_TIMEOUT, 0); + default: + assert(! "unexpected event type"); + } + return 0; +} + +void +NetManager::Timer::Handler::Cleanup() +{ + mConn->Close(); + // Reset fd to prevent calling close(). + mSock = TcpSocket(); +} + +void +NetManager::Timer::Handler::ResetTimeout() +{ + if (mConn->GetInactivityTimeout() >= 0) { + mStartTime = Now(); + mConn->Update(); + } +} + +void +NetManager::Timer::Handler::ScheduleTimeoutNoLaterThanIn(int tmSec) +{ + if (tmSec < 0) { + return; + } + const int curTimeout = mConn->GetInactivityTimeout(); + const time_t now = Now(); + if (curTimeout < 0 || now + tmSec < mStartTime + curTimeout) { + mStartTime = now; + if (curTimeout != tmSec) { + mConn->SetInactivityTimeout(tmSec); + } else { + mConn->Update(); + } + } +} +} diff --git a/src/cc/kfsio/NetManager.h b/src/cc/kfsio/NetManager.h new file mode 100644 index 000000000..de43938cd --- /dev/null +++ b/src/cc/kfsio/NetManager.h @@ -0,0 +1,237 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/14 +// Author: Sriram Rao +// Mike Ovsiannikov -- re-implement by separating poll/select/epoll +// os specific logic, implement timer wheel to get rid of linear +// connection list scans on every event, add Timer class. +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _LIBIO_NETMANAGER_H +#define _LIBIO_NETMANAGER_H + +#include "NetConnection.h" +#include "ITimeout.h" + +class QCFdPoll; +class QCMutex; + +namespace KFS +{ +using std::list; + +/// +/// \file NetManager.h +/// The net manager provides facilities for multiplexing I/O on network +/// connections. It keeps a list of connections on which it has to +/// call select. Whenever an "event" occurs on a connection (viz., +/// read/write/error), it calls back the connection to handle the +/// event. +/// +/// The net manager also provides support for timeout notification, and +/// connection inactivity timeout. +/// In the present implemenation the worst case timeout resolution is +/// mSelectTimeout. +/// +// + +class NetManager +{ +public: + NetManager(int timeoutMs = 1000); + ~NetManager(); + /// Add a connection to the net manager's list of connections that + /// are used for building poll vector. + /// @param[in] conn The connection that should be added. + void AddConnection(const NetConnectionPtr &conn); + void RegisterTimeoutHandler(ITimeout *handler); + void UnRegisterTimeoutHandler(ITimeout *handler); + + void SetBacklogLimit(int64_t v) + { mMaxOutgoingBacklog = v; } + void ChangeDiskOverloadState(bool v); + + /// + /// This function never returns. It builds a poll vector, calls + /// select(), and then evaluates the result of select(): for + /// connections on which data is I/O is possible---either for + /// reading or writing are called back. In the callback, the + /// connections should take appropriate action. + /// + /// NOTE: When a connection is closed (such as, via a call to + /// NetConnection::Close()), then it automatically falls out of + /// the net manager's list of connections that are polled. + /// + void MainLoop(QCMutex* mutex = 0); + void Wakeup(); + + void Shutdown() + { mRunFlag = false; } + time_t GetStartTime() const + { return mStartTime; } + time_t Now() const + { return mNow; } + time_t UpTime() const + { return (mNow - mStartTime); } + bool IsRunning() const + { return mRunFlag; } + int64_t GetTimerOverrunCount() const + { return mTimerOverrunCount; } + int64_t GetTimerOverrunSec() const + { return mTimerOverrunSec; } + int GetMaxAcceptsPerRead() const + { return mMaxAcceptsPerRead; } + void SetMaxAcceptsPerRead(int maxAcceptsPerRead) + { mMaxAcceptsPerRead = maxAcceptsPerRead <= 0 ? 1 : maxAcceptsPerRead; } + void ChildAtFork(bool onlyCloseFdFlag = true); + + // Primarily for debugging, to simulate network failures. + class PollEventHook + { + public: + virtual void Add(NetManager& netMgr, NetConnection& conn) {} + virtual void Remove(NetManager& netMgr, NetConnection& conn) {} + virtual void Event( + NetManager& netMgr, NetConnection& conn, int& pollEvent) = 0; + protected: + PollEventHook() {} + virtual ~PollEventHook() {} + }; + PollEventHook* SetPollEventHook(PollEventHook* hook = 0) + { + PollEventHook* const prev = mPollEventHook; + mPollEventHook = hook; + return prev; + } + // Use net manager's timer wheel, with no fd/socket. + // Has about 100 bytes overhead. + class Timer + { + public: + Timer(NetManager& netManager, KfsCallbackObj& obj, int tmSec = -1) + : mHandler(netManager, obj, tmSec) + {} + void RemoveTimeout() + { SetTimeout(-1); } + void SetTimeout(int tmSec) + { mHandler.SetTimeout(tmSec); } + void ResetTimeout() + { mHandler.ResetTimeout(); } + time_t GetRemainingTime() const + { return mHandler.GetRemainingTime(); } + time_t GetStartTime() const + { return mHandler.mStartTime; } + int GetTimeout() const + { return mHandler.mConn->GetInactivityTimeout(); } + void ScheduleTimeoutNoLaterThanIn(int tmSec) + { mHandler.ScheduleTimeoutNoLaterThanIn(tmSec); } + // Negative timeouts are infinite, always greater than non negative. + static int MinTimeout(int tmL, int tmR) + { return ((tmR < 0 || (tmL < tmR && tmL >= 0)) ? tmL : tmR); } + + private: + struct Handler : public KfsCallbackObj + { + Handler(NetManager& netManager, KfsCallbackObj& obj, int tmSec); + ~Handler() + { Handler::Cleanup(); } + void SetTimeout(int tmSec); + time_t GetRemainingTime() const; + int EventHandler(int type, void* data); + void Cleanup(); + void ResetTimeout(); + void ScheduleTimeoutNoLaterThanIn(int tmSec); + inline time_t Now() const; + + KfsCallbackObj& mObj; + time_t mStartTime; + TcpSocket mSock; + NetConnectionPtr mConn; + private: + Handler(const Handler&); + Handler& operator=(const Handler&); + }; + Handler mHandler; + private: + Timer(const Timer&); + Timer& operator=(const Timer&); + }; + + /// Method used by NetConnection only. + static void Update(NetConnection::NetManagerEntry& entry, int fd, + bool resetTimer); + static inline const NetManager* GetNetManager(const NetConnection& conn); +private: + class Waker; + typedef NetConnection::NetManagerEntry::List List; + typedef QCDLList TimeoutHandlers; + enum { kTimerWheelSize = (1 << 8) }; + + List mRemove; + List::iterator mTimerWheelBucketItr; + NetConnection* mCurConnection; + int mCurTimerWheelSlot; + int mConnectionsCount; + /// when the system is overloaded--either because of disk or we + /// have too much network I/O backlogged---we avoid polling fd's for + /// read. this causes back-pressure and forces the clients to + /// slow down + bool mDiskOverloaded; + bool mNetworkOverloaded; + bool mIsOverloaded; + volatile bool mRunFlag; + bool mShutdownFlag; + bool mTimerRunningFlag; + /// timeout interval specified in the call to select(). + const int mTimeoutMs; + const time_t mStartTime; + time_t mNow; + int64_t mMaxOutgoingBacklog; + int64_t mNumBytesToSend; + int64_t mTimerOverrunCount; + int64_t mTimerOverrunSec; + int mMaxAcceptsPerRead; + QCFdPoll& mPoll; + Waker& mWaker; + PollEventHook* mPollEventHook; + /// Handlers that are notified whenever a call to select() + /// returns. To the handlers, the notification is a timeout signal. + ITimeout* mCurTimeoutHandler; + ITimeout* mTimeoutHandlers[1]; + List mEpollError; + List mTimerWheel[kTimerWheelSize + 1]; + + void CheckIfOverloaded(); + void CleanUp(bool childAtForkFlag = false, bool onlyCloseFdFlag = false); + inline void UpdateTimer(NetConnection::NetManagerEntry& entry, int timeOut); + void UpdateSelf(NetConnection::NetManagerEntry& entry, int fd, + bool resetTimer, bool epollError); + void PollRemove(int fd); +private: + NetManager(const NetManager&); + NetManager& operator=(const NetManager&); +}; + +} + +#endif // _LIBIO_NETMANAGER_H diff --git a/src/cc/kfsio/TcpSocket.cc b/src/cc/kfsio/TcpSocket.cc new file mode 100644 index 000000000..f5d630e4e --- /dev/null +++ b/src/cc/kfsio/TcpSocket.cc @@ -0,0 +1,526 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/10 +// Author: Sriram Rao +// +// Copyright 2008-2011 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Tcp socket class implementation. +// +//---------------------------------------------------------------------------- + +#include "TcpSocket.h" +#include "common/kfsdecls.h" +#include "common/MsgLogger.h" +#include "qcdio/QCUtils.h" +#include "qcdio/QCMutex.h" +#include "qcdio/qcstutils.h" + +#include "Globals.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace KFS { + +using std::min; +using std::max; +using std::string; +using KFS::libkfsio::globals; + +static inline void +UpdateSocketCount(int inc) +{ + globals().ctrOpenNetFds.Update(inc); +} + +int TcpSocket::sRecvBufSize = 64 << 10; +int TcpSocket::sSendBufSize = 64 << 10; + +TcpSocket::~TcpSocket() +{ + Close(); +} + +int +TcpSocket::Listen(int port, bool nonBlockingAccept /* = false */) +{ + mSockFd = socket(PF_INET, SOCK_STREAM, 0); + if (mSockFd == -1) { + return Perror("socket"); + } + if (fcntl(mSockFd, FD_CLOEXEC, 1)) { + Perror("set FD_CLOEXEC"); + } + + Address ourAddr; + memset(&ourAddr, 0, sizeof(ourAddr)); + ourAddr.sin_family = AF_INET; + ourAddr.sin_addr.s_addr = htonl(INADDR_ANY); + ourAddr.sin_port = htons(port); + + int reuseAddr = 1; + if (setsockopt(mSockFd, SOL_SOCKET, SO_REUSEADDR, + (char *) &reuseAddr, sizeof(reuseAddr))) { + Perror("setsockopt"); + } + + if (bind(mSockFd, (struct sockaddr *) &ourAddr, sizeof(ourAddr)) || + listen(mSockFd, 8192) || + (nonBlockingAccept && fcntl(mSockFd, F_SETFL, O_NONBLOCK))) { + const int ret = Perror(ourAddr); + close(mSockFd); + mSockFd = -1; + return ret; + } + + UpdateSocketCount(1); + + return 0; + +} + +TcpSocket* +TcpSocket::Accept(int* status /* = 0 */) +{ + int fd; + Address cliAddr; + TcpSocket *accSock; + socklen_t cliAddrLen = sizeof(cliAddr); + + if ((fd = accept(mSockFd, (struct sockaddr *) &cliAddr, &cliAddrLen)) < 0) { + const int err = errno; + if (err != EAGAIN && err != EWOULDBLOCK) { + Perror("accept", err); + } + if (status) { + *status = err; + } + return 0; + } + if (fcntl(fd, FD_CLOEXEC, 1)) { + Perror("set FD_CLOEXEC"); + } + accSock = new TcpSocket(fd); + accSock->SetupSocket(); + UpdateSocketCount(1); + if (status) { + *status = 0; + } + return accSock; +} + +int +TcpSocket::Connect(const TcpSocket::Address *remoteAddr, bool nonblockingConnect) +{ + Close(); + + mSockFd = socket(PF_INET, SOCK_STREAM, 0); + if (mSockFd < 0) { + return (errno > 0 ? -errno : mSockFd); + } + if (fcntl(mSockFd, FD_CLOEXEC, 1)) { + Perror("set FD_CLOEXEC"); + } + + if (nonblockingConnect) { + // when we do a non-blocking connect, we mark the socket + // non-blocking; then call connect and it wil return + // EINPROGRESS; the fd is added to the select loop to check + // for completion + fcntl(mSockFd, F_SETFL, O_NONBLOCK); + } + + int res = connect(mSockFd, (struct sockaddr *) remoteAddr, sizeof(*remoteAddr)); + if (res < 0 && errno != EINPROGRESS) { + res = Perror(*remoteAddr); + close(mSockFd); + mSockFd = -1; + return res; + } + if (res && nonblockingConnect) { + res = -errno; + } + SetupSocket(); + + UpdateSocketCount(1); + + return res; +} + +static QCMutex sLookupMutex; + +int +TcpSocket::Connect(const ServerLocation& location, bool nonblockingConnect) +{ + Address remoteAddr = { 0 }; + + const char* const name = location.hostname.c_str(); + if (! inet_aton(name, &remoteAddr.sin_addr)) { + QCStMutexLocker lock(sLookupMutex); + // do the conversion if we weren't handed an IP address + struct hostent * const hostInfo = gethostbyname(name); + KFS_LOG_STREAM_DEBUG << + "connect: " << location << + " hostent: " << (const void*)hostInfo << + " type: " << (hostInfo ? hostInfo->h_addrtype : -1) << + " size: " << (hostInfo ? hostInfo->h_length : -1) << + " " << h_errno << + KFS_LOG_EOM; + if (! hostInfo || hostInfo->h_addrtype != AF_INET || + hostInfo->h_length < (int)sizeof(remoteAddr.sin_addr)) { + const char* const err = hstrerror(h_errno); + KFS_LOG_STREAM_ERROR << + location.hostname << + ": " << ((err && *err) ? err : "unspecified error") << + KFS_LOG_EOM; + return -1; + } + memcpy(&remoteAddr.sin_addr, hostInfo->h_addr, + sizeof(remoteAddr.sin_addr)); + } + remoteAddr.sin_port = htons(location.port); + remoteAddr.sin_family = AF_INET; + return Connect(&remoteAddr, nonblockingConnect); +} + +void +TcpSocket::SetupSocket() +{ + int bufSize = sRecvBufSize; + if (bufSize > 0 && + setsockopt(mSockFd, SOL_SOCKET, SO_SNDBUF, + (char *) &bufSize, sizeof(bufSize))) { + Perror("setsockopt SO_SNDBUF"); + } + bufSize = sSendBufSize; + if (bufSize > 0 && + setsockopt(mSockFd, SOL_SOCKET, SO_RCVBUF, + (char *) &bufSize, sizeof(bufSize)) < 0) { + Perror("setsockopt SO_RCVBUF"); + } + int flag = 1; + // enable keep alive so we can socket errors due to detect network partitions + if (setsockopt(mSockFd, SOL_SOCKET, SO_KEEPALIVE, + (char *) &flag, sizeof(flag)) < 0) { + Perror("setsockopt SO_KEEPALIVE"); + } + if (fcntl(mSockFd, F_SETFL, O_NONBLOCK)) { + Perror("set O_NONBLOCK"); + } + // turn off NAGLE + if (setsockopt(mSockFd, IPPROTO_TCP, TCP_NODELAY, + (char *) &flag, sizeof(flag)) < 0) { + Perror("setsockopt TCP_NODELAY"); + } + +} + +int +TcpSocket::GetPeerName(struct sockaddr *peerAddr, int len) const +{ + socklen_t peerLen = (socklen_t)len; + if (getpeername(mSockFd, peerAddr, &peerLen) < 0) { + return Perror("getpeername"); + } + return 0; +} + +string +TcpSocket::GetPeerName() const +{ + Address saddr = { 0 }; + if (GetPeerName((struct sockaddr*) &saddr, (int)sizeof(saddr)) < 0) { + return "unknown"; + } + return ToString(saddr); +} + +string +TcpSocket::GetSockName() const +{ + Address saddr = { 0 }; + socklen_t len = (socklen_t)sizeof(saddr); + if (getsockname(mSockFd, (struct sockaddr*) &saddr, &len) < 0) { + return "unknown"; + } + return ToString(saddr); +} + +int +TcpSocket::Send(const char *buf, int bufLen) +{ + int nwrote; + + nwrote = bufLen > 0 ? send(mSockFd, buf, bufLen, 0) : 0; + if (nwrote > 0) { + globals().ctrNetBytesWritten.Update(nwrote); + } + return nwrote; +} + +int TcpSocket::Recv(char *buf, int bufLen) +{ + int nread; + + nread = bufLen > 0 ? recv(mSockFd, buf, bufLen, 0) : 0; + if (nread > 0) { + globals().ctrNetBytesRead.Update(nread); + } + + return nread; +} + +int +TcpSocket::Peek(char *buf, int bufLen) +{ + return (bufLen > 0 ? recv(mSockFd, buf, bufLen, MSG_PEEK) : 0); +} + +void +TcpSocket::Close() +{ + if (mSockFd < 0) { + return; + } + close(mSockFd); + mSockFd = -1; + UpdateSocketCount(-1); +} + +int +TcpSocket::DoSynchSend(const char *buf, int bufLen) +{ + int numSent = 0; + int res = 0, nfds; + struct pollfd pfd; + // 1 second in ms units + const int kTimeout = 1000; + + while (numSent < bufLen) { + if (mSockFd < 0) + break; + if (res < 0) { + pfd.fd = mSockFd; + pfd.events = POLLOUT; + pfd.revents = 0; + nfds = poll(&pfd, 1, kTimeout); + if (nfds == 0) + continue; + } + + res = Send(buf + numSent, bufLen - numSent); + if (res == 0) + return 0; + if ((res < 0) && + ((errno == EAGAIN) || (errno == EWOULDBLOCK) || (errno == EINTR))) + continue; + if (res < 0) + break; + numSent += res; + res = -1; + } + if (numSent > 0) { + globals().ctrNetBytesWritten.Update(numSent); + } + return numSent; +} + +// +// Receive data within a certain amount of time. If the server is too slow in responding, bail +// +int +TcpSocket::DoSynchRecv(char *buf, int bufLen, struct timeval &timeout) +{ + int numRecd = 0; + int res = 0, nfds; + struct pollfd pfd; + struct timeval startTime, now; + + gettimeofday(&startTime, 0); + + while (numRecd < bufLen) { + if (mSockFd < 0) + break; + + if (res < 0) { + pfd.fd = mSockFd; + pfd.events = POLLIN; + pfd.revents = 0; + nfds = poll(&pfd, 1, timeout.tv_sec * 1000); + // get a 0 when timeout expires + if (nfds == 0) { + KFS_LOG_STREAM_DEBUG << "Timeout in synch recv" << KFS_LOG_EOM; + return numRecd > 0 ? numRecd : -ETIMEDOUT; + } + } + + gettimeofday(&now, 0); + if (now.tv_sec - startTime.tv_sec >= timeout.tv_sec) { + return numRecd > 0 ? numRecd : -ETIMEDOUT; + } + + res = Recv(buf + numRecd, bufLen - numRecd); + if (res == 0) + return 0; + if ((res < 0) && + ((errno == EAGAIN) || (errno == EWOULDBLOCK) || (errno == EINTR))) + continue; + if (res < 0) + break; + numRecd += res; + } + if (numRecd > 0) { + globals().ctrNetBytesRead.Update(numRecd); + } + + return numRecd; +} + + +// +// Receive data within a certain amount of time and discard them. If +// the server is too slow in responding, bail +// +int +TcpSocket::DoSynchDiscard(int nbytes, struct timeval &timeout) +{ + int numRecd = 0, ntodo, res; + const int bufSize = 4096; + char buf[bufSize]; + + while (numRecd < nbytes) { + ntodo = min(nbytes - numRecd, bufSize); + res = DoSynchRecv(buf, ntodo, timeout); + if (res == -ETIMEDOUT) + return numRecd; + if (res == 0) + break; + assert(numRecd >= 0); + if (numRecd < 0) + break; + numRecd += res; + } + return numRecd; +} + +// +// Peek data within a certain amount of time. If the server is too slow in responding, bail +// +int +TcpSocket::DoSynchPeek(char *buf, int bufLen, struct timeval &timeout) +{ + int numRecd = 0; + int res, nfds; + struct pollfd pfd; + struct timeval startTime, now; + + gettimeofday(&startTime, 0); + + for (; ;) { + pfd.fd = mSockFd; + pfd.events = POLLIN; + pfd.revents = 0; + nfds = poll(&pfd, 1, timeout.tv_sec * 1000); + // get a 0 when timeout expires + if (nfds == 0) { + return -ETIMEDOUT; + } + + gettimeofday(&now, 0); + if (now.tv_sec - startTime.tv_sec >= timeout.tv_sec) { + return -ETIMEDOUT; + } + + res = Peek(buf + numRecd, bufLen - numRecd); + if (res == 0) + return 0; + if ((res < 0) && (errno == EAGAIN)) + continue; + if (res < 0) + break; + numRecd += res; + if (numRecd > 0) + break; + } + return numRecd; +} + +int +TcpSocket::GetSocketError() const +{ + if (mSockFd < 0) { + return EBADF; + } + int err = 0; + socklen_t len = sizeof(err); + if (getsockopt(mSockFd, SOL_SOCKET, SO_ERROR, &err, &len)) { + return (errno != 0 ? errno : EINVAL); + } + assert(len == sizeof(err)); + return err; +} + +string +TcpSocket::ToString(const Address& saddr) +{ + char ipname[INET_ADDRSTRLEN + 16]; + if (! inet_ntop(AF_INET, &(saddr.sin_addr), ipname, INET_ADDRSTRLEN)) { + return "unknown"; + } + ipname[INET_ADDRSTRLEN] = 0; + sprintf(ipname + strlen(ipname), ":%d", (int)htons(saddr.sin_port)); + return ipname; +} + +int +TcpSocket::Perror(const char* msg, int err) const +{ + KFS_LOG_STREAM_ERROR << QCUtils::SysError(err, msg) << KFS_LOG_EOM; + return (err > 0 ? -err : (err == 0 ? -1 : err)); +} + +int +TcpSocket::Perror(const char* msg) const +{ + return Perror(msg, errno); +} + +int +TcpSocket::Perror(const Address& saddr) const +{ + const int err = errno; + const string name = ToString(saddr); + return Perror(name.c_str(), err); +} + +} diff --git a/src/cc/kfsio/TcpSocket.h b/src/cc/kfsio/TcpSocket.h new file mode 100644 index 000000000..7d2427225 --- /dev/null +++ b/src/cc/kfsio/TcpSocket.h @@ -0,0 +1,142 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/10 +// Author: Sriram Rao +// +// Copyright 2008-2011 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +/// \brief Class that hides the internals of doing socket I/O. +// +//---------------------------------------------------------------------------- + +#ifndef _LIBIO_TCP_SOCKET_H +#define _LIBIO_TCP_SOCKET_H + +#include +#include + +struct timeval; +struct sockaddr_in; +struct sockaddr; + +namespace KFS +{ +using std::string; + +struct ServerLocation; + +class TcpSocket +{ +public: + typedef sockaddr_in Address; + TcpSocket() + : mSockFd(-1) + {} + /// Wrap the passed in file descriptor in a TcpSocket + /// @param[in] fd file descriptor corresponding to a TCP socket. + TcpSocket(int fd) + : mSockFd(fd) + {} + ~TcpSocket(); + + /// Setup a TCP socket that listens for connections + /// @param port Port on which to listen for incoming connections + int Listen(int port, bool nonBlockingAccept = false); + + /// Accept connection on a socket. + /// @retval A TcpSocket pointer that contains the accepted + /// connection. It is the caller's responsibility to free the + /// pointer returned by this method. + /// + TcpSocket* Accept(int* status = 0); + + /// Connect to the remote address. If non-blocking connect is + /// set, the socket is first marked non-blocking and then we do + /// the connect call. Then, you use select() to check for connect() completion + /// @retval 0 on success; -1 on failure; -EINPROGRESS if we do a + /// nonblockingConnect and connect returned that error code + int Connect(const Address* remoteAddr, bool nonblockingConnect = false); + int Connect(const ServerLocation& location, bool nonblockingConnect = false); + + /// Do block-IO's, where # of bytes to be send/recd is the length + /// of the buffer. + /// @retval Returns # of bytes sent or -1 if there was an error. + int DoSynchSend(const char *buf, int bufLen); + + /// For recv/peek, specify a timeout within which data should be received. + int DoSynchRecv(char *buf, int bufLen, timeval& timeout); + int DoSynchPeek(char *buf, int bufLen, timeval& timeout); + + /// Discard a bunch of bytes that are coming down the pipe. + int DoSynchDiscard(int len, timeval& timeout); + + /// Peek to see if any data is available. This call will not + /// remove the data from the underlying socket buffers. + /// @retval Returns # of bytes copied in or -1 if there was an error. + int Peek(char *buf, int bufLen); + + /// Get the file descriptor associated with this socket. + inline int GetFd() { return mSockFd; }; + + /// Return true if socket is good for read/write. false otherwise. + bool IsGood() const { + return (mSockFd >= 0); + } + + /// pass in the length of the buffer pointed to by peerAddr + int GetPeerName(sockaddr *peerAddr, int len) const; + /// Return the peer's IP address as a string + string GetPeerName() const; + string GetSockName() const; + + /// Sends at-most the specified # of bytes. + /// @retval Returns the result of calling send(). + int Send(const char *buf, int bufLen); + + /// Receives at-most the specified # of bytes. + /// @retval Returns the result of calling recv(). + int Recv(char *buf, int bufLen); + + /// Close the TCP socket. + void Close(); + + /// Get and clear pending socket error: getsockopt(SO_ERROR) + int GetSocketError() const; + static string ToString(const Address& addr); + static int GetDefaultRecvBufSize() { return sRecvBufSize; } + static int GetDefaultSendBufSize() { return sSendBufSize; } + static void SetDefaultRecvBufSize(int size) { sRecvBufSize = size; } + static void SetDefaultSendBufSize(int size) { sSendBufSize = size; } +private: + int mSockFd; + + void SetupSocket(); + int Perror(const char* msg) const; + int Perror(const Address& addr) const; + int Perror(const char* msg, int err) const; + + static int sRecvBufSize; + static int sSendBufSize; +}; + +typedef boost::shared_ptr TcpSocketPtr; + +} + +#endif // _LIBIO_TCP_SOCKET_H diff --git a/src/cc/kfsio/checksum.cc b/src/cc/kfsio/checksum.cc new file mode 100644 index 000000000..1b71f8780 --- /dev/null +++ b/src/cc/kfsio/checksum.cc @@ -0,0 +1,227 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/09/12 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// An adaptation of the 32-bit Adler checksum algorithm +// +//---------------------------------------------------------------------------- + +#include "checksum.h" + +#include +#include +#include + +namespace KFS { + +using std::min; +using std::max; +using std::vector; +using std::list; + +static inline uint32_t +KfsChecksum(uint32_t chksum, const void* buf, size_t len) +{ + return adler32(chksum, reinterpret_cast(buf), len); +} + +#ifndef _KFS_NO_ADDLER32_COMBINE + +// Copied from adler32.c +// This is needed to make it work with versions 1.2.3 and prior that have a bug +// in adler32_combine. + +#define BASE 65521 /* largest prime smaller than 65536 */ +#define MOD(a) a %= BASE +#define MOD63(a) a %= BASE + +static inline uint32_t +bug_fix_for_adler32_combine(uint32_t adler1, uint32_t adler2, int64_t len2) +{ + unsigned long sum1; + unsigned long sum2; + unsigned rem; + + /* for negative len, return invalid adler32 as a clue for debugging */ + if (len2 < 0) + return 0xffffffffUL; + + /* the derivation of this formula is left as an exercise for the reader */ + MOD63(len2); /* assumes len2 >= 0 */ + rem = (unsigned)len2; + sum1 = adler1 & 0xffff; + sum2 = rem * sum1; + MOD(sum2); + sum1 += (adler2 & 0xffff) + BASE - 1; + sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; + if (sum1 >= BASE) sum1 -= BASE; + if (sum1 >= BASE) sum1 -= BASE; + if (sum2 >= (BASE << 1)) sum2 -= (BASE << 1); + if (sum2 >= BASE) sum2 -= BASE; + return sum1 | (sum2 << 16); +} + +#endif + +static inline uint32_t +KfsChecksumCombine(uint32_t chksum1, uint32_t chksum2, size_t len2) +{ +#ifndef _KFS_NO_ADDLER32_COMBINE + return bug_fix_for_adler32_combine(chksum1, chksum2, (int64_t)len2); +#else + return adler32_combine(chksum1, chksum2, len2); +#endif +} + +uint32_t +OffsetToChecksumBlockNum(off_t offset) +{ + return offset / CHECKSUM_BLOCKSIZE; +} + +uint32_t +OffsetToChecksumBlockStart(off_t offset) +{ + return (offset / CHECKSUM_BLOCKSIZE) * + CHECKSUM_BLOCKSIZE; +} + +uint32_t +OffsetToChecksumBlockEnd(off_t offset) +{ + return ((offset / CHECKSUM_BLOCKSIZE) + 1) * + CHECKSUM_BLOCKSIZE; +} + +uint32_t +ComputeBlockChecksum(const char* buf, size_t len) +{ + return KfsChecksum(kKfsNullChecksum, buf, len); +} + +uint32_t +ComputeBlockChecksum(uint32_t ckhsum, const char* buf, size_t len) +{ + return KfsChecksum(ckhsum, buf, len); +} + +vector +ComputeChecksums(const char *buf, size_t len, uint32_t* chksum) +{ + vector cksums; + + if (len <= CHECKSUM_BLOCKSIZE) { + uint32_t cks = ComputeBlockChecksum(buf, len); + if (chksum) { + *chksum = cks; + } + cksums.push_back(cks); + return cksums; + } + if (chksum) { + *chksum = kKfsNullChecksum; + } + cksums.reserve((len + CHECKSUM_BLOCKSIZE - 1) / CHECKSUM_BLOCKSIZE); + size_t curr = 0; + while (curr < len) { + const size_t tlen = min((size_t) CHECKSUM_BLOCKSIZE, len - curr); + const uint32_t cks = ComputeBlockChecksum(buf + curr, tlen); + if (chksum) { + *chksum = KfsChecksumCombine(*chksum, cks, tlen); + } + cksums.push_back(cks); + curr += tlen; + } + return cksums; +} + +uint32_t +ComputeBlockChecksum(const IOBuffer* data, size_t len, uint32_t chksum) +{ + uint32_t res = chksum; + for (IOBuffer::iterator iter = data->begin(); + len > 0 && (iter != data->end()); ++iter) { + const size_t tlen = min((size_t) iter->BytesConsumable(), len); + if (tlen == 0) { + continue; + } + res = KfsChecksum(res, iter->Consumer(), tlen); + len -= tlen; + } + return res; +} + +vector +ComputeChecksums(const IOBuffer* data, size_t len, uint32_t* chksum) +{ + vector cksums; + + len = min(len, size_t(max(0, data->BytesConsumable()))); + if (len <= CHECKSUM_BLOCKSIZE) { + const uint32_t cks = ComputeBlockChecksum(data, len); + if (chksum) { + *chksum = cks; + } + cksums.push_back(cks); + return cksums; + } + if (chksum) { + *chksum = kKfsNullChecksum; + } + IOBuffer::iterator iter = data->begin(); + if (iter == data->end()) { + return cksums; + } + cksums.reserve((len + CHECKSUM_BLOCKSIZE - 1) / CHECKSUM_BLOCKSIZE); + const char *buf = iter->Consumer(); + /// Compute checksum block by block + while (len > 0 && iter != data->end()) { + size_t currLen = 0; + uint32_t res = kKfsNullChecksum; + while (currLen < CHECKSUM_BLOCKSIZE) { + size_t navail = min((size_t) (iter->Producer() - buf), len); + if (currLen + navail > CHECKSUM_BLOCKSIZE) { + navail = CHECKSUM_BLOCKSIZE - currLen; + } + if (navail == 0) { + iter++; + if (iter == data->end()) { + break; + } + buf = iter->Consumer(); + continue; + } + currLen += navail; + len -= navail; + res = KfsChecksum(res, buf, navail); + buf += navail; + } + if (chksum) { + *chksum = KfsChecksumCombine(*chksum, res, currLen); + } + cksums.push_back(res); + } + return cksums; +} + +} + diff --git a/src/cc/kfsio/checksum.h b/src/cc/kfsio/checksum.h new file mode 100644 index 000000000..27d121612 --- /dev/null +++ b/src/cc/kfsio/checksum.h @@ -0,0 +1,61 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/09/12 +// Author: Sriram Rao +// +// Copyright 2008-2011 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Code for computing 32-bit Adler checksums +//---------------------------------------------------------------------------- + +#ifndef CHUNKSERVER_CHECKSUM_H +#define CHUNKSERVER_CHECKSUM_H + +#include +#include +#include "kfsio/IOBuffer.h" + +namespace KFS +{ +using std::vector; + +/// Checksums are computed on 64KB block boundaries. We use the +/// "rolling" 32-bit Adler checksum algorithm +const uint32_t CHECKSUM_BLOCKSIZE = 65536; +const uint32_t kKfsNullChecksum = 1; + +extern uint32_t OffsetToChecksumBlockNum(off_t offset); + +extern uint32_t OffsetToChecksumBlockStart(off_t offset); + +extern uint32_t OffsetToChecksumBlockEnd(off_t offset); + +/// Call this function if you want checksum computed over CHECKSUM_BLOCKSIZE bytes +extern uint32_t ComputeBlockChecksum(const IOBuffer *data, size_t len, + uint32_t chksum = kKfsNullChecksum); +extern uint32_t ComputeBlockChecksum(const char *data, size_t len); +extern uint32_t ComputeBlockChecksum(uint32_t ckhsum, const char *buf, size_t len); + +/// Call this function if you want a checksums for a sequence of CHECKSUM_BLOCKSIZE bytes +extern vector ComputeChecksums(const IOBuffer *data, size_t len, uint32_t* chksum = 0); +extern vector ComputeChecksums(const char *data, size_t len, uint32_t* chksum = 0); + +} + +#endif // CHUNKSERVER_CHECKSUM_H diff --git a/src/cc/kfsio/event.h b/src/cc/kfsio/event.h new file mode 100644 index 000000000..f6bee5942 --- /dev/null +++ b/src/cc/kfsio/event.h @@ -0,0 +1,59 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/03/22 +// Author: Sriram Rao +// +// Copyright 2008-2010 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _LIBKFSIO_EVENT_H +#define _LIBKFSIO_EVENT_H + +#include "KfsCallbackObj.h" + +namespace KFS +{ +/// +/// \enum EventCode_t +/// Various event codes that a KfsCallbackObj is notified with when +/// events occur. +/// +enum EventCode_t { + EVENT_NEW_CONNECTION, + EVENT_NET_READ, + EVENT_NET_WROTE, + EVENT_NET_ERROR, + EVENT_DISK_READ, + EVENT_DISK_WROTE, + EVENT_DISK_ERROR, + EVENT_SYNC_DONE, + EVENT_CMD_DONE, + EVENT_INACTIVITY_TIMEOUT, + EVENT_TIMEOUT, + EVENT_DISK_DELETE_DONE, + EVENT_DISK_RENAME_DONE, + EVENT_DISK_GET_FS_SPACE_AVAIL_DONE, + EVENT_DISK_CHECK_DIR_READABLE_DONE +}; + +} + +#endif // _LIBKFSIO_EVENT_H diff --git a/src/cc/kfsio/requestio.cc b/src/cc/kfsio/requestio.cc new file mode 100644 index 000000000..5abb888d6 --- /dev/null +++ b/src/cc/kfsio/requestio.cc @@ -0,0 +1,125 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/08/10 +// Author: Sriram Rao +// Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief requestio.h: Common synchronous request send and receive routines +// implementation. +// +//---------------------------------------------------------------------------- + +#include "requestio.h" +#include "common/MsgLogger.h" +#include "kfsio/TcpSocket.h" +#include "qcdio/QCUtils.h" + +#include +#include + +#include + +#include + +namespace KFS +{ + +using std::max; + +int +SendRequest(const char* req, size_t rlen, const char* body, size_t blen, + TcpSocket* sock) +{ + if (! sock || ! sock->IsGood()) { + KFS_LOG_STREAM_DEBUG << "op send socket closed" << KFS_LOG_EOM; + return -EINVAL; + } + int numIO = sock->DoSynchSend(req, rlen); + if (blen > 0 && numIO > 0) { + numIO = sock->DoSynchSend(body, blen); + } + if (numIO > 0) { + return numIO; + } + KFS_LOG_STREAM_DEBUG << sock->GetPeerName() << + ": send failed: " << numIO << " " << QCUtils::SysError(-numIO) << + KFS_LOG_EOM; + sock->Close(); + return (numIO < 0 ? numIO : -EINVAL); +} + +int +RecvResponseHeader(char* buf, int bufSize, TcpSocket* sock, int opTimeout, + int* delims) +{ + *delims = -1; + for (int pos = 0; ;) { + struct timeval timeout = {0}; + timeout.tv_sec = opTimeout; + + int nread = sock->DoSynchPeek(buf + pos, bufSize - pos, timeout); + if (nread <= 0) { + if (nread == -ETIMEDOUT) { + return nread; + } + if (nread < 0 && (errno == EINTR || errno == EAGAIN)) { + continue; + } + return nread; + } + for (int i = max(pos, 3); i < pos + nread; i++) { + if ((buf[i - 3] == '\r') && + (buf[i - 2] == '\n') && + (buf[i - 1] == '\r') && + (buf[i] == '\n')) { + // valid stuff is from 0..i; so, length of resulting + // string is i+1. + i++; + while (pos < i) { + if ((nread = sock->Recv(buf + pos, i - pos)) <= 0) { + if (nread < 0 && (errno == EINTR || errno == EAGAIN)) { + continue; + } + return nread; + } + pos += nread; + } + *delims = i; + if (i < bufSize) { + buf[i] = 0; + } + return i; + } + } + // Unload data from socket, otherwise peek will return immediately. + if ((nread = sock->Recv(buf + pos, nread)) <= 0) { + if (nread < 0 && (errno == EINTR || errno == EAGAIN)) { + continue; + } + return nread; + } + pos += nread; + } + assert(! "not reached"); + return -1; +} + +} diff --git a/src/cc/kfsio/requestio.h b/src/cc/kfsio/requestio.h new file mode 100644 index 000000000..3d655dcec --- /dev/null +++ b/src/cc/kfsio/requestio.h @@ -0,0 +1,62 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/08/10 +// Author: Sriram Rao +// Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief requestio.h: Common synchronous request send and receive routines. +// +//---------------------------------------------------------------------------- + +#ifndef _LIBIO_REQUESTIO_H +#define _LIBIO_REQUESTIO_H + +#include + +namespace KFS +{ + +class TcpSocket; + +/// Synchronously send request -- data one or two buffers: req and body. +/// +int SendRequest(const char* req, size_t rlen, const char* body, size_t blen, + TcpSocket* sock); + +/// Get a response from the server. The response is assumed to +/// terminate with "\r\n\r\n". +/// @param[in/out] buf that should be filled with data from server +/// @param[in] bufSize size of the buffer +/// +/// @param[in] sock the socket from which data should be read +/// @retval # of bytes that were read; 0/-1 if there was an error +/// +/// @param[out] delims the position in the buffer where "\r\n\r\n" +/// occurs; in particular, the length of the response string that ends +/// with last "\n" character. If the buffer got full and we couldn't +/// find "\r\n\r\n", delims is set to -1. +/// +int RecvResponseHeader(char* buf, int bufSize, TcpSocket* sock, int opTimeout, + int* delims); + +} + +#endif /* _LIBIO_REQUESTIO_H */ diff --git a/src/cc/libclient/CMakeLists.txt b/src/cc/libclient/CMakeLists.txt new file mode 100644 index 000000000..6f4a5e73f --- /dev/null +++ b/src/cc/libclient/CMakeLists.txt @@ -0,0 +1,70 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2008-2012 Quantcast Corp. +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +# Take all the .cc files and build a library out of them +set (sources + KfsOps.cc + FileOpener.cc + KfsClient.cc + KfsNetClient.cc + KfsProtocolWorker.cc + KfsRead.cc + KfsWrite.cc + RSStriper.cc + Reader.cc + Path.cc + utils.cc + WriteAppender.cc + Writer.cc + kfsglob.cc + KfsAttr.cc +) + +# +# Build a static and a dynamically linked libraries. Both libraries +# should have the same root name, but installed in different places +# +add_library (kfsClient STATIC ${sources}) +add_library (kfsClient-shared SHARED ${sources}) +set_target_properties (kfsClient PROPERTIES OUTPUT_NAME "kfs_client") +set_target_properties (kfsClient-shared PROPERTIES OUTPUT_NAME "kfs_client") + +# +# Since the objects have to be built twice, set this up so they don't +# clobber each other. + +set_target_properties (kfsClient PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties (kfsClient-shared PROPERTIES CLEAN_DIRECT_OUTPUT 1) + +add_dependencies (kfsClient kfsCommon kfsIO qcdio kfsrs) +# get everthing into one +target_link_libraries (kfsClient kfsCommon kfsIO qcdio crypto kfsrs) +target_link_libraries (kfsClient-shared kfsCommon-shared kfsIO-shared qcdio-shared kfsrs-shared crypto) +# +install (TARGETS kfsClient kfsClient-shared + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static) + +install (FILES KfsAttr.h KfsClient.h DESTINATION include/kfs) diff --git a/src/cc/libclient/ClientPool.h b/src/cc/libclient/ClientPool.h new file mode 100644 index 000000000..8d29ad0b8 --- /dev/null +++ b/src/cc/libclient/ClientPool.h @@ -0,0 +1,167 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/05/20 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef CLIENT_POOL_H +#define CLIENT_POOL_H + +#include "common/kfsdecls.h" +#include "common/StdAllocator.h" +#include "KfsNetClient.h" + +#include +#include + +namespace KFS +{ +namespace client +{ +using std::pair; +using std::make_pair; +using std::map; +using std::less; + +// Client connection (KfsNetClient) pool. Used to reduce number of chunk +// server connections. Presently used only with radix sort with write append +// with M clients each appending to N buckets. +class ClientPool +{ +public: + typedef KfsNetClient::Stats Stats; + + ClientPool( + NetManager& inNetManager, + int inMaxRetryCount = 0, + int inTimeSecBetweenRetries = 10, + int inOpTimeoutSec = 5 * 60, + int inIdleTimeoutSec = 30 * 60, + int64_t inInitialSeqNum = 1, + const char* inLogPrefixPtr = 0, + bool inResetConnectionOnOpTimeoutFlag = true, + bool inRetryConnectOnlyFlag = true, + int inMaxContentLength = MAX_RPC_HEADER_LEN, + bool inFailAllOpsOnOpTimeoutFlag = false, + bool inMaxOneOutstandingOpFlag = false) + : mClients(), + mNetManager(inNetManager), + mMaxRetryCount(inMaxRetryCount), + mTimeSecBetweenRetries(inTimeSecBetweenRetries), + mOpTimeoutSec(inOpTimeoutSec), + mIdleTimeoutSec(inIdleTimeoutSec), + mInitialSeqNum(inInitialSeqNum), + mLogPrefixPtr(inLogPrefixPtr), + mResetConnectionOnOpTimeoutFlag(inResetConnectionOnOpTimeoutFlag), + mRetryConnectOnlyFlag(inRetryConnectOnlyFlag), + mMaxContentLength(inMaxContentLength), + mFailAllOpsOnOpTimeoutFlag(inFailAllOpsOnOpTimeoutFlag), + mMaxOneOutstandingOpFlag(inMaxOneOutstandingOpFlag) + {} + ~ClientPool() + { + for (Clients::const_iterator it = mClients.begin(); + it != mClients.end(); + ++it) { + delete it->second; + } + } + KfsNetClient& Get( + const ServerLocation& inLocation) + { + Clients::iterator it = mClients.find(inLocation); + if (it == mClients.end()) { + it = mClients.insert(make_pair(inLocation, new KfsNetClient( + mNetManager, + inLocation.hostname, + inLocation.port, + mMaxRetryCount, + mTimeSecBetweenRetries, + mOpTimeoutSec, + mIdleTimeoutSec, + mInitialSeqNum++, + mLogPrefixPtr, + mResetConnectionOnOpTimeoutFlag, + mMaxContentLength, + mFailAllOpsOnOpTimeoutFlag, + mMaxOneOutstandingOpFlag))).first; + it->second->SetRetryConnectOnly(mRetryConnectOnlyFlag); + } + return *(it->second); + } + void GetStats( + Stats& outStats) const + { + outStats.Clear(); + Stats theStats; + for (Clients::const_iterator it = mClients.begin(); + it != mClients.end(); + ++it) { + it->second->GetStats(theStats); + outStats.Add(theStats); + } + } + void ClearMaxOneOutstandingOpFlag( + bool inFailAllOpsOnOpTimeoutFlag) + { + if (! mMaxOneOutstandingOpFlag) { + return; + } + mMaxOneOutstandingOpFlag = false; + for (Clients::const_iterator theIt = mClients.begin(); + theIt != mClients.end(); + ++theIt) { + theIt->second->SetFailAllOpsOnOpTimeoutFlag( + inFailAllOpsOnOpTimeoutFlag); + theIt->second->ClearMaxOneOutstandingOpFlag(); + } + } +private: + typedef map< + ServerLocation, + KfsNetClient*, + less, + StdFastAllocator > + > Clients; + Clients mClients; + NetManager& mNetManager; + int mMaxRetryCount; + int mTimeSecBetweenRetries; + int mOpTimeoutSec; + int mIdleTimeoutSec; + int64_t mInitialSeqNum; + const char* mLogPrefixPtr; + bool mResetConnectionOnOpTimeoutFlag; + bool mRetryConnectOnlyFlag; + int mMaxContentLength; + bool mFailAllOpsOnOpTimeoutFlag; + bool mMaxOneOutstandingOpFlag; +private: + ClientPool( + const ClientPool& inPool); + ClientPool& operator=( + const ClientPool& inPool); +}; +} +} + +#endif /* CLIENT_POOL_H */ diff --git a/src/cc/libclient/FileOpener.cc b/src/cc/libclient/FileOpener.cc new file mode 100644 index 000000000..97765dde1 --- /dev/null +++ b/src/cc/libclient/FileOpener.cc @@ -0,0 +1,623 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/06/25 +// Author: Mike Ovsiannikov +// +// Copyright 2010 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "FileOpener.h" + +#include +#include +#include +#include + +#include "kfsio/NetManager.h" +#include "common/kfsdecls.h" +#include "common/MsgLogger.h" +#include "KfsOps.h" +#include "KfsClient.h" + +namespace KFS +{ +namespace client +{ + +// File open / create state machine implementation. +class FileOpener::Impl : private KfsNetClient::OpOwner +{ +public: + Impl( + FileOpener& inOuter, + MetaServer& inMetaServer, + Completion* inCompletionPtr, + std::string inLogPrefix) + : KfsNetClient::OpOwner(), + mOuter(inOuter), + mMetaServer(inMetaServer), + mPathName(), + mFileName(), + mCanceledFlag(false), + mOpenFlag(false), + mOpeningFlag(false), + mMakeDirsFlag(false), + mErrorCode(0), + mNumReplicas(0), + mPathNamePos(0), + mCurOpPtr(0), + mCompletionPtr(inCompletionPtr), + mLookupOp(0, 0, ""), + mMkdirOp(0, 0, ""), + mCreateOp(0, 0, "", mNumReplicas, false), + mLookupPathOp(0, 0, ""), + mLogPrefix(inLogPrefix), + mStats() + { Impl::Reset(); } + ~Impl() + { + mMetaServer.Cancel(mCurOpPtr, this); + Impl::Register(0); + } + int Open( + const char* inFileNamePtr, + int inNumReplicas, + bool inMakeDirsFlag) + { + if (mOpenFlag) { + if (inFileNamePtr == mPathName && + inNumReplicas == mNumReplicas) { + return mErrorCode; + } + return -EINVAL; + } + if (mErrorCode) { + return mErrorCode; + } + if (mOpeningFlag) { + return -EAGAIN; + } + mStats.Clear(); + mOpeningFlag = true; + mNumReplicas = inNumReplicas; + mPathName = inFileNamePtr; + mErrorCode = 0; + mPathNamePos = 0; + mMakeDirsFlag = inMakeDirsFlag; + LookupPath(); + return mErrorCode; + } + int Open( + kfsFileId_t inFileId, + const char* inFileNamePtr) + { + if (inFileId <= 0 || ! inFileNamePtr || ! *inFileNamePtr) { + return -EINVAL; + } + if (mOpenFlag) { + if (inFileId == mLookupOp.fattr.fileId && + inFileNamePtr == mPathName) { + return mErrorCode; + } + return -EINVAL; + } + if (mErrorCode) { + return mErrorCode; + } + if (mOpeningFlag) { + return -EAGAIN; + } + mStats.Clear(); + mPathName = inFileNamePtr; + mErrorCode = 0; + mPathNamePos = 0; + mMakeDirsFlag = false; + mNumReplicas = 0; // Do not create if doesn't exist. + mLookupOp.parentFid = -1; // Input, not known, and not needed. + mLookupOp.status = 0; + if (inFileId > 0) { + mLookupOp.fattr.fileId = inFileId; + mLookupOp.fattr.isDirectory = false; + mOpenFlag = true; + mOpeningFlag = false; + ReportCompletion(); + } else { + mOpeningFlag = true; + LookupPath(); + } + return mErrorCode; + } + void Shutdown() + { + Reset(); + mMetaServer.Cancel(mCurOpPtr, this); + mOpeningFlag = false; + mOpenFlag = false; + mErrorCode = 0; + } + bool IsOpen() const + { return (mOpenFlag); } + bool IsOpening() const + { return (! mOpenFlag && mOpeningFlag); } + bool IsActive() const + { return (mOpeningFlag); } + void Register( + Completion* inCompletionPtr) + { + if (inCompletionPtr == mCompletionPtr) { + return; + } + if (mCompletionPtr) { + mCompletionPtr->Unregistered(mOuter); + } + mCompletionPtr = inCompletionPtr; + } + bool Unregister( + Completion* inCompletionPtr) + { + if (inCompletionPtr != mCompletionPtr) { + return false; + } + mCompletionPtr = 0; + return true; + } + void GetStats( + Stats& outStats) + { outStats = mStats; } + bool GetErrorCode() const + { return mErrorCode; } + +protected: + virtual void OpDone( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + if (mCurOpPtr != inOpPtr) { + InternalError("invalid op completion"); + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "<- " << (inOpPtr ? + string("NULL operation completion?") : inOpPtr->Show()) << + (inCanceledFlag ? " canceled" : "") << + " buffer: " << (void*)inBufferPtr << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + " status: " << inOpPtr->status << + " seq: " << inOpPtr->seq << + KFS_LOG_EOM; + Dispatch(inOpPtr, inBufferPtr); + } + +private: + typedef std::string::size_type StringPos; + + FileOpener& mOuter; + MetaServer& mMetaServer; + std::string mPathName; + std::string mFileName; + bool mCanceledFlag; + bool mSleepingFlag; + bool mOpenFlag; + bool mOpeningFlag; + bool mMakeDirsFlag; + int mErrorCode; + int mNumReplicas; + StringPos mPathNamePos; + KfsOp* mCurOpPtr; + Completion* mCompletionPtr; + LookupOp mLookupOp; + MkdirOp mMkdirOp; + CreateOp mCreateOp; + LookupPathOp mLookupPathOp; + std::string const mLogPrefix; + Stats mStats; + + + void InternalError( + const char* inMsgPtr = 0) + { abort(); } + + void Dispatch( + KfsOp* inOpPtr, + IOBuffer* inBufferPtr) + { + if (&mLookupOp == inOpPtr) { + Done(mLookupOp, inBufferPtr); + } else if (&mMkdirOp == inOpPtr) { + Done(mMkdirOp, inBufferPtr); + } else if (&mCreateOp == inOpPtr) { + Done(mCreateOp, inBufferPtr); + } else if (&mLookupPathOp == inOpPtr) { + Done(mLookupPathOp, inBufferPtr); + } else { + InternalError("unknown operation dispatch"); + } + } + + void Lookup() + { + mCurOpPtr = &mLookupOp; // For HandleError() below to work. + const bool theStartFlag = mPathNamePos == 0; + if (theStartFlag) { + mFileName.clear(); + mCreateOp.status = 0; + } else if (mFileName.empty()) { + mLookupOp.status = -ENOENT; + HandleError(); + return; + } else if (mLookupOp.status == -ENOENT && mMakeDirsFlag) { + mLookupOp.status = 0; + Mkdir(); + return; + } else if (mLookupOp.status != 0) { + HandleError(); + return; + } + kfsFileId_t const theParentFid = theStartFlag ? + KFS::ROOTFID : mLookupOp.fattr.fileId; + const string theFileName = mFileName; + + Reset(mLookupOp); + mLookupOp.filename = 0; + mLookupOp.parentFid = theParentFid; + StringPos theNext = std::string::npos; + StringPos const theEnd = mPathName.length(); + const char theSeparator = '/'; + while (mPathNamePos < theEnd && + (theNext = mPathName.find(theSeparator, mPathNamePos)) != + std::string::npos && + theNext == mPathNamePos) { + mPathNamePos++; + } + if (theNext == std::string::npos) { + theNext = theEnd; + } + if (mPathNamePos >= theEnd) { + mFileName.clear(); + } else { + mFileName = mPathName.substr(mPathNamePos, theNext - mPathNamePos); + } + if (theNext - mPathNamePos > KFS::MAX_FILENAME_LEN) { + mLookupOp.status = -ENAMETOOLONG; + HandleError(); + return; + } + mPathNamePos = theNext; + if (theNext == theEnd) { + if (! mFileName.empty()) { + Create(); + return; + } + if (mCreateOp.status == -EEXIST && ! theFileName.empty()) { + mCreateOp.status = 0; + mFileName = theFileName; + mLookupOp.fattr.isDirectory = true; + } + } + if (! theStartFlag && + mLookupOp.fattr.isDirectory == mFileName.empty()) { + mLookupOp.status = mFileName.empty() ? -ENOENT : -ENOTDIR; + HandleError(); + return; + } + if (mFileName.empty()) { + mOpenFlag = true; + mOpeningFlag = false; + ReportCompletion(); + return; + } + mLookupOp.filename = mFileName.c_str(); + Enqueue(mLookupOp); + } + void Done( + LookupOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mLookupOp == &inOp && ! inBufferPtr); + Lookup(); + } + void Mkdir() + { + assert(mLookupOp.parentFid > 0 && ! mFileName.empty()); + Reset(mMkdirOp); + mMkdirOp.parentFid = mLookupOp.parentFid; + mMkdirOp.dirname = mLookupOp.filename; + Enqueue(mMkdirOp); + } + void Done( + MkdirOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mMkdirOp == &inOp && ! inBufferPtr); + if (inOp.status == -EEXIST) { + // Just re-queue the lookup op, it should succeed now. + assert(mLookupOp.parentFid == mMkdirOp.parentFid && + mMkdirOp.dirname == mLookupOp.filename); + Enqueue(mLookupOp); + return; + } + if (inOp.status != 0) { + HandleError(); + return; + } + assert(mLookupOp.parentFid == mMkdirOp.parentFid); + mLookupOp.fattr.fileId = mMkdirOp.fileId; + mLookupOp.fattr.isDirectory = true; + mLookupOp.status = 0; + Lookup(); + } + void Create() + { + assert(mLookupOp.parentFid > 0 && ! mFileName.empty()); + Reset(mCreateOp); + mCreateOp.parentFid = mLookupOp.parentFid; + mCreateOp.filename = mFileName.c_str(); + mCreateOp.numReplicas = mNumReplicas; + // With false it deletes the file then creates it again. + mCreateOp.exclusive = true; + Enqueue(mCreateOp); + } + void Done( + CreateOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mCreateOp == &inOp && ! inBufferPtr); + if (inOp.status == -EEXIST) { + Lookup(); + return; + } + if (inOp.status != 0) { + HandleError(); + return; + } + mLookupOp.parentFid = inOp.parentFid; + mLookupOp.status = inOp.status; + mLookupOp.fattr.fileId = inOp.fileId; + mOpenFlag = true; + mOpeningFlag = false; + ReportCompletion(); + } + void LookupPath() + { + Reset(mLookupPathOp); + mLookupPathOp.rootFid = KFS::ROOTFID; + mLookupPathOp.filename = mPathName.c_str(); + Enqueue(mLookupPathOp); + } + void Done( + LookupPathOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mLookupPathOp == &inOp && ! inBufferPtr); + if (inOp.status == KfsNetClient::kErrorMaxRetryReached) { + HandleError(); + return; + } + if (inOp.status != 0 && mNumReplicas > 0) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "lookup path failed: " << inOp.status << + " falling back to open" << + KFS_LOG_EOM; + Lookup(); + return; + } + if (inOp.fattr.isDirectory) { + inOp.status = -EISDIR; + HandleError(); + return; + } + inOp.filename = ""; // Reset just in case. + // Copy result into lookup op. + mLookupOp.parentFid = -1; // Input, not known, and not needed. + mLookupOp.status = inOp.status; + mLookupOp.fattr = inOp.fattr; + mOpenFlag = true; + mOpeningFlag = false; + ReportCompletion(); + } + void Enqueue( + KfsOp& inOp) + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "+> " << "meta " << inOp.Show() << + KFS_LOG_EOM; + mStats.mMetaOpsQueuedCount++; + if (! mMetaServer.Enqueue(&inOp, this, 0)) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "meta enqueu failure: " << inOp.Show() << + KFS_LOG_EOM; + inOp.status = -EINVAL; + OpDone(&inOp, false, 0); + } + } + static void Reset( + KfsOp& inOp) + { + inOp.seq = 0; + inOp.status = 0; + inOp.statusMsg.clear(); + inOp.checksum = 0; + inOp.contentLength = 0; + inOp.contentBufLen = 0; + delete [] inOp.contentBuf; + inOp.contentBuf = 0; + } + void Reset() + { + if (mCurOpPtr) { + mMetaServer.Cancel(mCurOpPtr, this); + } + mCurOpPtr = 0; + } + void HandleError() + { + if (mCurOpPtr) { + std::ostringstream theOStream; + mCurOpPtr->Request(theOStream); + KFS_LOG_STREAM_ERROR << mLogPrefix << + "operation" + " failure, seq: " << mCurOpPtr->seq << + " status: " << mCurOpPtr->status << + " msg: " << mCurOpPtr->statusMsg << + " op: " << mCurOpPtr->Show() << + "\nRequest:\n" << theOStream.str() << + KFS_LOG_EOM; + } else { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "NULL operation failure" << + KFS_LOG_EOM; + } + if (! (mErrorCode = mCurOpPtr ? mCurOpPtr->status : -1)) { + mErrorCode = -1; + } + // Meta operations are automatically retried by MetaServer. + // Declare fatal error in the case of meta op failure. + KFS_LOG_STREAM_ERROR << mLogPrefix << + "meta operation failed, giving up" << + KFS_LOG_EOM; + FatalError(); + } + void FatalError( + int inErrorCode = 0) + { + if (inErrorCode != 0) { + mErrorCode = inErrorCode; + } + if (mErrorCode == 0) { + mErrorCode = -1; + } + mOpenFlag = false; + mOpeningFlag = false; + mCurOpPtr = 0; + ReportCompletion(); + } + void HandleCancel( + KfsOp* inOpPtr) + { + mStats.mMetaOpsCancelledCount++; + if (! mCurOpPtr) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "NULL operation canceled" << + KFS_LOG_EOM; + } + KFS_LOG_STREAM_ERROR << mLogPrefix << + "operation canceled " << inOpPtr->Show() << + KFS_LOG_EOM; + mCurOpPtr = 0; + mErrorCode = -ECANCELED; + } + void ReportCompletion() + { + if (mCompletionPtr) { + mCompletionPtr->Done(mOuter, mErrorCode); + } + } +private: + Impl( + const Impl& inImpl); + Impl& operator=( + const Impl& inImpl); +}; + +FileOpener::FileOpener( + FileOpener::MetaServer& inMetaServer, + FileOpener::Completion* inCompletionPtr /* = 0 */, + const char* inLogPrefixPtr /* = 0 */) + : mImpl(*new FileOpener::Impl( + *this, + inMetaServer, + inCompletionPtr, + (inLogPrefixPtr && inLogPrefixPtr[0]) ? + (inLogPrefixPtr + std::string(" ")) : std::string() + )) +{ +} + +/* virtual */ +FileOpener::~FileOpener() +{ + delete &mImpl; +} + +int +FileOpener::Open( + const char* inFileNamePtr, + int inNumReplicas /* = 3 */, + bool inMakeDirsFlag /* = false */) +{ + return mImpl.Open(inFileNamePtr, inNumReplicas, inMakeDirsFlag); +} + +int +FileOpener::Open( + kfsFileId_t inFileId, + const char* inFileNamePtr) +{ + return mImpl.Open(inFileId, inFileNamePtr); +} + +void +FileOpener::Shutdown() +{ + mImpl.Shutdown(); +} + +bool +FileOpener::IsOpen() const +{ + return mImpl.IsOpen(); +} + +bool +FileOpener::IsOpening() const +{ + return mImpl.IsOpening(); +} + +bool +FileOpener::IsActive() const +{ + return mImpl.IsActive(); +} + + void +FileOpener::Register( + FileOpener::Completion* inCompletionPtr) +{ + mImpl.Register(inCompletionPtr); +} + +bool +FileOpener::Unregister( + FileOpener::Completion* inCompletionPtr) +{ + return mImpl.Unregister(inCompletionPtr); +} + +void +FileOpener::GetStats( + FileOpener::Stats& outStats) +{ + mImpl.GetStats(outStats); +} + +} // namespace client + +} // namespace KFS diff --git a/src/cc/libclient/FileOpener.h b/src/cc/libclient/FileOpener.h new file mode 100644 index 000000000..094f0d2e5 --- /dev/null +++ b/src/cc/libclient/FileOpener.h @@ -0,0 +1,135 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/06/25 +// Author: Mike Ovsiannikov +// +// Copyright 2010 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef FILE_OPENER_H +#define FILE_OPENER_H + +#include "KfsNetClient.h" +#include "common/kfstypes.h" +#include "kfsio/checksum.h" + +#include +#include + +namespace KFS +{ +namespace client +{ + +// File open / create state machine creates file and all intermediate +// directories in the specified path if needed. +class FileOpener +{ +public: + class Completion + { + public: + virtual void Done( + FileOpener& inOpener, + int inStatusCode) = 0; + virtual void Unregistered( + FileOpener& /* inOpener */) + {} + protected: + Completion() + {} + Completion( + const Completion&) + {} + virtual ~Completion() + {} + }; + struct Stats + { + typedef int64_t Counter; + Stats() + : mMetaOpsQueuedCount(0), + mMetaOpsCancelledCount(0) + {} + void Clear() + { *this = Stats(); } + Stats& Add( + const Stats& inStats) + { + mMetaOpsQueuedCount += inStats.mMetaOpsQueuedCount; + mMetaOpsCancelledCount += inStats.mMetaOpsCancelledCount; + return *this; + } + std::ostream& Display( + std::ostream& inStream, + const char* inSeparatorPtr = 0, + const char* inDelimiterPtr = 0) const + { + const char* const theSeparatorPtr = + inSeparatorPtr ? inSeparatorPtr : " "; + const char* const theDelimiterPtr = + inDelimiterPtr ? inDelimiterPtr : ": "; + inStream << + "MetaOpsQueued" << theDelimiterPtr << + mMetaOpsQueuedCount << theSeparatorPtr << + "MetaOpsCancelled" << theDelimiterPtr << + mMetaOpsCancelledCount + ; + return inStream; + } + Counter mMetaOpsQueuedCount; + Counter mMetaOpsCancelledCount; + }; + typedef KfsNetClient MetaServer; + FileOpener( + MetaServer& inMetaServer, + Completion* inCompletionPtr = 0, + const char* inLogPrefixPtr = 0); + virtual ~FileOpener(); + int Open( + const char* inFileNamePtr, + int inNumReplicas = 3, + bool inMakeDirsFlag = false); + int Open( + kfsFileId_t inFileId, + const char* inFileNamePtr); + void Shutdown(); + bool IsOpen() const; + bool IsOpening() const; + bool IsActive() const; + int GetErrorCode() const; + void Register( + Completion* inCompletionPtr); + bool Unregister( + Completion* inCompletionPtr); + void GetStats( + Stats& outStats); +private: + class Impl; + Impl& mImpl; +private: + FileOpener( + const FileOpener& inFileOpener); + FileOpener& operator=( + const FileOpener& inFileOpener); +}; +}} + +#endif /* FILE_OPENER_H */ diff --git a/src/cc/libclient/KfsAttr.cc b/src/cc/libclient/KfsAttr.cc new file mode 100644 index 000000000..f7dd22ea4 --- /dev/null +++ b/src/cc/libclient/KfsAttr.cc @@ -0,0 +1,81 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/08/22 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +// \file KfsAttr.cc +// \brief Kfs i-node attributes class. +// +//---------------------------------------------------------------------------- + +#include "KfsAttr.h" + +#include +#include + +#include +#include +#include + +namespace KFS { +namespace client { + +void +FileAttr::ToStat( + struct stat& outStat) const +{ + memset(&outStat, 0, sizeof(outStat)); + outStat.st_ino = fileId; + // Directories are drwxrwxrwx, files are drw-rw-rw- + if (isDirectory) { + outStat.st_mode = S_IFDIR | (mode_t)((mode == kKfsModeUndef ? + (kfsMode_t)0777 : mode) & 0777); + outStat.st_size = 0; + } else { + outStat.st_mode = S_IFREG | (mode_t)((mode == kKfsModeUndef ? + (kfsMode_t)0666 : mode) & 0777); + outStat.st_size = fileSize; + } +#ifdef S_ISVTX + if (IsSticky()) { + outStat.st_mode |= S_ISVTX; + } +#endif + outStat.st_blksize = CHUNKSIZE; + outStat.st_blocks = (fileSize + CHUNKSIZE - 1) / CHUNKSIZE; + outStat.st_uid = (uid_t)user; + outStat.st_gid = (gid_t)group; +#ifdef KFS_OS_NAME_DARWIN + outStat.st_atimespec.tv_sec = mtime.tv_sec; + outStat.st_atimespec.tv_nsec = mtime.tv_usec * 1000; + outStat.st_mtimespec.tv_sec = mtime.tv_sec; + outStat.st_mtimespec.tv_nsec = mtime.tv_usec * 1000; + outStat.st_ctimespec.tv_sec = ctime.tv_sec; + outStat.st_ctimespec.tv_nsec = ctime.tv_usec * 1000; +#else + outStat.st_atime = mtime.tv_sec; + outStat.st_mtime = mtime.tv_sec; + outStat.st_ctime = ctime.tv_sec; +#endif +} + +}} + diff --git a/src/cc/libclient/KfsAttr.h b/src/cc/libclient/KfsAttr.h new file mode 100644 index 000000000..7006b0450 --- /dev/null +++ b/src/cc/libclient/KfsAttr.h @@ -0,0 +1,152 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/09 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +// \file KfsAttr.h +// \brief Kfs i-node and chunk attribute classes. +// +//---------------------------------------------------------------------------- + +#ifndef LIBKFSCLIENT_KFSATTR_H +#define LIBKFSCLIENT_KFSATTR_H + +#include + +#include +#include + +#include "common/kfstypes.h" +#include "common/kfsdecls.h" + +struct stat; + +namespace KFS +{ +namespace client +{ +using std::string; +using std::vector; + +/// +/// \brief Server attributes + chunk attributes +/// +struct FileAttr : public Permissions +{ + kfsFileId_t fileId; /// i-node number + struct timeval mtime; /// modification time + struct timeval ctime; /// attribute change time + struct timeval crtime; /// creation time + bool isDirectory; /// is this a directory? + chunkOff_t fileSize; /// logical eof + int64_t subCount1; /// number of chunks in the file or files in directory + int64_t subCount2; /// directories count + int16_t numReplicas; + int16_t numStripes; + int16_t numRecoveryStripes; + StripedFileType striperType; + int32_t stripeSize; + + FileAttr() + : Permissions(), + fileId(-1), + mtime(), + ctime(), + crtime(), + isDirectory(false), + fileSize(-1), + subCount1(0), + subCount2(-1), + numReplicas(0), + numStripes(0), + numRecoveryStripes(0), + striperType(KFS_STRIPED_FILE_TYPE_NONE), + stripeSize(0) + {} + void Reset() + { *this = FileAttr(); } + void Init(bool isDir) + { + isDirectory = isDir; + gettimeofday(&mtime, 0); + ctime = mtime; + crtime = mtime; + } + int64_t chunkCount() const + { return (isDirectory ? 0 : subCount1); } + int64_t fileCount() const + { return (isDirectory ? subCount1 : int64_t(0)); } + int64_t dirCount() const + { return (isDirectory ? subCount2 : int64_t(0)); } + void ToStat(struct stat& outStat) const; +}; + +struct ChunkAttr +{ + vector chunkServerLoc; // servers hosting chunk replicas + kfsChunkId_t chunkId; + int64_t chunkVersion; + chunkOff_t chunkSize; + chunkOff_t chunkOffset; // start position in the file + + ChunkAttr() + : chunkServerLoc(), + chunkId(-1), + chunkVersion(-1), + chunkSize(0), + chunkOffset(-1) + {} +}; + +} // namespace client + +/// +/// \brief File attributes as usable by applications. +/// +/// +struct KfsFileAttr : public client::FileAttr +{ + /// the name of this file + string filename; + + KfsFileAttr() + : client::FileAttr(), + filename() + {} + void Clear() + { + Reset(); + filename.clear(); + } + KfsFileAttr& operator= (const FileAttr &other) + { + FileAttr::operator=(other); + return *this; + } + bool operator < (const KfsFileAttr & other) const + { + return filename < other.filename; + } +}; +} // namespace KFS + +#endif // LIBKFSCLIENT_KFSATTR_H diff --git a/src/cc/libclient/KfsClient.cc b/src/cc/libclient/KfsClient.cc new file mode 100644 index 000000000..6587744db --- /dev/null +++ b/src/cc/libclient/KfsClient.cc @@ -0,0 +1,5194 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/04/18 +// Author: Sriram Rao +// Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file KfsClient.cc +// \brief Kfs Client-library code. +// +//---------------------------------------------------------------------------- + +#include "KfsClient.h" +#include "KfsClientInt.h" + +#include "common/config.h" +#include "common/Properties.h" +#include "common/MsgLogger.h" +#include "common/RequestParser.h" +#include "common/hsieh_hash.h" +#include "common/kfsatomic.h" +#include "common/MdStream.h" +#include "qcdio/qcstutils.h" +#include "qcdio/QCUtils.h" +#include "kfsio/checksum.h" +#include "kfsio/Globals.h" +#include "kfsio/requestio.h" +#include "Path.h" +#include "utils.h" +#include "KfsProtocolWorker.h" +#include "qcrs/rs.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace KFS +{ +using std::string; +using std::ostringstream; +using std::istringstream; +using std::min; +using std::max; +using std::map; +using std::vector; +using std::sort; +using std::transform; +using std::numeric_limits; +using std::unique; +using std::find; + +using boost::scoped_array; +using boost::bind; + +const int kMaxReaddirEntries = 1 << 10; +const int kMaxReadDirRetries = 16; + +KfsClient* +Connect(const char* propFile) +{ + bool verbose = false; +#ifdef DEBUG + verbose = true; +#endif + Properties p; + if (p.loadProperties(propFile, '=', verbose) != 0) { + return 0; + } + return Connect(p.getValue("metaServer.name", ""), + p.getValue("metaServer.port", -1)); +} + +KfsClient* +Connect(const string& metaServerHost, int metaServerPort) +{ + KfsClient* const clnt = new KfsClient(); + clnt->Init(metaServerHost, metaServerPort); + if (clnt->IsInitialized()) { + return clnt; + } + delete clnt; + return 0; +} + +string +ErrorCodeToStr(int status) +{ + return (status == 0 ? string() : QCUtils::SysError(-status)); +} + +static inline kfsSeq_t +RandomSeqNo() +{ + kfsSeq_t ret = 0; + if (RAND_pseudo_bytes( + reinterpret_cast(&ret), int(sizeof(ret))) < 0) { + KFS_LOG_STREAM_WARN << "RAND_pseudo_bytes failure" << KFS_LOG_EOM; + size_t kMaxNameLen = 1024; + char name[kMaxNameLen + 1]; + gethostname(name, kMaxNameLen); + name[kMaxNameLen] = 0; + Hsieh_hash_fcn hf; + static int64_t cnt = 0; + ret = microseconds() + getpid() + hf(name, strlen(name)) + + SyncAddAndFetch(cnt, int64_t(1000000)); + } + return ((ret < 0 ? -ret : ret) >> 1); +} + +static int +ValidateCreateParams( + int numReplicas, int numStripes, int numRecoveryStripes, + int stripeSize, int stripedType) +{ + return ( + (numReplicas <= 0 || + (stripedType != KFS_STRIPED_FILE_TYPE_NONE && + stripedType != KFS_STRIPED_FILE_TYPE_RS) || + (stripedType == KFS_STRIPED_FILE_TYPE_RS && + (numStripes <= 0 || + stripeSize < KFS_MIN_STRIPE_SIZE || + stripeSize > KFS_MAX_STRIPE_SIZE || + stripeSize % KFS_STRIPE_ALIGNMENT != 0 || + (numRecoveryStripes != 0 && + (numRecoveryStripes != RS_LIB_MAX_RECOVERY_BLOCKS || + numStripes > RS_LIB_MAX_DATA_BLOCKS)))) + ) ? -EINVAL : 0 + ); +} + +static MsgLogger::LogLevel +GetLogLevel(const char* logLevel) +{ + if (! logLevel || strcmp(logLevel, "INFO") == 0) { + return MsgLogger::kLogLevelINFO; + } + if (strcmp(logLevel, "DEBUG") == 0) { + return MsgLogger::kLogLevelDEBUG; + } + if (strcmp(logLevel, "WARN") == 0) { + return MsgLogger::kLogLevelWARN; + } + return MsgLogger::kLogLevelINFO; +} + +KfsClient::KfsClient() + : mImpl(new KfsClientImpl()) +{ +} + +KfsClient::~KfsClient() +{ + delete mImpl; +} + +void +KfsClient::SetLogLevel(const string &logLevel) +{ + MsgLogger::SetLevel(GetLogLevel(logLevel.c_str())); +} + +int +KfsClient::Init(const string &metaServerHost, int metaServerPort) +{ + return mImpl->Init(metaServerHost, metaServerPort); +} + +bool +KfsClient::IsInitialized() +{ + return mImpl->IsInitialized(); +} + +int +KfsClient::Cd(const char *pathname) +{ + return mImpl->Cd(pathname); +} + +string +KfsClient::GetCwd() +{ + return mImpl->GetCwd(); +} + +int +KfsClient::Mkdirs(const char *pathname, kfsMode_t mode) +{ + return mImpl->Mkdirs(pathname, mode); +} + +int +KfsClient::Mkdir(const char *pathname, kfsMode_t mode) +{ + return mImpl->Mkdir(pathname, mode); +} + +int +KfsClient::Rmdir(const char *pathname) +{ + return mImpl->Rmdir(pathname); +} + +int +KfsClient::Rmdirs(const char *pathname) +{ + return mImpl->Rmdirs(pathname); +} + +int +KfsClient::RmdirsFast(const char *pathname) +{ + return mImpl->RmdirsFast(pathname); +} + +int +KfsClient::Readdir(const char *pathname, vector &result) +{ + return mImpl->Readdir(pathname, result); +} + +int +KfsClient::ReaddirPlus(const char *pathname, vector &result, + bool computeFilesize) +{ + return mImpl->ReaddirPlus(pathname, result, computeFilesize); +} + +int +KfsClient::OpenDirectory(const char *pathname) +{ + return mImpl->OpenDirectory(pathname); +} + +int +KfsClient::Stat(const char *pathname, KfsFileAttr &result, bool computeFilesize) +{ + return mImpl->Stat(pathname, result, computeFilesize); +} + +int +KfsClient::GetNumChunks(const char *pathname) +{ + return mImpl->GetNumChunks(pathname); +} + +int +KfsClient::UpdateFilesize(int fd) +{ + return mImpl->UpdateFilesize(fd); +} + +bool +KfsClient::Exists(const char *pathname) +{ + return mImpl->Exists(pathname); +} + +bool +KfsClient::IsFile(const char *pathname) +{ + return mImpl->IsFile(pathname); +} + +bool +KfsClient::IsDirectory(const char *pathname) +{ + return mImpl->IsDirectory(pathname); +} + +int +KfsClient::EnumerateBlocks(const char* pathname, KfsClient::BlockInfos& res) +{ + return mImpl->EnumerateBlocks(pathname, res); +} + +int +KfsClient::CompareChunkReplicas(const char *pathname, string &md5sum) +{ + return mImpl->CompareChunkReplicas(pathname, md5sum); +} + +int +KfsClient::VerifyDataChecksums(const char *pathname) +{ + return mImpl->VerifyDataChecksums(pathname); +} + +int +KfsClient::VerifyDataChecksums(int fd) +{ + return mImpl->VerifyDataChecksums(fd); +} + +/* static */ int +KfsClient::ParseCreateParams(const char* params, + int& numReplicas, int& numStripes, int& numRecoveryStripes, + int& stripeSize, int& stripedType) +{ + numReplicas = 2; + numStripes = 0; + numRecoveryStripes = 0; + stripeSize = 0; + stripedType = KFS_STRIPED_FILE_TYPE_NONE; + if (! params || ! *params) { + return 0; + } + if (params[0] == 'S' && params[1] == 0) { + numReplicas = 1; + numStripes = 6; + numRecoveryStripes = 3; + stripeSize = 64 << 10, + stripedType = KFS_STRIPED_FILE_TYPE_RS; + return 0; + } + char* p = 0; + numReplicas = (int)strtol(params, &p, 10); + if (numReplicas <= 0) { + return -EINVAL; + } + if (*p == ',') numStripes = (int)strtol(p + 1, &p, 10); + if (*p == ',') numRecoveryStripes = (int)strtol(p + 1, &p, 10); + if (*p == ',') stripeSize = (int)strtol(p + 1, &p, 10); + if (*p == ',') stripedType = (int)strtol(p + 1, &p, 10); + if (stripedType == KFS_STRIPED_FILE_TYPE_NONE) { + numStripes = 0; + numRecoveryStripes = 0; + stripeSize = 0; + } + return ValidateCreateParams(numReplicas, numStripes, numRecoveryStripes, + stripeSize, stripedType); +} + +int +KfsClient::Create(const char *pathname, int numReplicas, bool exclusive, + int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, + bool forceTypeFlag, kfsMode_t mode) +{ + return mImpl->Create(pathname, numReplicas, exclusive, + numStripes, numRecoveryStripes, stripeSize, stripedType, forceTypeFlag, + mode); +} + + +int +KfsClient::Create(const char *pathname, bool exclusive, const char *params) +{ + int numReplicas; + int numStripes; + int numRecoveryStripes; + int stripeSize; + int stripedType; + const int ret = ParseCreateParams( + params, numReplicas, numStripes, numRecoveryStripes, + stripeSize, stripedType); + if (ret) { + return ret; + } + return mImpl->Create(pathname, numReplicas, exclusive, + numStripes, numRecoveryStripes, stripeSize, stripedType, true); +} + +int +KfsClient::Remove(const char *pathname) +{ + return mImpl->Remove(pathname); +} + +int +KfsClient::Rename(const char *oldpath, const char *newpath, bool overwrite) +{ + return mImpl->Rename(oldpath, newpath, overwrite); +} + +int +KfsClient::CoalesceBlocks(const char *srcPath, const char *dstPath, chunkOff_t *dstStartOffset) +{ + return mImpl->CoalesceBlocks(srcPath, dstPath, dstStartOffset); +} + +int +KfsClient::SetMtime(const char *pathname, const struct timeval &mtime) +{ + return mImpl->SetMtime(pathname, mtime); +} + +int +KfsClient::Open(const char *pathname, int openFlags, int numReplicas, + int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, + kfsMode_t mode) +{ + return mImpl->Open(pathname, openFlags, numReplicas, + numStripes, numRecoveryStripes, stripeSize, stripedType, mode); +} + +int +KfsClient::Open(const char *pathname, int openFlags, const char *params, + kfsMode_t mode) +{ + int numReplicas; + int numStripes; + int numRecoveryStripes; + int stripeSize; + int stripedType; + const int ret = ParseCreateParams( + params, numReplicas, numStripes, numRecoveryStripes, + stripeSize, stripedType); + if (ret) { + return ret; + } + return mImpl->Open(pathname, openFlags, numReplicas, + numStripes, numRecoveryStripes, stripeSize, stripedType, mode); +} + +int +KfsClient::Close(int fd) +{ + return mImpl->Close(fd); +} + +int +KfsClient::RecordAppend(int fd, const char *buf, int reclen) +{ + return mImpl->RecordAppend(fd, buf, reclen); +} + +int +KfsClient::AtomicRecordAppend(int fd, const char *buf, int reclen) +{ + return mImpl->AtomicRecordAppend(fd, buf, reclen); +} + +void +KfsClient::EnableAsyncRW() +{ +} + +void +KfsClient::DisableAsyncRW() +{ +} + +int +KfsClient::ReadPrefetch(int fd, char *buf, size_t numBytes) +{ + return mImpl->ReadPrefetch(fd, buf, numBytes); +} + +ssize_t +KfsClient::PRead(int fd, chunkOff_t pos, char *buf, size_t numBytes) +{ + chunkOff_t cpos = pos; + return mImpl->Read(fd, buf, numBytes, &cpos); +} + +ssize_t +KfsClient::PWrite(int fd, chunkOff_t pos, const char *buf, size_t numBytes) +{ + chunkOff_t cpos = pos; + return mImpl->Write(fd, buf, numBytes, &cpos); +} + +ssize_t +KfsClient::Read(int fd, char *buf, size_t numBytes) +{ + return mImpl->Read(fd, buf, numBytes); +} + +ssize_t +KfsClient::Write(int fd, const char *buf, size_t numBytes) +{ + return mImpl->Write(fd, buf, numBytes); +} + +int +KfsClient::WriteAsync(int fd, const char *buf, size_t numBytes) +{ + return mImpl->WriteAsync(fd, buf, numBytes); +} + +int +KfsClient::WriteAsyncCompletionHandler(int fd) +{ + return mImpl->WriteAsyncCompletionHandler(fd); +} + +void +KfsClient::SkipHolesInFile(int fd) +{ + mImpl->SkipHolesInFile(fd); +} + +int +KfsClient::Sync(int fd) +{ + return mImpl->Sync(fd); +} + +chunkOff_t +KfsClient::Seek(int fd, chunkOff_t offset, int whence) +{ + return mImpl->Seek(fd, offset, whence); +} + +chunkOff_t +KfsClient::Seek(int fd, chunkOff_t offset) +{ + return mImpl->Seek(fd, offset, SEEK_SET); +} + +chunkOff_t +KfsClient::Tell(int fd) +{ + return mImpl->Tell(fd); +} + +int +KfsClient::Truncate(const char* pathname, chunkOff_t offset) +{ + return mImpl->Truncate(pathname, offset); +} + +int +KfsClient::Truncate(int fd, chunkOff_t offset) +{ + return mImpl->Truncate(fd, offset); +} + +int +KfsClient::PruneFromHead(int fd, chunkOff_t offset) +{ + return mImpl->PruneFromHead(fd, offset); +} + +int +KfsClient::GetDataLocation(const char *pathname, chunkOff_t start, chunkOff_t len, + vector< vector > &locations) +{ + return mImpl->GetDataLocation(pathname, start, len, locations); +} + +int +KfsClient::GetDataLocation(int fd, chunkOff_t start, chunkOff_t len, + vector< vector > &locations) +{ + return mImpl->GetDataLocation(fd, start, len, locations); +} + +int16_t +KfsClient::GetReplicationFactor(const char *pathname) +{ + return mImpl->GetReplicationFactor(pathname); +} + +int16_t +KfsClient::SetReplicationFactor(const char *pathname, int16_t numReplicas) +{ + return mImpl->SetReplicationFactor(pathname, numReplicas); +} + +ServerLocation +KfsClient::GetMetaserverLocation() const +{ + return mImpl->GetMetaserverLocation(); +} + +void +KfsClient::SetDefaultIOTimeout(int nsecs) +{ + mImpl->SetDefaultIOTimeout(nsecs); +} + +int +KfsClient::GetDefaultIOTimeout() const +{ + return mImpl->GetDefaultIOTimeout(); +} + +void +KfsClient::SetRetryDelay(int nsecs) +{ + mImpl->SetRetryDelay(nsecs); +} + +int +KfsClient::GetRetryDelay() const +{ + return mImpl->GetRetryDelay(); +} + +void +KfsClient::SetMaxRetryPerOp(int retryCount) +{ + mImpl->SetMaxRetryPerOp(retryCount); +} + +int +KfsClient::GetMaxRetryPerOp() const +{ + return mImpl->GetMaxRetryPerOp(); +} + +ssize_t +KfsClient::SetDefaultIoBufferSize(size_t size) +{ + return mImpl->SetDefaultIoBufferSize(size); +} + +ssize_t +KfsClient::GetDefaultIoBufferSize() const +{ + return mImpl->GetDefaultIoBufferSize(); +} + +ssize_t +KfsClient::SetIoBufferSize(int fd, size_t size) +{ + return mImpl->SetIoBufferSize(fd, size); +} + +ssize_t +KfsClient::GetIoBufferSize(int fd) const +{ + return mImpl->GetIoBufferSize(fd); +} + +ssize_t +KfsClient::SetDefaultReadAheadSize(size_t size) +{ + return mImpl->SetDefaultReadAheadSize(size); +} + +ssize_t +KfsClient::GetDefaultReadAheadSize() const +{ + return mImpl->GetDefaultReadAheadSize(); +} + +ssize_t +KfsClient::SetReadAheadSize(int fd, size_t size) +{ + return mImpl->SetReadAheadSize(fd, size); +} + +ssize_t +KfsClient::GetReadAheadSize(int fd) const +{ + return mImpl->GetReadAheadSize(fd); +} + +void +KfsClient::SetEOFMark(int fd, chunkOff_t offset) +{ + mImpl->SetEOFMark(fd, offset); +} + +int +KfsClient::GetFileOrChunkInfo(kfsFileId_t fileId, kfsChunkId_t chunkId, + KfsFileAttr& fattr, chunkOff_t& offset, int64_t& chunkVersion, + vector& servers) +{ + return mImpl->GetFileOrChunkInfo( + fileId, chunkId, fattr, offset, chunkVersion, servers); +} + +void +KfsClient::SetDefaultFullSparseFileSupport(bool flag) +{ + mImpl->SetDefaultFullSparseFileSupport(flag); +} + +int +KfsClient::SetFullSparseFileSupport(int fd, bool flag) +{ + return mImpl->SetFullSparseFileSupport(fd, flag); +} + +void +KfsClient::SetFileAttributeRevalidateTime(int secs) +{ + mImpl->SetFileAttributeRevalidateTime(secs); +} + +int +KfsClient::Chmod(int fd, kfsMode_t mode) +{ + return mImpl->Chmod(fd, mode); +} + +int +KfsClient::Chmod(const char* pathname, kfsMode_t mode) +{ + return mImpl->Chmod(pathname, mode); +} + +int +KfsClient::Chown(int fd, kfsUid_t user, kfsGid_t group) +{ + return mImpl->Chown(fd, user, group); +} + +int +KfsClient::Chown(int fd, const char* user, const char* group) +{ + return mImpl->Chown(fd, user, group); +} + +int +KfsClient::ChmodR(const char* pathname, kfsMode_t mode) +{ + return mImpl->ChmodR(pathname, mode); +} + +int +KfsClient::ChownR(const char* pathname, kfsUid_t user, kfsGid_t group) +{ + return mImpl->ChownR(pathname, user, group); +} + +int +KfsClient::ChownR(const char* pathname, const char* user, const char* group) +{ + return mImpl->ChownR(pathname, user, group); +} + +void +KfsClient::SetUMask(kfsMode_t mask) +{ + mImpl->SetUMask(mask); +} + +kfsMode_t +KfsClient::GetUMask() const +{ + return mImpl->GetUMask(); +} + +int +KfsClient::Chown(const char* pathname, kfsUid_t user, kfsGid_t group) +{ + return mImpl->Chown(pathname, user, group); +} + +int +KfsClient::Chown(const char* pathname, const char* user, const char* group) +{ + return mImpl->Chown(pathname, user, group); +} + +int +KfsClient::SetEUserAndEGroup(kfsUid_t user, kfsGid_t group, + kfsGid_t* groups, int groupsCnt) +{ + return mImpl->SetEUserAndEGroup(user, group, groups, groupsCnt); +} + +int +KfsClient::GetUserAndGroupNames(kfsUid_t user, kfsGid_t group, + string& uname, string& gname) +{ + return mImpl->GetUserAndGroupNames(user, group, uname, gname); +} + +namespace client +{ + +class MatchingServer { + ServerLocation loc; +public: + MatchingServer(const ServerLocation &l) : loc(l) { } + bool operator()(KfsClient * &clnt) const { + return clnt->GetMetaserverLocation() == loc; + } + bool operator()(const ServerLocation &other) const { + return other == loc; + } +}; + +class KfsClientImpl::ClientsList +{ +public: + static void Insert(KfsClientImpl& client) + { Instance().InsertSelf(client); } + static void Remove(KfsClientImpl& client) + { Instance().RemoveSelf(client); } + static void Init(KfsClientImpl& client) + { Instance().InitSelf(client); } + static int SetEUserAndEGroup(kfsUid_t user, kfsGid_t group, + kfsGid_t* groups, int groupsCnt) + { + return Instance().SetEUserAndEGroupSelf(user, group, groups, groupsCnt); + } +private: + typedef QCDLList List; + QCMutex mMutex; + KfsClientImpl* mList[1]; + + static ClientsList* sInstance; + + class Globals + { + public: + kfsMode_t mUMask; + kfsUid_t mEUser; + kfsGid_t mEGroup; + vector mGroups; + int mDefaultFileAttributeRevalidateTime; + + static const Globals& Get() + { return GetInstance(); } + static int SetEUserAndEGroup(kfsUid_t user, kfsGid_t group, + kfsGid_t* groups, int groupsCnt) + { + return GetInstance().SetEUserAndEGroupSelf( + user, group, groups, groupsCnt); + } + private: + Globals() + : mUMask(0), + mEUser(geteuid()), + mEGroup(getegid()), + mGroups(), + mDefaultFileAttributeRevalidateTime(30) + { + signal(SIGPIPE, SIG_IGN); + libkfsio::InitGlobals(); + const int maxGroups = min((int)sysconf(_SC_NGROUPS_MAX), 1 << 16); + if (maxGroups > 0) { + gid_t* const grs = new gid_t[maxGroups]; + const int cnt = getgroups(maxGroups, grs); + if (cnt > 0) { + mGroups.reserve(cnt + 1); + for (int i = 0; i < cnt; i++) { + mGroups.push_back((kfsGid_t)grs[i]); + } + } + delete [] grs; + } + if (find(mGroups.begin(), mGroups.end(), mEGroup) == mGroups.end()) { + mGroups.push_back(mEGroup); + } + KfsOp::AddDefaultRequestHeaders(mEUser, mEGroup); + AddUserHeader((uid_t)mEUser); + const mode_t mask = umask(0); + umask(mask); + mUMask = mask & Permissions::kAccessModeMask; + const char* p = getenv("KFS_CLIENT_DEFAULT_FATTR_REVALIDATE_TIME"); + if (p) { + char* e = 0; + const long v = strtol(p, &e, 10); + if (p < e && (*e & 0xFF) <= ' ') { + mDefaultFileAttributeRevalidateTime = (int)v; + } + } + } + void AddUserHeader(uid_t uid) + { + struct passwd pwebuf = {0}; + struct passwd* pwe = 0; + char namebuf[1024]; + getpwuid_r(uid, &pwebuf, namebuf, sizeof(namebuf), &pwe); + if (pwe && pwe->pw_name) { + string hdr("User: "); + hdr += pwe->pw_name; + hdr += "\r\n"; + KfsOp::AddExtraRequestHeaders(hdr); + } + } + ~Globals() + { Instance().Shutdown(); } + int SetEUserAndEGroupSelf(kfsUid_t user, kfsGid_t group, + kfsGid_t* groups, int groupsCnt) + { + mGroups.clear(); + if (groupsCnt > 0) { + mGroups.reserve(groupsCnt + 1); + for (int i = 0; i < groupsCnt; i++) { + mGroups.push_back(groups[i]); + } + } + if (group != kKfsGroupNone && + find(mGroups.begin(), mGroups.end(), group) == + mGroups.end()) { + mGroups.push_back(group); + } + mEUser = user == kKfsUserNone ? geteuid() : user; + mEGroup = group == kKfsGroupNone ? getegid() : group; + KfsOp::SetExtraRequestHeaders(string()); + KfsOp::AddDefaultRequestHeaders(mEUser, mEGroup); + AddUserHeader((uid_t)mEUser); + return 0; + } + static Globals& GetInstance() + { + static Globals globals; + return globals; + } + }; + friend class Globals; + + ClientsList() + : mMutex() + { List::Init(mList); } + ~ClientsList() + { assert(! "unexpected invocation"); } + void InsertSelf(KfsClientImpl& client) + { + QCStMutexLocker locker(mMutex); + List::Init(client); + List::PushBack(mList, client); + const Globals& globals = Globals::Get(); + client.mEUser = globals.mEUser; + client.mEGroup = globals.mEGroup; + client.mGroups = globals.mGroups; + client.mUMask = globals.mUMask; + client.mFileAttributeRevalidateTime = + globals.mDefaultFileAttributeRevalidateTime; + } + void RemoveSelf(KfsClientImpl& client) + { + QCStMutexLocker locker(mMutex); + List::Remove(mList, client); + } + void Shutdown() + { + KfsClientImpl* list[1]; + { + QCStMutexLocker locker(mMutex); + list[0] = mList[0]; + List::Init(mList); + } + while (! List::IsEmpty(list)) { + List::PopFront(list)->Shutdown(); + } + QCStMutexLocker locker(mMutex); + KfsOp::SetExtraRequestHeaders(string()); + } + void InitSelf(KfsClientImpl& /* client */) + { + QCStMutexLocker locker(mMutex); + if (! MsgLogger::IsLoggerInited()) { + MsgLogger::Init(0, GetLogLevel(getenv("KFS_CLIENT_LOG_LEVEL"))); + } + } + int SetEUserAndEGroupSelf(kfsUid_t user, kfsGid_t group, + kfsGid_t* groups, int groupCnt) + { + QCStMutexLocker locker(mMutex); + if (List::IsEmpty(mList)) { + return Globals::SetEUserAndEGroup( + user, group, groups, groupCnt); + } + if (List::Front(mList) != List::Back(mList)) { + KFS_LOG_STREAM_ERROR << + "cannot change user and group -- more than one" + " KFS client instance already exist" << + KFS_LOG_EOM; + return -EINVAL; + } + KfsClientImpl& client = *List::Front(mList); + QCStMutexLocker clientLock(client.mMutex); + if (! client.mFileTable.empty()) { + KFS_LOG_STREAM_ERROR << + "setting user and group must be performed immediately" + " after KFS client created" << + KFS_LOG_EOM; + return -EINVAL; + } + const int ret = Globals::SetEUserAndEGroup( + user, group, groups, groupCnt); + if (ret != 0) { + return ret; + } + const Globals& globals = Globals::Get(); + client.mEUser = globals.mEUser; + client.mEUser = globals.mEUser; + client.mGroups = globals.mGroups; + return 0; + } + static ClientsList& Instance(); +}; + +KfsClientImpl::ClientsList& +KfsClientImpl::ClientsList::Instance() +{ + static bool sOnce = true; + if (sOnce) { + sOnce = false; + static struct { char alloc[sizeof(ClientsList)]; } sStorage; + sInstance = new (&sStorage) ClientsList(); + } + return *sInstance; +} +KfsClientImpl::ClientsList* KfsClientImpl::ClientsList::sInstance = + &KfsClientImpl::ClientsList::Instance(); + +// +// Now, the real work is done by the impl object.... +// + +KfsClientImpl::KfsClientImpl() + : mMutex(), + mIsInitialized(false), + mMetaServerLoc(), + mMetaServerSock(), + mCmdSeqNum(0), + mCwd("/"), + mFileTable(), + mFidNameToFAttrMap(), + mPathCache(), + mPathCacheNone(mPathCache.insert( + make_pair(string(), static_cast(0))).first), + mFAttrPool(), + mFreeFileTableEntires(), + mFattrCacheSkipValidateCnt(0), + mFileAttributeRevalidateTime(30), + mFAttrCacheGeneration(0), + mTmpPath(), + mTmpAbsPathStr(), + mTmpAbsPath(), + mTmpCurPath(), + mTmpDirName(), + mSlash("/"), + mDefaultIoBufferSize(min(CHUNKSIZE, size_t(1) << 20)), + mDefaultReadAheadSize(min(mDefaultIoBufferSize, size_t(1) << 20)), + mFailShortReadsFlag(true), + mFileInstance(0), + mProtocolWorker(0), + mMaxNumRetriesPerOp(DEFAULT_NUM_RETRIES_PER_OP), + mRetryDelaySec(RETRY_DELAY_SECS), + mDefaultOpTimeout(30), + mFreeCondVarsHead(0), + mEUser(kKfsUserNone), + mEGroup(kKfsGroupNone), + mUMask(0), + mGroups(), + mCreateId(RandomSeqNo()), + mUserNames(), + mGroupNames(), + mUserIds(), + mGroupIds() +{ + ClientsList::Insert(*this); + + QCStMutexLocker l(mMutex); + + FAttrLru::Init(mFAttrLru); + mTmpPath.reserve(32); + mResponseBuffer[kResponseBufferSize] = 0; +} + +KfsClientImpl::~KfsClientImpl() +{ + ClientsList::Remove(*this); + KfsClientImpl::Shutdown(); + + QCStMutexLocker l(mMutex); + FAttr* p; + while ((p = FAttrLru::Front(mFAttrLru))) { + Delete(p); + } + delete mProtocolWorker; + KfsClientImpl::CleanupPendingRead(); + vector ::iterator it = mFileTable.begin(); + while (it != mFileTable.end()) { + delete *it++; + } +} + +void +KfsClientImpl::Shutdown() +{ + QCStMutexLocker l(mMutex); + if (! mProtocolWorker) { + return; + } + l.Unlock(); + mProtocolWorker->Stop(); +} + +int KfsClientImpl::Init(const string &metaServerHost, int metaServerPort) +{ + ClientsList::Init(*this); + + mMetaServerLoc.hostname = metaServerHost; + mMetaServerLoc.port = metaServerPort; + + KFS_LOG_STREAM_DEBUG << + "connecting to metaserver at: " << + metaServerHost << ":" << metaServerPort << + KFS_LOG_EOM; + + if (!mMetaServerLoc.IsValid()) { + mIsInitialized = false; + KFS_LOG_STREAM_ERROR << + "invalid metaserver location: " << + metaServerHost << ":" << metaServerPort << + KFS_LOG_EOM; + return -1; + } + + for (int attempt = 0; ;) { + if (ConnectToMetaServer()) { + mIsInitialized = true; + break; + } + mIsInitialized = false; + if (++attempt >= mMaxNumRetriesPerOp) { + KFS_LOG_STREAM_ERROR << + "unable to connect to metaserver at: " << + metaServerHost << ":" << metaServerPort << + "; retrying..." << + KFS_LOG_EOM; + break; + } + Sleep(mRetryDelaySec); + } + if (!mIsInitialized) { + KFS_LOG_STREAM_ERROR << + "unable to connect to metaserver at: " << + metaServerHost << ":" << metaServerPort << + "; giving up" << + KFS_LOG_EOM; + return -1; + } + + mIsInitialized = true; + return 0; +} + +bool +KfsClientImpl::ConnectToMetaServer() +{ + return mMetaServerSock.Connect(mMetaServerLoc) >= 0; +} + + +/// A notion of "cwd" in KFS. +/// +int +KfsClientImpl::Cd(const char *pathname) +{ + QCStMutexLocker l(mMutex); + + KfsFileAttr attr; + string path; + const int status = StatSelf(pathname, attr, false, &path); + if (status < 0) { + KFS_LOG_STREAM_DEBUG << "Non-existent file: " << + (pathname ? pathname : "null:") << + KFS_LOG_EOM; + return status; + } + if (! attr.isDirectory) { + KFS_LOG_STREAM_DEBUG << "Non-existent dir: " << + (pathname ? pathname : "null:") << + KFS_LOG_EOM; + return -ENOTDIR; + } + mCwd = path; + return 0; +} + +/// +/// To allow tools to get at "pwd" +/// +string +KfsClientImpl::GetCwd() +{ + return mCwd; +} + +const char* +KfsClientImpl::GetTmpAbsPath(const char* pathname, size_t& ioLen) +{ + if (mTmpAbsPathStr.data() == pathname) { + return pathname; + } + if (ioLen <= 0) { + ioLen = mCwd.length(); + return mCwd.c_str(); + } + if (*pathname == '/') { + return pathname; + } + mTmpAbsPathStr.assign(mCwd.data(), mCwd.length()); + mTmpAbsPathStr.append("/", 1); + mTmpAbsPathStr.append(pathname, ioLen); + ioLen = mTmpAbsPathStr.length(); + return mTmpAbsPathStr.c_str(); +} + +/// +/// Make a directory hierarchy in KFS. +/// +int +KfsClientImpl::Mkdirs(const char *pathname, kfsMode_t mode) +{ + if (! pathname) { + return -EFAULT; + } + if (! *pathname) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + // + // Walk from the root down to the last part of the path making the + // directory hierarchy along the way. If any of the components of + // the path is a file, error out. + // + size_t len = strlen(pathname); + const char* ptr = GetTmpAbsPath(pathname, len); + if (! mTmpAbsPath.Set(ptr, len)) { + return -EINVAL; + } + const size_t sz = mTmpAbsPath.size(); + const Path::Token kRootDir = Path::Token("/", 1); + if (sz < 1 || mTmpAbsPath[0] != kRootDir) { + return -EINVAL; + } + const bool kComputeFileSize = false; + const time_t now = time(0); + bool createdFlag = false; + int res = 0; + const Path::Token kThisDir(".", 1); + const Path::Token kParentDir("..", 2); + mTmpCurPath.clear(); + mTmpCurPath.reserve(MAX_PATH_NAME_LENGTH); + mTmpPath.clear(); + mTmpPath.reserve(sz); + mTmpPath.push_back(make_pair(ROOTFID, 0)); + for (size_t i = 1; i < sz; i++) { + const Path::Token& cname = mTmpAbsPath[i]; + if (cname == kThisDir || cname.mLen <= 0) { + continue; + } + if (cname == kParentDir) { + const size_t pl = mTmpPath.size() - 1; + if (pl > 0) { + mTmpCurPath.erase(mTmpCurPath.length() - + mTmpAbsPath[mTmpPath.back().second].mLen - 1); + mTmpPath.pop_back(); + } + continue; + } + mTmpDirName.assign(cname.mPtr, cname.mLen); + if ((res = ValidateName(mTmpDirName)) != 0) { + break; + } + mTmpCurPath += mSlash; + mTmpCurPath += mTmpDirName; + if (mTmpCurPath.length() > MAX_PATH_NAME_LENGTH) { + res = -ENAMETOOLONG; + break; + } + FAttr* fa = LookupFAttr(mTmpPath.back().first, mTmpDirName); + res = (fa && IsValid(*fa, now)) ? 0 : + LookupAttr(mTmpPath.back().first, mTmpDirName, + fa, kComputeFileSize, mTmpCurPath); + if (res == 0) { + assert(fa); + if (! fa->isDirectory) { + res = -ENOTDIR; + break; + } + // Invalidate the counts, assuming that in most cases case a new sub + // directory will be created. + fa->staleSubCountsFlag = true; + mTmpPath.push_back(make_pair(fa->fileId, i)); + continue; + } + if (res != -ENOENT) { + break; + } + MkdirOp op(nextSeq(), mTmpPath.back().first, mTmpDirName.c_str(), + Permissions(mEUser, mEGroup, + mode != kKfsModeUndef ? (mode & ~mUMask) : mode), + NextCreateId() + ); + DoMetaOpWithRetry(&op); + if ((res = op.status) == 0) { + mTmpPath.push_back(make_pair(op.fileId, i)); + if (! createdFlag && (fa = LookupFAttr(ROOTFID, mSlash))) { + fa->staleSubCountsFlag = true; + } + createdFlag = true; + continue; + } + if (res != -EEXIST) { + break; + } + fa = 0; + if ((res = LookupAttr(mTmpPath.back().first, mTmpDirName, + fa, kComputeFileSize, mTmpCurPath)) != 0) { + break; + } + assert(fa); + if (! fa->isDirectory) { + res = -ENOTDIR; + break; + } + fa->staleSubCountsFlag = true; + mTmpPath.push_back(make_pair(fa->fileId, i)); + } + mTmpAbsPath.Clear(); + return res; +} + +/// +/// Make a directory in KFS. +/// @param[in] pathname The full pathname such as /.../dir +/// @retval 0 if mkdir is successful; -errno otherwise +int +KfsClientImpl::Mkdir(const char *pathname, kfsMode_t mode) +{ + QCStMutexLocker l(mMutex); + + kfsFileId_t parentFid; + string dirname; + string path; + const bool kEnforceLastDirFlag = false; + const bool kInvalidateSubCountsFlag = true; + int res = GetPathComponents( + pathname, &parentFid, dirname, &path, + kInvalidateSubCountsFlag, kEnforceLastDirFlag); + if (res < 0) { + return res; + } + MkdirOp op(nextSeq(), parentFid, dirname.c_str(), + Permissions(mEUser, mEGroup, + mode != kKfsModeUndef ? (mode & ~mUMask) : mode), + NextCreateId() + ); + DoMetaOpWithRetry(&op); + if (op.status < 0) { + return op.status; + } + return 0; +} + +/// +/// Remove a directory in KFS. +/// @param[in] pathname The full pathname such as /.../dir +/// @retval 0 if rmdir is successful; -errno otherwise +int +KfsClientImpl::Rmdir(const char *pathname) +{ + QCStMutexLocker l(mMutex); + + string dirname; + string path; + kfsFileId_t parentFid; + const bool kInvalidateSubCountsFlag = true; + int res = GetPathComponents(pathname, &parentFid, dirname, &path, + kInvalidateSubCountsFlag); + if (res < 0) { + return res; + } + RmdirOp op(nextSeq(), parentFid, dirname.c_str(), path.c_str()); + DoMetaOpWithRetry(&op); + Delete(LookupFAttr(parentFid, dirname)); + return op.status; +} + +/// +/// Remove a directory hierarchy in KFS. +/// @param[in] pathname The full pathname such as /.../dir +/// @retval 0 if rmdir is successful; -errno otherwise +int +KfsClientImpl::Rmdirs(const char *pathname) +{ + return RmdirsFast(pathname); +} + +/// +/// Remove a directory hierarchy in KFS. +/// @param[in] pathname The full pathname such as /.../dir +/// @retval 0 if rmdir is successful; -errno otherwise +int +KfsClientImpl::RmdirsFast(const char *pathname) +{ + QCStMutexLocker l(mMutex); + + if (! pathname || ! *pathname) { + return -EINVAL; + } + string dirname; + kfsFileId_t parentFid = -1; + string path; + int ret = GetPathComponents(pathname, &parentFid, dirname, &path); + if (ret < 0) { + return ret; + } + FAttr* fa = 0; + ret = LookupAttr(parentFid, dirname, fa, false, path); + if (ret < 0) { + return ret; + } + if (! fa->isDirectory) { + return -ENOTDIR; + } + const size_t pos = path.rfind('/'); + if (pos == string::npos) { + assert(! "internal error: invalid path name"); + return -EFAULT; + } + ret = RmdirsSelf( + path.substr(0, pos), + (pos == 0 && dirname == "/") ? string() : dirname, + parentFid, + fa->fileId + ); + // Invalidate cached attributes. + InvalidateAllCachedAttrs(); + return ret; +} + +void +KfsClientImpl::InvalidateAllCachedAttrs() +{ + // Invalidate cached attributes. + mFAttrCacheGeneration++; +} + +int +KfsClientImpl::RmdirsSelf(const string& path, const string& dirname, + kfsFileId_t parentFid, kfsFileId_t dirFid) +{ + vector entries; + // don't compute any filesize; don't update client cache + const string p = path + "/" + dirname; + int res = ReaddirPlus(p, dirFid, entries, false, false); + if (res < 0) { + return res; + } + for (vector::const_iterator it = entries.begin(); + it != entries.end(); + ++it) { + if (it->filename == "." || it->filename == "..") { + continue; + } + if (it->isDirectory) { + if (it->fileId == ROOTFID) { + continue; + } + res = RmdirsSelf(p, it->filename, dirFid, it->fileId); + } else { + res = Remove(p, dirFid, it->filename); + } + if (res < 0) { + return res; + } + } + if (dirname.empty() || (parentFid == ROOTFID && dirname == "/")) { + return 0; + } + RmdirOp op(nextSeq(), parentFid, dirname.c_str(), p.c_str()); + DoMetaOpWithRetry(&op); + return op.status; +} + +int +KfsClientImpl::Remove(const string& dirname, kfsFileId_t dirFid, const string& filename) +{ + string pathname = dirname + "/" + filename; + RemoveOp op(nextSeq(), dirFid, filename.c_str(), pathname.c_str()); + DoMetaOpWithRetry(&op); + return op.status; +} + +/// +/// Read a directory's contents. This is analogous to READDIR in +/// NFS---just reads the directory contents and returns the names; +/// you'll need to lookup the attributes next. The resulting +/// directory entries are sorted lexicographically. +/// +/// XXX NFS READDIR also returns the file ids, and we should do +/// the same here. +/// +/// @param[in] pathname The full pathname such as /.../dir +/// @param[out] result The filenames in the directory +/// @retval 0 if readdir is successful; -errno otherwise +int +KfsClientImpl::Readdir(const char* pathname, vector& result) +{ + QCStMutexLocker l(mMutex); + + result.clear(); + KfsFileAttr attr; + const int res = StatSelf(pathname, attr, false); + if (res < 0) { + return res; + } + if (! attr.isDirectory) { + return -ENOTDIR; + } + + ReaddirOp op(0, attr.fileId); + for (int retryCnt = kMaxReadDirRetries; ;) { + op.seq = nextSeq(); + op.numEntries = kMaxReaddirEntries; + op.contentLength = 0; + op.hasMoreEntriesFlag = false; + + DoMetaOpWithRetry(&op); + if (op.status < 0) { + if (op.fnameStart.empty() || + (op.status != -ENOENT && op.status != -EAGAIN)) { + break; + } + if (--retryCnt <= 0) { + KFS_LOG_STREAM_ERROR << + pathname << ": id: " << attr.fileId << + " directory has changed " << + kMaxReadDirRetries << + " times while attempting to list it; giving up" << + KFS_LOG_EOM; + op.status = -EAGAIN; + break; + } + result.clear(); + op.fnameStart.clear(); + continue; + } + if (op.numEntries <= 0) { + break; + } + if (op.contentLength <= 0) { + op.status = -EIO; + break; + } + assert(op.contentBuf && op.contentBufLen >= op.contentLength); + BufferInputStream ist(op.contentBuf, op.contentLength); + result.reserve(result.size() + op.numEntries); + for (int i = 0; i < op.numEntries; i++) { + string line; + if (! getline(ist, line) || line.empty()) { + op.status = -EIO; + break; + } + result.push_back(line); + } + if (! op.hasMoreEntriesFlag || op.status != 0) { + break; + } + op.fnameStart = result.back(); + } + if (op.status == 0) { + sort(result.begin(), result.end()); + if (! op.fnameStart.empty()) { + unique(result.begin(), result.end()); + } + } else { + result.clear(); + } + return op.status; +} + +/// +/// Read a directory's contents and get the attributes. This is +/// analogous to READDIRPLUS in NFS. The resulting directory entries +/// are sort lexicographically. +/// +/// @param[in] pathname The full pathname such as /.../dir +/// @param[out] result The filenames in the directory and their attributes +/// @retval 0 if readdir is successful; -errno otherwise +int +KfsClientImpl::ReaddirPlus(const char* pathname, vector& result, + bool computeFilesize) +{ + QCStMutexLocker l(mMutex); + + result.clear(); + KfsFileAttr attr; + string path; + const int res = StatSelf(pathname, attr, false, &path); + if (res < 0) { + return res; + } + if (! attr.isDirectory) { + return -ENOTDIR; + } + return ReaddirPlus(path, attr.fileId, result, computeFilesize); +} + +class ReaddirPlusParser +{ +public: + ReaddirPlusParser() + : mEntry(), + mHexParserFlag(false) + {} + void SetUseHexParser() { mHexParserFlag = true; } + bool Parse( + PropertiesTokenizer& tokenizer) + { + mEntry.Reset(); + if (mHexParserFlag) { + sHexParser.Parse(tokenizer, &mEntry); + } else { + sParser.Parse(tokenizer, &mEntry); + } + return mEntry.Validate(); + } + void LastChunkInfo(ChunkAttr& info) const + { + info.chunkOffset = mEntry.lastChunkOffset; + info.chunkId = mEntry.chunkId; + info.chunkVersion = mEntry.chunkVersion; + info.chunkServerLoc.clear(); + + const int numReplicas = mEntry.lastchunkNumReplicas; + if (numReplicas <= 0 || mEntry.lastChunkReplicas.empty()) { + return; + } + BufferInputStream is( + mEntry.lastChunkReplicas.GetPtr(), + mEntry.lastChunkReplicas.GetSize() + ); + if (mHexParserFlag) { + is.flags(istream::hex | istream::skipws); + } + info.chunkServerLoc.resize(numReplicas); + for (int i = 0; i < numReplicas; ++i) { + is >> info.chunkServerLoc[i].hostname; + is >> info.chunkServerLoc[i].port; + } + } + const KfsFileAttr& GetFattr() const + { return mEntry; } +private: + class Entry : public KfsFileAttr + { + public: + enum { kCTimeUndef = 2 * 1000 * 1000 + 1 }; + chunkOff_t lastChunkOffset; + kfsFileId_t chunkId; + int64_t chunkVersion; + int lastchunkNumReplicas; + StringBufT<128> lastChunkReplicas; + StringBufT<32> type; + + Entry() + : KfsFileAttr(), + lastChunkOffset(0), + chunkId(-1), + chunkVersion(-1), + lastchunkNumReplicas(0), + lastChunkReplicas(), + type() + {} + void Reset() + { + Clear(); + lastChunkOffset = 0; + chunkId = -1; + chunkVersion = -1; + lastchunkNumReplicas = 0; + ctime.tv_usec = kCTimeUndef; + lastChunkReplicas.clear(); + type.clear(); + } + bool Validate() + { + isDirectory = type.Compare("dir") == 0; + if (isDirectory) { + if (fileSize < 0) { + fileSize = 0; + } + if (subCount2 < 0) { + subCount1 = -1; + } + } else if (subCount1 <= 0) { + subCount1 = 0; + } + if (ctime.tv_usec == kCTimeUndef) { + ctime = crtime; + } + return (! filename.empty()); + } + }; + template + class VParser + { + public: + // Specialization for StripedFileType + typedef ValueParserT ValueParser; + + template + static void SetValue( + const char* inPtr, + size_t inLen, + const T& inDefaultValue, + T& outValue) + { + ValueParser::SetValue(inPtr, inLen, inDefaultValue, outValue); + } + template + static bool ParseInt( + const char*& ioPtr, + size_t inLen, + T& outValue) + { + return ValueParser::ParseInt(ioPtr, inLen, outValue); + } + static void SetValue( + const char* inPtr, + size_t inLen, + const StripedFileType& inDefaultValue, + StripedFileType& outValue) + { + int theVal = 0; + if (! ValueParser::ParseInt(inPtr, inLen, theVal)) { + outValue = inDefaultValue; + return; + } + switch (theVal) { + case KFS_STRIPED_FILE_TYPE_NONE: + outValue = KFS_STRIPED_FILE_TYPE_NONE; + return; + case KFS_STRIPED_FILE_TYPE_RS: + outValue = KFS_STRIPED_FILE_TYPE_RS; + return; + default: + outValue = inDefaultValue; + } + } + }; + class VParserDec + { + public: + // Specialization for struct timeval + typedef struct timeval TimeVal; + typedef VParser ValueParser; + + template + static void SetValue( + const char* inPtr, + size_t inLen, + const T& inDefaultValue, + T& outValue) + { + ValueParser::SetValue(inPtr, inLen, inDefaultValue, outValue); + } + static void SetValue( + const char* inPtr, + size_t inLen, + const TimeVal& inDefaultValue, + TimeVal& outValue) + { + if (! ValueParser::ParseInt(inPtr, inLen, outValue.tv_sec) || + ! ValueParser::ParseInt(inPtr, inLen, outValue.tv_usec)) { + outValue = inDefaultValue; + } + } + }; + class VParserHex + { + public: + // Specialization for struct timeval + typedef struct timeval TimeVal; + typedef VParser ValueParser; + + template + static void SetValue( + const char* inPtr, + size_t inLen, + const T& inDefaultValue, + T& outValue) + { + ValueParser::SetValue(inPtr, inLen, inDefaultValue, outValue); + } + static void SetValue( + const char* inPtr, + size_t inLen, + const TimeVal& inDefaultValue, + TimeVal& outValue) + { + const int64_t kUses = 1000000; + int64_t theTimeUsecs = 0; + if (ValueParser::ParseInt(inPtr, inLen, theTimeUsecs)) { + outValue.tv_sec = theTimeUsecs / kUses; + outValue.tv_usec = theTimeUsecs % kUses; + } else { + outValue = inDefaultValue; + } + } + }; + typedef ObjectParser Parser; + typedef ObjectParser HexParser; + + Entry mEntry; + bool mHexParserFlag; + + static const Parser& sParser; + static const HexParser& sHexParser; + + static const Parser& MakeParser() + { + static Parser sParser; + return sParser + .Def("Name", &Entry::filename ) + .Def("File-handle", &Entry::fileId, kfsFileId_t(-1)) + .Def("Type", &Entry::type ) + .Def("M-Time", &Entry::mtime ) + .Def("C-Time", &Entry::ctime ) + .Def("CR-Time", &Entry::crtime ) + .Def("Replication", &Entry::numReplicas, int16_t(1)) + .Def("Chunk-count", &Entry::subCount1 ) + .Def("File-size", &Entry::fileSize, chunkOff_t(-1)) + .Def("Striper-type", &Entry::striperType, KFS_STRIPED_FILE_TYPE_UNKNOWN) + .Def("Num-stripes", &Entry::numStripes ) + .Def("Num-recovery-stripes", &Entry::numRecoveryStripes ) + .Def("Stripe-size", &Entry::stripeSize ) + .Def("Chunk-offset", &Entry::lastChunkOffset ) + .Def("Chunk-handle", &Entry::chunkId, kfsFileId_t(-1)) + .Def("Chunk-version", &Entry::chunkVersion, int64_t(-1)) + .Def("Num-replicas", &Entry::lastchunkNumReplicas ) + .Def("Replicas", &Entry::lastChunkReplicas ) + .Def("User", &Entry::user, kKfsUserNone) + .Def("Group", &Entry::group, kKfsGroupNone) + .Def("Mode", &Entry::mode, kKfsModeUndef) + .Def("File-count", &Entry::subCount1, int64_t(-1)) + .Def("Dir-count", &Entry::subCount2, int64_t(-1)) + .DefDone() + ; + }; + // Short keys to save bandwidth / memory + static const HexParser& MakeHexParser() + { + static HexParser sParser; + return sParser + .Def("N", &Entry::filename ) + .Def("H", &Entry::fileId, kfsFileId_t(-1)) + .Def("T", &Entry::type ) + .Def("M", &Entry::mtime ) + .Def("C", &Entry::ctime ) + .Def("CR", &Entry::crtime ) + .Def("R", &Entry::numReplicas, int16_t(1)) + .Def("CC", &Entry::subCount1 ) + .Def("S", &Entry::fileSize, chunkOff_t(-1)) + .Def("ST", &Entry::striperType, KFS_STRIPED_FILE_TYPE_UNKNOWN) + .Def("SC", &Entry::numStripes ) + .Def("SR", &Entry::numRecoveryStripes ) + .Def("SS", &Entry::stripeSize ) + .Def("LO", &Entry::lastChunkOffset ) + .Def("LH", &Entry::chunkId, kfsFileId_t(-1)) + .Def("LV", &Entry::chunkVersion, int64_t(-1)) + .Def("LN", &Entry::lastchunkNumReplicas ) + .Def("LR", &Entry::lastChunkReplicas ) + .Def("U", &Entry::user, kKfsUserNone) + .Def("G", &Entry::group, kKfsGroupNone) + .Def("A", &Entry::mode, kKfsModeUndef) + .Def("FC", &Entry::subCount1, int64_t(-1)) + .Def("DC", &Entry::subCount2, int64_t(-1)) + .DefDone() + ; + } +}; + +const ReaddirPlusParser::Parser& ReaddirPlusParser::sParser = + ReaddirPlusParser::MakeParser(); +const ReaddirPlusParser::HexParser& ReaddirPlusParser::sHexParser = + ReaddirPlusParser::MakeHexParser(); + +bool +KfsClientImpl::Cache(time_t now, const string& dirname, kfsFileId_t dirFid, + const KfsFileAttr& attr) +{ + if (attr.filename == "." || attr.filename == "..") { + return true; + } + const string path = dirname + "/" + attr.filename; + FAttr* fa = LookupFAttr(dirFid, attr.filename); + if (fa) { + UpdatePath(fa, path, false); + FAttrLru::PushBack(mFAttrLru, *fa); + } else { + fa = NewFAttr(dirFid, attr.filename, path); + if (! fa) { + return false; + } + } + *fa = attr; + fa->validatedTime = now; + fa->generation = mFAttrCacheGeneration; + fa->staleSubCountsFlag = false; + return true; +} + +int +KfsClientImpl::ReaddirPlus(const string& pathname, kfsFileId_t dirFid, + vector& result, bool computeFilesize, bool updateClientCache) +{ + assert(mMutex.IsOwned()); + + vector fileChunkInfo; + ReaddirPlusParser parser; + const PropertiesTokenizer::Token beginEntry("Begin-entry"); + const PropertiesTokenizer::Token shortBeginEntry("B"); + const bool kGetLastChunkInfoIfSizeUnknown = true; + ReaddirPlusOp op( + 0, dirFid, kGetLastChunkInfoIfSizeUnknown); + const time_t now = time(0); + bool hasDirs = false; + for (int retryCnt = kMaxReadDirRetries; ;) { + op.seq = nextSeq(); + op.numEntries = kMaxReaddirEntries; + op.contentLength = 0; + op.hasMoreEntriesFlag = false; + + DoMetaOpWithRetry(&op); + + if (op.status < 0) { + if (op.fnameStart.empty() || + (op.status != -ENOENT && op.status != -EAGAIN)) { + break; + } + if (--retryCnt <= 0) { + KFS_LOG_STREAM_ERROR << + pathname << ": id: " << dirFid << + " directory has changed " << + kMaxReadDirRetries << + " times while attempting to list it; giving up" << + KFS_LOG_EOM; + op.status = -EAGAIN; + break; + } + result.clear(); + fileChunkInfo.clear(); + op.fnameStart.clear(); + continue; + } + if (op.numEntries <= 0) { + break; + } + if (op.contentLength <= 0) { + op.status = -EIO; + break; + } + if (op.numEntries <= 0) { + break; + } + // The response format: + // Begin-entry Begin-entry + // The last entry doesn't have a end-marker. + result.reserve(result.size() + op.numEntries); + PropertiesTokenizer tokenizer(op.contentBuf, op.contentLength, false); + tokenizer.Next(); + const PropertiesTokenizer::Token& beginToken = + tokenizer.GetKey() == shortBeginEntry ? + shortBeginEntry : beginEntry; + if (&beginToken == &shortBeginEntry) { + parser.SetUseHexParser(); + } + for (int i = 0; i < op.numEntries; i++) { + if (tokenizer.GetKey() != beginToken) { + op.status = -EIO; + break; + } + if (! parser.Parse(tokenizer)) { + continue; // Skip empty entries. + } + result.push_back(parser.GetFattr()); + KfsFileAttr& attr = result.back(); + if (attr.filename.empty()) { + op.status = -EIO; + break; + } + if (attr.isDirectory) { + if (hasDirs) { + continue; + } + if (attr.filename != "." && attr.filename != "..") { + hasDirs = true; + } + continue; + } + if (! computeFilesize || attr.fileSize >= 0) { + continue; + } + FAttr* const fa = LookupFAttr(dirFid, attr.filename); + if (fa && ! fa->isDirectory && fa->fileSize >= 0) { + if (IsValid(*fa, now)) { + attr.fileSize = fa->fileSize; + continue; + } + } + fileChunkInfo.resize(result.size()); + parser.LastChunkInfo(fileChunkInfo.back()); + } + if (! op.hasMoreEntriesFlag || op.status != 0) { + break; + } + op.fnameStart = result.back().filename; + } + if (op.status != 0) { + result.clear(); + return op.status; + } + ComputeFilesizes(result, fileChunkInfo); + + // if there are too many entries in the dir, then the caller is + // probably scanning the directory. don't put it in the cache + string dirname(pathname); + for (string::size_type len = dirname.size(); + len > 0 && dirname[len - 1] == '/'; + ) { + dirname.erase(--len); + } + const size_t kMaxUpdateSize = 1 << 10; + if (updateClientCache && + result.size() <= kMaxUpdateSize && + mFidNameToFAttrMap.size() < kMaxUpdateSize) { + for (size_t i = 0; i < result.size(); i++) { + if (! Cache(now, dirname, dirFid, result[i])) { + break; + } + } + } else if (updateClientCache && hasDirs) { + size_t dirCnt = 0; + size_t kMaxDirUpdateSize = 1024; + for (size_t i = 0; i < result.size(); i++) { + if (! result[i].isDirectory) { + continue; + } + if (! Cache(now, dirname, dirFid, result[i]) || + kMaxDirUpdateSize <= ++dirCnt) { + break; + } + } + } + + sort(result.begin(), result.end()); + if (! op.fnameStart.empty()) { + // The meta server doesn't guarantee that listing restarts from the + // exact same position if there were entry names hash collisions, + // and the the name where collision occurred was used as cursor (the + // restart point), and the entry was removed and added back right + // before the next readdir rpc execution started. + // This is due to b+tree ordering where newly added entries inserted + // before the existing entries with the same keys. The name hash is + // used as part of the b+tree key. + // Remove duplicate entries, if any. + unique(result.begin(), result.end(), + bind(&KfsFileAttr::filename, _1) == + bind(&KfsFileAttr::filename, _2) + ); + } + return 0; +} + +int +KfsClientImpl::Stat(const char *pathname, KfsFileAttr &kfsattr, bool computeFilesize) +{ + QCStMutexLocker l(mMutex); + const bool kValidSubCountsRequiredFlag = true; + return StatSelf(pathname, kfsattr, computeFilesize, 0, 0, + kValidSubCountsRequiredFlag); +} + +int +KfsClientImpl::StatSelf(const char* pathname, KfsFileAttr& kfsattr, + bool computeFilesize, string* path, KfsClientImpl::FAttr** cattr, + bool validSubCountsRequiredFlag) +{ + assert(mMutex.IsOwned()); + + if (! pathname) { + return -EFAULT; + } + if (! *pathname) { + return -EINVAL; + } + if (pathname[0] == '/') { + mTmpAbsPathStr = pathname; + } else { + mTmpAbsPathStr.assign(mCwd.data(), mCwd.length()); + mTmpAbsPathStr.append(pathname); + } + FAttr* fa = LookupFAttr(mTmpAbsPathStr, path); + if (! fa || (computeFilesize && ! fa->isDirectory && fa->fileSize < 0) || + (validSubCountsRequiredFlag && fa->staleSubCountsFlag) || + ! IsValid(*fa, time(0))) { + kfsFileId_t parentFid; + string filename; + string tmpPath; + string& fpath = path ? *path : tmpPath; + int res = GetPathComponents( + mTmpAbsPathStr.c_str(), &parentFid, filename, &fpath); + if (res == 0) { + res = LookupAttr(parentFid, filename, fa, computeFilesize, fpath, + validSubCountsRequiredFlag); + } + if (res < 0) { + return res; + } + } + if (fa) { + kfsattr = *fa; + kfsattr.filename = fa->fidNameIt->first.second; + } + if (cattr) { + *cattr = fa; + } + KFS_LOG_STREAM_DEBUG << + pathname << ": size: " << kfsattr.fileSize << + KFS_LOG_EOM; + return 0; +} + +int +KfsClientImpl::GetNumChunks(const char *pathname) +{ + QCStMutexLocker l(mMutex); + + KfsFileAttr attr; + string path; + const int res = StatSelf(pathname, attr, false); + if (res != 0) { + return (res < 0 ? res : -res); + } + if (attr.isDirectory) { + return -EISDIR; + } + return attr.chunkCount(); +} + + +bool +KfsClientImpl::Exists(const char *pathname) +{ + KfsFileAttr attr; + return (Stat(pathname, attr, false) == 0); +} + +bool +KfsClientImpl::IsFile(const char *pathname) +{ + KfsFileAttr attr; + return (Stat(pathname, attr, false) == 0 && ! attr.isDirectory); +} + +bool +KfsClientImpl::IsDirectory(const char *pathname) +{ + KfsFileAttr attr; + return (Stat(pathname, attr, false) == 0 && attr.isDirectory); +} + +int +KfsClientImpl::LookupAttr(kfsFileId_t parentFid, const string& filename, + KfsClientImpl::FAttr*& fa, bool computeFilesize, const string& path, + bool validSubCountsRequiredFlag) +{ + assert(mMutex.IsOwned()); + + if (parentFid < 0) { + return -EINVAL; + } + if (! fa) { + fa = LookupFAttr(path, 0); + if (fa && (! validSubCountsRequiredFlag || ! fa->staleSubCountsFlag) && + (! computeFilesize || fa->isDirectory || fa->fileSize >= 0) && + IsValid(*fa, time(0))) { + return 0; + } + } + LookupOp op(nextSeq(), parentFid, filename.c_str()); + DoMetaOpWithRetry(&op); + if (op.status < 0) { + Delete(fa); + fa = 0; + return op.status; + } + if (! op.fattr.isDirectory && computeFilesize && op.fattr.fileSize < 0) { + op.fattr.fileSize = ComputeFilesize(op.fattr.fileId); + if (op.fattr.fileSize < 0) { + // We are asked for filesize and if we can't compute it, fail + return -EIO; + } + } + if (! fa) { + fa = LookupFAttr(parentFid, filename); + } + if (fa) { + // Update i-node cache. + if (! path.empty() && path[0] == '/') { + if (fa->nameIt == mPathCacheNone) { + if (filename != "." && filename != "..") { + fa->nameIt = + mPathCache.insert(make_pair(path, fa)).first; + assert(fa->nameIt->second == fa); + } + } else if (fa->nameIt->first != path) { + mPathCache.erase(fa->nameIt); + mPathCache[path] = fa; + } + } else if (fa->nameIt != mPathCacheNone) { + mPathCache.erase(fa->nameIt); + fa->nameIt = mPathCacheNone; + } + FAttrLru::PushBack(mFAttrLru, *fa); + } else { + fa = NewFAttr(parentFid, filename, path); + if (! fa) { + return -ENOMEM; + } + } + *fa = op.fattr; + fa->validatedTime = time(0); + fa->generation = mFAttrCacheGeneration; + fa->staleSubCountsFlag = false; + return 0; +} + +int +KfsClientImpl::Create(const char *pathname, int numReplicas, bool exclusive, + int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, + bool forceTypeFlag, kfsMode_t mode) +{ + QCStMutexLocker l(mMutex); + return CreateSelf(pathname, numReplicas, exclusive, + numStripes, numRecoveryStripes, stripeSize, stripedType, forceTypeFlag, + mode); +} + +int +KfsClientImpl::CreateSelf(const char *pathname, int numReplicas, bool exclusive, + int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, + bool forceTypeFlag, kfsMode_t mode) +{ + if (! pathname || ! *pathname) { + return -EINVAL; + } + + assert(mMutex.IsOwned()); + int res = ValidateCreateParams(numReplicas, numStripes, numRecoveryStripes, + stripeSize, stripedType); + if (res < 0) { + return res; + } + kfsFileId_t parentFid; + string filename; + string path; + const bool kInvalidateSubCountsFlag = true; + res = GetPathComponents(pathname, &parentFid, filename, &path, + kInvalidateSubCountsFlag); + Delete(LookupFAttr(parentFid, filename)); + if (res < 0) { + KFS_LOG_STREAM_DEBUG << + pathname << ": GetPathComponents: " << res << + KFS_LOG_EOM; + return res; + } + CreateOp op(nextSeq(), parentFid, filename.c_str(), numReplicas, exclusive, + Permissions(mEUser, mEGroup, + mode != kKfsModeUndef ? (mode & ~mUMask) : mode), + exclusive ? NextCreateId() : -1 + ); + if (stripedType == KFS_STRIPED_FILE_TYPE_RS) { + if (numStripes <= 0) { + KFS_LOG_STREAM_DEBUG << + pathname << ": invalid stripe count: " << numStripes << + KFS_LOG_EOM; + return -EINVAL; + } + if (numRecoveryStripes < 0) { + KFS_LOG_STREAM_DEBUG << + pathname << ": invalid recovery stripe count: " << + numRecoveryStripes << + KFS_LOG_EOM; + return -EINVAL; + } + if (stripeSize < KFS_MIN_STRIPE_SIZE || + stripeSize > KFS_MAX_STRIPE_SIZE || + stripeSize % KFS_STRIPE_ALIGNMENT != 0 || + CHUNKSIZE % stripeSize != 0) { + KFS_LOG_STREAM_DEBUG << + pathname << ": invalid stripe size: " << stripeSize << + KFS_LOG_EOM; + return -EINVAL; + } + op.striperType = KFS_STRIPED_FILE_TYPE_RS; + op.numStripes = numStripes; + op.numRecoveryStripes = numRecoveryStripes; + op.stripeSize = stripeSize; + } else if (stripedType != KFS_STRIPED_FILE_TYPE_NONE) { + KFS_LOG_STREAM_DEBUG << + pathname << ": invalid striped file type: " << stripedType << + KFS_LOG_EOM; + return -EINVAL; + } + DoMetaOpWithRetry(&op); + if (op.status < 0) { + KFS_LOG_STREAM_ERROR << + pathname << ": create: " << op.status << " " << op.statusMsg << + KFS_LOG_EOM; + return op.status; + } + if (op.striperType != op.metaStriperType && forceTypeFlag) { + KFS_LOG_STREAM_ERROR << + pathname << ": create: " << "striped file type " << op.striperType << + " is not supported " << " got: " << op.metaStriperType << + KFS_LOG_EOM; + // Cleanup the file. + RemoveOp rm(nextSeq(), parentFid, filename.c_str(), pathname); + DoMetaOpWithRetry(&rm); + return -ENXIO; + } + + // Do not attempt to re-use possibly existing file table entry. + // If file existed and being written into it is moved into the dumpster by + // the meta server. + // An attempt to re-use the same file table entry would route the ios to the + // previously existed file into newly created one. + const int fte = AllocFileTableEntry(parentFid, filename, path); + if (fte < 0) { // XXX Too many open files + KFS_LOG_STREAM_DEBUG << + pathname << ": AllocFileTableEntry: " << fte << + KFS_LOG_EOM; + return fte; + } + + // make it the same as creat(): equivalent to open(O_CREAT|O_WRONLY|O_TRUNC). + FileTableEntry& entry = *mFileTable[fte]; + entry.openMode = O_WRONLY; + FileAttr& fa = entry.fattr; + fa.Init(false); // is an ordinary file + fa.fileId = op.fileId; + fa.numReplicas = numReplicas; + fa.fileSize = 0; // presently CreateOp always deletes file if exists. + if (op.metaStriperType != KFS_STRIPED_FILE_TYPE_NONE) { + fa.numStripes = (int16_t)numStripes; + fa.numRecoveryStripes = (int16_t)numRecoveryStripes; + fa.striperType = (StripedFileType)stripedType; + fa.stripeSize = stripeSize; + } + static_cast(fa) = op.permissions; + // Set optimal io size, like open does. + SetOptimalReadAheadSize(entry, mDefaultReadAheadSize); + SetOptimalIoBufferSize(entry, mDefaultIoBufferSize); + + return fte; +} + +int +KfsClientImpl::Remove(const char *pathname) +{ + if (! pathname || ! *pathname) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + kfsFileId_t parentFid; + string filename; + string path; + const bool kInvalidateSubCountsFlag = true; + int res = GetPathComponents(pathname, &parentFid, filename, &path, + kInvalidateSubCountsFlag); + if (res < 0) { + return res; + } + RemoveOp op(nextSeq(), parentFid, filename.c_str(), path.c_str()); + DoMetaOpWithRetry(&op); + Delete(LookupFAttr(parentFid, filename)); + return op.status; +} + +int +KfsClientImpl::Rename(const char* src, const char* dst, bool overwrite) +{ + if (! src || ! *src || ! dst || ! *dst) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + kfsFileId_t srcParentFid; + string srcFileName; + string srcPath; + const bool kInvalidateSubCountsFlag = true; + int res = GetPathComponents(src, &srcParentFid, srcFileName, &srcPath, + kInvalidateSubCountsFlag); + if (res < 0) { + KFS_LOG_STREAM_DEBUG << "reanme: " << + src << " " << dst << " status: " << res << + KFS_LOG_EOM; + return res; + } + string dstFileName; + string dstPath; + kfsFileId_t dstParentFid; + res = GetPathComponents(dst, &dstParentFid, dstFileName, &dstPath, + kInvalidateSubCountsFlag); + if (res < 0) { + KFS_LOG_STREAM_DEBUG << "reanme: " << + src << " " << dst << " status: " << res << + KFS_LOG_EOM; + return res; + } + if (srcParentFid == dstParentFid && dstFileName == srcFileName) { + return 0; // src and dst are the same. + } + RenameOp op(nextSeq(), srcParentFid, srcFileName.c_str(), + dstPath.c_str(), srcPath.c_str(), overwrite); + DoMetaOpWithRetry(&op); + + KFS_LOG_STREAM_DEBUG << "reanme: " << + srcPath << " " << dstPath << " status: " << op.status << + KFS_LOG_EOM; + + // Invalidate file attribute and the path cache + bool invalidateFlag = true; + for (string* pp = &srcPath; ; pp = &dstPath) { + string& path = *pp; + if (path.empty()) { + continue; + } + if (*path.rbegin() != '/') { + path += "/"; + } + const size_t len = path.length(); + int maxInval = + (int)min(size_t(256), mFidNameToFAttrMap.size() / 2 + 1); + for (NameToFAttrMap::iterator it = mPathCache.lower_bound(path); + it != mPathCache.end(); + ) { + const string& cp = it->first; + if (cp.length() < len || cp.compare(0, len, path) != 0) { + break; + } + if (--maxInval < 0) { + break; + } + FAttr* const fa = it->second; + ++it; + Delete(fa); + } + if (maxInval < 0) { + InvalidateAllCachedAttrs(); + invalidateFlag = false; + break; + } + if (pp == &dstPath) { + break; + } + } + if (invalidateFlag) { + Delete(LookupFAttr(srcParentFid, srcFileName)); + Delete(LookupFAttr(dstParentFid, dstFileName)); + } + return op.status; +} + +void +KfsClientImpl::InvalidateAttribute(const string& pathname, + bool countsOnlyFlag, bool deleteAttrFlag) +{ + string pathstr(pathname); + kfsFileId_t parentFid; + string name; + bool startFlag = true; + for (size_t len = pathstr.length(); len > 0; pathstr.erase(len)) { + size_t pos = len - 1; + const char* const p = pathstr.c_str(); + while (pos > 0 && p[pos] == '/') { + --pos; + } + if (pos < len - 1) { + pathstr.erase(pos); + } + len = pathstr.rfind('/'); + if (len == string::npos) { + break; + } + if (len == 0 && pos > 0) { + len++; + } + const bool deleteFlag = ! countsOnlyFlag || + (startFlag && deleteAttrFlag); + startFlag = false; + NameToFAttrMap::iterator const it = mPathCache.find(pathstr); + if (it != mPathCache.end()) { + if (deleteFlag) { + Delete(it->second); + } else { + it->second->staleSubCountsFlag = true; + } + continue; + } + if (GetPathComponents(pathstr.c_str(), &parentFid, name) != 0) { + continue; + } + FAttr* const fa = LookupFAttr(parentFid, name); + if (! fa) { + continue; + } + if (deleteFlag) { + Delete(fa); + } else { + fa->staleSubCountsFlag = true; + } + } +} + +int +KfsClientImpl::CoalesceBlocks(const char* src, const char* dst, chunkOff_t *dstStartOffset) +{ + if (! src || ! dst || ! *src || ! *dst) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + kfsFileId_t srcParentFid; + string srcFileName; + string srcPath; + const bool kInvalidateSubCountsFlag = true; + int res = GetPathComponents(src, &srcParentFid, srcFileName, &srcPath, + kInvalidateSubCountsFlag); + if (res < 0) { + KFS_LOG_STREAM_DEBUG << "coalesce: " << + src << " " << dst << " status: " << res << + KFS_LOG_EOM; + return res; + } + string dstFileName; + string dstPath; + kfsFileId_t dstParentFid; + res = GetPathComponents(dst, &dstParentFid, dstFileName, &dstPath, + kInvalidateSubCountsFlag); + if (res < 0) { + KFS_LOG_STREAM_DEBUG << "coalesce: " << + src << " " << dst << " status: " << res << + KFS_LOG_EOM; + return res; + } + if (srcParentFid == dstParentFid && dstFileName == srcFileName) { + return 0; // src and dst are the same. + } + CoalesceBlocksOp op(nextSeq(), srcPath.c_str(), dstPath.c_str()); + DoMetaOpWithRetry(&op); + if (dstStartOffset) { + *dstStartOffset = op.dstStartOffset; + } + Delete(LookupFAttr(srcParentFid, srcFileName)); + Delete(LookupFAttr(dstParentFid, dstFileName)); + return op.status; +} + +int +KfsClientImpl::SetMtime(const char *pathname, const struct timeval &mtime) +{ + if (! pathname || ! *pathname) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + kfsFileId_t parentFid; + string fileName; + string path; + const int res = GetPathComponents(pathname, &parentFid, fileName, &path); + if (res < 0) { + return res; + } + SetMtimeOp op(nextSeq(), path.c_str(), mtime); + DoMetaOpWithRetry(&op); + Delete(LookupFAttr(parentFid, fileName)); + return op.status; +} + +int +KfsClientImpl::OpenDirectory(const char *pathname) +{ + if (! pathname || ! *pathname) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + string path; + const int fd = OpenSelf(pathname, O_RDONLY, + 3, 0, 0, 0, KFS_STRIPED_FILE_TYPE_NONE, false, kKfsModeUndef, + &path); + if (fd < 0) { + return fd; + } + FileTableEntry& entry = *(mFileTable[fd]); + if (! entry.fattr.isDirectory) { + ReleaseFileTableEntry(fd); + return -ENOTDIR; + } + assert(! entry.dirEntries); + entry.dirEntries = new vector(); + const int res = ReaddirPlus(path, entry.fattr.fileId, *entry.dirEntries, true); + if (res < 0) { + Close(fd); + return res; + } + return fd; +} + +static inline int WriteInt16(int8_t* ptr, int16_t val) +{ + ptr[1] = (int8_t)val; + val >>= 8; + ptr[0] = (int8_t)val; + return 2; +} + +static inline int WriteInt32(int8_t* ptr, int32_t val) +{ + ptr[3] = (int8_t)val; + val >>= 8; + ptr[2] = (int8_t)val; + val >>= 8; + ptr[1] = (int8_t)val; + val >>= 8; + ptr[0] = (int8_t)val; + return 4; +} + +static inline int WriteInt64(int8_t* ptr, int64_t val) +{ + WriteInt32(ptr, (int32_t)(val >> 32)); + WriteInt32(ptr + 4, (int32_t)val); + return 8; +} + +int +KfsClientImpl::ReadDirectory(int fd, char* buf, size_t numBytes) +{ + if (! valid_fd(fd)) { + return -EBADF; + } + FileTableEntry& entry = *(mFileTable[fd]); + if (! entry.fattr.isDirectory) { + return -ENOTDIR; + } + if (! entry.dirEntries) { + return -EISDIR; // not opened with OpenDirectory(). + } + const vector& dirEntries = *entry.dirEntries; + if (entry.currPos.fileOffset < 0) { + entry.currPos.fileOffset = 0; + } + int8_t* ptr = (int8_t*)buf; + int8_t* const end = ptr + numBytes; + kfsUid_t uid = kKfsUserNone; + kfsGid_t gid = kKfsGroupNone; + time_t now = time(0); + string uname; + string gname; + for ( ; (size_t)entry.currPos.fileOffset < dirEntries.size(); + entry.currPos.fileOffset++) { + const KfsFileAttr& attr = dirEntries[entry.currPos.fileOffset]; + int32_t unameLen; + if (attr.user != uid) { + uid = attr.user; + uname = UidToName(uid, now); + unameLen = (int32_t)uname.length(); + } else { + unameLen = 0; + } + int32_t gnameLen; + if (attr.group != gid) { + gid = attr.group; + gname = GidToName(gid, now); + gnameLen = (int32_t)gname.length(); + } else { + gnameLen = 0; + } + const int32_t nameLen = (int32_t)attr.filename.length(); + const size_t entrySize = + (64 + 64 + 32 + 32 + 8 + 32 * 6 + 16 + 32 * 2 + 64) / 8 + + nameLen + unameLen + gnameLen + + (attr.isDirectory ? 2 * 64/8 : 0); + if (nameLen <= 0) { + continue; + } + if (ptr + entrySize > end) { + break; + } + ptr += WriteInt64(ptr, (int64_t)attr.mtime.tv_sec * 1000 + + attr.mtime.tv_usec / 1000); + ptr += WriteInt64(ptr, attr.fileSize); + ptr += WriteInt32(ptr, attr.numReplicas); + ptr += WriteInt32(ptr, nameLen); + *ptr++ = (int8_t)(attr.isDirectory ? 1 : 0); + ptr += WriteInt32(ptr, attr.numStripes); + ptr += WriteInt32(ptr, attr.numRecoveryStripes); + ptr += WriteInt32(ptr, attr.striperType); + ptr += WriteInt32(ptr, attr.stripeSize); + ptr += WriteInt32(ptr, attr.user); + ptr += WriteInt32(ptr, attr.group); + ptr += WriteInt16(ptr, attr.mode); + ptr += WriteInt64(ptr, attr.fileId); + if (attr.isDirectory) { + ptr += WriteInt64(ptr, attr.fileCount()); + ptr += WriteInt64(ptr, attr.dirCount()); + } + ptr += WriteInt32(ptr, unameLen); + ptr += WriteInt32(ptr, gnameLen); + memcpy(ptr, attr.filename.data(), nameLen); + ptr += nameLen; + memcpy(ptr, uname.data(), unameLen); + ptr += unameLen; + memcpy(ptr, gname.data(), gnameLen); + ptr += gnameLen; + } + if (ptr <= (int8_t*)buf && (size_t)entry.currPos.fileOffset < + dirEntries.size()) { + return -EINVAL; // buffer too small. + } + return (int)(ptr - (int8_t*)buf); +} + +int +KfsClientImpl::Open(const char *pathname, int openMode, int numReplicas, + int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, + kfsMode_t mode) +{ + QCStMutexLocker l(mMutex); + const bool kCacheAttributesFlag = false; + return OpenSelf(pathname, openMode, numReplicas, + numStripes, numRecoveryStripes, stripeSize, stripedType, + kCacheAttributesFlag, mode); +} + +int +KfsClientImpl::CacheAttributes(const char *pathname) +{ + return OpenSelf(pathname, 0, 0, 0, 0, 0, KFS_STRIPED_FILE_TYPE_NONE, true); +} + +int +KfsClientImpl::OpenSelf(const char *pathname, int openMode, int numReplicas, + int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, + bool cacheAttributesFlag, kfsMode_t mode, string* path) +{ + if ((openMode & O_TRUNC) != 0 && (openMode & (O_RDWR | O_WRONLY)) == 0) { + return -EINVAL; + } + + kfsFileId_t parentFid = -1; + string filename; + string fpath; + const int res = GetPathComponents(pathname, &parentFid, filename, &fpath); + if (res < 0) { + return res; + } + if (path) { + *path = fpath; + } + LookupOp op(0, parentFid, filename.c_str()); + FAttr* const fa = LookupFAttr(parentFid, filename); + time_t const faNow = fa ? time(0) : 0; + if (fa && IsValid(*fa, faNow) && + (fa->isDirectory || fa->fileSize > 0 || + (fa->fileSize == 0 && fa->chunkCount() <= 0))) { + UpdatePath(fa, fpath); + op.fattr = *fa; + } else { + op.seq = nextSeq(); + DoMetaOpWithRetry(&op); + if (op.status < 0) { + Delete(fa); + if (! cacheAttributesFlag && (openMode & O_CREAT) != 0 && + op.status == -ENOENT) { + // file doesn't exist. Create it + const int fte = CreateSelf(pathname, numReplicas, openMode & O_EXCL, + numStripes, numRecoveryStripes, stripeSize, stripedType, false, + mode); + if (fte >= 0 && (openMode & O_APPEND) != 0) { + FileTableEntry& entry = *mFileTable[fte]; + assert(! entry.fattr.isDirectory); + entry.openMode |= O_APPEND; + } + return fte; + } + return op.status; + } + if (fa) { + UpdatePath(fa, fpath); + *fa = op.fattr; + fa->validatedTime = faNow; + fa->generation = mFAttrCacheGeneration; + fa->staleSubCountsFlag = false; + FAttrLru::PushBack(mFAttrLru, *fa); + } + } + // file exists; now fail open if: O_CREAT | O_EXCL + if ((openMode & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) { + return -EEXIST; + } + if (! cacheAttributesFlag && + op.fattr.IsAnyPermissionDefined() && ((( + (openMode & O_WRONLY) == 0 || + (openMode & (O_RDWR | O_RDONLY)) != 0) && + ! op.fattr.CanRead(mEUser, mEGroup)) || ( + (openMode & (O_WRONLY | O_RDWR)) != 0 && + ! op.fattr.CanWrite(mEUser, mEGroup)))) { + return -EACCES; + } + if (op.fattr.isDirectory && openMode != O_RDONLY) { + return -ENOTDIR; + } + + const int fte = AllocFileTableEntry(parentFid, filename, fpath); + if (fte < 0) { // Too many open files + return fte; + } + + FileTableEntry& entry = *mFileTable[fte]; + if (cacheAttributesFlag) { + entry.openMode = 0; + } else if ((openMode & O_RDWR) != 0) { + entry.openMode = O_RDWR; + } else if ((openMode & O_WRONLY) != 0) { + entry.openMode = O_WRONLY; + } else if ((openMode & O_RDONLY) != 0) { + entry.openMode = O_RDONLY; + } else { + entry.openMode = 0; + } + entry.fattr = op.fattr; + const bool truncateFlag = + ! cacheAttributesFlag && (openMode & O_TRUNC) != 0; + if (truncateFlag) { + if (entry.fattr.chunkCount() > 0 || entry.fattr.fileSize != 0) { + const int res = TruncateSelf(fte, 0); + if (res < 0) { + ReleaseFileTableEntry(fte); + return res; + } + } + } else if (entry.fattr.fileSize < 0 && + ! entry.fattr.isDirectory && entry.fattr.chunkCount() > 0) { + entry.fattr.fileSize = ComputeFilesize(op.fattr.fileId); + if (entry.fattr.fileSize < 0) { + ReleaseFileTableEntry(fte); + return -EIO; + } + } + if (! cacheAttributesFlag && + (openMode & O_APPEND) != 0 && + ! entry.fattr.isDirectory && + (entry.openMode & (O_RDWR | O_WRONLY)) != 0) { + entry.openMode |= O_APPEND; + } + if (! entry.fattr.isDirectory) { + SetOptimalIoBufferSize(entry, mDefaultIoBufferSize); + SetOptimalReadAheadSize(entry, mDefaultReadAheadSize); + if (fa && entry.openMode != O_RDONLY) { + Delete(fa); // Invalidate attribute cache entry if isn't read only. + } + } + return fte; +} + +int +KfsClientImpl::Close(int fd) +{ + KfsProtocolWorker::FileInstance fileInstance; + KfsProtocolWorker::FileId fileId; + KfsProtocolWorker::RequestType closeType; + bool readCloseFlag; + bool writeCloseFlag; + int status = 0; + { + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + KFS_LOG_STREAM_DEBUG << "close: invalid fd: " << fd << KFS_LOG_EOM; + return -EBADF; + } + FileTableEntry& entry = *mFileTable[fd]; + closeType = (entry.openMode & O_APPEND) != 0 ? + KfsProtocolWorker::kRequestTypeWriteAppendClose : + KfsProtocolWorker::kRequestTypeWriteClose; + fileId = entry.fattr.fileId; + fileInstance = entry.instance; + readCloseFlag = entry.readUsedProtocolWorkerFlag && mProtocolWorker; + writeCloseFlag = entry.usedProtocolWorkerFlag && mProtocolWorker; + KFS_LOG_STREAM_DEBUG << + "closing:" + " fd: " << fd << + " fileId: " << fileId << + " instance: " << fileInstance << + " read: " << readCloseFlag << + " write: " << writeCloseFlag << + KFS_LOG_EOM; + if (writeCloseFlag) { + // Invalidate the corresponding attribute if any. + InvalidateAttributeAndCounts(entry.pathname); + Delete(LookupFAttr(entry.parentFid, entry.name)); + } + ReleaseFileTableEntry(fd); + } + if (writeCloseFlag) { + const int ret = (int)mProtocolWorker->Execute( + closeType, + fileInstance, + fileId + ); + if (status == 0) { + status = ret; + } + } + if (readCloseFlag) { + const int ret = (int)mProtocolWorker->Execute( + KfsProtocolWorker::kRequestTypeReadClose, + fileInstance + 1, // reader's instance always +1 + fileId + ); + if (! writeCloseFlag && ret != 0 && status == 0) { + status = ret; + } + } + KFS_LOG_STREAM_DEBUG << + "closed:" + " fd: " << fd << + " fileId: " << fileId << + " instance: " << fileInstance << + " read: " << readCloseFlag << + " write: " << writeCloseFlag << + " status: " << status << + KFS_LOG_EOM; + return status; +} + +void +KfsClientImpl::SkipHolesInFile(int fd) +{ + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return; + } + FileTableEntry& entry = *(mFileTable[fd]); + entry.skipHoles = true; + entry.failShortReadsFlag = false; +} + +int +KfsClientImpl::Sync(int fd) +{ + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return -EBADF; + } + FileTableEntry& entry = *mFileTable[fd]; + if (entry.pending > 0 && + mProtocolWorker && entry.usedProtocolWorkerFlag) { + const KfsProtocolWorker::FileId fileId = entry.fattr.fileId; + const KfsProtocolWorker::FileInstance fileInstance = entry.instance; + entry.pending = 0; + l.Unlock(); + return (int)mProtocolWorker->Execute( + (entry.openMode & O_APPEND) != 0 ? + KfsProtocolWorker::kRequestTypeWriteAppend : + KfsProtocolWorker::kRequestTypeWrite, + fileInstance, + fileId, + 0, + 0, + 0 + ); + } + return 0; +} + +int +KfsClientImpl::Truncate(int fd, chunkOff_t offset) +{ + const int syncRes = Sync(fd); + if (syncRes < 0) { + return syncRes; + } + QCStMutexLocker l(mMutex); + return TruncateSelf(fd, offset); +} + +int +KfsClientImpl::Truncate(const char* pathname, chunkOff_t offset) +{ + QCStMutexLocker l(mMutex); + + KfsFileAttr attr; + string path; + const int res = StatSelf(pathname, attr, false, &path); + if (res != 0) { + return (res < 0 ? res : -res); + } + if (attr.isDirectory) { + return -EISDIR; + } + if (! attr.CanWrite(mEUser, mEGroup)) { + return -EACCES; + } + TruncateOp op(nextSeq(), pathname, attr.fileId, offset); + DoMetaOpWithRetry(&op); + if (op.status != 0) { + return op.status; + } + InvalidateAttributeAndCounts(path); + return 0; +} + +int +KfsClientImpl::TruncateSelf(int fd, chunkOff_t offset) +{ + assert(mMutex.IsOwned()); + + if (! valid_fd(fd)) { + return -EBADF; + } + // for truncation, file should be opened for writing + if ((mFileTable[fd]->openMode & (O_RDWR | O_WRONLY)) == 0) { + return -EINVAL; + } + FdInfo(fd)->buffer.Invalidate(); + + FileAttr *fa = FdAttr(fd); + TruncateOp op(nextSeq(), FdInfo(fd)->pathname.c_str(), fa->fileId, offset); + DoMetaOpWithRetry(&op); + int res = op.status; + + if (res == 0) { + fa->fileSize = offset; + if (fa->fileSize == 0) { + fa->subCount1 = 0; + } + // else + // chunkcount is off...but, that is ok; it is never exposed to + // the end-client. + + gettimeofday(&fa->mtime, 0); + } + return res; +} + +int +KfsClientImpl::PruneFromHead(int fd, chunkOff_t offset) +{ + const int syncRes = Sync(fd); + if (syncRes < 0) { + return syncRes; + } + + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return -EBADF; + } + // for truncation, file should be opened for writing + if (mFileTable[fd]->openMode == O_RDONLY) { + return -EINVAL; + } + FdInfo(fd)->buffer.Invalidate(); + + // round-down to the nearest chunk block start offset + offset = (offset / CHUNKSIZE) * CHUNKSIZE; + + FileAttr *fa = FdAttr(fd); + TruncateOp op(nextSeq(), FdInfo(fd)->pathname.c_str(), fa->fileId, offset); + op.pruneBlksFromHead = true; + DoMetaOpWithRetry(&op); + int res = op.status; + + if (res == 0) { + // chunkcount is off...but, that is ok; it is never exposed to + // the end-client. + gettimeofday(&fa->mtime, 0); + } + return res; +} + +int +KfsClientImpl::GetDataLocation(const char *pathname, chunkOff_t start, chunkOff_t len, + vector< vector > &locations) +{ + QCStMutexLocker l(mMutex); + + // Open the file and cache the attributes + const int fd = CacheAttributes(pathname); + if (fd < 0) { + return fd; + } + const int ret = GetDataLocationSelf(fd, start, len, locations); + ReleaseFileTableEntry(fd); + return ret; +} + +int +KfsClientImpl::GetDataLocation(int fd, chunkOff_t start, chunkOff_t len, + vector< vector > &locations) +{ + QCStMutexLocker l(mMutex); + return GetDataLocationSelf(fd, start, len, locations); +} + +int +KfsClientImpl::GetDataLocationSelf(int fd, chunkOff_t start, chunkOff_t len, + vector > &locations) +{ + assert(mMutex.IsOwned()); + + if (! valid_fd(fd)) { + return -EBADF; + } + int res; + ChunkAttr chunk; + // locate each chunk and get the hosts that are storing the chunk. + for (chunkOff_t pos = start / (chunkOff_t)CHUNKSIZE * (chunkOff_t)CHUNKSIZE; + pos < start + len; + pos += CHUNKSIZE) { + if ((res = LocateChunk(fd, pos, chunk)) < 0) { + return res; + } + locations.push_back(vector()); + vector& hosts = locations.back(); + const size_t cnt = chunk.chunkServerLoc.size(); + for (size_t i = 0; i < cnt; i++) { + hosts.push_back(chunk.chunkServerLoc[i].hostname); + } + } + + return 0; +} + +int16_t +KfsClientImpl::GetReplicationFactor(const char *pathname) +{ + KfsFileAttr attr; + const int res = Stat(pathname, attr, false); + if (res != 0) { + return (res < 0 ? res : -res); + } + if (attr.isDirectory) { + return -EISDIR; + } + return attr.numReplicas; +} + +int16_t +KfsClientImpl::SetReplicationFactor(const char *pathname, int16_t numReplicas) +{ + QCStMutexLocker l(mMutex); + + KfsFileAttr attr; + string path; + const int res = StatSelf(pathname, attr, false, &path); + if (res != 0) { + return (res < 0 ? res : -res); + } + if (attr.isDirectory) { + return -EISDIR; + } + ChangeFileReplicationOp op(nextSeq(), attr.fileId, numReplicas); + DoMetaOpWithRetry(&op); + InvalidateAttributeAndCounts(path); + return (op.status <= 0 ? op.status : -op.status); +} + +void +KfsClientImpl::SetDefaultIOTimeout(int nsecs) +{ + QCStMutexLocker l(mMutex); + const int kMaxTimeout = numeric_limits::max() / 1000; + const int timeout = nsecs >= 0 ? min(kMaxTimeout, nsecs) : kMaxTimeout; + if (timeout == mDefaultOpTimeout) { + return; + } + mDefaultOpTimeout = timeout; + if (mProtocolWorker) { + mProtocolWorker->SetOpTimeoutSec(mDefaultOpTimeout); + mProtocolWorker->SetMetaOpTimeoutSec(mDefaultOpTimeout); + } +} + +int +KfsClientImpl::GetDefaultIOTimeout() const +{ + QCStMutexLocker l(const_cast(this)->mMutex); + return mDefaultOpTimeout; +} + +void +KfsClientImpl::SetRetryDelay(int nsecs) +{ + QCStMutexLocker l(mMutex); + if (mRetryDelaySec == nsecs) { + return; + } + mRetryDelaySec = nsecs; + if (mProtocolWorker) { + mProtocolWorker->SetTimeSecBetweenRetries(mRetryDelaySec); + mProtocolWorker->SetMetaTimeSecBetweenRetries(mRetryDelaySec); + } +} + +int +KfsClientImpl::GetRetryDelay() const +{ + QCStMutexLocker l(const_cast(this)->mMutex); + return mRetryDelaySec; +} + +void +KfsClientImpl::SetMaxRetryPerOp(int retryCount) +{ + QCStMutexLocker l(mMutex); + if (mMaxNumRetriesPerOp == retryCount) { + return; + } + mMaxNumRetriesPerOp = retryCount; + if (mProtocolWorker) { + mProtocolWorker->SetMaxRetryCount(mMaxNumRetriesPerOp); + mProtocolWorker->SetMetaMaxRetryCount(mMaxNumRetriesPerOp); + } +} + +void +KfsClientImpl::StartProtocolWorker() +{ + assert(mMutex.IsOwned()); + if (mProtocolWorker) { + return; + } + mProtocolWorker = new KfsProtocolWorker( + mMetaServerLoc.hostname, mMetaServerLoc.port); + mProtocolWorker->SetOpTimeoutSec(mDefaultOpTimeout); + mProtocolWorker->SetMetaOpTimeoutSec(mDefaultOpTimeout); + mProtocolWorker->SetMaxRetryCount(mMaxNumRetriesPerOp); + mProtocolWorker->SetMetaMaxRetryCount(mMaxNumRetriesPerOp); + mProtocolWorker->SetTimeSecBetweenRetries(mRetryDelaySec); + mProtocolWorker->SetMetaTimeSecBetweenRetries(mRetryDelaySec); + mProtocolWorker->Start(); +} + +int +KfsClientImpl::GetMaxRetryPerOp() const +{ + QCStMutexLocker l(const_cast(this)->mMutex); + return mMaxNumRetriesPerOp; +} + +void +KfsClientImpl::SetEOFMark(int fd, chunkOff_t offset) +{ + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd) || FdAttr(fd)->isDirectory) { + return; + } + FdInfo(fd)->eofMark = offset; +} + +chunkOff_t +KfsClientImpl::Seek(int fd, chunkOff_t offset) +{ + return Seek(fd, offset, SEEK_SET); +} + +chunkOff_t +KfsClientImpl::Seek(int fd, chunkOff_t offset, int whence) +{ + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return -EBADF; + } + FileTableEntry& entry = *(mFileTable[fd]); + if (entry.fattr.isDirectory) { + return -EINVAL; + } + + chunkOff_t newOff; + switch (whence) { + case SEEK_SET: + newOff = offset; + break; + case SEEK_CUR: + newOff = entry.currPos.fileOffset + offset; + break; + case SEEK_END: + newOff = entry.fattr.fileSize + offset; + break; + default: + return -EINVAL; + } + + if (newOff < 0) { + return -EINVAL; + } + entry.currPos.fileOffset = newOff; + + return newOff; +} + +chunkOff_t +KfsClientImpl::Tell(int fd) +{ + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return -EBADF; + } + FileTableEntry& entry = *(mFileTable[fd]); + if (entry.fattr.isDirectory) { + return -EINVAL; + } + + return entry.currPos.fileOffset; +} + +void +KfsClientImpl::SetMaxNumRetriesPerOp(int maxNumRetries) +{ + QCStMutexLocker l(mMutex); + mMaxNumRetriesPerOp = maxNumRetries; +} + +/// +/// Given a chunk of file, find out where the chunk is hosted. +/// @param[in] fd The index for an entry in mFileTable[] for which +/// we are trying find out chunk location info. +/// +/// @param[in] chunkNum The index in +/// mFileTable[fd]->cattr[] corresponding to the chunk for +/// which we are trying to get location info. +/// +/// +int +KfsClientImpl::LocateChunk(int fd, chunkOff_t chunkOffset, ChunkAttr& chunk) +{ + assert(mMutex.IsOwned() && valid_fd(fd) && + ! mFileTable[fd]->fattr.isDirectory); + + if (chunkOffset < 0) { + return -EINVAL; + } + GetAllocOp op(nextSeq(), mFileTable[fd]->fattr.fileId, chunkOffset); + op.filename = mFileTable[fd]->pathname; + DoMetaOpWithRetry(&op); + if (op.status < 0) { + KFS_LOG_STREAM_DEBUG << + "locate chunk failure: " << op.status << + " " << ErrorCodeToStr(op.status) << + KFS_LOG_EOM; + return op.status; + } + chunk.chunkId = op.chunkId; + chunk.chunkVersion = op.chunkVersion; + chunk.chunkServerLoc = op.chunkServers; + chunk.chunkSize = -1; + chunk.chunkOffset = chunkOffset; + return 0; +} + +ssize_t +KfsClientImpl::SetDefaultIoBufferSize(size_t size) +{ + QCStMutexLocker lock(mMutex); + mDefaultIoBufferSize = min((size_t)numeric_limits::max(), + (size + CHECKSUM_BLOCKSIZE - 1) / + CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE); + return mDefaultIoBufferSize; +} + +ssize_t +KfsClientImpl::GetDefaultIoBufferSize() const +{ + QCStMutexLocker lock(const_cast(this)->mMutex); + return mDefaultIoBufferSize; +} + +ssize_t +KfsClientImpl::SetIoBufferSize(int fd, size_t size) +{ + QCStMutexLocker lock(mMutex); + if (! valid_fd(fd)) { + return -EBADF; + } + return SetIoBufferSize(*mFileTable[fd], size); +} + +ssize_t +KfsClientImpl::SetIoBufferSize(FileTableEntry& entry, size_t size, bool optimalFlag) +{ + int bufSize = (int)min((size_t)numeric_limits::max(), + size / CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE); + FileAttr& attr = entry.fattr; + if (bufSize > 0 && + attr.striperType != KFS_STRIPED_FILE_TYPE_NONE && + attr.stripeSize > 0 && + attr.numStripes > 0 && + attr.stripeSize < bufSize) { + const int stripes = + attr.numStripes + max(0, int(attr.numRecoveryStripes)); + const int stride = attr.stripeSize * stripes; + bufSize = (max(optimalFlag ? (1 << 20) * stripes : 0, bufSize) + + stride - 1) / stride * stride; + } + entry.ioBufferSize = max(0, bufSize); + return entry.ioBufferSize; +} + +ssize_t +KfsClientImpl::GetIoBufferSize(int fd) const +{ + QCStMutexLocker lock(const_cast(this)->mMutex); + if (! valid_fd(fd)) { + return -EBADF; + } + return mFileTable[fd]->ioBufferSize; +} + +ssize_t +KfsClientImpl::SetDefaultReadAheadSize(size_t size) +{ + QCStMutexLocker lock(mMutex); + mDefaultReadAheadSize = (size + CHECKSUM_BLOCKSIZE - 1) / + CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE; + return mDefaultReadAheadSize; +} + +ssize_t +KfsClientImpl::GetDefaultReadAheadSize() const +{ + QCStMutexLocker lock(const_cast(this)->mMutex); + return mDefaultReadAheadSize; +} + +void +KfsClientImpl::SetDefaultFullSparseFileSupport(bool flag) +{ + QCStMutexLocker lock(mMutex); + mFailShortReadsFlag = ! flag; +} + +int +KfsClientImpl::SetFullSparseFileSupport(int fd, bool flag) +{ + QCStMutexLocker lock(mMutex); + if (! valid_fd(fd)) { + return -EBADF; + } + mFileTable[fd]->failShortReadsFlag = ! flag; + return 0; +} + +void +KfsClientImpl::SetFileAttributeRevalidateTime(int secs) +{ + QCStMutexLocker lock(mMutex); + mFileAttributeRevalidateTime = secs; +} + +/// +/// Helper function that does the work for sending out an op to the +/// server. +/// +/// @param[in] op the op to be sent out +/// @param[in] sock the socket on which we communicate with server +/// @retval 0 on success; -1 on failure +/// (On failure, op->status contains error code.) +/// +int +KfsClientImpl::DoOpSend(KfsOp *op, TcpSocket *sock) +{ + if (! sock || ! sock->IsGood()) { + KFS_LOG_STREAM_DEBUG << "op send socket closed" << KFS_LOG_EOM; + op->status = -EHOSTUNREACH; + return -1; + } + ostringstream os; + op->Request(os); + const string str = os.str(); + const int ret = SendRequest(str.data(), str.size(), + op->contentBuf, op->contentLength, sock); + if (ret <= 0) { + op->status = -EHOSTUNREACH; + } + return ret; +} + +int +KfsClientImpl::GetResponse(char *buf, int bufSize, int *delims, TcpSocket *sock) +{ + return RecvResponseHeader(buf, bufSize, sock, mDefaultOpTimeout, delims); +} + +/// +/// From a response, extract out seq # and content-length. +/// +static void +GetSeqContentLen(const char *resp, int respLen, + kfsSeq_t *seq, int *contentLength, Properties& prop) +{ + BufferInputStream ist(resp, respLen); + const char separator = ':'; + + prop.clear(); + prop.loadProperties(ist, separator, false); + *seq = prop.getValue("Cseq", (kfsSeq_t) -1); + *contentLength = prop.getValue("Content-length", 0); +} + +/// +/// Helper function that does the work of getting a response from the +/// server and parsing it out. +/// +/// @param[in] op the op for which a response is to be gotten +/// @param[in] sock the socket on which we communicate with server +/// @retval 0 on success; -1 on failure +/// (On failure, op->status contains error code.) +/// +int +KfsClientImpl::DoOpResponse(KfsOp *op, TcpSocket *sock) +{ + if (! sock || ! sock->IsGood()) { + op->status = -EHOSTUNREACH; + KFS_LOG_STREAM_DEBUG << "op recv socket closed" << KFS_LOG_EOM; + return -1; + } + + Properties prop; + int numIO; + bool printMatchingResponse = false; + int len; + for (; ;) { + len = 0; + numIO = GetResponse( + mResponseBuffer, kResponseBufferSize, &len, sock); + if (numIO <= 0) { + KFS_LOG_STREAM_DEBUG << + sock->GetPeerName() << ": read failed: " << numIO << + " " << QCUtils::SysError(-numIO) << + KFS_LOG_EOM; + op->status = numIO == -ETIMEDOUT ? -ETIMEDOUT : -EHOSTUNREACH; + sock->Close(); + return -1; + } + if (len <= 0) { + KFS_LOG_STREAM_DEBUG << + sock->GetPeerName() << ": invalid response length: " << len << + KFS_LOG_EOM; + sock->Close(); + op->status = -EINVAL; + return -1; + } + + kfsSeq_t resSeq = -1; + int contentLen = 0; + GetSeqContentLen(mResponseBuffer, len, &resSeq, &contentLen, prop); + if (resSeq == op->seq) { + if (printMatchingResponse) { + KFS_LOG_STREAM_DEBUG << + sock->GetPeerName() << ": response seq: " << resSeq << + KFS_LOG_EOM; + } + break; + } + KFS_LOG_STREAM_DEBUG << + sock->GetPeerName() << ": unexpected response seq:" + " expect: " << op->seq << + " got " << resSeq << + KFS_LOG_EOM; + printMatchingResponse = true; + if (contentLen > 0) { + struct timeval timeout = {0}; + timeout.tv_sec = mDefaultOpTimeout; + int len = sock->DoSynchDiscard(contentLen, timeout); + if (len != contentLen) { + sock->Close(); + op->status = -EHOSTUNREACH; + return -1; + } + } + } + + const int contentLen = op->contentLength; + op->ParseResponseHeader(prop); + if (op->contentLength == 0) { + // restore it back: when a write op is sent out and this + // method is invoked with the same op to get the response, the + // op's status should get filled in; we shouldn't be stomping + // over content length. + op->contentLength = contentLen; + return numIO; + } + + if (! op->contentBuf || op->contentBufLen < op->contentLength + 1) { + delete [] op->contentBuf; + op->contentBuf = 0; + op->contentBuf = new char[op->contentLength + 1]; + op->contentBuf[op->contentLength] = '\0'; + op->contentBufLen = op->contentLength + 1; + } + + // len bytes belongs to the RPC reply. Whatever is left after + // stripping that data out is the data. + const ssize_t navail = numIO - len; + if (navail > 0) { + assert(navail <= (ssize_t)op->contentLength); + memcpy(op->contentBuf, mResponseBuffer + len, navail); + } + ssize_t nleft = op->contentLength - navail; + + assert(nleft >= 0); + + int nread = 0; + if (nleft > 0) { + struct timeval timeout = {0}; + timeout.tv_sec = mDefaultOpTimeout; + nread = sock->DoSynchRecv(op->contentBuf + navail, nleft, timeout); + if (nread <= 0) { + KFS_LOG_STREAM_DEBUG << + sock->GetPeerName() << ": read failed: " << nread << + " " << QCUtils::SysError(-nread) << + KFS_LOG_EOM; + op->status = nread == -ETIMEDOUT ? -ETIMEDOUT : -EHOSTUNREACH; + sock->Close(); + return -1; + } + } + + return nread + numIO; +} + + +/// +/// Common work for each op: build a request; send it to server; get a +/// response; parse it. +/// +/// @param[in] op the op to be done +/// @param[in] sock the socket on which we communicate with server +/// +/// @retval # of bytes read from the server. +/// +int +KfsClientImpl::DoOpCommon(KfsOp *op, TcpSocket *sock) +{ + assert(sock); + int res = DoOpSend(op, sock); + if (res < 0) { + return res; + } + res = DoOpResponse(op, sock); + if (res < 0) { + return res; + } + if (op->status < 0) { + KFS_LOG_STREAM_DEBUG << op->Show() << + " failed: " << op->status << " " << ErrorCodeToStr(op->status) << + KFS_LOG_EOM; + } + return res; +} + +/// +/// To compute the size of a file, determine what the last chunk in +/// the file happens to be (from the meta server); then, for the last +/// chunk, find its size and then add the size of remaining chunks +/// (all of which are assumed to be full). The reason for asking the +/// meta server about the last chunk (and simply using chunkCount) is +/// that random writes with seeks affect where the "last" chunk of the +/// file happens to be: for instance, a file could have chunkCount = 1, but +/// that chunk could be the 10th chunk in the file---the first 9 +/// chunks are just holes. +// +struct RespondingServer { + KfsClientImpl& client; + const ChunkLayoutInfo& layout; + int& status; + chunkOff_t& size; + RespondingServer(KfsClientImpl& cli, const ChunkLayoutInfo& lay, + chunkOff_t& sz, int& st) + : client(cli), layout(lay), status(st), size(sz) + {} + bool operator() (ServerLocation loc) + { + status = -EIO; + size = -1; + + TcpSocket sock; + if (sock.Connect(loc) < 0) { + size = 0; + return false; + } + SizeOp sop(client.nextSeq(), layout.chunkId, layout.chunkVersion); + sop.status = -1; + const int numIO = client.DoOpCommon(&sop, &sock); + if (numIO < 0 || ! sock.IsGood()) { + return false; + } + status = sop.status; + if (status >= 0) { + size = sop.size; + } + return status >= 0; + } +}; + +struct RespondingServer2 { + KfsClientImpl& client; + const ChunkLayoutInfo& layout; + RespondingServer2(KfsClientImpl& cli, const ChunkLayoutInfo& lay) + : client(cli), layout(lay) + {} + ssize_t operator() (const ServerLocation& loc) + { + TcpSocket sock; + if (sock.Connect(loc) < 0) { + return -1; + } + + SizeOp sop(client.nextSeq(), layout.chunkId, layout.chunkVersion); + int numIO = client.DoOpCommon(&sop, &sock); + if ((numIO < 0 && ! sock.IsGood()) || sop.status < 0) { + return -1; + } + return sop.size; + } +}; + +int +KfsClientImpl::UpdateFilesize(int fd) +{ + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return -EBADF; + } + FileTableEntry& entry = *(mFileTable[fd]); + FAttr* const fa = LookupFAttr(entry.parentFid, entry.name); + if (entry.fattr.isDirectory || + entry.fattr.striperType != KFS_STRIPED_FILE_TYPE_NONE) { + LookupOp op(nextSeq(), entry.parentFid, entry.name.c_str()); + DoMetaOpWithRetry(&op); + if (op.status < 0) { + Delete(fa); + return op.status; + } + if (op.fattr.fileId != entry.fattr.fileId) { + Delete(fa); + return 0; // File doesn't exists anymore, or in the dumpster. + } + entry.fattr = op.fattr; + if (fa) { + *fa = op.fattr; + fa->validatedTime = time(0); + fa->generation = mFAttrCacheGeneration; + fa->staleSubCountsFlag = false; + FAttrLru::PushBack(mFAttrLru, *fa); + } + if (entry.fattr.fileSize >= 0 || entry.fattr.isDirectory) { + return 0; + } + } + const chunkOff_t res = ComputeFilesize(entry.fattr.fileId); + if (res >= 0) { + FdAttr(fd)->fileSize = res; + if (fa) { + fa->fileSize = res; + } + } + return 0; +} + +chunkOff_t +KfsClientImpl::ComputeFilesize(kfsFileId_t kfsfid) +{ + GetLayoutOp lop(nextSeq(), kfsfid); + lop.lastChunkOnlyFlag = true; + DoMetaOpWithRetry(&lop); + if (lop.status < 0) { + KFS_LOG_STREAM_ERROR << + "failed to compute filesize fid: " << kfsfid << + " status: " << lop.status << + KFS_LOG_EOM; + return -1; + } + if (lop.ParseLayoutInfo()) { + KFS_LOG_STREAM_ERROR << + "failed to parse layout info fid: " << kfsfid << + KFS_LOG_EOM; + return -1; + } + if (lop.chunks.empty()) { + return 0; + } + const ChunkLayoutInfo& last = *lop.chunks.rbegin(); + chunkOff_t filesize = last.fileOffset; + chunkOff_t endsize = 0; + int rstatus = 0; + for (int retry = 0; retry < max(1, mMaxNumRetriesPerOp); retry++) { + if (retry > 0) { + Sleep(mRetryDelaySec); + } + if (find_if(last.chunkServers.begin(), last.chunkServers.end(), + RespondingServer(*this, last, endsize, rstatus)) != + last.chunkServers.end()) { + break; + } + KFS_LOG_STREAM_INFO << + "failed to connect to any server to get size of" + " fid: " << kfsfid << + " chunk: " << last.chunkId << + " retry: " << retry << + " max: " << mMaxNumRetriesPerOp << + KFS_LOG_EOM; + } + if (rstatus < 0) { + KFS_LOG_STREAM_ERROR << + "failed to get size for" + " fid: " << kfsfid << + " status: " << rstatus << + KFS_LOG_EOM; + return -1; + } + + if (filesize == 0 && endsize == 0 && ! lop.chunks.empty()) { + // Make sure that the filesize is really 0: the file has one + // chunk, but the size of that chunk is 0. Sanity check with + // all the servers that is really the case + vector chunksize; + chunksize.resize(last.chunkServers.size(), -1); + transform(last.chunkServers.begin(), last.chunkServers.end(), + chunksize.begin(), RespondingServer2(*this, last)); + for (size_t i = 0; i < chunksize.size(); i++) { + if (chunksize[i] > 0) { + endsize = chunksize[i]; + break; + } + } + } + filesize += endsize; + + return filesize; +} + +void +KfsClientImpl::ComputeFilesizes(vector& fattrs, + const vector& lastChunkInfo) +{ + const size_t cnt = lastChunkInfo.size(); + assert(cnt <= fattrs.size()); + for (size_t i = 0; i < cnt; i++) { + KfsFileAttr& fa = fattrs[i]; + if (fa.isDirectory || fa.fileSize >= 0) { + continue; + } + if (fa.chunkCount() == 0) { + fa.fileSize = 0; + continue; + } + const ChunkAttr& cattr = lastChunkInfo[i]; + const size_t locCnt = cattr.chunkServerLoc.size(); + for (size_t j = 0; j < locCnt; j++) { + // get all the filesizes we can from this server + ComputeFilesizes(fattrs, lastChunkInfo, i, cattr.chunkServerLoc[j]); + } + if (fa.fileSize < 0) { + // If no replicas available return max possible size. + fa.fileSize = cattr.chunkOffset + (chunkOff_t)CHUNKSIZE; + } + } +} + +void +KfsClientImpl::ComputeFilesizes(vector& fattrs, + const vector& lastChunkInfo, size_t startIdx, + const ServerLocation& loc) +{ + TcpSocket sock; + if (sock.Connect(loc) < 0) { + return; + } + const size_t cnt = lastChunkInfo.size(); + for (size_t i = startIdx; i < cnt; i++) { + KfsFileAttr& fa = fattrs[i]; + if (fa.isDirectory || fa.fileSize >= 0) { + continue; + } + if (fa.chunkCount() == 0) { + fa.fileSize = 0; + continue; + } + const ChunkAttr& cattr = lastChunkInfo[i]; + vector::const_iterator const iter = find_if( + cattr.chunkServerLoc.begin(), + cattr.chunkServerLoc.end(), + MatchingServer(loc) + ); + if (iter == cattr.chunkServerLoc.end()) { + continue; + } + SizeOp sop(nextSeq(), cattr.chunkId, cattr.chunkVersion); + const int numIO = DoOpCommon(&sop, &sock); + if (numIO < 0 && ! sock.IsGood()) { + return; + } + fa.fileSize = lastChunkInfo[i].chunkOffset; + if (sop.status >= 0) { + fa.fileSize += sop.size; + } + } +} + +/// +/// Wrapper for retrying ops with the metaserver. +/// +void +KfsClientImpl::DoMetaOpWithRetry(KfsOp *op) +{ + time_t start = time(0); + for (int attempt = -1; ;) { + if (! mMetaServerSock.IsGood()) { + ConnectToMetaServer(); + } + op->status = 0; + const int res = DoOpCommon(op, &mMetaServerSock); + if (res < 0 && op->status == 0) { + op->status = res; + } + if (op->status != -EHOSTUNREACH && op->status != -ETIMEDOUT) { + break; + } + mMetaServerSock.Close(); + const time_t now = time(0); + if (++attempt == 0) { + if (now <= start + 1) { + continue; // Most likely idle connection timeout. + } + ++attempt; + } + if (attempt >= mMaxNumRetriesPerOp) { + break; + } + start += mRetryDelaySec; + if (now < start) { + Sleep(start - now); + } else { + start = now; + } + // re-issue the op with a new sequence # + op->seq = nextSeq(); + } + KFS_LOG_STREAM_DEBUG << + op->Show() << " status: " << op->status << + KFS_LOG_EOM; +} + +int +KfsClientImpl::FindFreeFileTableEntry() +{ + if (! mFreeFileTableEntires.empty()) { + const int fte = mFreeFileTableEntires.back(); + assert(mFileTable[fte] == 0); + mFreeFileTableEntires.pop_back(); + return fte; + } + const int last = (int)mFileTable.size(); + if (last < MAX_FILES) { // Grow vector up to max. size + mFileTable.push_back(0); + return last; + } + return -EMFILE; // No luck +} + +void +KfsClientImpl::ValidateFAttrCache(time_t now, int maxScan) +{ + FAttr* p; + const time_t expire = now - mFileAttributeRevalidateTime; + int rem = maxScan; + while ((p = FAttrLru::Front(mFAttrLru)) && + (p->validatedTime < expire || + p->generation != mFAttrCacheGeneration)) { + Delete(p); + if (--rem < 0) { + break; + } + } +} + +KfsClientImpl::FAttr* +KfsClientImpl::LookupFAttr(kfsFileId_t parentFid, const string& name) +{ + FidNameToFAttrMap::const_iterator const it = mFidNameToFAttrMap.find( + make_pair(parentFid, name)); + return (it == mFidNameToFAttrMap.end() ? 0 : it->second); +} + +KfsClientImpl::FAttr* +KfsClientImpl::LookupFAttr(const string& pathname, string* path) +{ + if (pathname.empty() || pathname[0] != '/') { + return 0; + } + NameToFAttrMap::const_iterator const it = mPathCache.find(pathname); + if (it != mPathCache.end()) { + if (path) { + *path = it->first; + } + return it->second; + } + return 0; +} + +KfsClientImpl::FAttr* +KfsClientImpl::NewFAttr(kfsFileId_t parentFid, const string& name, + const string& pathname) +{ + if (mFidNameToFAttrMap.size() > 128 && ++mFattrCacheSkipValidateCnt > 512) { + mFattrCacheSkipValidateCnt = 0; + ValidateFAttrCache(time(0), 64); + } + const size_t kMaxInodeCacheSize = 16 << 10; + for (size_t sz = mFidNameToFAttrMap.size(); + kMaxInodeCacheSize <= sz; + sz--) { + Delete(FAttrLru::Front(mFAttrLru)); + } + FAttr* const fa = new (mFAttrPool.Allocate()) FAttr(mFAttrLru); + pair const res = + mFidNameToFAttrMap.insert(make_pair(make_pair(parentFid, name), fa)); + if (! res.second) { + KFS_LOG_STREAM_FATAL << "fattr entry already exists: " << + " fid: " << parentFid << + " name: " << name << + KFS_LOG_EOM; + abort(); + } + fa->fidNameIt = res.first; + if (! pathname.empty() && pathname[0] == '/' && + name != ".." && name != ".") { + pair const + res = mPathCache.insert(make_pair(pathname, fa)); + if (! res.second) { + KFS_LOG_STREAM_FATAL << "fattr path entry already exists: " << + " fid: " << parentFid << + " name: " << name << + KFS_LOG_EOM; + abort(); + } + fa->nameIt = res.first; + } else { + fa->nameIt = mPathCacheNone; + } + return fa; +} + +void +KfsClientImpl::Delete(KfsClientImpl::FAttr* fa) +{ + if (! fa) { + return; + } + mFidNameToFAttrMap.erase(fa->fidNameIt); + if (fa->nameIt != mPathCacheNone) { + mPathCache.erase(fa->nameIt); + } + FAttrLru::Remove(mFAttrLru, *fa); + fa->~FAttr(); + mFAttrPool.Deallocate(fa); +} + +int +KfsClientImpl::AllocFileTableEntry(kfsFileId_t parentFid, const string& name, + const string& pathname) +{ + const int fte = FindFreeFileTableEntry(); + if (fte < 0) { + return fte; + } + mFileInstance += 2; + FileTableEntry* const entry = + new FileTableEntry(parentFid, name, mFileInstance); + mFileTable[fte] = entry; + InitPendingRead(*entry); + entry->pathname = pathname; + entry->ioBufferSize = mDefaultIoBufferSize; + entry->failShortReadsFlag = mFailShortReadsFlag; + return fte; +} + +void +KfsClientImpl::ReleaseFileTableEntry(int fte) +{ + assert(valid_fd(fte)); + FileTableEntry& entry = *(mFileTable[fte]); + mFileTable[fte] = 0; + mFreeFileTableEntires.push_back(fte); + KFS_LOG_STREAM_DEBUG << + "closing filetable entry: " << fte << + " mode: " << entry.openMode << + " path: " << entry.pathname << + KFS_LOG_EOM; + CancelPendingRead(entry); + delete &entry; +} + +void +KfsClientImpl::UpdatePath(KfsClientImpl::FAttr* fa, const string& path, + bool copyPathFlag) +{ + if (! fa || fa->nameIt->first == path) { + return; + } + if (fa->nameIt != mPathCacheNone) { + mPathCache.erase(fa->nameIt); + } + fa->nameIt = mPathCache.insert( + make_pair(copyPathFlag ? string(path.data(), path.length()) : path, fa) + ).first; +} + +/// +/// Given a parentFid and a file in that directory, return the +/// corresponding entry in the file table. If such an entry has not +/// been seen before, download the file attributes from the server and +/// save it in the file table. +/// +int +KfsClientImpl::Lookup(kfsFileId_t parentFid, const string& name, + KfsClientImpl::FAttr*& fa, time_t now, const string& path) +{ + assert(! path.empty() && *path.begin() == '/' && + name != "." && name != ".."); + + fa = LookupFAttr(parentFid, name); + if (fa && IsValid(*fa, now)) { + UpdatePath(fa, path); + return 0; + } + LookupOp op(nextSeq(), parentFid, name.c_str()); + DoMetaOpWithRetry(&op); + if (op.status < 0) { + if (fa) { + Delete(fa); + } + return op.status; + } + // Update i-node cache. + // This method presently called only from the path traversal. + // Force new path string allocation to keep "path" buffer mutable, + // assuming string class implementation with ref. counting, of course. + if (fa) { + UpdatePath(fa, path); + FAttrLru::PushBack(mFAttrLru, *fa); + } else if (! (fa = NewFAttr(parentFid, name, + string(path.data(), path.length())))) { + return -ENOMEM; + } + fa->validatedTime = now; + fa->generation = mFAttrCacheGeneration; + fa->staleSubCountsFlag = false; + *fa = op.fattr; + return 0; +} + +int +KfsClientImpl::ValidateName(const string& name) +{ + const size_t len = name.length(); + if (len <= 0) { + return -EINVAL; + } + if (len > MAX_FILENAME_LEN) { + return -ENAMETOOLONG; + } + // For now do not allow leading or trailing spaces and \r \n anywhere, + // as these aren't escaped yet. + const char* nm = name.c_str(); + if (nm[0] <= ' ' || nm[len - 1] <= ' ') { + return -ENOENT; + } + if (name.find_first_of("/\n\r") != string::npos) { + return -ENOENT; + } + return 0; +} + +/// +/// Given a path, break it down into: parentFid and filename. If the +/// path does not begin with "/", the current working directory is +/// inserted in front of it. +/// @param[in] path The path string that needs to be extracted +/// @param[out] parentFid The file-id corresponding to the parent dir +/// @param[out] name The filename following the final "/". +/// @retval 0 on success; -errno on failure +/// +int +KfsClientImpl::GetPathComponents(const char* pathname, kfsFileId_t* parentFid, + string& name, string* path, bool invalidateSubCountsFlag, + bool enforceLastDirFlag) +{ + if (! pathname) { + return -EFAULT; + } + size_t len = strlen(pathname); + const char* ptr = GetTmpAbsPath(pathname, len); + if (! mTmpAbsPath.Set(ptr, len)) { + return -EINVAL; + } + const size_t sz = mTmpAbsPath.size(); + if (sz < 1 || mTmpAbsPath[0] != Path::Token("/", 1)) { + return -EINVAL; + } + string& npath = path ? *path : mTmpCurPath; + npath.clear(); + npath.reserve(min(MAX_PATH_NAME_LENGTH, len)); + mTmpPath.clear(); + mTmpPath.reserve(sz); + mTmpPath.push_back(make_pair(ROOTFID, 0)); + *parentFid = ROOTFID; + name = mSlash; + FAttr* fa; + if (invalidateSubCountsFlag && (fa = LookupFAttr(*parentFid, name))) { + fa->staleSubCountsFlag = true; + } + const Path::Token kThisDir(".", 1); + const Path::Token kParentDir("..", 2); + const time_t now = sz <= 1 ? 0 : time(0); + int res = 0; + for (size_t i = 1; i < sz; i++) { + const Path::Token& dname = mTmpAbsPath[i]; + if (dname == kThisDir || dname.mLen <= 0) { + continue; + } + if (dname == kParentDir) { + const size_t psz = mTmpPath.size(); + if (psz <= 1) { + assert(psz == 1); + continue; + } + npath.erase(npath.length() - + mTmpAbsPath[mTmpPath.back().second].mLen - 1); + mTmpPath.pop_back(); + const TmpPath::value_type& back = mTmpPath.back(); + *parentFid = back.first; + const Path::Token& nm = mTmpAbsPath[back.second]; + name.assign(nm.mPtr, nm.mLen); + continue; + } + name.assign(dname.mPtr, dname.mLen); + if ((res = ValidateName(name)) != 0) { + break; + } + npath += mSlash; + npath.append(dname.mPtr, dname.mLen); + if (npath.length() > MAX_PATH_NAME_LENGTH) { + res = -ENAMETOOLONG; + break; + } + const bool lastFlag = i + 1 == sz; + if (lastFlag && (! enforceLastDirFlag || ! mTmpAbsPath.IsDir())) { + break; + } + fa = 0; + if ((res = Lookup(*parentFid, name, fa, now, npath)) != 0) { + break; + } + if (! fa->isDirectory) { + res = -ENOTDIR; + break; + } + if (invalidateSubCountsFlag) { + fa->staleSubCountsFlag = true; + } + if (lastFlag) { + break; + } + *parentFid = fa->fileId; + mTmpPath.push_back(make_pair(*parentFid, i)); + } + if (path && res == 0 && npath.empty()) { + npath = name; + } + mTmpAbsPath.Clear(); + + KFS_LOG_STREAM_DEBUG << + "path: " << pathname << + " file: " << name << + " npath: " << (npath.empty() ? name : npath) << + " parent: " << *parentFid << + " ret: " << res << + KFS_LOG_EOM; + return res; +} + +int +KfsClientImpl::EnumerateBlocks(const char* pathname, KfsClient::BlockInfos& res) +{ + QCStMutexLocker l(mMutex); + + KfsFileAttr attr; + int ret; + if ((ret = StatSelf(pathname, attr, false)) < 0) { + KFS_LOG_STREAM_DEBUG << (pathname ? pathname : "null") << ": " << + ErrorCodeToStr(ret) << + KFS_LOG_EOM; + return -ENOENT; + } + if (attr.isDirectory) { + KFS_LOG_STREAM_DEBUG << pathname << ": is a directory" << KFS_LOG_EOM; + return -EISDIR; + } + KFS_LOG_STREAM_DEBUG << "path: " << pathname << + " file id: " << attr.fileId << + KFS_LOG_EOM; + + GetLayoutOp lop(nextSeq(), attr.fileId); + DoMetaOpWithRetry(&lop); + if (lop.status < 0) { + KFS_LOG_STREAM_ERROR << "get layout failed on path: " << pathname << " " + << ErrorCodeToStr(lop.status) << + KFS_LOG_EOM; + return lop.status; + } + + if (lop.ParseLayoutInfo()) { + KFS_LOG_STREAM_ERROR << "unable to parse layout for path: " << pathname << + KFS_LOG_EOM; + return -1; + } + + vector chunksize; + for (vector::const_iterator i = lop.chunks.begin(); + i != lop.chunks.end(); + ++i) { + if (i->chunkServers.empty()) { + res.push_back(KfsClient::BlockInfo()); + KfsClient::BlockInfo& chunk = res.back(); + chunk.offset = i->fileOffset; + chunk.id = i->chunkId; + chunk.version = i->chunkVersion; + continue; + } + chunksize.clear(); + // Get the size for the chunk from all the responding servers + chunksize.resize(i->chunkServers.size(), -1); + transform(i->chunkServers.begin(), i->chunkServers.end(), + chunksize.begin(), RespondingServer2(*this, *i)); + for (size_t k = 0; k < chunksize.size(); k++) { + res.push_back(KfsClient::BlockInfo()); + KfsClient::BlockInfo& chunk = res.back(); + chunk.offset = i->fileOffset; + chunk.id = i->chunkId; + chunk.version = i->chunkVersion; + chunk.size = chunksize[k]; + chunk.server = i->chunkServers[k]; + } + } + return 0; +} + + +int +KfsClientImpl::GetDataChecksums(const ServerLocation &loc, + kfsChunkId_t chunkId, uint32_t *checksums, bool readVerifyFlag) +{ + TcpSocket sock; + int ret; + if ((ret = sock.Connect(loc)) < 0) { + return ret; + } + GetChunkMetadataOp op(nextSeq(), chunkId, readVerifyFlag); + const int numIO = DoOpCommon(&op, &sock); + if (numIO <= 0) { + return (numIO < 0 ? numIO : -EINVAL); + } + if (op.status == -EBADCKSUM) { + KFS_LOG_STREAM_INFO << + "Server " << loc.ToString() << + " reports checksum mismatch for scrub read on" + " chunk: " << chunkId << + KFS_LOG_EOM; + } + if (op.status < 0) { + return op.status; + } + const size_t numChecksums = CHUNKSIZE / CHECKSUM_BLOCKSIZE; + if (op.contentLength < numChecksums * sizeof(*checksums)) { + return -EINVAL; + } + memcpy(checksums, op.contentBuf, numChecksums * sizeof(*checksums)); + return 0; +} + +int +KfsClientImpl::VerifyDataChecksums(const char* pathname) +{ + KfsFileAttr attr; + int res; + + QCStMutexLocker l(mMutex); + + if ((res = StatSelf(pathname, attr, false)) < 0) { + return res; + } + if (attr.isDirectory) { + return -EISDIR; + } + return VerifyDataChecksumsFid(attr.fileId); +} + +int +KfsClientImpl::VerifyDataChecksums(int fd) +{ + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return -EBADF; + } + FileTableEntry& entry = *(mFileTable[fd]); + if (entry.fattr.isDirectory) { + return -EISDIR; + } + return VerifyDataChecksumsFid(entry.fattr.fileId); +} + +int +KfsClientImpl::VerifyDataChecksumsFid(kfsFileId_t fileId) +{ + GetLayoutOp lop(nextSeq(), fileId); + DoMetaOpWithRetry(&lop); + if (lop.status < 0) { + KFS_LOG_STREAM_ERROR << "Get layout failed with error: " + << ErrorCodeToStr(lop.status) << + KFS_LOG_EOM; + return lop.status; + } + if (lop.ParseLayoutInfo()) { + KFS_LOG_STREAM_ERROR << "unable to parse layout info" << + KFS_LOG_EOM; + return -EINVAL; + } + const size_t numChecksums = CHUNKSIZE / CHECKSUM_BLOCKSIZE; + scoped_array chunkChecksums1; + chunkChecksums1.reset(new uint32_t[numChecksums]); + scoped_array chunkChecksums2; + chunkChecksums2.reset(new uint32_t[numChecksums]); + for (vector::const_iterator i = lop.chunks.begin(); + i != lop.chunks.end(); + ++i) { + int ret; + if ((ret = GetDataChecksums( + i->chunkServers[0], i->chunkId, chunkChecksums1.get())) < 0) { + KFS_LOG_STREAM_ERROR << "failed to get checksums from server " << + i->chunkServers[0].ToString() << + " " << ErrorCodeToStr(ret) << + KFS_LOG_EOM; + return ret; + } + for (size_t k = 1; k < i->chunkServers.size(); k++) { + if ((ret = GetDataChecksums( + i->chunkServers[k], i->chunkId, + chunkChecksums2.get())) < 0) { + KFS_LOG_STREAM_ERROR << "didn't get checksums from server: " << + i->chunkServers[k].ToString() << + " " << ErrorCodeToStr(ret) << + KFS_LOG_EOM; + return ret; + } + bool mismatch = false; + for (size_t v = 0; v < numChecksums; v++) { + if (chunkChecksums1[v] != chunkChecksums2[v]) { + KFS_LOG_STREAM_ERROR << + "checksum mismatch between servers: " << + i->chunkServers[0].ToString() << + " " << i->chunkServers[k].ToString() << + KFS_LOG_EOM; + mismatch = true; + } + } + if (mismatch) { + return 1; + } + } + } + return 0; +} + +int +KfsClientImpl::GetFileOrChunkInfo(kfsFileId_t fileId, kfsChunkId_t chunkId, + KfsFileAttr& fattr, chunkOff_t& offset, int64_t& chunkVersion, + vector& servers) +{ + QCStMutexLocker l(mMutex); + GetPathNameOp op(nextSeq(), fileId, chunkId); + DoMetaOpWithRetry(&op); + fattr = op.fattr; + fattr.filename = op.pathname; + offset = op.offset; + chunkVersion = op.chunkVersion; + servers = op.servers; + return op.status; +} + +int +KfsClientImpl::Chmod(const char* pathname, kfsMode_t mode) +{ + if (mode == kKfsModeUndef) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + KfsFileAttr attr; + int res; + FAttr* fa = 0; + if ((res = StatSelf(pathname, attr, false, 0, &fa)) < 0) { + return res; + } + if (! attr.IsAnyPermissionDefined()) { + return 0; // permissions aren't supported by the meta server. + } + ChmodOp op(nextSeq(), attr.fileId, mode & (attr.isDirectory ? + kfsMode_t(Permissions::kDirModeMask) : + kfsMode_t(Permissions::kFileModeMask))); + DoMetaOpWithRetry(&op); + if (op.status != 0) { + return op.status; + } + if (fa && fa->isDirectory) { + InvalidateAllCachedAttrs(); + } + return 0; +} + +int +KfsClientImpl::GetUserAndGroup(const char* user, const char* group, + kfsUid_t& uid, kfsGid_t& gid) +{ + uid = kKfsUserNone; + gid = kKfsGroupNone; + + QCStMutexLocker l(mMutex); + + const time_t now = time(0); + if (user && *user) { + uid = NameToUid(user, now); + if (uid == kKfsUserNone) { + return -EINVAL; + } + } + if (group && *group) { + gid = NameToGid(group, now); + if (gid == kKfsGroupNone) { + return -EINVAL; + } + } + return 0; +} + +template +int +KfsClientImpl::RecursivelyApply(string& path, const KfsFileAttr& attr, T& functor) +{ + if (attr.isDirectory) { + // don't compute any filesize; don't update client cache + const size_t prevSize = path.size(); + path += "/"; + path += attr.filename; + vector entries; + int res = ReaddirPlus(path, attr.fileId, entries, false, false); + if (res < 0) { + return res; + } + for (vector::const_iterator it = entries.begin(); + it != entries.end(); + ++it) { + if (it->filename == "." || it->filename == "..") { + continue; + } + if ((res = RecursivelyApply(path, *it, functor)) != 0) { + break; + } + } + path.resize(prevSize); + if (res != 0) { + return res; + } + } + return functor(path, attr); +} + +template +int +KfsClientImpl::RecursivelyApply(const char* pathname, T& functor) +{ + string path; + path.reserve(MAX_PATH_NAME_LENGTH); + path = pathname; + KfsFileAttr attr; + int res = StatSelf(pathname, attr, false); + if (res != 0) { + return res; + } + res = RecursivelyApply(path, attr, functor); + InvalidateAllCachedAttrs(); + return res; +} + +int +KfsClientImpl::Chmod(int fd, kfsMode_t mode) +{ + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return -EBADF; + } + if (mode == kKfsModeUndef) { + return -EINVAL; + } + FileTableEntry& entry = *(mFileTable[fd]); + if (! entry.fattr.IsAnyPermissionDefined()) { + return 0; // permissions aren't supported by the meta server. + } + ChmodOp op(nextSeq(), entry.fattr.fileId, mode & (entry.fattr.isDirectory ? + kfsMode_t(Permissions::kDirModeMask) : + kfsMode_t(Permissions::kFileModeMask))); + DoMetaOpWithRetry(&op); + if (op.status != 0) { + return op.status; + } + entry.fattr.mode = op.mode; + if (entry.fattr.isDirectory) { + InvalidateAllCachedAttrs(); + } + return 0; +} + +class ChmodFunc +{ +private: + KfsClientImpl& mCli; + const kfsMode_t mMode; +public: + ChmodFunc(KfsClientImpl& cli, kfsMode_t mode) + : mCli(cli), + mMode(mode) + {} + int operator()(const string& /* path */, const KfsFileAttr& attr) const + { + ChmodOp op(mCli.nextSeq(), attr.fileId, (attr.isDirectory ? + kfsMode_t(Permissions::kDirModeMask) : + kfsMode_t(Permissions::kFileModeMask))); + mCli.DoMetaOpWithRetry(&op); + return op.status; + } +}; + +int +KfsClientImpl::ChmodR(const char* pathname, kfsMode_t mode) +{ + QCStMutexLocker l(mMutex); + + ChmodFunc funct(*this, mode); + return RecursivelyApply(pathname, funct); +} + +int +KfsClientImpl::Chown(int fd, const char* user, const char* group) +{ + kfsUid_t uid = kKfsUserNone; + kfsGid_t gid = kKfsGroupNone; + int ret = GetUserAndGroup(user, group, uid, gid); + if (ret != 0) { + return ret; + } + return Chown(fd, uid, gid); +} + +int +KfsClientImpl::Chown(int fd, kfsUid_t user, kfsGid_t group) +{ + if (user == kKfsUserNone && group == kKfsGroupNone) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + if (! valid_fd(fd)) { + return -EBADF; + } + FileTableEntry& entry = *(mFileTable[fd]); + if (! entry.fattr.IsAnyPermissionDefined()) { + return 0; // permissions aren't supported by the meta server. + } + if (mEUser != kKfsUserRoot && (user == kKfsUserNone || user == mEUser) && + find(mGroups.begin(), mGroups.end(), group) == + mGroups.end()) { + return -EPERM; + } + ChownOp op(nextSeq(), entry.fattr.fileId, user, group); + DoMetaOpWithRetry(&op); + if (op.status != 0) { + return op.status; + } + if (user != kKfsUserNone) { + entry.fattr.user = user; + } + if (group != kKfsGroupNone) { + entry.fattr.group = group; + } + return 0; +} + +int +KfsClientImpl::Chown(const char* pathname, const char* user, const char* group) +{ + kfsUid_t uid = kKfsUserNone; + kfsGid_t gid = kKfsGroupNone; + int ret = GetUserAndGroup(user, group, uid, gid); + if (ret != 0) { + return ret; + } + return Chown(pathname, uid, gid); +} + +int +KfsClientImpl::Chown(const char* pathname, kfsUid_t user, kfsGid_t group) +{ + if (user == kKfsUserNone && group == kKfsGroupNone) { + return -EINVAL; + } + + QCStMutexLocker l(mMutex); + + KfsFileAttr attr; + int res; + FAttr* fa = 0; + if ((res = StatSelf(pathname, attr, false, 0, &fa)) < 0) { + return res; + } + if (! fa) { + return -EFAULT; + } + if (! attr.IsAnyPermissionDefined()) { + return 0; // permissions aren't supported by the meta server. + } + if (mEUser != kKfsUserRoot && (user == kKfsUserNone || user == mEUser) && + find(mGroups.begin(), mGroups.end(), group) == + mGroups.end()) { + return -EPERM; + } + ChownOp op(nextSeq(), attr.fileId, user, group); + DoMetaOpWithRetry(&op); + if (op.status != 0 || ! fa) { + return op.status; + } + if (user != kKfsUserNone) { + fa->user = user; + } + if (group != kKfsGroupNone) { + fa->group = group; + } + if (fa->isDirectory) { + InvalidateAllCachedAttrs(); + } + return 0; +} + +int +KfsClientImpl::ChownR(const char* pathname, const char* user, const char* group) +{ + kfsUid_t uid = kKfsUserNone; + kfsGid_t gid = kKfsGroupNone; + int ret = GetUserAndGroup(user, group, uid, gid); + if (ret != 0) { + return ret; + } + return ChownR(pathname, uid, gid); +} + +class ChownFunc +{ +private: + KfsClientImpl& mCli; + const kfsUid_t mUser; + const kfsGid_t mGroup; +public: + ChownFunc(KfsClientImpl& cli, kfsUid_t user, kfsGid_t group) + : mCli(cli), + mUser(user), + mGroup(group) + {} + int operator()(const string& /* path */, const KfsFileAttr& attr) const + { + ChownOp op(mCli.nextSeq(), attr.fileId, mUser, mGroup); + mCli.DoMetaOpWithRetry(&op); + return op.status; + } +}; + +int +KfsClientImpl::ChownR(const char* pathname, kfsUid_t user, kfsGid_t group) +{ + QCStMutexLocker l(mMutex); + + if (mEUser != kKfsUserRoot && (user == kKfsUserNone || user == mEUser) && + find(mGroups.begin(), mGroups.end(), group) == + mGroups.end()) { + return -EPERM; + } + ChownFunc funct(*this, user, group); + return RecursivelyApply(pathname, funct); +} + +void +KfsClientImpl::SetUMask(kfsMode_t mask) +{ + QCStMutexLocker l(mMutex); + mUMask = mask & Permissions::kAccessModeMask; +} + +kfsMode_t +KfsClientImpl::GetUMask() +{ + QCStMutexLocker l(mMutex); + return mUMask; +} + +int +KfsClientImpl::SetEUserAndEGroup(kfsUid_t user, kfsGid_t group, + kfsGid_t* groups, int groupsCnt) +{ + return ClientsList::SetEUserAndEGroup(user, group, groups, groupsCnt); +} + +int +KfsClientImpl::CompareChunkReplicas(const char* pathname, string& md5sum) +{ + QCStMutexLocker l(mMutex); + + KfsFileAttr attr; + int res; + if ((res = StatSelf(pathname, attr, false)) < 0) { + KFS_LOG_STREAM_ERROR << + (pathname ? pathname : "null") << ": " << ErrorCodeToStr(res) << + KFS_LOG_EOM; + return res; + } + if (attr.isDirectory) { + KFS_LOG_STREAM_ERROR << pathname << " is a directory" << KFS_LOG_EOM; + return -EISDIR; + } + + GetLayoutOp lop(nextSeq(), attr.fileId); + DoMetaOpWithRetry(&lop); + if (lop.status < 0) { + KFS_LOG_STREAM_ERROR << "get layout error: " << + ErrorCodeToStr(lop.status) << + KFS_LOG_EOM; + return lop.status; + } + if (lop.ParseLayoutInfo()) { + KFS_LOG_STREAM_ERROR << "Unable to parse layout info!" << KFS_LOG_EOM; + return -EINVAL; + } + MdStream mdsAll; + MdStream mds; + bool match = true; + for (vector::const_iterator i = lop.chunks.begin(); + i != lop.chunks.end(); + ++i) { + LeaseAcquireOp leaseOp(nextSeq(), i->chunkId, pathname); + DoMetaOpWithRetry(&leaseOp); + if (leaseOp.status < 0) { + KFS_LOG_STREAM_ERROR << "failed to acquire lease: " << + " chunk: " << i->chunkId << + ErrorCodeToStr(leaseOp.status) << + KFS_LOG_EOM; + return leaseOp.status; + } + mdsAll.flush(); + mds.Reset(&mdsAll); + const int nbytes = GetChunkFromReplica( + i->chunkServers[0], i->chunkId, i->chunkVersion, mds); + if (nbytes < 0) { + KFS_LOG_STREAM_ERROR << i->chunkServers[0].ToString() << + ": " << ErrorCodeToStr(nbytes) << + KFS_LOG_EOM; + match = false; + continue; + } + const string md5sumFirst = mds.GetMd(); + mds.Reset(); + KFS_LOG_STREAM_DEBUG << + "chunk: " << i->chunkId << + " replica: " << i->chunkServers[0].ToString() << + " size: " << nbytes << + " md5sum: " << md5sumFirst << + KFS_LOG_EOM; + for (uint32_t k = 1; k < i->chunkServers.size(); k++) { + mds.Reset(); + const int n = GetChunkFromReplica( + i->chunkServers[k], i->chunkId, i->chunkVersion, mds); + if (n < 0) { + KFS_LOG_STREAM_ERROR << i->chunkServers[0].ToString() << + ": " << ErrorCodeToStr(n) << + KFS_LOG_EOM; + match = false; + continue; + } + const string md5sumCur = mds.GetMd(); + KFS_LOG_STREAM_DEBUG << + "chunk: " << i->chunkId << + " replica: " << i->chunkServers[k].ToString() << + " size: " << nbytes << + " md5sum: " << md5sumCur << + KFS_LOG_EOM; + if (nbytes != n || md5sumFirst != md5sumCur) { + match = false; + } + if (! match) { + KFS_LOG_STREAM_ERROR << + "chunk: " << i->chunkId << + (nbytes != n ? "size" : "data") << + " mismatch: " << i->chunkServers[0].ToString() << + " size: " << nbytes << + " md5sum: " << md5sumFirst << + " vs " << i->chunkServers[k].ToString() << + " size: " << n << + " md5sum: " << md5sumCur << + KFS_LOG_EOM; + } + } + LeaseRelinquishOp lrelOp(nextSeq(), i->chunkId, leaseOp.leaseId); + DoMetaOpWithRetry(&lrelOp); + if (leaseOp.status < 0) { + KFS_LOG_STREAM_ERROR << "failed to relinquish lease: " << + " chunk: " << i->chunkId << + ErrorCodeToStr(lrelOp.status) << + KFS_LOG_EOM; + } + } + md5sum = mdsAll.GetMd(); + return (match ? 0 : 1); +} + +int +KfsClientImpl::GetChunkFromReplica(const ServerLocation& loc, + kfsChunkId_t chunkId, int64_t chunkVersion, ostream& os) +{ + TcpSocket sock; + int res; + if ((res = sock.Connect(loc)) < 0) { + return res; + } + SizeOp sizeOp(nextSeq(), chunkId, chunkVersion); + res = DoOpCommon(&sizeOp, &sock); + if (res < 0) { + return res; + } + if (sizeOp.status < 0) { + return sizeOp.status; + } + if (sizeOp.size <= 0) { + return 0; + } + if ((chunkOff_t)CHUNKSIZE < sizeOp.size) { + KFS_LOG_STREAM_ERROR << + "chunk size: " << sizeOp.size << + " exceeds max chunk size: " << CHUNKSIZE << + KFS_LOG_EOM; + return -EINVAL; + } + chunkOff_t nread = 0; + ReadOp op(0, chunkId, chunkVersion); + while (nread < sizeOp.size) { + op.seq = nextSeq(); + op.numBytes = min(size_t(1) << 20, (size_t)(sizeOp.size - nread)); + op.offset = nread; + op.contentLength = 0; + const int res = DoOpCommon(&op, &sock); + if (res < 0) { + nread = res; + break; + } + if (op.status < 0) { + nread = op.status; + break; + } + if (op.numBytes != op.contentLength) { + KFS_LOG_STREAM_ERROR << + "invalid read return: " << op.contentLength << + " exected: " << op.numBytes << + KFS_LOG_EOM; + nread = -EINVAL; + break; + } + os.write(op.contentBuf, op.contentLength); + nread += op.contentLength; + } + return (int)nread; +} + +const string& +KfsClientImpl::UidToName(kfsUid_t uid, time_t now) +{ + if (uid == kKfsUserNone) { + static string empty; + return empty; + } + UserNames::const_iterator it = mUserNames.find(uid); + if (it == mUserNames.end() || it->second.second < now) { + struct passwd pwebuf = {0}; + struct passwd* pwe = 0; + char nameBuf[1024]; + const int err = getpwuid_r((uid_t)uid, + &pwebuf, nameBuf, sizeof(nameBuf), &pwe); + string name; + if (err || ! pwe) { + ostringstream os; + os << uid; + name = os.str(); + } else { + name = pwe->pw_name; + } + pair const val( + name, now + mFileAttributeRevalidateTime); + pair const res = mUserNames.insert( + make_pair(uid, val)); + if (! res.second) { + res.first->second = val; + } + it = res.first; + } + return it->second.first; +} + +const string& +KfsClientImpl::GidToName(kfsGid_t gid, time_t now) +{ + if (gid == kKfsGroupNone) { + static string empty; + return empty; + } + GroupNames::const_iterator it = mGroupNames.find(gid); + if (it == mGroupNames.end() || it->second.second < now) { + struct group gbuf = {0}; + struct group* pge = 0; + char nameBuf[1024]; + const int err = getgrgid_r((gid_t)gid, + &gbuf, nameBuf, sizeof(nameBuf), &pge); + string name; + if (err || ! pge) { + ostringstream os; + os << gid; + name = os.str(); + } else { + name = pge->gr_name; + } + pair const val( + name, now + mFileAttributeRevalidateTime); + pair const res = mGroupNames.insert( + make_pair(gid, val)); + if (! res.second) { + res.first->second = val; + } + it = res.first; + } + return it->second.first; +} + + +kfsUid_t +KfsClientImpl::NameToUid(const string& name, time_t now) +{ + UserIds::const_iterator it = mUserIds.find(name); + if (it == mUserIds.end() || it->second.second < now) { + struct passwd pwebuf = {0}; + struct passwd* pwe = 0; + char nameBuf[1024]; + const int err = getpwnam_r(name.c_str(), + &pwebuf, nameBuf, sizeof(nameBuf), &pwe); + kfsUid_t uid; + if (err || ! pwe) { + char* end = 0; + uid = (kfsUid_t)strtol(name.c_str(), &end, 0); + if (! end || *end > ' ') { + uid = kKfsUserNone; + } + } else { + uid = (kfsUid_t)pwe->pw_uid; + } + pair const val( + uid, now + mFileAttributeRevalidateTime); + pair const res = mUserIds.insert( + make_pair(name, val)); + if (! res.second) { + res.first->second = val; + } + it = res.first; + } + return it->second.first; +} + +kfsGid_t +KfsClientImpl::NameToGid(const string& name, time_t now) +{ + GroupIds::const_iterator it = mGroupIds.find(name); + if (it == mGroupIds.end() || it->second.second < now) { + struct group gbuf = {0}; + struct group* pge = 0; + char nameBuf[1024]; + const int err = getgrnam_r(name.c_str(), + &gbuf, nameBuf, sizeof(nameBuf), &pge); + kfsGid_t gid; + if (err || ! pge) { + char* end = 0; + gid = (kfsUid_t)strtol(name.c_str(), &end, 0); + if (! end || *end > ' ') { + gid = kKfsGroupNone; + } + } else { + gid = pge->gr_gid; + } + pair const val( + gid, now + mFileAttributeRevalidateTime); + pair const res = mGroupIds.insert( + make_pair(name, val)); + if (! res.second) { + res.first->second = val; + } + it = res.first; + } + return it->second.first; +} + +int +KfsClientImpl::GetUserAndGroupNames(kfsUid_t user, kfsGid_t group, + string& uname, string& gname) +{ + QCStMutexLocker l(mMutex); + const time_t now = time(0); + if (user != kKfsUserNone) { + uname = UidToName(user, now); + } + if (group != kKfsGroupNone) { + gname = GidToName(group, now); + } + return 0; +} + +} // client +} // KFS diff --git a/src/cc/libclient/KfsClient.h b/src/cc/libclient/KfsClient.h new file mode 100644 index 000000000..3306986e3 --- /dev/null +++ b/src/cc/libclient/KfsClient.h @@ -0,0 +1,644 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/04/18 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file KfsClient.h +// \brief Kfs Client-library code. +// +//---------------------------------------------------------------------------- + +#ifndef LIBKFSCLIENT_KFSCLIENT_H +#define LIBKFSCLIENT_KFSCLIENT_H + +#include +#include + +#include "KfsAttr.h" + +namespace KFS { +using std::string; +using std::vector; + +namespace client { +class KfsClientImpl; +} + +/// Maximum length of a filename +const size_t MAX_FILENAME_LEN = 256; + +/// +/// \brief The KfsClient is the "bridge" between applications and the +/// KFS servers (either the metaserver or chunkserver): there can be +/// only one client per metaserver. +/// +/// The KfsClientFactory class can be used to produce KfsClient +/// objects, where each client is used to interface with a different +/// metaserver. The preferred method of creating a client object is +/// thru the client factory. +/// + + +class KfsClient { +public: + typedef client::KfsClientImpl KfsClientImpl; + + KfsClient(); + ~KfsClient(); + + /// + /// @param[in] metaServerHost Machine on meta is running + /// @param[in] metaServerPort Port at which we should connect to + /// @retval 0 on success; -1 on failure + /// + int Init(const string &metaServerHost, int metaServerPort); + + /// Set the logging level to control message verbosity + void SetLogLevel(const string &level); + + bool IsInitialized(); + + /// + /// Provide a "cwd" like facility for KFS. + /// @param[in] pathname The pathname to change the "cwd" to + /// @retval 0 on sucess; -errno otherwise + /// + int Cd(const char *pathname); + + /// Get cwd + /// @retval a string that describes the current working dir. + /// + string GetCwd(); + + /// + /// Make a directory hierarcy in KFS. If the parent dirs are not + /// present, they are also made. + /// @param[in] pathname The full pathname such as /.../dir + /// @retval 0 if mkdir is successful; -errno otherwise + int Mkdirs(const char *pathname, kfsMode_t mode = 0777); + + /// + /// Make a directory in KFS. + /// @param[in] pathname The full pathname such as /.../dir + /// @retval 0 if mkdir is successful; -errno otherwise + int Mkdir(const char *pathname, kfsMode_t mode = 0777); + + /// + /// Remove a directory in KFS. + /// @param[in] pathname The full pathname such as /.../dir + /// @retval 0 if rmdir is successful; -errno otherwise + int Rmdir(const char *pathname); + + /// + /// Remove a directory hierarchy in KFS. + /// @param[in] pathname The full pathname such as /.../dir + /// @retval 0 if rmdir is successful; -errno otherwise + int Rmdirs(const char *pathname); + + int RmdirsFast(const char *pathname); + + /// + /// Read a directory's contents + /// @param[in] pathname The full pathname such as /.../dir + /// @param[out] result The contents of the directory + /// @retval 0 if readdir is successful; -errno otherwise + int Readdir(const char *pathname, vector &result); + + /// + /// Read a directory's contents and retrieve the attributes + /// @param[in] pathname The full pathname such as /.../dir + /// @param[out] result The files in the directory and their attributes. + /// @retval 0 if readdirplus is successful; -errno otherwise + /// + int ReaddirPlus(const char *pathname, vector &result, + bool computeFilesize = true); + + /// + /// Read a directory's contents and retrieve the attributes + /// @retval 0 if readdirplus is successful; -errno otherwise + /// read() will retrieve directory entries in the form: + /// 64 bit mod time + /// 64 bit file size + /// 32 bit file replication + /// 32 bit file name length + /// 8 bit directory flag + /// file name: 8 bit times file name length + /// + int OpenDirectory(const char *pathname); + + /// + /// Stat a file and get its attributes. + /// @param[in] pathname The full pathname such as /.../foo + /// @param[out] result The attributes that we get back from server + /// @param[in] computeFilesize When set, for files, the size of + /// file is computed and the value is returned in result.st_size + /// @retval 0 if stat was successful; -errno otherwise + /// + int Stat(const char *pathname, KfsFileAttr &result, bool computeFilesize = true); + + /// + /// Given a file, return the # of chunks in the file + /// @param[in] pathname The full pathname such as /.../foo + /// @retval On success, # of chunks in the file; otherwise -1 + /// + int GetNumChunks(const char *pathname); + + int GetChunkSize(const char *pathname) { + return KFS::CHUNKSIZE; + } + + /// Update the size of a file that has been opened. It is likely + /// that the file is shared between two clients, one or more + /// writers, and a single reader. The reader needs to update its + /// view of the filesize so that it knows how much data there is. + int UpdateFilesize(int fd); + + /// + /// Helper APIs to check for the existence of (1) a path, (2) a + /// file, and (3) a directory. + /// Use Stat() if more than one property needs to be tested at the same + /// time. + /// @param[in] pathname The full pathname such as /.../foo + /// @retval status: True if it exists; false otherwise + /// + bool Exists(const char *pathname); + bool IsFile(const char *pathname); + bool IsDirectory(const char *pathname); + + struct BlockInfo + { + chunkOff_t offset; + kfsChunkId_t id; + int64_t version; + ServerLocation server; + chunkOff_t size; + + BlockInfo() + : offset(-1), + id(-1), + version(-1), + server(), + size(-1) + {} + }; + typedef vector BlockInfos; + + /// + /// For testing/debugging purposes, it might useful to know where + /// the blocks of a file are and what their sizes happen to be. + /// ** This method results in synchronous communication with all chunk + /// servers hosting chunk replicas of the file specified, no + /// other kfs client methods will execute concurrently until this method + /// returns. + /// @param[in] pathname the name of the file that is being queried. + /// @retval status code + /// + int EnumerateBlocks(const char* pathname, BlockInfos& res); + + /// + /// Given a file in KFS, verify that all N copies of each chunk are + /// identical. + /// For files with replication 1, and for files with recovery and + /// replication 1 this method presently effectively performs *no* + /// verification. + /// ** This method results in synchronous communication with all chunk + /// servers hosting chunk replicas of the file specified, no other kfs + /// client methods will execute concurrently until this method returns. + /// + /// @param[in] pathname the name of the file that is being queried. + /// @param[out] md5sum A string representation of the md5sum of + /// all chunks in the file. For striped files the returned md5sum will not + /// match md5sum of the file content. + /// @retval status code -- 0 OK, 1 mismatch < 0 -- error + /// + int CompareChunkReplicas(const char *pathname, string &md5sum); + + /// + /// Verify that the checksums on replicas is identical. + /// For files with replication 1, and for files with recovery and + /// replication 1 this method presently effectively performs *no* + /// verification. + /// ** These methods result in synchronous communication with all chunk + /// servers hosting chunk replicas of the file specified, no other kfs + /// client methods will execute concurrently until this method returns. + /// + /// @param[in] pathname the name of the file that is being queried. + /// @retval status code -- 0 OK, 1 mismatch < 0 -- error + /// + int VerifyDataChecksums(const char* pathname); + int VerifyDataChecksums(int fd); + + + /// @param[in] params create params encoded as string: + /// empty or null string: replication 2 + /// S -- 6+3 Reed Solomon 64KB stripes 1 replica + /// ,,,, + /// type: + /// 1 -- mirror replicas, all parameters except replicas ignored + /// 2 -- reed solomon, only 0 or 3 recovery stripes are currently + /// supported + /// @param[out] numReplicas + /// @param[out] numStripes + /// @param[out] numRecoveryStripes + /// @param[out] stripeSize + /// @param[out] stripedType + /// @retval 0 on success; -errno on failure. + static int ParseCreateParams(const char* params, int& numReplicas, + int& numStripes, int& numRecoveryStripes, int& stripeSize, + int& stripedType); + + /// + /// Create a file which is specified by a complete path. + /// @param[in] pathname that has to be created + /// @param[in] numReplicas the desired degree of replication for + /// the file. + /// @param[in] exclusive create will fail if the exists (O_EXCL flag) + /// @retval on success, fd corresponding to the created file; + /// -errno on failure. + /// + int Create(const char *pathname, int numReplicas = 3, bool exclusive = false, + int numStripes = 0, int numRecoveryStripes = 0, int stripeSize = 0, + int stripedType = KFS_STRIPED_FILE_TYPE_NONE, bool forceTypeFlag = true, + kfsMode_t mode = 0666); + + /// + /// Create a file which is specified by a complete path. + /// @param[in] pathname that has to be created + /// @param[in] exclusive create will fail if the exists (O_EXCL flag) + /// @param[in] params in ParseCreateParams() format + /// @retval on success, fd corresponding to the created file; + /// -errno on failure. + /// + int Create(const char *pathname, bool exclusive, const char* params); + + /// + /// Remove a file which is specified by a complete path. + /// @param[in] pathname that has to be removed + /// @retval status code + /// + int Remove(const char *pathname); + + /// + /// Rename file/dir corresponding to oldpath to newpath + /// @param[in] oldpath path corresponding to the old name + /// @param[in] newpath path corresponding to the new name + /// @param[in] overwrite when set, overwrite the newpath if it + /// exists; otherwise, the rename will fail if newpath exists + /// @retval 0 on success; -1 on failure + /// + int Rename(const char *oldpath, const char *newpath, bool overwrite = true); + + int CoalesceBlocks(const char *srcPath, const char *dstPath, chunkOff_t *dstStartOffset); + /// + /// Set the mtime for a path + /// @param[in] pathname for which mtime has to be set + /// @param[in] mtime the desired mtime + /// @retval status code + /// + int SetMtime(const char *pathname, const struct timeval &mtime); + + /// + /// Open a file + /// @param[in] pathname that has to be opened + /// @param[in] openFlags modeled after open(). The specific set + /// of flags currently supported are: + /// O_CREAT, O_CREAT|O_EXCL, O_RDWR, O_RDONLY, O_WRONLY, O_TRUNC, O_APPEND + /// @param[in] numReplicas if O_CREAT is specified, then this the + /// desired degree of replication for the file + /// @retval fd corresponding to the opened file; -errno on failure + /// + int Open(const char *pathname, int openFlags, int numReplicas = 3, + int numStripes = 0, int numRecoveryStripes = 0, int stripeSize = 0, + int stripedType = KFS_STRIPED_FILE_TYPE_NONE, + kfsMode_t mode = 0666); + + /// + /// Create a file which is specified by a complete path. + /// @param[in] pathname that has to be created + /// @param[in] params in ParseCreateParams() format + /// @retval on success, fd corresponding to the created file; + /// -errno on failure. + /// + int Open(const char *pathname, int openFlags, const char* params, + kfsMode_t mode = 0666); + + /// + /// Close a file + /// @param[in] fd that corresponds to a previously opened file + /// table entry. + /// + int Close(int fd); + + /// + /// Append a record to the chunk that we are writing to in the + /// file with one caveat: the record should not straddle chunk + /// boundaries. That is, if there is insufficient space in the + /// chunk to hold the record, then this record will be written to + /// a newly allocated chunk. + /// + int RecordAppend(int fd, const char *buf, int reclen); + + /// + /// With atomic record appends, if multiple clients are writing to + /// the same file, the writes are serialized by the chunk master. + /// + int AtomicRecordAppend(int fd, const char *buf, int reclen); + + void EnableAsyncRW(); + void DisableAsyncRW(); + + /// + /// To mask network latencies, applications can prefetch data from + /// a chunk. A request is enqueued and a prefetch thread will + /// start pulling the data. For the case of read, there can only + /// be one outstanding prefetch for a chunk: Data is read from + /// the current file position in the chunk. As a result of the + /// read request, the file pointer is NOT modified. Furthermore, + /// the prefetch will not straddle chunk boundaries. When a + /// subsequent read is issued, that read will do the completion + /// handling; if data prefetched is less than what was asked, the + /// sync read call will read the additional data/do failover. + /// + /// @param[in] fd that corresponds to a previously opened file + /// table entry. + /// @param buf For read, the buffer will be filled with data. The + /// caller is expected to provide sufficient buffer to hold the + /// prefetch data AND the buffer shouldn't be mutated while the + /// read is on-going. + /// @param[in] numBytes The # of bytes of I/O to be done. + /// @retval status code + /// + int ReadPrefetch(int fd, char *buf, size_t numBytes); + + /// + /// Similar to read prefetch, queue a write to a chunk. In + /// contrast to the read case, there are several differences: + /// 1. Each time an async write is issued, the write starts at the + /// "current" file pointer for N bytes; the call WILL advance the + /// file pointer. + /// 2. The write may straddle chunk boundaries. This call will + /// breakup the write into multiple requests. + /// 3. When an async write request is queued, the app is expected + /// to NOT mutate the buffer until the write is complete. + /// + /// @param[in] fd that corresponds to a previously opened file + /// table entry. + /// @param buf For writes, the buffer containing data to be written. + /// @param[in] numBytes The # of bytes of I/O to be done. + /// @retval status code + /// + int WriteAsync(int fd, const char *buf, size_t numBytes); + + /// + /// A set of async writes were issued to a file. Call this method + /// to do completion handling. If any of the async writes had + /// failed, this method will do a sync write. If the sync write + /// fails, this method will return an error. + /// + /// @param[in] fd that corresponds to a previously opened file + /// table entry. + /// @retval 0 on success; -1 if the writes failed + /// + int WriteAsyncCompletionHandler(int fd); + + /// + /// Read/write the desired # of bytes to the file, starting at the + /// "current" position of the file. + /// @param[in] fd that corresponds to a previously opened file + /// table entry. + /// @param buf For read, the buffer will be filled with data; for + /// writes, this buffer supplies the data to be written out. + /// @param[in] numBytes The # of bytes of I/O to be done. + /// @retval On success, return of bytes of I/O done (>= 0); + /// on failure, return status code (< 0). + /// + ssize_t Read(int fd, char *buf, size_t numBytes); + ssize_t Write(int fd, const char *buf, size_t numBytes); + + ssize_t PRead(int fd, chunkOff_t pos, char *buf, size_t numBytes); + ssize_t PWrite(int fd, chunkOff_t pos, const char *buf, size_t numBytes); + + /// If there are any holes in a file, such as those at the end of + /// a chunk, skip over them. + void SkipHolesInFile(int fd); + + /// + /// \brief Sync out data that has been written (to the "current" chunk). + /// @param[in] fd that corresponds to a file that was previously + /// opened for writing. + /// + int Sync(int fd); + + /// \brief Adjust the current position of the file pointer similar + /// to the seek() system call. + /// @param[in] fd that corresponds to a previously opened file + /// @param[in] offset offset to which the pointer should be moved + /// relative to whence. + /// @param[in] whence one of SEEK_CUR, SEEK_SET, SEEK_END + /// @retval On success, the offset to which the filer + /// pointer was moved to; (chunkOff_t) -1 on failure. + /// + chunkOff_t Seek(int fd, chunkOff_t offset, int whence); + /// In this version of seek, whence == SEEK_SET + chunkOff_t Seek(int fd, chunkOff_t offset); + + /// Return the current position of the file pointer in the file. + /// @param[in] fd that corresponds to a previously opened file + /// @retval value returned is analogous to calling ftell() + chunkOff_t Tell(int fd); + + /// + /// Truncate a file to the specified offset. + /// @param[in] fd that corresponds to a previously opened file + /// @param[in] offset the offset to which the file should be truncated + /// @retval status code + int Truncate(int fd, chunkOff_t offset); + int Truncate(const char* pathname, chunkOff_t offset); + + /// + /// Truncation, but going in the reverse direction: delete chunks + /// from the beginning of the file to the specified offset + /// @param[in] fd that corresponds to a previously opened file + /// @param[in] offset the offset before which the chunks should + /// be deleted + /// @retval status code + int PruneFromHead(int fd, chunkOff_t offset); + + /// + /// Given a starting offset/length, return the location of all the + /// chunks that cover this region. By location, we mean the name + /// of the chunkserver that is hosting the chunk. This API can be + /// used for job scheduling. + /// + /// @param[in] pathname The full pathname of the file such as /../foo + /// @param[in] start The starting byte offset + /// @param[in] len The length in bytes that define the region + /// @param[out] locations The location(s) of various chunks + /// @retval status: 0 on success; -errno otherwise + /// + int GetDataLocation(const char *pathname, chunkOff_t start, chunkOff_t len, + vector< vector > &locations); + + int GetDataLocation(int fd, chunkOff_t start, chunkOff_t len, + vector< vector > &locations); + + /// + /// Get the degree of replication for the pathname. + /// @param[in] pathname The full pathname of the file such as /../foo + /// @retval count + /// + int16_t GetReplicationFactor(const char *pathname); + + /// + /// Set the degree of replication for the pathname. + /// @param[in] pathname The full pathname of the file such as /../foo + /// @param[in] numReplicas The desired degree of replication. + /// @retval -1 on failure; on success, the # of replicas that will be made. + /// + int16_t SetReplicationFactor(const char *pathname, int16_t numReplicas); + + ServerLocation GetMetaserverLocation() const; + + /// Set a timeout of nsecs for an IO op. If the op doesn't + /// complete in nsecs, it returns an error to the client. + /// @param[in] desired op timeout in secs + /// + void SetDefaultIOTimeout(int nsecs); + int GetDefaultIOTimeout() const; + + void SetRetryDelay(int nsecs); + int GetRetryDelay() const; + + void SetMaxRetryPerOp(int retryCount); + int GetMaxRetryPerOp() const; + + /// + /// Set default io buffer size. + /// This has no effect on already opened files. + /// SetIoBufferSize() can be used to change buffer size for opened file. + /// @param[in] desired buffer size + /// @retval actual buffer size + // + ssize_t SetDefaultIoBufferSize(size_t size); + + /// + /// Get read ahead / write behind default buffer size. + /// @retval buffer size + // + ssize_t GetDefaultIoBufferSize() const; + + /// + /// Set file io buffer size. + /// @param[in] fd that corresponds to a previously opened file + /// @param[in] desired buffer size + /// @retval actual buffer size + // + ssize_t SetIoBufferSize(int fd, size_t size); + + /// + /// Get file io buffer size. + /// @param[in] fd that corresponds to a previously opened file + /// @retval buffer size + // + ssize_t GetIoBufferSize(int fd) const; + + /// + /// Set default read ahead size. + /// This has no effect on already opened files. + /// @param[in] desired read ahead size + /// @retval actual default read ahead size + // + ssize_t SetDefaultReadAheadSize(size_t size); + + /// + /// Get read ahead / write behind default buffer size. + /// @retval buffer size + // + ssize_t GetDefaultReadAheadSize() const; + + /// + /// Set file read ahead size. + /// @param[in] fd that corresponds to a previously opened file + /// @param[in] desired read ahead size + /// @retval actual read ahead size + // + ssize_t SetReadAheadSize(int fd, size_t size); + + /// A read for an offset that is after the specified value will result in EOF + void SetEOFMark(int fd, chunkOff_t offset); + + /// + /// Get file read ahead size. + /// @param[in] fd that corresponds to a previously opened file + /// @retval read ahead size + // + ssize_t GetReadAheadSize(int fd) const; + + int GetFileOrChunkInfo(kfsFileId_t fileId, kfsChunkId_t chunkId, + KfsFileAttr& fattr, chunkOff_t& offset, int64_t& chunkVersion, + vector& servers); + void SetDefaultFullSparseFileSupport(bool flag); + // Must be invoked before issuing the first read. + int SetFullSparseFileSupport(int fd, bool flag); + void SetFileAttributeRevalidateTime(int secs); + int Chmod(const char* pathname, kfsMode_t mode); + int Chmod(int fd, kfsMode_t mode); + int Chown(const char* pathname, kfsUid_t user, kfsGid_t group); + int Chown(int fd, kfsUid_t user, kfsGid_t group); + int Chown(const char* pathname, const char* user, const char* group); + int Chown(int fd, const char* user, const char* group); + int ChmodR(const char* pathname, kfsMode_t mode); + int ChownR(const char* pathname, kfsUid_t user, kfsGid_t group); + int ChownR(const char* pathname, const char* user, const char* group); + void SetUMask(kfsMode_t mask); + kfsMode_t GetUMask() const; + // Must be invoked before invoking any other method. + int SetEUserAndEGroup(kfsUid_t user, kfsGid_t group, + kfsGid_t* groups, int groupsCnt); + int GetUserAndGroupNames(kfsUid_t user, kfsGid_t group, + string& uname, string& gname); +private: + KfsClientImpl* const mImpl; +}; + +/// +/// @param[in] propFile that describes where the server is and +/// other client configuration info. +/// +KfsClient *Connect(const char *propFile); + +/// +/// Get the client object corresponding to the specified +/// metaserver. If an object hasn't been created previously, +/// create a new one and return it. The client object returned is +/// all setup---connected to metaserver and such. +/// @retval if connection to metaserver succeeds, a client object +/// that is "ready" for use; NULL if there was an error +/// +KfsClient *Connect(const string &metaServerHost, int metaServerPort); + +/// Given a error status code, return a string describing the error. +/// @param[in] status The status code for an error. +/// @retval String that describes what the error is. +extern string ErrorCodeToStr(int status); +} + +#endif // LIBKFSCLIENT_KFSCLIENT_H diff --git a/src/cc/libclient/KfsClientInt.h b/src/cc/libclient/KfsClientInt.h new file mode 100644 index 000000000..c6063d011 --- /dev/null +++ b/src/cc/libclient/KfsClientInt.h @@ -0,0 +1,913 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/04/18 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef LIBKFSCLIENT_KFSCLIENTINT_H +#define LIBKFSCLIENT_KFSCLIENTINT_H + +#include "common/MsgLogger.h" +#include "common/hsieh_hash.h" +#include "common/kfstypes.h" +#include "common/PoolAllocator.h" +#include "kfsio/TcpSocket.h" +#include "kfsio/checksum.h" +#include "qcdio/QCDLList.h" + +#include "KfsAttr.h" +#include "KfsOps.h" +#include "KfsClient.h" +#include "Path.h" +#include "qcdio/QCMutex.h" + +#include +#include +#include +#include + +namespace KFS { +namespace client { + +using std::string; +using std::map; +using std::vector; +using std::pair; +using std::less; +using std::map; +using std::equal_to; +using std::less; +using std::ostream; + +/// If an op fails because the server crashed, retry the op. This +/// constant defines the # of retries before declaring failure. +const int DEFAULT_NUM_RETRIES_PER_OP = 30; + +/// Whenever an op fails, we need to give time for the server to +/// recover. So, introduce a delay of 5 secs between retries. +const int RETRY_DELAY_SECS = 5; + +/// Whenever we have issues with lease failures, we retry the op after 5 secs +const int LEASE_RETRY_DELAY_SECS = 5; + +/// +/// A KfsClient maintains a file-table that stores information about +/// KFS files on that client. Each file in the file-table is composed +/// of some number of chunks; the meta-information about each +/// chunk is stored in a chunk-table associated with that file. Thus, given a +/// , we can map it to the appropriate ; we can also find where that piece of +/// data is located and appropriately access it. +/// + +class ReadRequest; +class ReadRequestCondVar; + +/// +/// \brief Read buffer class used with read ahead. +/// +class ReadBuffer +{ +public: + ReadBuffer() + : mStart(-1), + mSize(0), + mBufSize(0), + mStatus(0), + mAllocBuf(0), + mBuf(0), + mReadReq(0) + {} + ~ReadBuffer() + { + assert(! mReadReq); + delete [] mAllocBuf; + } + void Invalidate() + { mSize = 0; } + char* GetBufPtr() + { + if (mReadReq) { + return 0; + } + if (mBufSize > 0) { + assert(mBuf); + return mBuf; + } + mBufSize = -mBufSize; + delete [] mAllocBuf; + mAllocBuf = 0; + mBuf = 0; + mSize = 0; + mStatus = 0; + if (mBufSize <= 0) { + return 0; + } + const int kAlign = 16; + mAllocBuf = new char[mBufSize + kAlign]; + mBuf = mAllocBuf + kAlign - (mAllocBuf - (char*)0) % kAlign; + return mBuf; + } + void SetBufSize(int size) + { + if (GetBufSize() != size) { + mBufSize = size < 0 ? 0 : -size; + } + } + int GetBufSize() const + { return (mBufSize < 0 ? -mBufSize : mBufSize); } +private: + chunkOff_t mStart; + int mSize; + int mBufSize; + int mStatus; + char* mAllocBuf; + char* mBuf; + ReadRequest* mReadReq; + + friend class ReadRequest; + + char* DetachBuffer() + { + char* const ret = mAllocBuf; + mStart = -1; + mBuf = 0; + mAllocBuf = 0; + mSize = 0; + mStatus = 0; + mBufSize = -GetBufSize(); + return ret; + } +private: + ReadBuffer(const ReadBuffer& buf); + ReadBuffer& operator=(const ReadBuffer& buf); +}; + +class KfsClientImpl; + +/// +/// \brief Location of the file pointer in a file. +/// +struct FilePosition { + FilePosition() + : fileOffset(0) + {} + void Reset() { + fileOffset = 0; + } + chunkOff_t fileOffset; // offset within the file +}; + +/// +/// \brief A table of entries that describe each open KFS file. +/// +struct FileTableEntry { + // the fid of the parent dir in which this entry "resides" + kfsFileId_t parentFid; + // stores the name of the file/directory. + string name; + // the full pathname + string pathname; + // one of O_RDONLY, O_WRONLY, O_RDWR; when it is 0 for a file, + // this entry is used for attribute caching + int openMode; + FileAttr fattr; + // the position in the file at which the next read/write will occur + FilePosition currPos; + /// the user has set a marker beyond which reads should return EOF + chunkOff_t eofMark; + + bool skipHoles:1; + bool usedProtocolWorkerFlag:1; + bool readUsedProtocolWorkerFlag:1; + bool cachedAttrFlag:1; + bool failShortReadsFlag:1; + unsigned int instance; + int64_t pending; + vector* dirEntries; + int ioBufferSize; + ReadBuffer buffer; + ReadRequest* mReadQueue[1]; + + FileTableEntry(kfsFileId_t p, const string& n, unsigned int instance): + parentFid(p), + name(n), + pathname(), + openMode(0), + fattr(), + currPos(), + eofMark(-1), + skipHoles(false), + usedProtocolWorkerFlag(false), + readUsedProtocolWorkerFlag(false), + cachedAttrFlag(false), + failShortReadsFlag(false), + instance(instance), + pending(0), + dirEntries(0), + ioBufferSize(0), + buffer() + { mReadQueue[0] = 0; } + ~FileTableEntry() + { + delete dirEntries; + } +}; + +class KfsProtocolWorker; +/// +/// The kfs client implementation object. +/// +class KfsClientImpl { + +public: + enum { kMaxReadRequest = 1 << 20 }; + + KfsClientImpl(); + ~KfsClientImpl(); + + /// + /// @param[in] metaServerHost Machine on meta is running + /// @param[in] metaServerPort Port at which we should connect to + /// @retval 0 on success; -1 on failure + /// + int Init(const string &metaServerHost, int metaServerPort); + + ServerLocation GetMetaserverLocation() const { + return mMetaServerLoc; + } + + bool IsInitialized() { return mIsInitialized; }; + + /// + /// Provide a "cwd" like facility for KFS. + /// @param[in] pathname The pathname to change the "cwd" to + /// @retval 0 on sucess; -errno otherwise + /// + int Cd(const char *pathname); + + /// Get cwd + /// @retval a string that describes the current working dir. + /// + string GetCwd(); + + /// + /// Make a directory hierarcy in KFS. If the parent dirs are not + /// present, they are also made. + /// @param[in] pathname The full pathname such as /.../dir + /// @retval 0 if mkdir is successful; -errno otherwise + int Mkdirs(const char *pathname, kfsMode_t mode); + + /// + /// Make a directory in KFS. + /// @param[in] pathname The full pathname such as /.../dir + /// @retval 0 if mkdir is successful; -errno otherwise + int Mkdir(const char *pathname, kfsMode_t mode); + + /// + /// Remove a directory in KFS. + /// @param[in] pathname The full pathname such as /.../dir + /// @retval 0 if rmdir is successful; -errno otherwise + int Rmdir(const char *pathname); + + /// + /// Remove a directory hierarchy in KFS. + /// @param[in] pathname The full pathname such as /.../dir + /// @retval 0 if rmdir is successful; -errno otherwise + int Rmdirs(const char *pathname); + + int RmdirsFast(const char *pathname); + + /// + /// Read a directory's contents + /// @param[in] pathname The full pathname such as /.../dir + /// @param[out] result The contents of the directory + /// @retval 0 if readdir is successful; -errno otherwise + int Readdir(const char *pathname, vector &result); + + /// + /// Read a directory's contents and retrieve the attributes + /// @param[in] pathname The full pathname such as /.../dir + /// @param[out] result The files in the directory and their attributes. + /// @param[in] computeFilesize By default, compute file size + /// @retval 0 if readdirplus is successful; -errno otherwise + /// + int ReaddirPlus(const char *pathname, vector &result, + bool computeFilesize = true); + + /// + /// Read a directory's contents and retrieve the attributes + /// @retval 0 if readdirplus is successful; -errno otherwise + /// read() will retrieve directory entries in the form: + /// 64 bit mod time + /// 64 bit file size + /// 32 bit file replication + /// 32 bit file name length + /// 8 bit directory flag + /// file name: 8 bit times file name length + /// + int OpenDirectory(const char *pathname); + + /// + /// Stat a file and get its attributes. + /// @param[in] pathname The full pathname such as /.../foo + /// @param[out] result The attributes that we get back from server + /// @param[in] computeFilesize When set, for files, the size of + /// file is computed and the value is returned in result.st_size + /// @retval 0 if stat was successful; -errno otherwise + /// + int Stat(const char *pathname, KfsFileAttr &result, bool computeFilesize = true); + + /// + /// Return the # of chunks in the file specified by the fully qualified pathname. + /// -1 if there is an error. + /// + int GetNumChunks(const char *pathname); + + int UpdateFilesize(int fd); + + /// + /// Helper APIs to check for the existence of (1) a path, (2) a + /// file, and (3) a directory. + /// @param[in] pathname The full pathname such as /.../foo + /// @retval status: True if it exists; false otherwise + /// + bool Exists(const char *pathname); + bool IsFile(const char *pathname); + bool IsDirectory(const char *pathname); + + int EnumerateBlocks(const char* pathname, KfsClient::BlockInfos& res); + + int CompareChunkReplicas(const char *pathname, string &md5sum); + + /// API to verify that checksums on all replicas are the same. + /// Each chunkserver scrubs the chunk and returns the adler-32 + /// checksum for its data. After scrubbing on all chunkservers, + /// we compare the checksums across replicas and return the result. + /// @retval status code -- 0 OK, 1 mismatch < 0 -- error + int VerifyDataChecksums(const char *pathname); + int VerifyDataChecksums(int fd); + + /// + /// Create a file which is specified by a complete path. + /// @param[in] pathname that has to be created + /// @param[in] numReplicas the desired degree of replication for + /// the file. + /// @param[in] exclusive create will fail if the exists (O_EXCL flag) + /// @retval on success, fd corresponding to the created file; + /// -errno on failure. + /// + int Create(const char *pathname, int numReplicas = 3, bool exclusive = false, + int numStripes = 0, int numRecoveryStripes = 0, int stripeSize = 0, + int stripedType = KFS_STRIPED_FILE_TYPE_NONE, bool forceTypeFlag = true, + kfsMode_t mode = kKfsModeUndef); + + /// + /// Remove a file which is specified by a complete path. + /// @param[in] pathname that has to be removed + /// @retval status code + /// + int Remove(const char *pathname); + + /// + /// Rename file/dir corresponding to oldpath to newpath + /// @param[in] oldpath path corresponding to the old name + /// @param[in] newpath path corresponding to the new name + /// @param[in] overwrite when set, overwrite the newpath if it + /// exists; otherwise, the rename will fail if newpath exists + /// @retval 0 on success; -1 on failure + /// + int Rename(const char *oldpath, const char *newpath, bool overwrite = true); + + int CoalesceBlocks(const char *srcPath, const char *dstPath, chunkOff_t *dstStartOffset); + + /// + /// Set the mtime for a path + /// @param[in] pathname for which mtime has to be set + /// @param[in] mtime the desired mtime + /// @retval status code + /// + int SetMtime(const char *pathname, const struct timeval &mtime); + + /// + /// Open a file + /// @param[in] pathname that has to be opened + /// @param[in] openFlags modeled after open(). The specific set + /// of flags currently supported are: + /// O_CREAT, O_CREAT|O_EXCL, O_RDWR, O_RDONLY, O_WRONLY, O_TRUNC, O_APPEND + /// @param[in] numReplicas if O_CREAT is specified, then this the + /// desired degree of replication for the file + /// @retval fd corresponding to the opened file; -errno on failure + /// + int Open(const char *pathname, int openFlags, int numReplicas, + int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, + kfsMode_t mode); + + /// + /// Close a file + /// @param[in] fd that corresponds to a previously opened file + /// table entry. + /// + int Close(int fd); + + /// + /// Append a record to the chunk that we are writing to in the + /// file with one caveat: the record should not straddle chunk + /// boundaries. + /// @param[in] fd that correpsonds to the file open for writing + /// @param[in] buf the record that should be appended + /// @param[in] reclen the length of the record + /// @retval Status code + /// + int RecordAppend(int fd, const char *buf, int reclen); + int AtomicRecordAppend(int fd, const char *buf, int reclen); + + /// See the comments in KfsClient.h + int ReadPrefetch(int fd, char *buf, size_t numBytes); + + int WriteAsync(int fd, const char *buf, size_t numBytes); + int WriteAsyncCompletionHandler(int fd); + + /// + /// Read/write the desired # of bytes to the file, starting at the + /// "current" position of the file. + /// @param[in] fd that corresponds to a previously opened file + /// table entry. + /// @param buf For read, the buffer will be filled with data; for + /// writes, this buffer supplies the data to be written out. + /// @param[in] numBytes The # of bytes of I/O to be done. + /// @retval On success, return of bytes of I/O done (>= 0); + /// on failure, return status code (< 0). + /// + ssize_t Read(int fd, char *buf, size_t numBytes, chunkOff_t* pos = 0); + ssize_t Write(int fd, const char *buf, size_t numBytes, chunkOff_t* pos = 0); + + /// If there are any holes in a file, such as those at the end of + /// a chunk, skip over them. + void SkipHolesInFile(int fd); + + /// + /// \brief Sync out data that has been written (to the "current" chunk). + /// @param[in] fd that corresponds to a file that was previously + /// opened for writing. + /// + int Sync(int fd); + + /// \brief Adjust the current position of the file pointer similar + /// to the seek() system call. + /// @param[in] fd that corresponds to a previously opened file + /// @param[in] offset offset to which the pointer should be moved + /// relative to whence. + /// @param[in] whence one of SEEK_CUR, SEEK_SET, SEEK_END + /// @retval On success, the offset to which the filer + /// pointer was moved to; (chunkOff_t) -1 on failure. + /// + chunkOff_t Seek(int fd, chunkOff_t offset, int whence); + /// In this version of seek, whence == SEEK_SET + chunkOff_t Seek(int fd, chunkOff_t offset); + + /// Return the current position of the file pointer in the file. + /// @param[in] fd that corresponds to a previously opened file + /// @retval value returned is analogous to calling ftell() + chunkOff_t Tell(int fd); + + /// + /// Truncate a file to the specified offset. + /// @param[in] fd that corresponds to a previously opened file + /// @param[in] offset the offset to which the file should be truncated + /// @retval status code + int Truncate(int fd, chunkOff_t offset); + int Truncate(const char* pathname, chunkOff_t offset); + + /// + /// Truncation, but going in the reverse direction: delete chunks + /// from the beginning of the file to the specified offset + /// @param[in] fd that corresponds to a previously opened file + /// @param[in] offset the offset before which the chunks should + /// be deleted + /// @retval status code + int PruneFromHead(int fd, chunkOff_t offset); + + /// + /// Given a starting offset/length, return the location of all the + /// chunks that cover this region. By location, we mean the name + /// of the chunkserver that is hosting the chunk. This API can be + /// used for job scheduling. + /// + /// @param[in] pathname The full pathname of the file such as /../foo + /// @param[in] start The starting byte offset + /// @param[in] len The length in bytes that define the region + /// @param[out] locations The location(s) of various chunks + /// @retval status: 0 on success; -errno otherwise + /// + int GetDataLocation(const char *pathname, chunkOff_t start, chunkOff_t len, + vector > &locations); + + int GetDataLocation(int fd, chunkOff_t start, chunkOff_t len, + vector > &locations); + + /// + /// Get the degree of replication for the pathname. + /// @param[in] pathname The full pathname of the file such as /../foo + /// @retval count + /// + int16_t GetReplicationFactor(const char *pathname); + + /// + /// Set the degree of replication for the pathname. + /// @param[in] pathname The full pathname of the file such as /../foo + /// @param[in] numReplicas The desired degree of replication. + /// @retval -1 on failure; on success, the # of replicas that will be made. + /// + int16_t SetReplicationFactor(const char *pathname, int16_t numReplicas); + + void SetDefaultIOTimeout(int nsecs); + int GetDefaultIOTimeout() const; + void SetRetryDelay(int nsecs); + int GetRetryDelay() const; + void SetMaxRetryPerOp(int retryCount); + int GetMaxRetryPerOp() const; + + ssize_t SetDefaultIoBufferSize(size_t size); + ssize_t GetDefaultIoBufferSize() const; + ssize_t SetIoBufferSize(int fd, size_t size); + ssize_t GetIoBufferSize(int fd) const; + + ssize_t SetDefaultReadAheadSize(size_t size); + ssize_t GetDefaultReadAheadSize() const; + ssize_t SetReadAheadSize(int fd, size_t size); + ssize_t GetReadAheadSize(int fd) const; + + /// A read for an offset that is after the specified value will result in EOF + void SetEOFMark(int fd, chunkOff_t offset); + + void SetMaxNumRetriesPerOp(int maxNumRetries); + int GetFileOrChunkInfo(kfsFileId_t fileId, kfsChunkId_t chunkId, + KfsFileAttr& fattr, chunkOff_t& offset, int64_t& chunkVersion, + vector& servers); + void SetDefaultFullSparseFileSupport(bool flag); + // Must be invoked before issuing the first read. + int SetFullSparseFileSupport(int fd, bool flag); + void SetFileAttributeRevalidateTime(int secs); + int Chmod(const char* pathname, kfsMode_t mode); + int Chmod(int fd, kfsMode_t mode); + int Chown(const char* pathname, kfsUid_t user, kfsGid_t group); + int Chown(int fd, kfsUid_t user, kfsGid_t group); + int Chown(const char* pathname, const char* user, const char* group); + int Chown(int fd, const char* user, const char* group); + int ChmodR(const char* pathname, kfsMode_t mode); + int ChownR(const char* pathname, kfsUid_t user, kfsGid_t group); + int ChownR(const char* pathname, const char* user, const char* group); + void SetUMask(kfsMode_t mask); + kfsMode_t GetUMask(); + int SetEUserAndEGroup(kfsUid_t user, kfsGid_t group, + kfsGid_t* groups, int groupsCnt); + int GetUserAndGroupNames(kfsUid_t user, kfsGid_t group, + string& uname, string& gname); + +private: + /// Maximum # of files a client can have open. + enum { MAX_FILES = 128 << 10 }; + + QCMutex mMutex; + + /// Seed to the random number generator + bool mIsInitialized; + /// where is the meta server located + ServerLocation mMetaServerLoc; + + /// a tcp socket that holds the connection with the server + TcpSocket mMetaServerSock; + /// seq # that we send in each command + kfsSeq_t mCmdSeqNum; + + /// The current working directory in KFS + string mCwd; + + class FAttr; + typedef map< + string, FAttr*, + less, + StdFastAllocator > + > NameToFAttrMap; + typedef map< + pair, FAttr*, + less >, + StdFastAllocator, FAttr*> > + > FidNameToFAttrMap; + class FAttr : public FileAttr + { + public: + typedef QCDLList List; + FAttr(FAttr** list) + : FileAttr(), + validatedTime(0), + generation(0), + staleSubCountsFlag(false), + fidNameIt(), + nameIt() + { + List::Init(*this); + List::PushBack(list, *this); + } + FAttr& operator=(const FileAttr& fa) + { + *static_cast(this) = fa; + return *this; + } + time_t validatedTime; + unsigned int generation; + bool staleSubCountsFlag; + FidNameToFAttrMap::iterator fidNameIt; + NameToFAttrMap::iterator nameIt; + private: + FAttr* mPrevPtr[1]; + FAttr* mNextPtr[1]; + friend class QCDLListOp; + }; + typedef FAttr::List FAttrLru; + + /// keep a table of open files/directory handles. + typedef vector FileTable; + typedef PoolAllocator < + sizeof(FAttr), + 1 << 20, + 32 << 20, + true + > FAttrPool; + typedef vector FreeFileTableEntires; + typedef vector > TmpPath; + + FileTable mFileTable; + FidNameToFAttrMap mFidNameToFAttrMap; + NameToFAttrMap mPathCache; + NameToFAttrMap::iterator const mPathCacheNone; + FAttrPool mFAttrPool; + FAttr* mFAttrLru[1]; + FreeFileTableEntires mFreeFileTableEntires; + unsigned int mFattrCacheSkipValidateCnt; + int mFileAttributeRevalidateTime; + unsigned int mFAttrCacheGeneration; + TmpPath mTmpPath; + string mTmpAbsPathStr; + Path mTmpAbsPath; + string mTmpCurPath; + string mTmpDirName; + const string mSlash; + + size_t mDefaultIoBufferSize; + size_t mDefaultReadAheadSize; + bool mFailShortReadsFlag; + + unsigned int mFileInstance; + KfsProtocolWorker* mProtocolWorker; + int mMaxNumRetriesPerOp; + int mRetryDelaySec; + int mDefaultOpTimeout; + ReadRequestCondVar* mFreeCondVarsHead; + kfsUid_t mEUser; + kfsGid_t mEGroup; + kfsMode_t mUMask; + vector mGroups; + kfsSeq_t mCreateId; + + typedef map, + less, + StdFastAllocator > > + > UserNames; + typedef map, + less, + StdFastAllocator > > + > GroupNames; + typedef map, + less, + StdFastAllocator > > + > UserIds; + typedef map, + less, + StdFastAllocator > > + > GroupIds; + + UserNames mUserNames; + GroupNames mGroupNames; + UserIds mUserIds; + GroupIds mGroupIds; + + KfsClientImpl* mPrevPtr[1]; + KfsClientImpl* mNextPtr[1]; + friend class QCDLListOp; + class ClientsList; + friend class ClientsList; + + // Kfs client presently always allocated with new / malloc. Allocating large + // buffer as part of the object should present no problem. + enum { kResponseBufferSize = MAX_RPC_HEADER_LEN }; + char mResponseBuffer[kResponseBufferSize + 1]; + + // Next sequence number for operations. + // This is called in a thread safe manner. + kfsSeq_t nextSeq() { return mCmdSeqNum++; } + kfsSeq_t NextCreateId() { return mCreateId++; } + + + bool IsValid(const FAttr& fa, time_t now) const + { + return (fa.generation == mFAttrCacheGeneration && + now <= fa.validatedTime + mFileAttributeRevalidateTime); + } + + void Shutdown(); + + /// Check that fd is in range + bool valid_fd(int fd) const { + return (fd >= 0 && fd < MAX_FILES && + (size_t)fd < mFileTable.size() && mFileTable[fd]); + } + + FAttr* NewFattr(kfsFileId_t parentFid, const string& name, const string& pathname); + void Delete(FAttr* fa); + bool Cache(time_t now, const string& dirname, kfsFileId_t dirFid, + const KfsFileAttr& attr); + int StatSelf(const char *pathname, KfsFileAttr &kfsattr, bool computeFilesize, + string* path = 0, FAttr** fa = 0, bool validSubCountsRequiredFlag = false); + int OpenSelf(const char *pathname, int openFlags, int numReplicas = 3, + int numStripes = 0, int numRecoveryStripes = 0, int stripeSize = 0, + int stripedType = KFS_STRIPED_FILE_TYPE_NONE, bool cacheAttributesFlag = false, + kfsMode_t mode = kKfsModeUndef, string* path = 0); + int CacheAttributes(const char* pathname); + int GetDataLocationSelf(int fd, chunkOff_t start, chunkOff_t len, + vector > &locations); + int TruncateSelf(int fd, chunkOff_t offset); + int CreateSelf(const char *pathname, int numReplicas, bool exclusive, + int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, + bool forceTypeFlag, kfsMode_t mode); + ssize_t SetReadAheadSize(FileTableEntry& inEntry, size_t inSize, bool optimalFlag = false); + ssize_t SetIoBufferSize(FileTableEntry& entry, size_t size, bool optimalFlag = false); + ssize_t SetOptimalIoBufferSize(FileTableEntry& entry, size_t size) { + return SetIoBufferSize(entry, size, true); + } + ssize_t SetOptimalReadAheadSize(FileTableEntry& entry, size_t size) { + return SetReadAheadSize(entry, size, true); + } + + + /// Connect to the meta server and return status. + /// @retval true if connect succeeds; false otherwise. + bool ConnectToMetaServer(); + + /// Lookup the attributes of a file given its parent file-id + /// @param[in] parentFid file-id of the parent directory + /// @param[in] filename filename whose attributes are being + /// asked + /// @param[out] result the resultant attributes + /// @param[in] computeFilesize when set, for files, the size of + /// the file is computed and returned in result.fileSize + /// @retval 0 on success; -errno otherwise + /// + int LookupAttr(kfsFileId_t parentFid, const string& filename, + FAttr*& result, bool computeFilesize, const string& path, + bool validSubCountsRequiredFlag = false); + + FAttr* LookupFAttr(kfsFileId_t parentFid, const string& name); + FAttr* LookupFAttr(const string& pathname, string* path); + FAttr* NewFAttr(kfsFileId_t parentFid, const string& name, + const string& pathname); + + /// Given a chunk, find out which chunk-server is hosting it. It + /// is possible that no server is hosting the chunk---if there is + /// a hole in the file. + /// @retval status code: 0 on success; < 0 => failure + int LocateChunk(int fd, chunkOff_t chunkOffset, ChunkAttr& chunk); + + /// Given a kfsfid with some # of chunks, compute the size of the + /// file. This involves looking up the size of the last chunk of + /// the file and then adding with the size of the remaining (full) chunks. + chunkOff_t ComputeFilesize(kfsFileId_t kfsfid); + + /// Given the attributes for a set of files and the location info + /// of the last chunk of each file, compute the filesizes for each file + void ComputeFilesizes(vector &fattrs, + const vector &lastChunkInfo); + + /// Helper function: given a starting index to the two vectors, + /// compute the file sizes for each file whose last chunk is + /// stored in chunkserver at location loc. + void ComputeFilesizes(vector &fattrs, + const vector &lastChunkInfo, + size_t startIdx, const ServerLocation &loc); + + FileTableEntry* FdInfo(int fd) { return mFileTable[fd]; } + FileAttr* FdAttr(int fd) { return &FdInfo(fd)->fattr; } + + /// Do the work for an op with the metaserver; if the metaserver + /// dies in the middle, retry the op a few times before giving up. + void DoMetaOpWithRetry(KfsOp *op); + + /// Get a response from the server, where, the response is + /// terminated by "\r\n\r\n". + int GetResponse(char *buf, int bufSize, int *delims, TcpSocket *sock); + + /// Validate file or directory name. + int ValidateName(const string& name); + + /// Given a path, get the parent fileid and the name following the + /// trailing "/" + int GetPathComponents(const char *pathname, kfsFileId_t *parentFid, + string &name, string* path = 0, bool invalidateSubCountsFlag = false, + bool enforceLastDirFlag = true); + + /// File table management utilities: find a free entry in the + /// table, find the entry corresponding to a pathname, "mark" an + /// entry in the table as in use, and "mark" an entry in the table + /// as free. + int FindFreeFileTableEntry(); + + bool IsFileTableEntryValid(int fte); + + + /// Given a parent fid and name, get the corresponding entry in + /// the file table. Note: if needed, attributes will be + /// downloaded from the server. + int Lookup(kfsFileId_t parentFid, const string& name, FAttr*& fa, time_t now, + const string& path); + FAttr* LookupFattr(kfsFileId_t parentFid, const string& name); + + // name -- is the last component of the pathname + int ClaimFileTableEntry(kfsFileId_t parentFid, const string& name, const string& pathname); + int AllocFileTableEntry(kfsFileId_t parentFid, const string& name, const string& pathname); + void ReleaseFileTableEntry(int fte); + + int GetDataChecksums(const ServerLocation &loc, + kfsChunkId_t chunkId, uint32_t *checksums, bool readVerifyFlag = true); + + int VerifyDataChecksumsFid(kfsFileId_t fileId); + + int GetChunkFromReplica(const ServerLocation& loc, kfsChunkId_t chunkId, + int64_t chunkVersion, ostream& os); + + int ReaddirPlus(const string& pathname, kfsFileId_t dirFid, + vector &result, + bool computeFilesize = true, bool updateClientCache = true); + + int Rmdirs(const string &parentDir, kfsFileId_t parentFid, const string &dirname, kfsFileId_t dirFid); + int Remove(const string &parentDir, kfsFileId_t parentFid, const string &entryName); + + int ReadDirectory(int fd, char *buf, size_t bufSize); + ssize_t Write(int fd, const char *buf, size_t numBytes, + bool asyncFlag, bool appendOnlyFlag, chunkOff_t* pos = 0); + void InitPendingRead(FileTableEntry& entry); + void CancelPendingRead(FileTableEntry& entry); + void CleanupPendingRead(); + int DoOpResponse(KfsOp *op, TcpSocket *sock); + int DoOpCommon(KfsOp *op, TcpSocket *sock); + int DoOpSend(KfsOp *op, TcpSocket *sock); + int RmdirsSelf(const string& path, const string& dirname, kfsFileId_t parentFid, kfsFileId_t dirFid); + void StartProtocolWorker(); + void InvalidateAllCachedAttrs(); + int GetUserAndGroup(const char* user, const char* group, kfsUid_t& uid, kfsGid_t& gid); + template int RecursivelyApply(string& path, const KfsFileAttr& attr, T& functor); + template int RecursivelyApply(const char* pathname, T& functor); + const string& UidToName(kfsUid_t uid, time_t now); + const string& GidToName(kfsUid_t uid, time_t now); + kfsUid_t NameToUid(const string& name, time_t now); + kfsGid_t NameToGid(const string& name, time_t now); + void InvalidateAttribute(const string& path, + bool countsOnlyFlag, bool deleteAttrFlag); + void InvalidateAttributeCounts(const string& path) + { InvalidateAttribute(path, true, false); } + void InvalidateAttributeAndCounts(const string& path) + { InvalidateAttribute(path, true, true); } + void ValidateFAttrCache(time_t now, int maxScan); + void UpdatePath(KfsClientImpl::FAttr* fa, const string& path, + bool copyPathFlag = true); + const char* GetTmpAbsPath(const char* pathname, size_t& ioLen); + + friend struct RespondingServer; + friend struct RespondingServer2; + friend class ChmodFunc; + friend class ChownFunc; +}; + +}} + +#endif // LIBKFSCLIENT_KFSCLIENTINT_H diff --git a/src/cc/libclient/KfsNetClient.cc b/src/cc/libclient/KfsNetClient.cc new file mode 100644 index 000000000..a56627c44 --- /dev/null +++ b/src/cc/libclient/KfsNetClient.cc @@ -0,0 +1,1327 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/05/20 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "KfsNetClient.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "kfsio/IOBuffer.h" +#include "kfsio/NetConnection.h" +#include "kfsio/NetManager.h" +#include "kfsio/ITimeout.h" +#include "common/kfstypes.h" +#include "common/kfsdecls.h" +#include "common/MsgLogger.h" +#include "common/StdAllocator.h" +#include "qcdio/QCUtils.h" +#include "qcdio/qcstutils.h" +#include "qcdio/QCDLList.h" +#include "KfsOps.h" +#include "utils.h" + +namespace KFS +{ +namespace client +{ +using std::string; +using std::map; +using std::list; +using std::pair; +using std::make_pair; +using std::less; +using std::max; + +// Generic KFS request / response protocol state machine implementation. +class KfsNetClient::Impl : + public KfsCallbackObj, + public QCRefCountedObj, + private ITimeout +{ +public: + typedef QCRefCountedObj::StRef StRef; + + Impl( + string inHost, + int inPort, + int inMaxRetryCount, + int inTimeSecBetweenRetries, + int inOpTimeoutSec, + int inIdleTimeoutSec, + kfsSeq_t inInitialSeqNum, + const char* inLogPrefixPtr, + NetManager& inNetManager, + bool inResetConnectionOnOpTimeoutFlag, + int inMaxContentLength, + bool inFailAllOpsOnOpTimeoutFlag, + bool inMaxOneOutstandingOpFlag) + : KfsCallbackObj(), + QCRefCountedObj(), + ITimeout(), + mServerLocation(inHost, inPort), + mPendingOpQueue(), + mQueueStack(), + mConnPtr(), + mNextSeqNum( + (inInitialSeqNum < 0 ? -inInitialSeqNum : inInitialSeqNum) >> 1), + mReadHeaderDoneFlag(false), + mSleepingFlag(false), + mDataReceivedFlag(false), + mDataSentFlag(false), + mAllDataSentFlag(false), + mRetryConnectOnlyFlag(false), + mIdleTimeoutFlag(false), + mResetConnectionOnOpTimeoutFlag(inResetConnectionOnOpTimeoutFlag), + mFailAllOpsOnOpTimeoutFlag(inFailAllOpsOnOpTimeoutFlag), + mMaxOneOutstandingOpFlag(inMaxOneOutstandingOpFlag), + mTimeSecBetweenRetries(inTimeSecBetweenRetries), + mOpTimeoutSec(inOpTimeoutSec), + mIdleTimeoutSec(inIdleTimeoutSec), + mRetryCount(0), + mContentLength(0), + mMaxRetryCount(inMaxRetryCount), + mMaxContentLength(inMaxContentLength), + mInFlightOpPtr(0), + mOutstandingOpPtr(0), + mCurOpIt(), + mIstream(), + mOstream(), + mProperties(), + mStats(), + mEventObserverPtr(0), + mLogPrefix((inLogPrefixPtr && inLogPrefixPtr[0]) ? + (inLogPrefixPtr + string(" ")) : string()), + mNetManager(inNetManager) + { + SET_HANDLER(this, &KfsNetClient::Impl::EventHandler); + } + bool IsConnected() const + { return (mConnPtr && mConnPtr->IsGood()); } + bool Start( + string inServerName, + int inServerPort, + string* inErrMsgPtr, + bool inRetryPendingOpsFlag, + int inMaxRetryCount, + int inTimeSecBetweenRetries, + bool inRetryConnectOnlyFlag) + { + if (! inRetryPendingOpsFlag) { + Cancel(); + } + mRetryConnectOnlyFlag = inRetryConnectOnlyFlag; + mMaxRetryCount = inMaxRetryCount; + mTimeSecBetweenRetries = inTimeSecBetweenRetries; + return SetServer(ServerLocation(inServerName, inServerPort), + false, inErrMsgPtr); + } + bool SetServer( + const ServerLocation& inLocation, + bool inCancelPendingOpsFlag = true, + string* inErrMsgPtr = 0) + { + if (inLocation == mServerLocation) { + EnsureConnected(); + return (mSleepingFlag || IsConnected()); + } + if (inCancelPendingOpsFlag) { + Cancel(); + } + if (mSleepingFlag || IsConnected()) { + Reset(); + } + mServerLocation = inLocation; + mRetryCount = 0; + mNextSeqNum += 100; + EnsureConnected(inErrMsgPtr); + return (mSleepingFlag || IsConnected()); + } + void Reset() + { + if (mSleepingFlag) { + mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + ResetConnection(); + } + void Stop() + { + Reset(); + Cancel(); + } + int GetMaxRetryCount() const + { return mMaxRetryCount; } + void SetMaxRetryCount( + int inMaxRetryCount) + { mMaxRetryCount = inMaxRetryCount; } + int GetOpTimeoutSec() const + { return mOpTimeoutSec; } + void SetOpTimeoutSec( + int inTimeout) + { + mOpTimeoutSec = inTimeout; + if (IsConnected() && ! mPendingOpQueue.empty()) { + mConnPtr->SetInactivityTimeout(mOpTimeoutSec); + } + } + int GetIdleTimeoutSec() const + { return mIdleTimeoutSec; } + void SetIdleTimeoutSec( + int inTimeout) + { + mIdleTimeoutSec = inTimeout; + if (IsConnected() && mPendingOpQueue.empty()) { + mConnPtr->SetInactivityTimeout(mIdleTimeoutSec); + } + } + int GetTimeSecBetweenRetries() const + { return mTimeSecBetweenRetries; } + void SetTimeSecBetweenRetries( + int inTimeSec) + { mTimeSecBetweenRetries = inTimeSec; } + bool IsAllDataSent() const + { return (mDataSentFlag && mAllDataSentFlag); } + bool IsDataReceived() const + { return mDataReceivedFlag; } + bool IsDataSent() const + { return mDataSentFlag; } + bool IsRetryConnectOnly() const + { return mRetryConnectOnlyFlag; } + bool WasDisconnected() const + { return ((mDataSentFlag || mDataReceivedFlag) && ! IsConnected()); } + void SetRetryConnectOnly( + bool inFlag) + { mRetryConnectOnlyFlag = inFlag; } + void SetOpTimeout( + int inOpTimeoutSec) + { mOpTimeoutSec = inOpTimeoutSec; } + void GetStats( + Stats& outStats) const + { outStats = mStats; } + string GetServerLocation() const + { return mServerLocation.ToString(); } + bool Enqueue( + KfsOp* inOpPtr, + OpOwner* inOwnerPtr, + IOBuffer* inBufferPtr = 0) + { + // Ensure that the op is in the queue before attempting to re-establish + // connection, as later can fail other ops, and invoke the op cancel. + // The op has to be in the queue in order for cancel to work. + mStats.mOpsQueuedCount++; + const bool theOkFlag = EnqueueSelf(inOpPtr, inOwnerPtr, inBufferPtr, 0); + EnsureConnected(0, inOpPtr); + return theOkFlag; + } + bool Cancel( + KfsOp* inOpPtr, + OpOwner* inOwnerPtr) + { + if (! inOpPtr) { + return true; // Nothing to do. + } + OpQueue::iterator theIt = mPendingOpQueue.find(inOpPtr->seq); + if (theIt != mPendingOpQueue.end()) { + if (theIt->second.mOwnerPtr != inOwnerPtr || + theIt->second.mOpPtr != inOpPtr) { + return false; + } + Cancel(theIt); + return true; + } + for (QueueStack::iterator theStIt = mQueueStack.begin(); + theStIt != mQueueStack.end(); + ++theStIt) { + if ((theIt = theStIt->find(inOpPtr->seq)) != theStIt->end()) { + if (theIt->second.mOwnerPtr != inOwnerPtr || + theIt->second.mOpPtr != inOpPtr) { + return false; + } + if (&theIt->second == mInFlightOpPtr) { + CancelInFlightOp(); + } + theIt->second.Cancel(); + break; + } + } + return true; + } + bool Cancel() + { + CancelInFlightOp(); + const bool thePendingEmptyFlag = mPendingOpQueue.empty(); + if (thePendingEmptyFlag && mQueueStack.empty()) { + return false; + } + QueueStack::iterator theIt; + if (! thePendingEmptyFlag) { + theIt = mQueueStack.insert(mQueueStack.end(), OpQueue()); + mPendingOpQueue.swap(*theIt); + } + for (QueueStack::iterator theStIt = mQueueStack.begin(); + theStIt != mQueueStack.end(); + ++theStIt) { + for (OpQueue::iterator theIt = theStIt->begin(); + theIt != theStIt->end(); + ++theIt) { + if (! theIt->second.mOpPtr) { + continue; + } + mStats.mOpsCancelledCount++; + theIt->second.Cancel(); + } + } + if (! thePendingEmptyFlag) { + mQueueStack.erase(theIt); + } + return true; + } + int EventHandler( + int inCode, + void* inDataPtr) + { + const int thePrefRefCount = GetRefCount(); + StRef theRef(*this); + + if (mEventObserverPtr && mEventObserverPtr->Event(inCode, inDataPtr)) { + return 0; + } + + const char* theReasonPtr = "network error"; + OpQueueEntry* const theOutstandingOpPtr = mOutstandingOpPtr; + switch (inCode) { + case EVENT_NET_READ: { + assert(inDataPtr && mConnPtr); + IOBuffer& theBuffer = *reinterpret_cast(inDataPtr); + mDataReceivedFlag = mDataReceivedFlag || ! theBuffer.IsEmpty(); + HandleResponse(theBuffer); + } + break; + + case EVENT_NET_WROTE: + assert(inDataPtr && mConnPtr); + mDataSentFlag = true; + break; + + case EVENT_INACTIVITY_TIMEOUT: + if (! mIdleTimeoutFlag && + IsConnected() && mPendingOpQueue.empty()) { + mConnPtr->SetInactivityTimeout(mIdleTimeoutSec); + mIdleTimeoutFlag = true; + break; + } + theReasonPtr = "inactivity timeout"; + // Fall through. + case EVENT_NET_ERROR: + if (mConnPtr) { + mAllDataSentFlag = ! mConnPtr->IsWriteReady(); + KFS_LOG_STREAM(mPendingOpQueue.empty() ? + MsgLogger::kLogLevelDEBUG : + MsgLogger::kLogLevelERROR) << mLogPrefix << + "closing connection: " << mConnPtr->GetSockName() << + " to: " << + mServerLocation.ToString() << + " due to " << theReasonPtr << + " error: " << + QCUtils::SysError(mConnPtr->GetSocketError()) << + " pending read: " << mConnPtr->GetNumBytesToRead() << + " write: " << mConnPtr->GetNumBytesToWrite() << + " ops: " << mPendingOpQueue.size() << + KFS_LOG_EOM; + Reset(); + } + if (mIdleTimeoutFlag) { + mStats.mConnectionIdleTimeoutCount++; + } else if (mPendingOpQueue.empty()) { + break; + } else if (mDataSentFlag || mDataReceivedFlag) { + mStats.mNetErrorCount++; + if (inCode == EVENT_INACTIVITY_TIMEOUT) { + mStats.mResponseTimeoutCount++; + } + } else { + mStats.mConnectFailureCount++; + } + if (! mPendingOpQueue.empty()) { + RetryConnect(theOutstandingOpPtr); + } + break; + + default: + assert(!"Unknown event"); + break; + } + if (thePrefRefCount <= GetRefCount()) { + OpsTimeout(); + } + return 0; + } + void SetEventObserver( + EventObserver* inEventObserverPtr) + { mEventObserverPtr = inEventObserverPtr; } + time_t Now() const + { return mNetManager.Now(); } + NetManager& GetNetManager() + { return mNetManager; } + const NetManager& GetNetManager() const + { return mNetManager; } + void SetMaxContentLength( + int inMax) + { mMaxContentLength = inMax; } + void ClearMaxOneOutstandingOpFlag() + { + if (! mMaxOneOutstandingOpFlag) { + return; + } + mMaxOneOutstandingOpFlag = false; + OpQueueEntry* const theOutstandingOpPtr = mOutstandingOpPtr; + mOutstandingOpPtr = 0; + bool theResetTimerFlag = ! mOutstandingOpPtr; + for (OpQueue::iterator theIt = mPendingOpQueue.begin(); + theIt != mPendingOpQueue.end() && IsConnected(); + ++theIt) { + OpQueueEntry& theEntry = theIt->second; + if (&theEntry != theOutstandingOpPtr) { + Request(theEntry, theResetTimerFlag, theEntry.mRetryCount); + theResetTimerFlag = false; + } + } + } + void SetFailAllOpsOnOpTimeoutFlag( + bool inFlag) + { + mFailAllOpsOnOpTimeoutFlag = inFlag; + } +private: + class DoNotDeallocate + { + public: + DoNotDeallocate() + {} + void operator()( + char* /* inBufferPtr */) + {} + }; + struct OpQueueEntry + { + OpQueueEntry( + KfsOp* inOpPtr = 0, + OpOwner* inOwnerPtr = 0, + IOBuffer* inBufferPtr = 0) + : mOpPtr(inOpPtr), + mOwnerPtr(inOwnerPtr), + mBufferPtr(inBufferPtr), + mTime(0), + mRetryCount(0) + {} + void Cancel() + { OpDone(true); } + void Done() + { OpDone(false); } + void OpDone( + bool inCanceledFlag) + { + if (mOwnerPtr) { + if (mOpPtr) { + KfsOp* const theOpPtr = mOpPtr; + mOpPtr = 0; + mOwnerPtr->OpDone(theOpPtr, inCanceledFlag, mBufferPtr); + } + mOwnerPtr = 0; + mBufferPtr = 0; + } else { + delete mOpPtr; + delete mBufferPtr; + mBufferPtr = 0; + mOpPtr = 0; + } + } + void Clear() + { + mOpPtr = 0; + mOwnerPtr = 0; + mBufferPtr = 0; + } + KfsOp* mOpPtr; + OpOwner* mOwnerPtr; + IOBuffer* mBufferPtr; + time_t mTime; + int mRetryCount; + }; + typedef map, + StdFastAllocator > + > OpQueue; + typedef list + > QueueStack; + enum { kMaxReadAhead = 4 << 10 }; + + ServerLocation mServerLocation; + OpQueue mPendingOpQueue; + QueueStack mQueueStack; + NetConnectionPtr mConnPtr; + kfsSeq_t mNextSeqNum; + bool mReadHeaderDoneFlag; + bool mSleepingFlag; + bool mDataReceivedFlag; + bool mDataSentFlag; + bool mAllDataSentFlag; + bool mRetryConnectOnlyFlag; + bool mIdleTimeoutFlag; + bool mResetConnectionOnOpTimeoutFlag; + bool mFailAllOpsOnOpTimeoutFlag; + bool mMaxOneOutstandingOpFlag; + int mTimeSecBetweenRetries; + int mOpTimeoutSec; + int mIdleTimeoutSec; + int mRetryCount; + int mContentLength; + int mMaxRetryCount; + int mMaxContentLength; + OpQueueEntry* mInFlightOpPtr; + OpQueueEntry* mOutstandingOpPtr; + OpQueue::iterator mCurOpIt; + IOBuffer::IStream mIstream; + IOBuffer::WOStream mOstream; + Properties mProperties; + Stats mStats; + EventObserver* mEventObserverPtr; + const string mLogPrefix; + NetManager& mNetManager; + + virtual ~Impl() + { Impl::Reset(); } + bool EnqueueSelf( + KfsOp* inOpPtr, + OpOwner* inOwnerPtr, + IOBuffer* inBufferPtr, + int inRetryCount) + { + if (! inOpPtr) { + return false; + } + mIdleTimeoutFlag = false; + inOpPtr->seq = mNextSeqNum++; + const bool theResetTimerFlag = mPendingOpQueue.empty(); + pair const theRes = + mPendingOpQueue.insert(make_pair( + inOpPtr->seq, OpQueueEntry(inOpPtr, inOwnerPtr, inBufferPtr) + )); + if (! theRes.second || ! IsConnected()) { + return theRes.second; + } + if (mMaxOneOutstandingOpFlag) { + if (mOutstandingOpPtr) { + return theRes.second; + } + mOutstandingOpPtr = &(mPendingOpQueue.begin()->second); + } + Request(mOutstandingOpPtr ? *mOutstandingOpPtr : theRes.first->second, + theResetTimerFlag || mOutstandingOpPtr, inRetryCount); + return theRes.second; + } + void Request( + OpQueueEntry& inEntry, + bool inResetTimerFlag, + int inRetryCount) + { + KfsOp& theOp = *inEntry.mOpPtr; + theOp.Request(mOstream.Set(mConnPtr->GetOutBuffer())); + mOstream.Reset(); + if (theOp.contentLength > 0) { + if (theOp.contentBuf && theOp.contentBufLen > 0) { + assert(theOp.contentBufLen >= theOp.contentLength); + mConnPtr->Write(theOp.contentBuf, theOp.contentLength, + inResetTimerFlag); + } else if (inEntry.mBufferPtr) { + assert(size_t(inEntry.mBufferPtr->BytesConsumable()) >= + theOp.contentLength); + mConnPtr->WriteCopy(inEntry.mBufferPtr, theOp.contentLength, + inResetTimerFlag); + } + } + while (theOp.NextRequest( + mNextSeqNum, + mOstream.Set(mConnPtr->GetOutBuffer()))) { + mNextSeqNum++; + mOstream.Reset(); + } + mOstream.Reset(); + // Start the timer. + inEntry.mTime = Now(); + inEntry.mRetryCount = inRetryCount; + mConnPtr->SetInactivityTimeout(mOpTimeoutSec); + mConnPtr->Flush(inResetTimerFlag); + } + void HandleResponse( + IOBuffer& inBuffer) + { + for (; ;) { + if (! mReadHeaderDoneFlag && ! ReadHeader(inBuffer)) { + break; + } + if (mContentLength > inBuffer.BytesConsumable()) { + if (! mInFlightOpPtr) { + // Discard content. + mContentLength -= inBuffer.Consume(mContentLength); + } + if (mConnPtr) { + mConnPtr->SetMaxReadAhead(max(int(kMaxReadAhead), + mContentLength - inBuffer.BytesConsumable())); + } + break; + } + // Get ready for next op. + if (mConnPtr) { + mConnPtr->SetMaxReadAhead(kMaxReadAhead); + } + mReadHeaderDoneFlag = false; + if (! mInFlightOpPtr) { + inBuffer.Consume(mContentLength); + mContentLength = 0; + mProperties.clear(); + // Don't rely on compiler to properly handle tail recursion, + // use for loop instead. + continue; + } + assert(&mCurOpIt->second == mInFlightOpPtr); + assert(mInFlightOpPtr->mOpPtr); + KfsOp& theOp = *mInFlightOpPtr->mOpPtr; + IOBuffer* const theBufPtr = mInFlightOpPtr->mBufferPtr; + mInFlightOpPtr = 0; + theOp.ParseResponseHeader(mProperties); + mProperties.clear(); + if (mContentLength > 0) { + if (theBufPtr) { + theBufPtr->RemoveSpaceAvailable(); + QCVERIFY(mContentLength == + theBufPtr->MoveSpace(&inBuffer, mContentLength) + ); + } else { + IOBuffer theBuf; + QCVERIFY(mContentLength == + theBuf.MoveSpace(&inBuffer, mContentLength) + ); + } + mContentLength = 0; + } + mRetryCount = 0; + HandleOp(mCurOpIt); + } + } + bool ReadHeader( + IOBuffer& inBuffer) + { + const int theIdx = inBuffer.IndexOf(0, "\r\n\r\n"); + if (theIdx < 0) { + if (inBuffer.BytesConsumable() > MAX_RPC_HEADER_LEN) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "error: " << mServerLocation.ToString() << + ": exceeded max. response header size: " << + MAX_RPC_HEADER_LEN << "; got " << + inBuffer.BytesConsumable() << " resetting connection" << + KFS_LOG_EOM; + Reset(); + EnsureConnected(); + } + return false; + } + const int theHdrLen = theIdx + 4; + const char theSeparator = ':'; + const bool theMultiLineFlag = false; + mProperties.clear(); + mProperties.loadProperties( + mIstream.Set(inBuffer, theHdrLen), theSeparator, theMultiLineFlag); + mIstream.Reset(); + inBuffer.Consume(theHdrLen); + mReadHeaderDoneFlag = true; + mContentLength = mProperties.getValue("Content-length", 0); + const kfsSeq_t theOpSeq = mProperties.getValue("Cseq", kfsSeq_t(-1)); + if (mContentLength > mMaxContentLength) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "error: " << mServerLocation.ToString() << + ": exceeded max. response content length: " << mContentLength << + " > " << mMaxContentLength << + " seq: " << theOpSeq << + ", resetting connection" << + KFS_LOG_EOM; + Reset(); + EnsureConnected(); + return false; + } + mCurOpIt = mPendingOpQueue.find(theOpSeq); + mInFlightOpPtr = + mCurOpIt != mPendingOpQueue.end() ? &mCurOpIt->second : 0; + if (! mInFlightOpPtr) { + KFS_LOG_STREAM_INFO << mLogPrefix << + "no operation found with seq: " << theOpSeq << + ", discarding response " << + " content length: " << mContentLength << + KFS_LOG_EOM; + mContentLength -= inBuffer.Consume(mContentLength); + return true; + } + if (mOutstandingOpPtr && mOutstandingOpPtr != mInFlightOpPtr) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "error: " << mServerLocation.ToString() << + " seq: " << theOpSeq << + " op:" + " expected: " << static_cast(mOutstandingOpPtr) << + " actual: " << static_cast(mInFlightOpPtr) << + " seq:" << + " expected: " << mOutstandingOpPtr->mOpPtr->seq << + " actual: " << mInFlightOpPtr->mOpPtr->seq << + ", resetting connection" << + KFS_LOG_EOM; + Reset(); + EnsureConnected(); + return false; + } + if (mContentLength <= 0) { + return true; + } + if (mInFlightOpPtr->mBufferPtr) { + if (inBuffer.UseSpaceAvailable( + mInFlightOpPtr->mBufferPtr, mContentLength) <= 0) { + // Move the payload, if any, to the beginning of the new buffer. + inBuffer.MakeBuffersFull(); + } + return true; + } + KfsOp& theOp = *mInFlightOpPtr->mOpPtr; + theOp.EnsureCapacity(size_t(mContentLength)); + IOBuffer theBuf; + theBuf.Append(IOBufferData( + IOBufferData::IOBufferBlockPtr( + theOp.contentBuf, + DoNotDeallocate() + ), + mContentLength, + 0, + 0 + )); + inBuffer.UseSpaceAvailable(&theBuf, mContentLength); + return true; + } + void CancelInFlightOp() + { + if (! mInFlightOpPtr) { + return; + } + if (mContentLength > 0 && mConnPtr && mInFlightOpPtr->mBufferPtr) { + // Detach shared buffers, if any. + IOBuffer& theBuf = mConnPtr->GetInBuffer(); + mContentLength -= theBuf.BytesConsumable(); + theBuf.Clear(); + } + mInFlightOpPtr = 0; + } + void EnsureConnected( + string* inErrMsgPtr = 0, + const KfsOp* inLastOpPtr = 0) + { + if (mSleepingFlag || IsConnected()) { + return; + } + mDataReceivedFlag = false; + mDataSentFlag = false; + mAllDataSentFlag = true; + mIdleTimeoutFlag = false; + ResetConnection(); + mStats.mConnectCount++; + const bool theNonBlockingFlag = true; + TcpSocket& theSocket = *(new TcpSocket()); + const int theErr = theSocket.Connect( + mServerLocation, theNonBlockingFlag); + if (theErr && theErr != -EINPROGRESS) { + if (inErrMsgPtr) { + *inErrMsgPtr = QCUtils::SysError(-theErr); + } + KFS_LOG_STREAM_ERROR << mLogPrefix << + "failed to connect to server " << mServerLocation.ToString() << + " : " << QCUtils::SysError(-theErr) << + KFS_LOG_EOM; + delete &theSocket; + mStats.mConnectFailureCount++; + RetryConnect(); + return; + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "connecting to server: " << mServerLocation.ToString() << + KFS_LOG_EOM; + mConnPtr.reset(new NetConnection(&theSocket, this)); + mConnPtr->EnableReadIfOverloaded(); + mConnPtr->SetDoingNonblockingConnect(); + mConnPtr->SetMaxReadAhead(kMaxReadAhead); + mConnPtr->SetInactivityTimeout(mOpTimeoutSec); + // Add connection to the poll vector + mNetManager.AddConnection(mConnPtr); + RetryAll(inLastOpPtr); + } + void RetryAll( + const KfsOp* inLastOpPtr = 0) + { + if (mPendingOpQueue.empty()) { + return; + } + CancelInFlightOp(); + mNextSeqNum += 1000; // For debugging to see retries. + QueueStack::iterator const theIt = + mQueueStack.insert(mQueueStack.end(), OpQueue()); + OpQueue& theQueue = *theIt; + theQueue.swap(mPendingOpQueue); + for (OpQueue::iterator theIt = theQueue.begin(); + theIt != theQueue.end(); + ++theIt) { + OpQueueEntry& theEntry = theIt->second; + if (! theEntry.mOpPtr) { + continue; + } + if (theIt->second.mOwnerPtr) { + if (theEntry.mRetryCount > mMaxRetryCount) { + mStats.mOpsTimeoutCount++; + theEntry.mOpPtr->status = kErrorMaxRetryReached; + theEntry.Done(); + } else { + if (inLastOpPtr != theEntry.mOpPtr && + theEntry.mRetryCount > 0) { + mStats.mOpsRetriedCount++; + } + EnqueueSelf(theEntry.mOpPtr, theEntry.mOwnerPtr, + theEntry.mBufferPtr, theEntry.mRetryCount); + theEntry.Clear(); + } + } else { + mStats.mOpsCancelledCount++; + theEntry.Cancel(); + } + } + mQueueStack.erase(theIt); + } + void ResetConnection() + { + CancelInFlightOp(); + mOutstandingOpPtr = 0; + if (! mConnPtr) { + return; + } + mConnPtr->Close(); + mConnPtr->GetInBuffer().Clear(); + mConnPtr->SetOwningKfsCallbackObj(0); + mConnPtr.reset(); + mReadHeaderDoneFlag = false; + mContentLength = 0; + } + void HandleOp( + OpQueue::iterator inIt, + bool inCanceledFlag = false) + { + if (inCanceledFlag) { + mStats.mOpsCancelledCount++; + } + const bool theScheduleNextOpFlag = mOutstandingOpPtr == &inIt->second; + if (theScheduleNextOpFlag) { + mOutstandingOpPtr = 0; + } + if (&inIt->second == mInFlightOpPtr) { + CancelInFlightOp(); + } + OpQueueEntry theOpEntry = inIt->second; + mPendingOpQueue.erase(inIt); + const int thePrefRefCount = GetRefCount(); + theOpEntry.OpDone(inCanceledFlag); + if (! mOutstandingOpPtr && + theScheduleNextOpFlag && thePrefRefCount <= GetRefCount() && + ! mPendingOpQueue.empty() && IsConnected()) { + mOutstandingOpPtr = &(mPendingOpQueue.begin()->second); + const bool kResetTimerFlag = true; + Request(*mOutstandingOpPtr, kResetTimerFlag, + mOutstandingOpPtr->mRetryCount); + } + } + void Cancel( + OpQueue::iterator inIt) + { HandleOp(inIt, true); } + virtual void Timeout() + { + if (mSleepingFlag) { + mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + if (mPendingOpQueue.empty()) { + return; + } + EnsureConnected(); + } + void RetryConnect( + OpQueueEntry* inOutstandingOpPtr = 0) + { + if (mSleepingFlag) { + return; + } + CancelInFlightOp(); + if (mRetryCount < mMaxRetryCount && (! mRetryConnectOnlyFlag || + (! mDataSentFlag && ! mDataReceivedFlag))) { + mRetryCount++; + if (mTimeSecBetweenRetries > 0) { + KFS_LOG_STREAM_INFO << mLogPrefix << + "retry attempt " << mRetryCount << + " of " << mMaxRetryCount << + ", will retry " << mPendingOpQueue.size() << + " pending operation(s) in " << mTimeSecBetweenRetries << + " seconds" << + KFS_LOG_EOM; + mStats.mSleepTimeSec += mTimeSecBetweenRetries; + SetTimeoutInterval(mTimeSecBetweenRetries * 1000, true); + mSleepingFlag = true; + mNetManager.RegisterTimeoutHandler(this); + } else { + Timeout(); + } + } else if (inOutstandingOpPtr && ! mFailAllOpsOnOpTimeoutFlag && + ! mPendingOpQueue.empty() && + &(mPendingOpQueue.begin()->second) == inOutstandingOpPtr) { + HandleSingleOpTimeout(mPendingOpQueue.begin()); + } else { + QueueStack::iterator const theIt = + mQueueStack.insert(mQueueStack.end(), OpQueue()); + OpQueue& theQueue = *theIt; + theQueue.swap(mPendingOpQueue); + for (OpQueue::iterator theIt = theQueue.begin(); + theIt != theQueue.end(); + ++theIt) { + if (! theIt->second.mOpPtr) { + continue; + } + theIt->second.mOpPtr->status = kErrorMaxRetryReached; + theIt->second.Done(); + } + mQueueStack.erase(theIt); + } + } + void HandleSingleOpTimeout( + OpQueue::iterator inIt) + { + OpQueueEntry& theEntry = inIt->second; + if (theEntry.mRetryCount < mMaxRetryCount) { + theEntry.mRetryCount++; + } else { + theEntry.mOpPtr->status = kErrorMaxRetryReached; + const int thePrefRefCount = GetRefCount(); + HandleOp(inIt); + if (thePrefRefCount > GetRefCount()) { + return; + } + } + if (! mPendingOpQueue.empty()) { + EnsureConnected(); + } + } + void OpsTimeout() + { + if (mOpTimeoutSec <= 0 || ! IsConnected()) { + return; + } + // Timeout ops waiting for response. + // The ops in the queue are ordered by op seq. number. + // The code ensures (initial seq. number in ctor) that seq. numbers + // never wrap around, and are monotonically increase, so that the last + // (re)queued operation seq. number is always the largest. + // First move all timed out ops into the temporary queue, then process + // the temporary queue. This is less error prone than dealing with + // completion changing mPendingOpQueue while iterating. + QueueStack::iterator theStIt = mQueueStack.end(); + const time_t theNow = Now(); + time_t theExpireTime = theNow - mOpTimeoutSec; + for (OpQueue::iterator theIt = mPendingOpQueue.begin(); + theIt != mPendingOpQueue.end() && + theIt->second.mTime < theExpireTime; ) { + OpQueueEntry& theEntry = theIt->second; + assert(theEntry.mOpPtr); + if (&theEntry == mInFlightOpPtr) { + CancelInFlightOp(); + } + if (mResetConnectionOnOpTimeoutFlag && + theStIt == mQueueStack.end()) { + KFS_LOG_STREAM_INFO << mLogPrefix << + "op timed out: seq: " << theEntry.mOpPtr->seq << + " " << theEntry.mOpPtr->Show() << + " retry count: " << theEntry.mRetryCount << + " wait time: " << (theNow - theEntry.mTime) << + " pending ops: " << mPendingOpQueue.size() << + " resetting connecton" << + KFS_LOG_EOM; + // Increment the retry count only for the op that timed out. + // This mode assumes all other ops were blocked by the first + // one. + Reset(); + if (mFailAllOpsOnOpTimeoutFlag) { + // Restart from the first op, and go until the end. + theIt = mPendingOpQueue.begin(); + theExpireTime = + max(theNow, mPendingOpQueue.rbegin()->second.mTime) + 1; + } else { + HandleSingleOpTimeout(theIt); + return; + } + } + if (theStIt == mQueueStack.end()) { + theStIt = mQueueStack.insert(mQueueStack.end(), OpQueue()); + } + theStIt->insert(*theIt); + mPendingOpQueue.erase(theIt++); + if (&theEntry == mOutstandingOpPtr) { + break; + } + } + if (theStIt == mQueueStack.end()) { + return; + } + for (OpQueue::iterator theIt = theStIt->begin(); + theIt != theStIt->end(); + ++theIt) { + OpQueueEntry& theEntry = theIt->second; + if (! theEntry.mOpPtr) { + continue; + } + KFS_LOG_STREAM_INFO << mLogPrefix << + "op timed out: seq: " << theEntry.mOpPtr->seq << + " " << theEntry.mOpPtr->Show() << + " retry count: " << theEntry.mRetryCount << + " max: " << mMaxRetryCount << + " wait time: " << (theNow - theEntry.mTime) << + KFS_LOG_EOM; + mStats.mOpsTimeoutCount++; + if (theEntry.mRetryCount >= mMaxRetryCount) { + theEntry.mOpPtr->status = kErrorMaxRetryReached; + theEntry.Done(); + } else { + mStats.mOpsRetriedCount++; + const OpQueueEntry theTmp = theEntry; + theEntry.Clear(); + EnqueueSelf(theTmp.mOpPtr, theTmp.mOwnerPtr, + theTmp.mBufferPtr, theTmp.mRetryCount + 1); + } + } + mQueueStack.erase(theStIt); + if (! mPendingOpQueue.empty()) { + EnsureConnected(); + } + } + friend class StImplRef; +private: + Impl( + const Impl& inClient); + Impl& operator=( + const Impl& inClient); +}; + +KfsNetClient::KfsNetClient( + NetManager& inNetManager, + string inHost /* = string() */, + int inPort /* = 0 */, + int inMaxRetryCount /* = 0 */, + int inTimeSecBetweenRetries /* = 10 */, + int inOpTimeoutSec /* = 5 * 60 */, + int inIdleTimeoutSec /* = 30 * 60 */, + int64_t inInitialSeqNum /* = 1 */, + const char* inLogPrefixPtr /* = 0 */, + bool inResetConnectionOnOpTimeoutFlag /* = true */, + int inMaxContentLength /* = MAX_RPC_HEADER_LEN */, + bool inFailAllOpsOnOpTimeoutFlag /* = false */, + bool inMaxOneOutstandingOpFlag /* = false */) + : mImpl(*new Impl( + inHost, + inPort, + inMaxRetryCount, + inTimeSecBetweenRetries, + inOpTimeoutSec, + inIdleTimeoutSec, + kfsSeq_t(inInitialSeqNum), + inLogPrefixPtr, + inNetManager, + inResetConnectionOnOpTimeoutFlag, + inMaxContentLength, + inFailAllOpsOnOpTimeoutFlag, + inMaxOneOutstandingOpFlag + )) +{ + mImpl.Ref(); +} + + /* virtual */ +KfsNetClient::~KfsNetClient() +{ + mImpl.Stop(); + mImpl.UnRef(); +} + + bool +KfsNetClient::IsConnected() const +{ + Impl::StRef theRef(mImpl); + return mImpl.IsConnected(); +} + + bool +KfsNetClient::Start( + string inServerName, + int inServerPort, + string* inErrMsgPtr, + bool inRetryPendingOpsFlag, + int inMaxRetryCount, + int inTimeSecBetweenRetries, + bool inRetryConnectOnlyFlag) +{ + return mImpl.Start( + inServerName, + inServerPort, + inErrMsgPtr, + inRetryPendingOpsFlag, + inMaxRetryCount, + inTimeSecBetweenRetries, + inRetryConnectOnlyFlag + ); +} + + bool +KfsNetClient::SetServer( + const ServerLocation& inLocation, + bool inCancelPendingOpsFlag /* = true */) +{ + Impl::StRef theRef(mImpl); + return mImpl.SetServer(inLocation, inCancelPendingOpsFlag); +} + + void +KfsNetClient::Stop() +{ + Impl::StRef theRef(mImpl); + mImpl.Stop(); +} + + int +KfsNetClient::GetMaxRetryCount() const +{ + Impl::StRef theRef(mImpl); + return mImpl.GetMaxRetryCount(); +} + + void +KfsNetClient::SetMaxRetryCount( + int inMaxRetryCount) +{ + Impl::StRef theRef(mImpl); + mImpl.SetMaxRetryCount(inMaxRetryCount); +} + + int +KfsNetClient::GetOpTimeoutSec() const +{ + Impl::StRef theRef(mImpl); + return mImpl.GetOpTimeoutSec(); +} + + void +KfsNetClient::SetOpTimeoutSec( + int inTimeout) +{ + Impl::StRef theRef(mImpl); + mImpl.SetOpTimeoutSec(inTimeout); +} + + int +KfsNetClient::GetIdleTimeoutSec() const +{ + Impl::StRef theRef(mImpl); + return mImpl.GetIdleTimeoutSec(); +} + + void +KfsNetClient::SetIdleTimeoutSec( + int inTimeout) +{ + Impl::StRef theRef(mImpl); + mImpl.SetIdleTimeoutSec(inTimeout); +} + + int +KfsNetClient::GetTimeSecBetweenRetries() +{ + return mImpl.GetTimeSecBetweenRetries(); +} + + void +KfsNetClient::SetTimeSecBetweenRetries( + int inTimeSec) +{ + Impl::StRef theRef(mImpl); + mImpl.SetTimeSecBetweenRetries(inTimeSec); +} + + bool +KfsNetClient::IsAllDataSent() const +{ + return mImpl.IsAllDataSent(); +} + + bool +KfsNetClient::IsDataReceived() const +{ + Impl::StRef theRef(mImpl); + return mImpl.IsDataReceived(); +} + + bool +KfsNetClient::IsDataSent() const +{ + Impl::StRef theRef(mImpl); + return mImpl.IsDataSent(); +} + + bool +KfsNetClient::IsRetryConnectOnly() const +{ + Impl::StRef theRef(mImpl); + return mImpl.IsRetryConnectOnly(); +} + + bool +KfsNetClient::WasDisconnected() const +{ + Impl::StRef theRef(mImpl); + return mImpl.WasDisconnected(); +} + + void +KfsNetClient::SetRetryConnectOnly( + bool inFlag) +{ + Impl::StRef theRef(mImpl); + mImpl.SetRetryConnectOnly(inFlag); +} + + void +KfsNetClient::SetOpTimeout( + int inOpTimeoutSec) +{ + Impl::StRef theRef(mImpl); + mImpl.SetOpTimeout(inOpTimeoutSec); +} + + void +KfsNetClient::GetStats( + Stats& outStats) const +{ + Impl::StRef theRef(mImpl); + mImpl.GetStats(outStats); +} + + bool +KfsNetClient::Enqueue( + KfsOp* inOpPtr, + OpOwner* inOwnerPtr, + IOBuffer* inBufferPtr /* = 0 */) +{ + Impl::StRef theRef(mImpl); + return mImpl.Enqueue(inOpPtr, inOwnerPtr, inBufferPtr); +} + + bool +KfsNetClient::Cancel( + KfsOp* inOpPtr, + OpOwner* inOwnerPtr) +{ + Impl::StRef theRef(mImpl); + return mImpl.Cancel(inOpPtr, inOwnerPtr); +} + + bool +KfsNetClient::Cancel() +{ + Impl::StRef theRef(mImpl); + return mImpl.Cancel(); +} + + string +KfsNetClient::GetServerLocation() const +{ + Impl::StRef theRef(mImpl); + return mImpl.GetServerLocation(); +} + + void +KfsNetClient::SetEventObserver( + KfsNetClient::EventObserver* inEventObserverPtr) +{ + Impl::StRef theRef(mImpl); + return mImpl.SetEventObserver(inEventObserverPtr); +} + + NetManager& +KfsNetClient::GetNetManager() +{ + Impl::StRef theRef(mImpl); + return mImpl.GetNetManager(); +} + + const NetManager& +KfsNetClient::GetNetManager() const +{ + Impl::StRef theRef(mImpl); + return mImpl.GetNetManager(); +} + + void +KfsNetClient::SetMaxContentLength( + int inMax) +{ + Impl::StRef theRef(mImpl); + return mImpl.SetMaxContentLength(inMax); +} + + void +KfsNetClient::ClearMaxOneOutstandingOpFlag() +{ + Impl::StRef theRef(mImpl); + return mImpl.ClearMaxOneOutstandingOpFlag(); +} + + void +KfsNetClient::SetFailAllOpsOnOpTimeoutFlag( + bool inFlag) +{ + Impl::StRef theRef(mImpl); + return mImpl.SetFailAllOpsOnOpTimeoutFlag(inFlag); +} + +}} /* namespace cient KFS */ diff --git a/src/cc/libclient/KfsNetClient.h b/src/cc/libclient/KfsNetClient.h new file mode 100644 index 000000000..b43d87466 --- /dev/null +++ b/src/cc/libclient/KfsNetClient.h @@ -0,0 +1,234 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/05/20 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef KFS_NET_CLIENT_H +#define KFS_NET_CLIENT_H + +#include "common/kfstypes.h" + +#include +#include +#include + +namespace KFS +{ + +class IOBuffer; +class NetManager; +struct ServerLocation; + +namespace client +{ +struct KfsOp; + +using std::ostream; +using std::string; + +// Generic KFS request / response protocol state machine. +class KfsNetClient +{ +private: + class Impl; +public: + class OpOwner + { + // protected: + public: + virtual void OpDone( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) = 0; + virtual ~OpOwner() {} + friend class Impl; + }; + struct Stats + { + typedef int64_t Counter; + Stats() + : mConnectCount(0), + mConnectFailureCount(0), + mNetErrorCount(0), + mConnectionIdleTimeoutCount(0), + mResponseTimeoutCount(0), + mOpsQueuedCount(0), + mOpsTimeoutCount(0), + mOpsRetriedCount(0), + mOpsCancelledCount(0), + mSleepTimeSec(0) + {} + void Clear() + { *this = Stats(); } + Stats& Add( + const Stats& inStats) + { + mConnectCount += inStats.mConnectCount; + mConnectFailureCount += inStats.mConnectFailureCount; + mNetErrorCount += inStats.mNetErrorCount; + mConnectionIdleTimeoutCount += inStats.mConnectionIdleTimeoutCount; + mResponseTimeoutCount += inStats.mResponseTimeoutCount; + mOpsQueuedCount += inStats.mOpsQueuedCount; + mOpsTimeoutCount += inStats.mOpsTimeoutCount; + mOpsRetriedCount += inStats.mOpsRetriedCount; + mOpsCancelledCount += inStats.mOpsCancelledCount; + mSleepTimeSec += inStats.mSleepTimeSec; + return *this; + } + ostream& Display( + ostream& inStream, + const char* inSeparatorPtr = 0, + const char* inDelimiterPtr = 0) const + { + const char* const theSeparatorPtr = + inSeparatorPtr ? inSeparatorPtr : " "; + const char* const theDelimiterPtr = + inDelimiterPtr ? inDelimiterPtr : ": "; + inStream << + "Connect" << theDelimiterPtr << + mConnectCount << theSeparatorPtr << + "ConnectFailure" << theDelimiterPtr << + mConnectFailureCount << theSeparatorPtr << + "NetError" << theDelimiterPtr << + mNetErrorCount << theSeparatorPtr << + "ConnectionIdleTimeout" << theDelimiterPtr << + mConnectionIdleTimeoutCount << theSeparatorPtr << + "ResponseTimeout" << theDelimiterPtr << + mResponseTimeoutCount << theSeparatorPtr << + "OpsQueued" << theDelimiterPtr << + mOpsQueuedCount << theSeparatorPtr << + "OpsTimeout" << theDelimiterPtr << + mOpsTimeoutCount << theSeparatorPtr << + "OpsRetried" << theDelimiterPtr << + mOpsRetriedCount << theSeparatorPtr << + "OpsCancelled" << theDelimiterPtr << + mOpsCancelledCount << theSeparatorPtr << + "SleepTimeSec" << theDelimiterPtr << + mSleepTimeSec + ; + return inStream; + } + Counter mConnectCount; + Counter mConnectFailureCount; + Counter mNetErrorCount; + Counter mConnectionIdleTimeoutCount; + Counter mResponseTimeoutCount; + Counter mOpsQueuedCount; + Counter mOpsTimeoutCount; + Counter mOpsRetriedCount; + Counter mOpsCancelledCount; + Counter mSleepTimeSec; + }; + enum { kErrorMaxRetryReached = -(10000 + ETIMEDOUT) }; + class EventObserver + { + public: + virtual bool Event( + int& ioCode, + void*& ioDataPtr) = 0; + protected: + EventObserver() {} + virtual ~EventObserver() {} + }; + + KfsNetClient( + NetManager& inNetManager, + string inHost = string(), + int inPort = 0, + int inMaxRetryCount = 0, + int inTimeSecBetweenRetries = 10, + int inOpTimeoutSec = 5 * 60, + int inIdleTimeoutSec = 30 * 60, + int64_t inInitialSeqNum = 1, + const char* inLogPrefixPtr = 0, + bool inResetConnectionOnOpTimeoutFlag = true, + int inMaxContentLength = MAX_RPC_HEADER_LEN, + bool inFailAllOpsOnOpTimeoutFlag = false, + bool inMaxOneOutstandingOpFlag = false); + virtual ~KfsNetClient(); + bool IsConnected() const; + bool Start( + string inServerName, + int inServerPort, + string* inErrMsgPtr, + bool inRetryPendingOpsFlag, + int inMaxRetryCount, + int inTimeSecBetweenRetries, + bool inRetryConnectOnlyFlag); + bool SetServer( + const ServerLocation& inLocation, + bool inCancelPendingOpsFlag = true); + void Stop(); + int GetMaxRetryCount() const; + void SetMaxRetryCount( + int inMaxRetryCount); + int GetOpTimeoutSec() const; + void SetOpTimeoutSec( + int inTimeout); + int GetIdleTimeoutSec() const; + void SetIdleTimeoutSec( + int inTimeout); + int GetTimeSecBetweenRetries(); + void SetTimeSecBetweenRetries( + int inTimeSec); + bool IsAllDataSent() const; + bool IsDataReceived() const; + bool IsDataSent() const; + bool IsRetryConnectOnly() const; + bool WasDisconnected() const; + void SetRetryConnectOnly( + bool inFlag); + void SetOpTimeout( + int inOpTimeoutSec); + void GetStats( + Stats& outStats) const; + bool Enqueue( + KfsOp* inOpPtr, + OpOwner* inOwnerPtr, + IOBuffer* inBufferPtr = 0); + bool Cancel( + KfsOp* inOpPtr, + OpOwner* inOwnerPtr); + bool Cancel(); + std::string GetServerLocation() const; + NetManager& GetNetManager(); + const NetManager& GetNetManager() const; + void SetEventObserver( + EventObserver* inEventObserverPtr); // Debug hook + void SetMaxContentLength( + int inMax); + void ClearMaxOneOutstandingOpFlag(); + void SetFailAllOpsOnOpTimeoutFlag( + bool inFlag); +private: + Impl& mImpl; +private: + KfsNetClient( + const KfsNetClient& inClient); + KfsNetClient& operator=( + const KfsNetClient& inClient); +}; + +}} + +#endif /* KFS_NET_CLIENT_H */ diff --git a/src/cc/libclient/KfsOps.cc b/src/cc/libclient/KfsOps.cc new file mode 100644 index 000000000..3c1c8cb6e --- /dev/null +++ b/src/cc/libclient/KfsOps.cc @@ -0,0 +1,1050 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/05/24 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Client side RPCs implementation. +// +//---------------------------------------------------------------------------- + +#include "KfsOps.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "kfsio/checksum.h" +#include "common/RequestParser.h" +#include "utils.h" + +namespace KFS +{ +namespace client +{ +using std::istringstream; +using std::ostream; +using std::string; +using std::min; +using std::max; + +static const char* InitHostName() +{ + const int maxLen = 1024; + static char sHostName[maxLen + 1]; + sHostName[gethostname(sHostName, maxLen) ? 0 : min(64, maxLen)] = 0; + return sHostName; +} +static const char* const sHostName(InitHostName()); + +string KfsOp::sExtraHeaders; + +class KfsOp::ReqHeaders +{ +public: + ReqHeaders(const KfsOp& o) + : op(o) + {} + ostream& Insert(ostream& os) const + { + return (os << + "Cseq: " << op.seq << "\r\n" + "Version: " "KFS/1.0" "\r\n" + "Client-Protocol-Version: " << KFS_CLIENT_PROTO_VERS << "\r\n" + << KfsOp::sExtraHeaders + ); + } +private: + const KfsOp& op; +}; + +inline ostream& operator<<(ostream& os, const KfsOp::ReqHeaders& hdrs) { + return hdrs.Insert(os); +} + +inline ostream& PutPermissions(ostream& os, const Permissions& permissions) +{ + if (permissions.user != kKfsUserNone) { + os << "Owner: " << permissions.user << "\r\n"; + } + if (permissions.group != kKfsGroupNone) { + os << "Group: " << permissions.group << "\r\n"; + } + if (permissions.mode != kKfsModeUndef) { + os << "Mode: " << permissions.mode << "\r\n"; + } + return os; +} + +/// +/// All Request() methods build a request RPC based on the KFS +/// protocol and output the request into a ostream. +/// @param[out] os which contains the request RPC. +/// +void +CreateOp::Request(ostream &os) +{ + os << + "CREATE \r\n" << ReqHeaders(*this) << + "Parent File-handle: " << parentFid << "\r\n" + "Filename: " << filename << "\r\n" + "Num-replicas: " << numReplicas << "\r\n" + "Exclusive: " << (exclusive ? 1 : 0) << "\r\n" + ; + if (striperType != KFS_STRIPED_FILE_TYPE_NONE && + striperType != KFS_STRIPED_FILE_TYPE_UNKNOWN) { + os << + "Striper-type: " << striperType << "\r\n" + "Num-stripes: " << numStripes << "\r\n" + "Num-recovery-stripes: " << numRecoveryStripes << "\r\n" + "Stripe-size: " << stripeSize << "\r\n" + ; + } + PutPermissions(os, permissions); + if (reqId >= 0) { + os << "ReqId: " << reqId << "\r\n"; + } + os << "\r\n"; +} + +void +MkdirOp::Request(ostream &os) +{ + os << + "MKDIR \r\n" << ReqHeaders(*this) << + "Parent File-handle: " << parentFid << "\r\n" + "Directory: " << dirname << "\r\n" + ; + PutPermissions(os, permissions); + if (reqId >= 0) { + os << "ReqId: " << reqId << "\r\n"; + } + os << "\r\n"; +} + +void +RmdirOp::Request(ostream &os) +{ + os << + "RMDIR \r\n" << ReqHeaders(*this) << + "Parent File-handle: " << parentFid << "\r\n" + "Pathname: " << pathname << "\r\n" + "Directory: " << dirname << "\r\n" + "\r\n"; +} + +void +RenameOp::Request(ostream &os) +{ + os << + "RENAME \r\n" << ReqHeaders(*this) << + "Parent File-handle: " << parentFid << "\r\n" + "Old-name: " << oldname << "\r\n" + "New-path: " << newpath << "\r\n" + "Old-path: " << oldpath << "\r\n" + "Overwrite: " << (overwrite ? 1 : 0) << "\r\n" + "\r\n"; +} + +void +ReaddirOp::Request(ostream &os) +{ + os << + "READDIR \r\n" << ReqHeaders(*this) << + "Directory File-handle: " << fid << "\r\n" + "Max-entries: " << numEntries << "\r\n" + ; + if (! fnameStart.empty()) { + os << "Fname-start: " << fnameStart << "\r\n"; + } + os << "\r\n"; +} + +void +SetMtimeOp::Request(ostream &os) +{ + os << + "SET_MTIME\r\n" << ReqHeaders(*this) << + "Pathname: " << pathname << "\r\n" + "Mtime-sec: " << mtime.tv_sec << "\r\n" + "Mtime-usec: " << mtime.tv_usec << "\r\n" + "\r\n"; +} + +void +DumpChunkServerMapOp::Request(ostream &os) +{ + os << + "DUMP_CHUNKTOSERVERMAP\r\n" << ReqHeaders(*this) << + "\r\n"; +} + +void +DumpChunkMapOp::Request(ostream &os) +{ + os << + "DUMP_CHUNKMAP\r\n" << ReqHeaders(*this) << + "\r\n"; +} + +void +UpServersOp::Request(ostream &os) +{ + os << + "UPSERVERS\r\n" << ReqHeaders(*this) << + "\r\n"; +} + +void +ReaddirPlusOp::Request(ostream &os) +{ + os << + "READDIRPLUS \r\n" << ReqHeaders(*this) << + "Directory File-handle: " << fid << "\r\n" + "GetLastChunkInfoOnlyIfSizeUnknown: " << + (getLastChunkInfoOnlyIfSizeUnknown ? 1 : 0) << "\r\n" + "Max-entries: " << numEntries << "\r\n" + ; + if (! fnameStart.empty()) { + os << "Fname-start: " << fnameStart << "\r\n"; + } + os << "\r\n"; +} + +void +RemoveOp::Request(ostream &os) +{ + os << + "REMOVE \r\n" << ReqHeaders(*this) << + "Pathname: " << pathname << "\r\n" + "Parent File-handle: " << parentFid << "\r\n" + "Filename: " << filename << "\r\n" + "\r\n"; +} + +void +LookupOp::Request(ostream &os) +{ + os << + "LOOKUP\r\n" << ReqHeaders(*this) << + "Parent File-handle: " << parentFid << "\r\n" + "Filename: " << filename << "\r\n" + "\r\n"; +} + +void +LookupPathOp::Request(ostream &os) +{ + os << + "LOOKUP_PATH\r\n" << ReqHeaders(*this) << + "Root File-handle: " << rootFid << "\r\n" + "Pathname: " << filename << "\r\n" + "\r\n"; +} + +void +GetAllocOp::Request(ostream &os) +{ + assert(fileOffset >= 0); + + os << + "GETALLOC\r\n" << ReqHeaders(*this) << + "Pathname: " << filename << "\r\n" + "File-handle: " << fid << "\r\n" + "Chunk-offset: " << fileOffset << "\r\n" + "\r\n"; +} + +void +GetLayoutOp::Request(ostream &os) +{ + os << + "GETLAYOUT\r\n" << ReqHeaders(*this) << + "File-handle: " << fid << "\r\n" + ; + if (startOffset > 0) { + os << "Start-offset: " << startOffset << "\r\n"; + } + if (omitLocationsFlag) { + os << "Omit-locations: 1\r\n"; + } + if (lastChunkOnlyFlag) { + os << "Last-chunk-only: 1\r\n"; + } + if (maxChunks > 0) { + os << "Max-chunks : " << maxChunks << "\r\n"; + } + os << "\r\n"; +} + +void +CoalesceBlocksOp::Request(ostream &os) +{ + os << + "COALESCE_BLOCKS\r\n" << ReqHeaders(*this) << + "Src-path: " << srcPath << "\r\n" + "Dest-path: " << dstPath << "\r\n" + "\r\n"; +} + +void +GetChunkMetadataOp::Request(ostream &os) +{ + os << + "GET_CHUNK_METADATA\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Read-verify: " << (readVerifyFlag ? 1 : 0) << "\r\n" + "\r\n"; +} + +void +AllocateOp::Request(ostream &os) +{ + os << + "ALLOCATE\r\n" << ReqHeaders(*this) << + "Client-host: " << sHostName << "\r\n" + "Pathname: " << pathname << "\r\n" + "File-handle: " << fid << "\r\n" + "Chunk-offset: " << fileOffset << "\r\n" + ; + if (invalidateAllFlag) { + os << "Invalidate-all: 1\r\n"; + } + if (append) { + os << + "Chunk-append: 1\r\n" + "Space-reserve: " << spaceReservationSize << "\r\n" + "Max-appenders: " << maxAppendersPerChunk << "\r\n" + ; + } + os << "\r\n"; +} + +void +TruncateOp::Request(ostream &os) +{ + os << + "TRUNCATE\r\n" << ReqHeaders(*this) << + "Pathname: " << pathname << "\r\n" + "File-handle: " << fid << "\r\n" + "Offset: " << fileOffset << "\r\n" + ; + if (pruneBlksFromHead) { + os << "Prune-from-head: 1\r\n"; + } + os << "\r\n"; +} + +void +OpenOp::Request(ostream &os) +{ + os << + "OPEN\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Intent: " << (openFlags == O_RDONLY ? "READ" : "WRITE") << "\r\n" + "\r\n"; +} + +void +CloseOp::Request(ostream &os) +{ + os << + "CLOSE\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + ; + if (! writeInfo.empty()) { + os << + "Has-write-id: 1\r\n" + "Num-servers: " << writeInfo.size() << "\r\n" + "Servers:" + ; + for (vector::const_iterator i = writeInfo.begin(); + i < writeInfo.end(); ++i) { + os << " " << i->serverLoc.ToString() << " " << i->writeId; + } + os << "\r\n"; + } else if (chunkServerLoc.size() > 1) { + os << + "Num-servers: " << chunkServerLoc.size() << "\r\n" + "Servers:" + ; + for (vector::const_iterator i = chunkServerLoc.begin(); + i != chunkServerLoc.end(); ++i) { + os << " " << i->ToString(); + } + os << "\r\n"; + } + os << "\r\n"; +} + +void +ReadOp::Request(ostream &os) +{ + os << + "READ\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Offset: " << offset << "\r\n" + "Num-bytes: " << numBytes << "\r\n" + "\r\n"; +} + +void +WriteIdAllocOp::Request(ostream &os) +{ + os << + "WRITE_ID_ALLOC\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Offset: " << offset << "\r\n" + "Num-bytes: " << numBytes << "\r\n" + "For-record-append: " << (isForRecordAppend ? 1 : 0) << "\r\n" + "Num-servers: " << chunkServerLoc.size() << "\r\n" + "Servers:" + ; + for (vector::size_type i = 0; i < chunkServerLoc.size(); ++i) { + os << chunkServerLoc[i].ToString() << ' '; + } + os << "\r\n\r\n"; +} + +void +ChunkSpaceReserveOp::Request(ostream &os) +{ + os << + "CHUNK_SPACE_RESERVE\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Num-bytes: " << numBytes << "\r\n" + "Num-servers: " << writeInfo.size() << "\r\n" + "Servers:" + ; + for (vector::size_type i = 0; i < writeInfo.size(); ++i) { + os << writeInfo[i].serverLoc.ToString() << + ' ' << writeInfo[i].writeId << ' '; + } + os << "\r\n\r\n"; +} + +void +ChunkSpaceReleaseOp::Request(ostream &os) +{ + os << + "CHUNK_SPACE_RELEASE\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Num-bytes: " << numBytes << "\r\n" + "Num-servers: " << writeInfo.size() << "\r\n" + "Servers:" + ; + for (vector::size_type i = 0; i < writeInfo.size(); ++i) { + os << writeInfo[i].serverLoc.ToString() << + ' ' << writeInfo[i].writeId << ' '; + } + os << "\r\n\r\n"; +} + +void +WritePrepareOp::Request(ostream &os) +{ + // one checksum over the whole data plus one checksum per 64K block + os << + "WRITE_PREPARE\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Offset: " << offset << "\r\n" + "Num-bytes: " << numBytes << "\r\n" + "Checksum: " << checksum << "\r\n" + "Checksum-entries: " << checksums.size() << "\r\n" + ; + if (checksums.size() > 0) { + os << "Checksums: "; + for (uint32_t i = 0; i < checksums.size(); i++) { + os << checksums[i] << ' '; + } + os << "\r\n"; + } + if (replyRequestedFlag) { + os << "Reply: 1\r\n"; + } + os << + "Num-servers: " << writeInfo.size() << "\r\n" + "Servers:" + ; + for (vector::size_type i = 0; i < writeInfo.size(); ++i) { + os << writeInfo[i].serverLoc.ToString() << + ' ' << writeInfo[i].writeId << ' '; + } + os << "\r\n\r\n"; +} + +void +WriteSyncOp::Request(ostream &os) +{ + os << + "WRITE_SYNC\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Offset: " << offset << "\r\n" + "Num-bytes: " << numBytes << "\r\n" + "Checksum-entries: " << checksums.size() << "\r\n" + ; + if (checksums.size() > 0) { + os << "Checksums: "; + for (uint32_t i = 0; i < checksums.size(); i++) { + os << checksums[i] << ' '; + } + os << "\r\n"; + } + os << + "Num-servers: " << writeInfo.size() << "\r\n" + "Servers:" + ; + for (vector::size_type i = 0; i < writeInfo.size(); ++i) { + os << writeInfo[i].serverLoc.ToString() << + ' ' << writeInfo[i].writeId << ' '; + } + os << "\r\n\r\n"; +} + +void +SizeOp::Request(ostream &os) +{ + os << + "SIZE\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "\r\n"; +} + +void +LeaseAcquireOp::Request(ostream &os) +{ + os << "LEASE_ACQUIRE\r\n" << ReqHeaders(*this); + if (pathname && pathname[0]) { + os << "Pathname: " << pathname << "\r\n"; + } + if (chunkId >= 0) { + os << "Chunk-handle: " << chunkId << "\r\n"; + } + if (flushFlag) { + os << "Flush-write-lease: 1\r\n"; + } + if (leaseTimeout >= 0) { + os << "Lease-timeout: " << leaseTimeout << "\r\n"; + } + if (chunkIds && leaseIds && chunkIds[0] >= 0) { + os << "Chunk-ids:"; + for (int i = 0; i < kMaxChunkIds && chunkIds[i] >= 0; i++) { + os << " " << chunkIds[i]; + } + os << "\r\n"; + } + os << "\r\n"; +} + +void +LeaseRenewOp::Request(ostream &os) +{ + os << + "LEASE_RENEW\r\n" << ReqHeaders(*this) << + "Pathname: " << pathname << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "Lease-id: " << leaseId << "\r\n" + "Lease-type: " "READ_LEASE" "\r\n" + "\r\n"; +} + +void +LeaseRelinquishOp::Request(ostream &os) +{ + os << + "LEASE_RELINQUISH\r\n" << ReqHeaders(*this) << + "Chunk-handle:" << chunkId << "\r\n" + "Lease-id: " << leaseId << "\r\n" + "Lease-type: " "READ_LEASE" "\r\n" + "\r\n"; +} + +void +RecordAppendOp::Request(ostream &os) +{ + os << + "RECORD_APPEND\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Num-bytes: " << contentLength << "\r\n" + "Checksum: " << checksum << "\r\n" + "Offset: " << offset << "\r\n" + "File-offset: " "-1" "\r\n" + "Num-servers: " << writeInfo.size() << "\r\n" + "Servers:" + ; + for (vector::size_type i = 0; i < writeInfo.size(); ++i) { + os << writeInfo[i].serverLoc.ToString() << + ' ' << writeInfo[i].writeId << ' '; + } + os << "\r\n\r\n"; +} + +void +GetRecordAppendOpStatus::Request(ostream &os) +{ + os << + "GET_RECORD_APPEND_OP_STATUS\r\n" << ReqHeaders(*this) << + "Chunk-handle: " << chunkId << "\r\n" + "Write-id: " << writeId << "\r\n" + "\r\n"; +} + +void +ChangeFileReplicationOp::Request(ostream &os) +{ + os << + "CHANGE_FILE_REPLICATION\r\n" << ReqHeaders(*this) << + "File-handle: " << fid << "\r\n" + "Num-replicas: " << numReplicas << "\r\n" + "\r\n"; +} + +void +GetRecordAppendOpStatus::ParseResponseHeaderSelf(const Properties &prop) +{ + chunkVersion = prop.getValue("Chunk-version", (int64_t)-1); + opSeq = prop.getValue("Op-seq", (int64_t)-1); + opStatus = prop.getValue("Op-status", -1); + opOffset = prop.getValue("Op-offset", (int64_t)-1); + opLength = (size_t)prop.getValue("Op-length", (uint64_t)0); + widAppendCount = (size_t)prop.getValue("Wid-append-count", (uint64_t)0); + widBytesReserved = (size_t)prop.getValue("Wid-bytes-reserved", (uint64_t)0); + chunkBytesReserved = (size_t)prop.getValue("Chunk-bytes-reserved",(uint64_t)0); + remainingLeaseTime = prop.getValue("Remaining-lease-time", (int64_t)-1); + widWasReadOnlyFlag = prop.getValue("Wid-was-read-only", 0) != 0; + masterFlag = prop.getValue("Chunk-master", 0) != 0; + stableFlag = prop.getValue("Stable-flag", 0) != 0; + openForAppendFlag = prop.getValue("Open-for-append-flag", 0) != 0; + appenderState = prop.getValue("Appender-state", -1); + appenderStateStr = prop.getValue("Appender-state-string", ""); + masterCommitOffset = prop.getValue("Master-commit-offset", (int64_t)-1); + nextCommitOffset = prop.getValue("Next-commit-offset", (int64_t)-1); + widReadOnlyFlag = prop.getValue("Wid-read-only", 0) != 0; +} + +/// +/// Handlers to parse a response sent by the server. The model is +/// similar to what is done by ChunkServer/metaserver: put the +/// response into a properties object and then extract out the values. +/// +/// \brief Parse the response common to all RPC requests. +/// @param[in] resp: a string consisting of header/value pairs in +/// which header/value is separated by a ':' character. +/// @param[out] prop: a properties object that contains the result of +/// parsing. +/// +void +KfsOp::ParseResponseHeader(std::istream& is) +{ + const char separator = ':'; + Properties prop; + prop.loadProperties(is, separator, false); + ParseResponseHeader(prop); +} + +void +KfsOp::ParseResponseHeader(const Properties &prop) +{ + // kfsSeq_t resSeq = prop.getValue("Cseq", (kfsSeq_t) -1); + status = prop.getValue("Status", -1); + contentLength = prop.getValue("Content-length", 0); + statusMsg = prop.getValue("Status-message", string()); + ParseResponseHeaderSelf(prop); +} + +/// +/// Default parse response handler. +/// @param[in] buf: buffer containing the response +/// @param[in] len: str-len of the buffer. +void +KfsOp::ParseResponseHeaderSelf(const Properties &prop) +{ +} + +/* static */ void +KfsOp::AddDefaultRequestHeaders( + kfsUid_t euser /* = kKfsUserNone */, kfsGid_t egroup /* = kKfsGroupNone */) +{ + ostringstream os; + os << "UserId: "; + if (euser == kKfsUserNone) { + os << geteuid(); + } else { + os << euser; + } + os << "\r\n" + "GroupId: "; + if (egroup == kKfsGroupNone) { + os << getegid(); + } else { + os << egroup; + } + os << "\r\n"; + KfsOp::AddExtraRequestHeaders(os.str()); +} + +/// +/// Specific response parsing handlers. +/// +void +CreateOp::ParseResponseHeaderSelf(const Properties &prop) +{ + fileId = prop.getValue("File-handle", (kfsFileId_t) -1); + metaStriperType = prop.getValue("Striper-type", KFS_STRIPED_FILE_TYPE_NONE); + permissions.user = prop.getValue("User", permissions.user); + permissions.group = prop.getValue("Group", permissions.group); + permissions.mode = prop.getValue("Mode", permissions.mode); +} + +void +ReaddirOp::ParseResponseHeaderSelf(const Properties &prop) +{ + numEntries = prop.getValue("Num-Entries", 0); + hasMoreEntriesFlag = prop.getValue("Has-more-entries", 0) != 0; +} + +void +DumpChunkServerMapOp::ParseResponseHeaderSelf(const Properties &prop) +{ +} + +void +DumpChunkMapOp::ParseResponseHeaderSelf(const Properties &prop) +{ +} + +void +UpServersOp::ParseResponseHeaderSelf(const Properties &prop) +{ +} + +void +ReaddirPlusOp::ParseResponseHeaderSelf(const Properties &prop) +{ + numEntries = prop.getValue("Num-Entries", 0); + hasMoreEntriesFlag = prop.getValue("Has-more-entries", 0) != 0; +} + +void +MkdirOp::ParseResponseHeaderSelf(const Properties &prop) +{ + fileId = prop.getValue("File-handle", (kfsFileId_t) -1); + permissions.user = prop.getValue("User", permissions.user); + permissions.group = prop.getValue("Group", permissions.group); + permissions.mode = prop.getValue("Mode", permissions.mode); +} + +static void +ParseFileAttribute(const Properties &prop, FileAttr &fattr) +{ + const string estr; + + fattr.fileId = prop.getValue("File-handle", kfsFileId_t(-1)); + fattr.isDirectory = prop.getValue("Type", estr) == "dir"; + if (fattr.isDirectory) { + fattr.subCount1 = prop.getValue("File-count", int64_t(-1)); + fattr.subCount2 = prop.getValue("Dir-count", int64_t(-1)); + } else { + fattr.subCount1 = prop.getValue("Chunk-count", int64_t(0)); + } + fattr.fileSize = prop.getValue("File-size", chunkOff_t(-1)); + fattr.numReplicas = (int16_t)prop.getValue("Replication", 1); + + GetTimeval(prop.getValue("M-Time", ""), fattr.mtime); + GetTimeval(prop.getValue("C-Time", ""), fattr.ctime); + GetTimeval(prop.getValue("CR-Time", ""), fattr.crtime); + + switch (prop.getValue("Striper-type", int(KFS_STRIPED_FILE_TYPE_NONE))) { + case KFS_STRIPED_FILE_TYPE_NONE: + fattr.striperType = KFS_STRIPED_FILE_TYPE_NONE; + break; + case KFS_STRIPED_FILE_TYPE_RS: + fattr.striperType = KFS_STRIPED_FILE_TYPE_RS; + break; + default: + fattr.striperType = KFS_STRIPED_FILE_TYPE_UNKNOWN; + break; + } + + fattr.numStripes = (int16_t)prop.getValue("Num-stripes", 0); + fattr.numRecoveryStripes = (int16_t)prop.getValue("Num-recovery-stripes", 0); + fattr.stripeSize = prop.getValue("Stripe-size", int32_t(0)); + fattr.user = prop.getValue("User", kKfsUserNone); + fattr.group = prop.getValue("Group", kKfsGroupNone); + fattr.mode = prop.getValue("Mode", kKfsModeUndef); +} + +void +LookupOp::ParseResponseHeaderSelf(const Properties &prop) +{ + ParseFileAttribute(prop, fattr); +} + +void +LookupPathOp::ParseResponseHeaderSelf(const Properties &prop) +{ + ParseFileAttribute(prop, fattr); +} + +void +AllocateOp::ParseResponseHeaderSelf(const Properties &prop) +{ + chunkId = prop.getValue("Chunk-handle", (kfsFileId_t) -1); + chunkVersion = prop.getValue("Chunk-version", (int64_t) -1); + if (append) + fileOffset = prop.getValue("Chunk-offset", (chunkOff_t) 0); + + string master = prop.getValue("Master", ""); + if (master != "") { + istringstream ist(master); + + ist >> masterServer.hostname; + ist >> masterServer.port; + // put the master the first in the list + chunkServers.push_back(masterServer); + } + + int numReplicas = prop.getValue("Num-replicas", 0); + string replicas = prop.getValue("Replicas", ""); + + if (replicas != "") { + istringstream ser(replicas); + ServerLocation loc; + + for (int i = 0; i < numReplicas; ++i) { + ser >> loc.hostname; + ser >> loc.port; + if (loc != masterServer) + chunkServers.push_back(loc); + } + } +} + +void +GetAllocOp::ParseResponseHeaderSelf(const Properties &prop) +{ + chunkId = prop.getValue("Chunk-handle", (kfsFileId_t) -1); + chunkVersion = prop.getValue("Chunk-version", (int64_t) -1); + serversOrderedFlag = prop.getValue("Replicas-ordered", 0) != 0; + int numReplicas = prop.getValue("Num-replicas", 0); + string replicas = prop.getValue("Replicas", ""); + if (replicas != "") { + istringstream ser(replicas); + ServerLocation loc; + + for (int i = 0; i < numReplicas; ++i) { + ser >> loc.hostname; + ser >> loc.port; + chunkServers.push_back(loc); + } + } +} + +void +CoalesceBlocksOp::ParseResponseHeaderSelf(const Properties &prop) +{ + dstStartOffset = prop.getValue("Dst-start-offset", (chunkOff_t) 0); +} + +void +GetLayoutOp::ParseResponseHeaderSelf(const Properties &prop) +{ + numChunks = prop.getValue("Num-chunks", 0); + hasMoreChunksFlag = prop.getValue("Has-more-chunks", 0) != 0; +} + +int +GetLayoutOp::ParseLayoutInfo() +{ + if (numChunks <= 0 || contentBuf == NULL) { + return 0; + } + BufferInputStream is(contentBuf, contentLength); + chunks.clear(); + chunks.reserve(numChunks); + for (int i = 0; i < numChunks; ++i) { + chunks.push_back(ChunkLayoutInfo()); + if (! (is >> chunks.back())) { + chunks.clear(); + return -EINVAL; + } + } + return 0; +} + +istream& +ChunkLayoutInfo::Parse(istream& is) +{ + chunkServers.clear(); + if (! (is >> fileOffset >> chunkId >> chunkVersion)) { + return is; + } + int numServers = 0; + if (! (is >> numServers)) { + return is; + } + chunkServers.reserve(max(0, numServers)); + for (int j = 0; j < numServers; j++) { + chunkServers.push_back(ServerLocation()); + ServerLocation& s = chunkServers.back(); + if (! (is >> s.hostname >> s.port)) { + return is; + } + } + return is; +} + +void +SizeOp::ParseResponseHeaderSelf(const Properties &prop) +{ + size = prop.getValue("Size", (long long) 0); +} + +void +ReadOp::ParseResponseHeaderSelf(const Properties &prop) +{ + string checksumStr; + uint32_t nentries; + + nentries = prop.getValue("Checksum-entries", 0); + checksumStr = prop.getValue("Checksums", ""); + diskIOTime = prop.getValue("DiskIOtime", 0.0); + drivename = prop.getValue("Drivename", ""); + istringstream ist(checksumStr); + checksums.clear(); + for (uint32_t i = 0; i < nentries; i++) { + uint32_t cksum; + ist >> cksum; + checksums.push_back(cksum); + } +} + +void +WriteIdAllocOp::ParseResponseHeaderSelf(const Properties &prop) +{ + writeIdStr = prop.getValue("Write-id", string()); + writePrepReplySupportedFlag = prop.getValue("Write-prepare-reply", 0) != 0; +} + +void +LeaseAcquireOp::ParseResponseHeaderSelf(const Properties &prop) +{ + leaseId = prop.getValue("Lease-id", (long long) -1); + const string lids = prop.getValue("Lease-ids", string()); + if (leaseIds) { + leaseIds[0] = -1; + } + if (! lids.empty() && chunkIds && leaseIds) { + istringstream is(lids); + for (int i = 0; i < kMaxChunkIds && chunkIds[i] >= 0; i++) { + if (! (is >> leaseIds[i])) { + leaseIds[i] = -1; + while (++i < kMaxChunkIds && chunkIds[i] >= 0) { + leaseIds[i] = -1; + } + break; + } + } + } +} + +void +ChangeFileReplicationOp::ParseResponseHeaderSelf(const Properties &prop) +{ + numReplicas = prop.getValue("Num-replicas", 1); +} + +void +GetPathNameOp::Request(ostream& os) +{ + os << + "GETPATHNAME\r\n" << ReqHeaders(*this); + if (fid > 0) { + os << "File-handle: " << fid << "\r\n"; + } + if (chunkId > 0) { + os << "Chunk-handle: " << chunkId << "\r\n"; + } + os << "\r\n"; +} + +void +GetPathNameOp::ParseResponseHeaderSelf(const Properties& prop) +{ + ParseFileAttribute(prop, fattr); + pathname = prop.getValue("Path-name", string()); + + offset = prop.getValue("Chunk-offset", chunkOff_t(-1)); + chunkVersion = prop.getValue("Chunk-version", int64_t(-1)); + + const int numReplicas = prop.getValue("Num-replicas", 0); + const string replicas = prop.getValue("Replicas", string()); + if (! replicas.empty()) { + istringstream ser(replicas); + for (int i = 0; i < numReplicas; ++i) { + ServerLocation loc; + ser >> loc.hostname; + ser >> loc.port; + servers.push_back(loc); + } + } +} + +void +ChmodOp::Request(ostream &os) +{ + os << + "CHMOD\r\n" << ReqHeaders(*this) << + "File-handle: " << fid << "\r\n" + "Mode: " << mode << "\r\n" + "\r\n"; + ; +} + +void +ChownOp::Request(ostream &os) +{ + os << + "CHOWN\r\n" << ReqHeaders(*this) << + "File-handle: " << fid << "\r\n"; + if (user != kKfsUserNone) { + os << "Owner: " << user << "\r\n"; + } + if (group != kKfsGroupNone) { + os << "Group: " << group << "\r\n"; + } + os << "\r\n"; +} + +} //namespace client +} //namespace KFS diff --git a/src/cc/libclient/KfsOps.h b/src/cc/libclient/KfsOps.h new file mode 100644 index 000000000..5205953ec --- /dev/null +++ b/src/cc/libclient/KfsOps.h @@ -0,0 +1,1190 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/05/24 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef _LIBKFSCLIENT_KFSOPS_H +#define _LIBKFSCLIENT_KFSOPS_H + +#include "common/kfstypes.h" +#include "common/Properties.h" +#include "KfsAttr.h" + +#include +#include +#include +#include +#include +#include + +#include + +namespace KFS { +namespace client { +using std::string; +using std::ostringstream; +using std::ostream; +using std::istream; +using std::oct; +using std::dec; + +// KFS client library RPCs. +enum KfsOp_t { + CMD_UNKNOWN, + // Meta-data server RPCs + CMD_GETALLOC, + CMD_GETLAYOUT, + CMD_ALLOCATE, + CMD_TRUNCATE, + CMD_LOOKUP, + CMD_MKDIR, + CMD_RMDIR, + CMD_READDIR, + CMD_READDIRPLUS, + CMD_GETDIRSUMMARY, + CMD_CREATE, + CMD_REMOVE, + CMD_RENAME, + CMD_SETMTIME, + CMD_LEASE_ACQUIRE, + CMD_LEASE_RENEW, + CMD_LEASE_RELINQUISH, + CMD_COALESCE_BLOCKS, + CMD_CHUNK_SPACE_RESERVE, + CMD_CHUNK_SPACE_RELEASE, + CMD_RECORD_APPEND, + CMD_GET_RECORD_APPEND_STATUS, + CMD_CHANGE_FILE_REPLICATION, + CMD_DUMP_CHUNKTOSERVERMAP, + CMD_UPSERVERS, + // Chunkserver RPCs + CMD_OPEN, + CMD_CLOSE, + CMD_READ, + CMD_WRITE_ID_ALLOC, + CMD_WRITE_PREPARE, + CMD_WRITE_SYNC, + CMD_SIZE, + CMD_GET_CHUNK_METADATA, + CMD_DUMP_CHUNKMAP, + CMD_WRITE, + CMD_GETPATHNAME, + CMD_CHMOD, + CMD_CHOWN, + + CMD_NCMDS, +}; + +struct KfsOp { + KfsOp_t op; + kfsSeq_t seq; + int32_t status; + uint32_t checksum; // a checksum over the data + size_t contentLength; + size_t contentBufLen; + char* contentBuf; + string statusMsg; // optional, mostly for debugging + + KfsOp (KfsOp_t o, kfsSeq_t s) + : op(o), seq(s), status(0), checksum(0), contentLength(0), + contentBufLen(0), contentBuf(0), statusMsg() + {} + // to allow dynamic-type-casting, make the destructor virtual + virtual ~KfsOp() { + delete [] contentBuf; + } + void AttachContentBuf(const char *buf, size_t len) { + AttachContentBuf((char *) buf, len); + } + void EnsureCapacity(size_t len) { + if (contentBufLen >= len) { + return; + } + DeallocContentBuf(); + AllocContentBuf(len); + } + void AllocContentBuf(size_t len) { + contentBuf = new char[len + 1]; + contentBuf[len] = 0; + contentBufLen = len; + } + void DeallocContentBuf() { + delete [] contentBuf; + ReleaseContentBuf(); + } + void AttachContentBuf(char *buf, size_t len) { + contentBuf = buf; + contentBufLen = len; + } + void ReleaseContentBuf() { + contentBuf = 0; + contentBufLen = 0; + } + // Build a request RPC that can be sent to the server + virtual void Request(ostream &os) = 0; + virtual bool NextRequest(kfsSeq_t /* seq */, ostream& /* os */) + { return false; } + + // Common parsing code: parse the response from string and fill + // that into a properties structure. + void ParseResponseHeader(istream& is); + // Parse a response header from the server: This does the + // default parsing of OK/Cseq/Status/Content-length. + void ParseResponseHeader(const Properties& prop); + + // Return information about op that can printed out for debugging. + virtual string Show() const = 0; + virtual void ParseResponseHeaderSelf(const Properties& prop); + // Global setting use only at startup, not re-entrant. + // The string added to the headers section as is. + // The headers must be properly formatted: each header line must end with + // \r\n + static void SetExtraRequestHeaders(const string& headers) { + sExtraHeaders = headers; + } + static void AddExtraRequestHeaders(const string& headers) { + sExtraHeaders += headers; + } + static void AddDefaultRequestHeaders( + kfsUid_t euser = kKfsUserNone, kfsGid_t egroup = kKfsGroupNone); + class ReqHeaders; + friend class OpsHeaders; +private: + static string sExtraHeaders; +}; + +struct CreateOp : public KfsOp { + kfsFileId_t parentFid; // input parent file-id + const char *filename; + kfsFileId_t fileId; // result + int numReplicas; // desired degree of replication + bool exclusive; // O_EXCL flag + int striperType; + int numStripes; + int numRecoveryStripes; + int stripeSize; + int metaStriperType; + Permissions permissions; + kfsSeq_t reqId; + CreateOp(kfsSeq_t s, kfsFileId_t p, const char *f, int n, bool e, + const Permissions& perms = Permissions(), + kfsSeq_t id = -1) : + KfsOp(CMD_CREATE, s), + parentFid(p), + filename(f), + numReplicas(n), + exclusive(e), + striperType(KFS_STRIPED_FILE_TYPE_NONE), + numStripes(0), + numRecoveryStripes(0), + stripeSize(0), + metaStriperType(KFS_STRIPED_FILE_TYPE_UNKNOWN), + permissions(perms), + reqId(id) + {} + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + + os << "create: " << filename << " parent: " << parentFid; + return os.str(); + } +}; + +struct RemoveOp : public KfsOp { + kfsFileId_t parentFid; // input parent file-id + const char *filename; + const char *pathname; + RemoveOp(kfsSeq_t s, kfsFileId_t p, const char *f, const char *pn) : + KfsOp(CMD_REMOVE, s), parentFid(p), filename(f), pathname(pn) + { + + } + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "remove: " << filename << " (parentfid = " << parentFid << ")"; + return os.str(); + } +}; + +struct MkdirOp : public KfsOp { + kfsFileId_t parentFid; // input parent file-id + const char *dirname; + Permissions permissions; + kfsSeq_t reqId; + kfsFileId_t fileId; // result + MkdirOp(kfsSeq_t s, kfsFileId_t p, const char *d, + const Permissions& perms = Permissions(), kfsSeq_t id = -1) + : KfsOp(CMD_MKDIR, s), + parentFid(p), + dirname(d), + permissions(perms), + reqId(id), + fileId(-1) + {} + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + + os << "mkdir: " << dirname << " parent: " << parentFid; + return os.str(); + } +}; + +struct RmdirOp : public KfsOp { + kfsFileId_t parentFid; // input parent file-id + const char *dirname; + const char *pathname; // input: full pathname + RmdirOp(kfsSeq_t s, kfsFileId_t p, const char *d, const char *pn) : + KfsOp(CMD_RMDIR, s), parentFid(p), dirname(d), pathname(pn) + { + + } + void Request(ostream &os); + // default parsing of OK/Cseq/Status/Content-length will suffice. + + string Show() const { + ostringstream os; + + os << "rmdir: " << dirname << " (parentfid = " << parentFid << ")"; + return os.str(); + } +}; + +struct RenameOp : public KfsOp { + kfsFileId_t parentFid; // input parent file-id + const char *oldname; // old file name/dir + const char *newpath; // new path to be renamed to + const char *oldpath; // old path (starting from /) + bool overwrite; // set if the rename can overwrite newpath + RenameOp(kfsSeq_t s, kfsFileId_t p, const char *o, + const char *n, const char *op, bool c) : + KfsOp(CMD_RENAME, s), parentFid(p), oldname(o), + newpath(n), oldpath(op), overwrite(c) + { + + } + void Request(ostream &os); + + // default parsing of OK/Cseq/Status/Content-length will suffice. + + string Show() const { + ostringstream os; + + if (overwrite) + os << "rename_overwrite: "; + else + os << "rename: "; + os << " old: " << oldname << " (parentfid: " << parentFid << ")"; + os << " new: " << newpath; + return os.str(); + } +}; + +struct ReaddirOp : public KfsOp { + kfsFileId_t fid; // fid of the directory + int numEntries; // # of entries in the directory + bool hasMoreEntriesFlag; + string fnameStart; + ReaddirOp(kfsSeq_t s, kfsFileId_t f) + : KfsOp(CMD_READDIR, s), + fid(f), + numEntries(0), + hasMoreEntriesFlag(false), + fnameStart() + {} + void Request(ostream &os); + // This will only extract out the default+num-entries. The actual + // dir. entries are in the content-length portion of things + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + os << + "readdir:" + " fid: " << fid << + " start: " << fnameStart << + " entries: " << numEntries << + " hasmore: " << hasMoreEntriesFlag; + return os.str(); + } +}; + +struct SetMtimeOp : public KfsOp { + const char *pathname; + struct timeval mtime; + SetMtimeOp(kfsSeq_t s, const char *p, const struct timeval &m): + KfsOp(CMD_SETMTIME, s), pathname(p), mtime(m) + { + } + void Request(ostream &os); + string Show() const { + ostringstream os; + os << "setmtime: " << pathname << + " mtime: " << mtime.tv_sec << ':' << mtime.tv_usec; + return os.str(); + } +}; + +struct DumpChunkServerMapOp : public KfsOp { + DumpChunkServerMapOp(kfsSeq_t s): + KfsOp(CMD_DUMP_CHUNKTOSERVERMAP, s) + { + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + os << "dumpchunktoservermap"; + return os.str(); + } +}; + +struct UpServersOp : public KfsOp { + UpServersOp(kfsSeq_t s): + KfsOp(CMD_UPSERVERS, s) + { + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + os << "upservers"; + return os.str(); + } +}; + +struct DumpChunkMapOp : public KfsOp { + DumpChunkMapOp(kfsSeq_t s): + KfsOp(CMD_DUMP_CHUNKMAP, s) + { + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + os << "dumpchunkmap"; + return os.str(); + } +}; + +struct ReaddirPlusOp : public KfsOp { + kfsFileId_t fid; // fid of the directory + bool getLastChunkInfoOnlyIfSizeUnknown; + bool hasMoreEntriesFlag; + int numEntries; // # of entries in the directory + string fnameStart; + ReaddirPlusOp(kfsSeq_t s, kfsFileId_t f, bool cif) + : KfsOp(CMD_READDIRPLUS, s), + fid(f), + getLastChunkInfoOnlyIfSizeUnknown(cif), + hasMoreEntriesFlag(false), + numEntries(0), + fnameStart() + {} + void Request(ostream &os); + // This will only extract out the default+num-entries. The actual + // dir. entries are in the content-length portion of things + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + os << + "readdirplus:" + " fid: " << fid << + " start: " << fnameStart << + " entries: " << numEntries << + " hasmore: " << hasMoreEntriesFlag; + return os.str(); + } +}; + +// Lookup the attributes of a file in a directory +struct LookupOp : public KfsOp { + kfsFileId_t parentFid; // fid of the parent dir + const char *filename; // file in the dir + FileAttr fattr; // result + LookupOp(kfsSeq_t s, kfsFileId_t p, const char *f) : + KfsOp(CMD_LOOKUP, s), parentFid(p), filename(f) + { + + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + + string Show() const { + ostringstream os; + + os << "lookup: " << filename << " parent: " << parentFid; + return os.str(); + } +}; + +// Lookup the attributes of a file relative to a root dir. +struct LookupPathOp : public KfsOp { + kfsFileId_t rootFid; // fid of the root dir + const char *filename; // path relative to root + FileAttr fattr; // result + LookupPathOp(kfsSeq_t s, kfsFileId_t r, const char *f) : + KfsOp(CMD_LOOKUP, s), rootFid(r), filename(f) + { + + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + + string Show() const { + ostringstream os; + + os << "lookup_path: " << filename << " (rootFid = " << rootFid << ")"; + return os.str(); + } +}; + +/// Coalesce blocks from src->dst by appending the blocks of src to +/// dst. If the op is successful, src will end up with 0 blocks. +struct CoalesceBlocksOp: public KfsOp { + string srcPath; // input + string dstPath; // input + chunkOff_t dstStartOffset; // output + CoalesceBlocksOp(kfsSeq_t s, const string& o, const string& n) : + KfsOp(CMD_COALESCE_BLOCKS, s), srcPath(o), dstPath(n) + { + + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + + os << "coalesce blocks: " << srcPath << "<-" << dstPath; + return os.str(); + } +}; + +/// Get the allocation information for a chunk in a file. +struct GetAllocOp: public KfsOp { + kfsFileId_t fid; + chunkOff_t fileOffset; + kfsChunkId_t chunkId; // result + int64_t chunkVersion; // result + bool serversOrderedFlag; // result: meta server ordered the servers list + // by its preference / load -- try the servers in this order. + // result: where the chunk is hosted name/port + vector chunkServers; + string filename; // input + GetAllocOp(kfsSeq_t s, kfsFileId_t f, chunkOff_t o) + : KfsOp(CMD_GETALLOC, s), + fid(f), + fileOffset(o), + chunkId(-1), + chunkVersion(-1), + serversOrderedFlag(false), + chunkServers(), + filename() + {} + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + + os << + "getalloc:" + " fid: " << fid << + " offset: " << fileOffset << + " chunkId: " << chunkId << + " version: " << chunkVersion + ; + return os.str(); + } +}; + + +struct ChunkLayoutInfo { + ChunkLayoutInfo() + : fileOffset(-1), + chunkId(-1), + chunkVersion(-1), + chunkServers() + {} + chunkOff_t fileOffset; + kfsChunkId_t chunkId; // result + int64_t chunkVersion; // result + vector chunkServers; // where the chunk lives + istream& Parse(istream& os); +}; + +inline static istream& operator>>(istream& is, ChunkLayoutInfo& li) { + return li.Parse(is); +} + +/// Get the layout information for all chunks in a file. +struct GetLayoutOp: public KfsOp { + kfsFileId_t fid; + chunkOff_t startOffset; + bool omitLocationsFlag; + bool lastChunkOnlyFlag; + int numChunks; + int maxChunks; + bool hasMoreChunksFlag; + vector chunks; + GetLayoutOp(kfsSeq_t s, kfsFileId_t f) + : KfsOp(CMD_GETLAYOUT, s), + fid(f), + startOffset(0), + omitLocationsFlag(false), + lastChunkOnlyFlag(false), + numChunks(0), + maxChunks(-1), + hasMoreChunksFlag(false), + chunks() + {} + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + int ParseLayoutInfo(); + string Show() const { + ostringstream os; + + os << "getlayout: fid: " << fid; + return os.str(); + } +}; + +// Get the chunk metadata (aka checksums) stored on the chunkservers +struct GetChunkMetadataOp: public KfsOp { + kfsChunkId_t chunkId; + bool readVerifyFlag; + GetChunkMetadataOp(kfsSeq_t s, kfsChunkId_t c, bool verifyFlag) : + KfsOp(CMD_GET_CHUNK_METADATA, s), chunkId(c), readVerifyFlag(verifyFlag) { } + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "get chunk metadata: chunkId: " << chunkId; + return os.str(); + } +}; + +struct AllocateOp : public KfsOp { + kfsFileId_t fid; + chunkOff_t fileOffset; + string pathname; // input: the full pathname corresponding to fid + kfsChunkId_t chunkId; // result + int64_t chunkVersion; // result---version # for the chunk + // where is the chunk hosted name/port + ServerLocation masterServer; // master for running the write transaction + vector chunkServers; + // if this is set, then the metaserver will pick the offset in the + // file at which the chunk was allocated. + bool append; + // the space reservation size that will follow the allocation. + int spaceReservationSize; + // suggested max. # of concurrent appenders per chunk + int maxAppendersPerChunk; + bool invalidateAllFlag; + AllocateOp(kfsSeq_t s, kfsFileId_t f, const string &p) : + KfsOp(CMD_ALLOCATE, s), + fid(f), + fileOffset(0), + pathname(p), + chunkId(-1), + chunkVersion(-1), + masterServer(), + chunkServers(), + append(false), + spaceReservationSize(1 << 20), + maxAppendersPerChunk(64), + invalidateAllFlag(false) + {} + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + + os << "allocate: fid: " << fid << " offset: " << fileOffset << + (invalidateAllFlag ? " invalidate" : "") ; + return os.str(); + } +}; + +struct TruncateOp : public KfsOp { + const char* pathname; + kfsFileId_t fid; + chunkOff_t fileOffset; + bool pruneBlksFromHead; + TruncateOp(kfsSeq_t s, const char *p, kfsFileId_t f, chunkOff_t o) : + KfsOp(CMD_TRUNCATE, s), pathname(p), fid(f), fileOffset(o), + pruneBlksFromHead(false) + { + + } + void Request(ostream &os); + string Show() const { + ostringstream os; + + if (pruneBlksFromHead) + os << "prune blks from head: "; + else + os << "truncate: "; + os << " fid: " << fid << " offset: " << fileOffset; + return os.str(); + } +}; + +struct OpenOp : public KfsOp { + kfsChunkId_t chunkId; + int openFlags; // either O_RDONLY, O_WRONLY or O_RDWR + OpenOp(kfsSeq_t s, kfsChunkId_t c) : + KfsOp(CMD_OPEN, s), chunkId(c) + { + + } + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "open: chunkid: " << chunkId; + return os.str(); + } +}; + +struct WriteInfo { + ServerLocation serverLoc; + int64_t writeId; + WriteInfo() : writeId(-1) { } + WriteInfo(ServerLocation loc, int64_t w) : + serverLoc(loc), writeId(w) { } + WriteInfo & operator = (const WriteInfo &other) { + serverLoc = other.serverLoc; + writeId = other.writeId; + return *this; + } + string Show() const { + ostringstream os; + + os << " location: " << serverLoc.ToString() << " writeId: " << writeId; + return os.str(); + } +}; + +class ShowWriteInfo { + ostringstream &os; +public: + ShowWriteInfo(ostringstream &o) : os(o) { } + void operator() (WriteInfo w) { + os << w.Show() << ' '; + } +}; + +struct CloseOp : public KfsOp { + kfsChunkId_t chunkId; + vector chunkServerLoc; + vector writeInfo; + CloseOp(kfsSeq_t s, kfsChunkId_t c) : + KfsOp(CMD_CLOSE, s), chunkId(c), writeInfo() + {} + CloseOp(kfsSeq_t s, kfsChunkId_t c, const vector& wi) : + KfsOp(CMD_CLOSE, s), chunkId(c), writeInfo(wi) + {} + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "close: chunkid: " << chunkId; + return os.str(); + } +}; + +// used for retrieving a chunk's size +struct SizeOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; + chunkOff_t size; /* result */ + SizeOp(kfsSeq_t s, kfsChunkId_t c, int64_t v) : + KfsOp(CMD_SIZE, s), chunkId(c), chunkVersion(v), size(-1) + { + + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + + os << + "size:" + " chunkid: " << chunkId << + " version: " << chunkVersion << + " size: " << size + ; + return os.str(); + } +}; + + +struct ReadOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; /* input */ + chunkOff_t offset; /* input */ + size_t numBytes; /* input */ + struct timeval submitTime; /* when the client sent the request to the server */ + vector checksums; /* checksum for each 64KB block */ + float diskIOTime; /* as reported by the server */ + float elapsedTime; /* as measured by the client */ + string drivename; /* drive from which data was read */ + + ReadOp(kfsSeq_t s, kfsChunkId_t c, int64_t v) : + KfsOp(CMD_READ, s), chunkId(c), chunkVersion(v), + offset(0), numBytes(0), diskIOTime(0.0), elapsedTime(0.0) + { + + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + + string Show() const { + ostringstream os; + + os << "read:" + " chunkid: " << chunkId << + " version: " << chunkVersion << + " offset: " << offset << + " numBytes: " << numBytes << + " iotm: " << diskIOTime + ; + return os.str(); + } +}; + +// op that defines the write that is going to happen +struct WriteIdAllocOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; /* input */ + chunkOff_t offset; /* input */ + size_t numBytes; /* input */ + bool isForRecordAppend; /* set if this is for a record append that is coming */ + bool writePrepReplySupportedFlag; + string writeIdStr; /* output */ + vector chunkServerLoc; + WriteIdAllocOp(kfsSeq_t s, kfsChunkId_t c, int64_t v, chunkOff_t o, size_t n) + : KfsOp(CMD_WRITE_ID_ALLOC, s), + chunkId(c), + chunkVersion(v), + offset(o), + numBytes(n), + isForRecordAppend(false), + writePrepReplySupportedFlag(false) + { + + } + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + + os << "write-id-alloc: chunkid: " << chunkId << + " version: " << chunkVersion; + return os.str(); + } +}; + +struct WritePrepareOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; /* input */ + chunkOff_t offset; /* input */ + size_t numBytes; /* input */ + bool replyRequestedFlag; + vector checksums; /* checksum for each 64KB block */ + vector writeInfo; /* input */ + WritePrepareOp(kfsSeq_t s, kfsChunkId_t c, int64_t v) + : KfsOp(CMD_WRITE_PREPARE, s), + chunkId(c), + chunkVersion(v), + offset(0), + numBytes(0), + replyRequestedFlag(false), + checksums(), + writeInfo() + {} + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "write-prepare:" + " chunkid: " << chunkId << + " version: " << chunkVersion << + " offset: " << offset << + " numBytes: " << numBytes << + " checksum: " << checksum; + for_each(writeInfo.begin(), writeInfo.end(), ShowWriteInfo(os)); + return os.str(); + } +}; + +struct WriteSyncOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; + // The range of data we are sync'ing + chunkOff_t offset; /* input */ + size_t numBytes; /* input */ + vector writeInfo; + // The checksums that cover the region. + vector checksums; + WriteSyncOp() + : KfsOp(CMD_WRITE_SYNC, 0), + chunkId(0), + chunkVersion(0), + offset(0), + numBytes(0), + writeInfo() + {} + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "write-sync:" + " chunkid: " << chunkId << + " version: " << chunkVersion << + " offset: " << offset << + " numBytes: " << numBytes; + for_each(writeInfo.begin(), writeInfo.end(), ShowWriteInfo(os)); + return os.str(); + } +}; + +struct LeaseAcquireOp : public KfsOp { + enum { kMaxChunkIds = 256 }; + BOOST_STATIC_ASSERT(kMaxChunkIds * 21 + (1<<10) < MAX_RPC_HEADER_LEN); + + kfsChunkId_t chunkId; // input + const char* pathname; // input + bool flushFlag; // input + int leaseTimeout; // input + int64_t leaseId; // output + kfsChunkId_t* chunkIds; + int64_t* leaseIds; + LeaseAcquireOp(kfsSeq_t s, kfsChunkId_t c, const char *p) : + KfsOp(CMD_LEASE_ACQUIRE, s), + chunkId(c), pathname(p), flushFlag(false), leaseTimeout(-1), + leaseId(-1), + chunkIds(0), + leaseIds(0) + {} + + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + + string Show() const { + ostringstream os; + os << "lease-acquire:" + " chunkid: " << chunkId << + " leaseid: " << leaseId + ; + return os.str(); + } +}; + +struct LeaseRenewOp : public KfsOp { + kfsChunkId_t chunkId; // input + int64_t leaseId; // input + const char *pathname; // input + LeaseRenewOp(kfsSeq_t s, kfsChunkId_t c, int64_t l, const char *p) : + KfsOp(CMD_LEASE_RENEW, s), chunkId(c), leaseId(l), pathname(p) + { + + } + + void Request(ostream &os); + // default parsing of status is sufficient + + string Show() const { + ostringstream os; + os << "lease-renew: chunkid: " << chunkId << " leaseId: " << leaseId; + return os.str(); + } +}; + +// Whenever we want to give up a lease early, we notify the metaserver +// using this op. +struct LeaseRelinquishOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t leaseId; + string leaseType; + LeaseRelinquishOp(kfsSeq_t s, kfsChunkId_t c, int64_t l) : + KfsOp(CMD_LEASE_RELINQUISH, s), chunkId(c), leaseId(l) + { + + } + void Request(ostream &os); + // defaut parsing of status is sufficient + string Show() const { + ostringstream os; + + os << "lease-relinquish: chunkid: " << chunkId << + " leaseId: " << leaseId << " type: " << leaseType; + return os.str(); + } +}; + +/// add in ops for space reserve/release/record-append +struct ChunkSpaceReserveOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; /* input */ + size_t numBytes; /* input */ + vector writeInfo; /* input */ + ChunkSpaceReserveOp(kfsSeq_t s, kfsChunkId_t c, int64_t v, vector &w, size_t n) : + KfsOp(CMD_CHUNK_SPACE_RESERVE, s), chunkId(c), chunkVersion(v), + numBytes(n), writeInfo(w) + { + + } + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "chunk-space-reserve: chunkid: " << chunkId << + " version: " << chunkVersion << " num-bytes: " << numBytes; + return os.str(); + } +}; + +struct ChunkSpaceReleaseOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; /* input */ + size_t numBytes; /* input */ + vector writeInfo; /* input */ + ChunkSpaceReleaseOp(kfsSeq_t s, kfsChunkId_t c, int64_t v, vector &w, size_t n) : + KfsOp(CMD_CHUNK_SPACE_RELEASE, s), chunkId(c), chunkVersion(v), + numBytes(n), writeInfo(w) + { + + } + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "chunk-space-release: chunkid: " << chunkId << + " version: " << chunkVersion << " num-bytes: " << numBytes; + return os.str(); + } +}; + +struct RecordAppendOp : public KfsOp { + kfsChunkId_t chunkId; + int64_t chunkVersion; /* input */ + chunkOff_t offset; /* input: this client's view of where it is writing in the file */ + vector writeInfo; /* input */ + RecordAppendOp(kfsSeq_t s, kfsChunkId_t c, int64_t v, chunkOff_t o, vector &w) : + KfsOp(CMD_RECORD_APPEND, s), chunkId(c), chunkVersion(v), offset(o), writeInfo(w) + { + + } + void Request(ostream &os); + string Show() const { + ostringstream os; + + os << "record-append: chunkid: " << chunkId << + " version: " << chunkVersion << + " num-bytes: " << contentLength; + return os.str(); + } +}; + +struct GetRecordAppendOpStatus : public KfsOp +{ + kfsChunkId_t chunkId; // input + int64_t writeId; // input + kfsSeq_t opSeq; // output + int64_t chunkVersion; + int64_t opOffset; + size_t opLength; + int opStatus; + size_t widAppendCount; + size_t widBytesReserved; + size_t chunkBytesReserved; + int64_t remainingLeaseTime; + int64_t masterCommitOffset; + int64_t nextCommitOffset; + int appenderState; + string appenderStateStr; + bool masterFlag; + bool stableFlag; + bool openForAppendFlag; + bool widWasReadOnlyFlag; + bool widReadOnlyFlag; + + GetRecordAppendOpStatus(kfsSeq_t seq, kfsChunkId_t c, int64_t w) : + KfsOp(CMD_GET_RECORD_APPEND_STATUS, seq), + chunkId(c), + writeId(w), + opSeq(-1), + chunkVersion(-1), + opOffset(-1), + opLength(0), + opStatus(-1), + widAppendCount(0), + widBytesReserved(0), + chunkBytesReserved(0), + remainingLeaseTime(0), + masterCommitOffset(-1), + nextCommitOffset(-1), + appenderState(0), + appenderStateStr(), + masterFlag(false), + stableFlag(false), + openForAppendFlag(false), + widWasReadOnlyFlag(false), + widReadOnlyFlag(false) + {} + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const + { + ostringstream os; + os << "get-record-append-op-status:" + " seq: " << seq << + " chunkId: " << chunkId << + " writeId: " << writeId << + " chunk-version: " << chunkVersion << + " op-seq: " << opSeq << + " op-status: " << opStatus << + " op-offset: " << opOffset << + " op-length: " << opLength << + " wid-read-only: " << widReadOnlyFlag << + " master-commit: " << masterCommitOffset << + " next-commit: " << nextCommitOffset << + " wid-append-count: " << widAppendCount << + " wid-bytes-reserved: " << widBytesReserved << + " chunk-bytes-reserved: " << chunkBytesReserved << + " remaining-lease-time: " << remainingLeaseTime << + " wid-was-read-only: " << widWasReadOnlyFlag << + " chunk-master: " << masterFlag << + " stable-flag: " << stableFlag << + " open-for-append-flag: " << openForAppendFlag << + " appender-state: " << appenderState << + " appender-state-string: " << appenderStateStr + ; + return os.str(); + } +}; + +struct ChangeFileReplicationOp : public KfsOp { + kfsFileId_t fid; // input + int16_t numReplicas; // desired replication + ChangeFileReplicationOp(kfsSeq_t s, kfsFileId_t f, int16_t r) : + KfsOp(CMD_CHANGE_FILE_REPLICATION, s), fid(f), numReplicas(r) + { + + } + + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + + string Show() const { + ostringstream os; + os << "change-file-replication: fid: " << fid + << " # of replicas: " << numReplicas; + return os.str(); + } +}; + +struct GetPathNameOp : public KfsOp { + kfsFileId_t fid; + kfsChunkId_t chunkId; + chunkOff_t offset; + int64_t chunkVersion; + vector servers; + FileAttr fattr; + string pathname; + GetPathNameOp(kfsSeq_t s, kfsFileId_t f, kfsChunkId_t c) + : KfsOp(CMD_GETPATHNAME, s), + fid(f), + chunkId(c), + offset(-1), + chunkVersion(-1), + servers(), + fattr(), + pathname() + {} + void Request(ostream &os); + virtual void ParseResponseHeaderSelf(const Properties& prop); + string Show() const { + ostringstream os; + os << "getpathname:" + " fid: " << fid << + " cid: " << chunkId << + " status: " << status + ; + return os.str(); + } +}; + +struct ChmodOp : public KfsOp { + kfsFileId_t fid; + kfsMode_t mode; + ChmodOp(kfsSeq_t s, kfsFileId_t f, kfsMode_t m) + : KfsOp(CMD_CHMOD, s), + fid(f), + mode(m) + {} + void Request(ostream &os); + string Show() const { + ostringstream os; + os << "chmod:" + " fid: " << fid << + " mode: " << oct << mode << dec << + " status: " << status + ; + return os.str(); + } +}; + +struct ChownOp : public KfsOp { + kfsFileId_t fid; + kfsUid_t user; + kfsGid_t group; + ChownOp(kfsSeq_t s, kfsFileId_t f, kfsUid_t u, kfsGid_t g) + : KfsOp(CMD_CHOWN, s), + fid(f), + user(u), + group(g) + {} + void Request(ostream &os); + string Show() const { + ostringstream os; + os << "chown:" + " fid: " << fid << + " user: " << user << + " group: " << group << + " status: " << status + ; + return os.str(); + } +}; + +} //namespace client +} //namespace KFS + +#endif // _LIBKFSCLIENT_KFSOPS_H diff --git a/src/cc/libclient/KfsProtocolWorker.cc b/src/cc/libclient/KfsProtocolWorker.cc new file mode 100644 index 000000000..28d115e62 --- /dev/null +++ b/src/cc/libclient/KfsProtocolWorker.cc @@ -0,0 +1,1767 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/10/10 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "KfsProtocolWorker.h" + +#include +#include +#include +#include +#include + +#include + +#include "kfsio/IOBuffer.h" +#include "kfsio/NetConnection.h" +#include "kfsio/NetManager.h" +#include "kfsio/ITimeout.h" +#include "common/kfstypes.h" +#include "common/kfsdecls.h" +#include "common/time.h" +#include "common/MsgLogger.h" +#include "common/StdAllocator.h" +#include "qcdio/QCUtils.h" +#include "qcdio/QCThread.h" +#include "qcdio/QCMutex.h" +#include "qcdio/QCUtils.h" +#include "qcdio/qcstutils.h" +#include "qcdio/QCDLList.h" +#include "qcdio/qcdebug.h" + +#include "WriteAppender.h" +#include "Writer.h" +#include "Reader.h" + +namespace KFS +{ +namespace client +{ + +using std::string; +using std::max; +using std::min; +using std::make_pair; +using std::ostringstream; +using std::pair; +using std::map; +using std::less; + +// KFS client side protocol worker thread implementation. +class KfsProtocolWorker::Impl : + public QCRunnable, + public ITimeout +{ +public: + typedef KfsNetClient MetaServer; + typedef QCDLList WorkQueue; + + Impl( + string inMetaHost, + int inMetaPort, + const Parameters& inParameters) + : QCRunnable(), + ITimeout(), + mNetManager(), + mMetaServer( + mNetManager, + inMetaHost, + inMetaPort, + inParameters.mMetaMaxRetryCount, + inParameters.mMetaTimeSecBetweenRetries, + inParameters.mMetaOpTimeoutSec, + inParameters.mMetaIdleTimeoutSec, + inParameters.mMetaInitialSeqNum > 0 ? + inParameters.mMetaInitialSeqNum : GetInitalSeqNum(), + inParameters.mMetaLogPrefixPtr ? + inParameters.mMetaLogPrefixPtr : "PWM", + true, // inResetConnectionOnOpTimeoutFlag + inParameters.mMaxMetaServerContentLength + ), + mWorkers(), + mMaxRetryCount(inParameters.mMaxRetryCount), + mTimeSecBetweenRetries(inParameters.mTimeSecBetweenRetries), + mWriteAppendThreshold( + inParameters.mWriteAppendThreshold), + mDefaultSpaceReservationSize(inParameters.mDefaultSpaceReservationSize), + mPreferredAppendSize(inParameters.mPreferredAppendSize), + mOpTimeoutSec(inParameters.mOpTimeoutSec), + mIdleTimeoutSec(inParameters.mIdleTimeoutSec), + mLogPrefixPtr(inParameters.mLogPrefixPtr ? + inParameters.mLogPrefixPtr : "PW"), + mPreAllocateFlag(inParameters.mPreAllocateFlag), + mMaxWriteSize(inParameters.mMaxWriteSize), + mRandomWriteThreshold(inParameters.mRandomWriteThreshold), + mMaxReadSize(inParameters.mMaxReadSize), + mReadLeaseRetryTimeout(inParameters.mReadLeaseRetryTimeout), + mLeaseWaitTimeout(inParameters.mLeaseWaitTimeout), + mChunkServerInitialSeqNum( + inParameters.mChunkServerInitialSeqNum > 0 ? + inParameters.mChunkServerInitialSeqNum : + GetInitalSeqNum(0x19885a10)), + mDoNotDeallocate(), + mStopRequest(), + mWorker(this, "KfsProtocolWorker"), + mMutex() + { + WorkQueue::Init(mWorkQueue); + FreeSyncRequests::Init(mFreeSyncRequests); + CleanupList::Init(mCleanupList); + } + virtual ~Impl() + { Impl::Stop(); } + virtual void Run() + { + mNetManager.RegisterTimeoutHandler(this); + mNetManager.MainLoop(); + mNetManager.UnRegisterTimeoutHandler(this); + } + void Start() + { + if (mWorker.IsStarted()) { + return; + } + mStopRequest.mState = Request::kStateNone; + const int kStackSize = 64 << 10; + mWorker.Start(this, kStackSize); + } + void Stop() + { + { + QCStMutexLocker lock(mMutex); + if (mStopRequest.mState == Request::kStateNone) { + Enqueue(mStopRequest); + } + } + mWorker.Join(); + { + QCStMutexLocker lock(mMutex); + QCRTASSERT( + mWorkers.empty() && + CleanupList::IsEmpty(mCleanupList) + ); + SyncRequest* theReqPtr; + while ((theReqPtr = + FreeSyncRequests::PopFront(mFreeSyncRequests))) { + delete theReqPtr; + } + } + } + virtual void Timeout() + { + Request* theWorkQueue[1]; + { + QCStMutexLocker theLock(mMutex); + theWorkQueue[0] = mWorkQueue[0]; + WorkQueue::Init(mWorkQueue); + } + bool theShutdownFlag = false; + Request* theReqPtr; + while ((theReqPtr = WorkQueue::PopFront(theWorkQueue))) { + Request& theReq = *theReqPtr; + QCRTASSERT(theReq.mState == Request::kStateInFlight); + theShutdownFlag = theShutdownFlag || &mStopRequest == theReqPtr; + if (theShutdownFlag) { + Done(theReq, kErrShutdown); + continue; + } + WorkerKey const theKey(theReq.mFileInstance, theReq.mFileId); + Workers::iterator theIt; + if (NeedNewWorker(theReq)) { + theIt = mWorkers.insert( + make_pair(theKey, (Worker*)0)).first; + if (! theIt->second && ! NewWorker(theReq, theIt)) { + continue; + } + } else { + theIt = mWorkers.find(theKey); + if (theIt == mWorkers.end()) { + Done( + theReq, + theReq.mRequestType == kRequestTypeWriteAppendShutdown ? + kErrNone : kErrProtocol + ); + continue; + } + } + QCASSERT(IsValid(theReq)); + theIt->second->Process(theReq); + } + while (! CleanupList::IsEmpty(mCleanupList)) { + delete CleanupList::Front(mCleanupList); + } + if (theShutdownFlag) { + Workers theWorkers; + theWorkers.swap(mWorkers); + for (Workers::iterator theIt = theWorkers.begin(); + theIt != theWorkers.end(); + ) { + delete theIt++->second; + } + QCASSERT(mWorkers.empty()); + mNetManager.Shutdown(); + } + } + int64_t Execute( + RequestType inRequestType, + FileInstance inFileInstance, + FileId inFileId, + const Request::Params* inParamsPtr, + void* inBufferPtr, + int inSize, + int inMaxPending, + int64_t inOffset) + { + if (IsSync(inRequestType)) { + SyncRequest& theReq = GetSyncRequest( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr, + inBufferPtr, + inSize, + inMaxPending, + inOffset + ); + const int64_t theRet = theReq.Execute(*this); + PutSyncRequest(theReq); + return theRet; + } + return Enqueue(AsyncRequest::Create( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr, + inBufferPtr, + inSize, + inMaxPending, + inOffset + )); + } + int64_t Enqueue( + Request& inRequest) + { + if (! IsValid(inRequest)) { + inRequest.mState = Request::kStateDone; + inRequest.Done(kErrParameters); + return kErrParameters; + } + if (! IsSync(inRequest.mRequestType) && inRequest.mSize <= 0) { + inRequest.mState = Request::kStateDone; + inRequest.Done(kErrNone); + return kErrParameters; + } + { + QCStMutexLocker theLock(mMutex); + if (mStopRequest.mState != Request::kStateNone) { + theLock.Unlock(); + Done(inRequest, kErrShutdown); + return kErrShutdown; + } + QCRTASSERT(inRequest.mState != Request::kStateInFlight); + inRequest.mState = Request::kStateInFlight; + WorkQueue::PushBack(mWorkQueue, inRequest); + } + mNetManager.Wakeup(); + return 0; + } + static bool IsAsync( + RequestType inRequestType) + { + return ( + inRequestType == kRequestTypeWriteAppendAsync || + inRequestType == kRequestTypeWriteAppendAsyncNoCopy || + inRequestType == kRequestTypeWriteAsync || + inRequestType == kRequestTypeWriteAsyncNoCopy || + inRequestType == kRequestTypeReadAsync + ); + } + static bool IsSync( + RequestType inRequestType) + { return (! IsAsync(inRequestType)); } + static bool IsSync( + const Request& inRequest) + { return (IsSync(inRequest.mRequestType)); } + static bool IsAppend( + const Request& inRequest) + { + switch (inRequest.mRequestType) { + case kRequestTypeWriteAppend: + case kRequestTypeWriteAppendThrottle: + return true; + case kRequestTypeWriteAppendClose: + return (inRequest.mSize > 0); + case kRequestTypeWriteAppendAsync: + case kRequestTypeWriteAppendAsyncNoCopy: + return (inRequest.mSize > 0 || + inRequest.mMaxPendingOrEndPos >= 0); + default: + break; + } + return false; + } + static bool IsWrite( + const Request& inRequest) + { + switch (inRequest.mRequestType) { + case kRequestTypeWrite: + case kRequestTypeWriteThrottle: + return true; + case kRequestTypeWriteClose: + return (inRequest.mSize > 0); + case kRequestTypeWriteAsync: + case kRequestTypeWriteAsyncNoCopy: + return (inRequest.mSize > 0 || + inRequest.mMaxPendingOrEndPos >= 0); + default: + break; + } + return false; + } + static bool IsRead( + const Request& inRequest) + { + switch (inRequest.mRequestType) { + case kRequestTypeRead: + case kRequestTypeReadAsync: + return true; + case kRequestTypeReadClose: + return (inRequest.mSize > 0); + default: + break; + } + return false; + } + static bool NeedNewWorker( + const Request& inRequest) + { + return ( + IsAppend(inRequest) || + IsWrite(inRequest) || + IsRead(inRequest) + ); + } + static bool IsValid( + const Request& inRequest) + { + if (inRequest.mFileId <= 0) { + return false; + } + switch (inRequest.mRequestType) { + case kRequestTypeWriteAppend: + case kRequestTypeWriteAppendClose: + case kRequestTypeWriteAppendAsync: + case kRequestTypeWriteAppendAsyncNoCopy: + case kRequestTypeWriteAppendThrottle: + return (inRequest.mBufferPtr || inRequest.mSize <= 0); + return (inRequest.mSize == 0); + case kRequestTypeWriteAppendSetWriteThreshold: + return true; + + case kRequestTypeWrite: + case kRequestTypeWriteClose: + case kRequestTypeWriteAsync: + case kRequestTypeWriteAsyncNoCopy: + case kRequestTypeWriteThrottle: + case kRequestTypeRead: + case kRequestTypeReadAsync: + case kRequestTypeReadClose: + return ( + (inRequest.mBufferPtr || inRequest.mSize <= 0) && + (inRequest.mOffset >= 0 || inRequest.mSize <= 0) + ); + case kRequestTypeWriteAppendShutdown: + case kRequestTypeWriteShutdown: + case kRequestTypeReadShutdown: + return (inRequest.mSize == 0); + case kRequestTypeWriteSetWriteThreshold: + return true; + + default: + break; + } + return false; + } + void SetMetaMaxRetryCount( + int inMaxRetryCount) + { + QCStMutexLocker theLock(mMutex); + mMetaServer.SetMaxRetryCount(inMaxRetryCount); + } + void SetMetaTimeSecBetweenRetries( + int inSecs) + { + QCStMutexLocker theLock(mMutex); + mMetaServer.SetTimeSecBetweenRetries(inSecs); + } + void SetMaxRetryCount( + int inMaxRetryCount) + { + QCStMutexLocker theLock(mMutex); + mMaxRetryCount = inMaxRetryCount; + } + void SetTimeSecBetweenRetries( + int inSecs) + { + QCStMutexLocker theLock(mMutex); + mTimeSecBetweenRetries = inSecs; + } + void SetMetaOpTimeoutSec( + int inSecs) + { + QCStMutexLocker theLock(mMutex); + mMetaServer.SetOpTimeoutSec(inSecs); + } + void SetOpTimeoutSec( + int inSecs) + { + QCStMutexLocker theLock(mMutex); + mOpTimeoutSec = inSecs; + } +private: + class StopRequest : public Request + { + public: + StopRequest() + : Request(kRequestTypeWriteAppendClose, 0, 1) + {} + virtual void Done( + int64_t Status) + {} + }; + class AsyncRequest : public Request + { + public: + static AsyncRequest& Create( + RequestType inRequestType, + FileInstance inFileInstance, + FileId inFileId, + const Params* inParamsPtr, + void* inBufferPtr, + int inSize, + int inMaxPending, + int64_t inOffset) + { + QCASSERT(IsAsync(inRequestType)); + const bool theCopyFlag = inSize > 0 && ! ( + inRequestType == kRequestTypeWriteAppendAsyncNoCopy || + inRequestType == kRequestTypeWriteAsyncNoCopy || + inRequestType == kRequestTypeReadAsync + ); + const size_t theReqHdr = sizeof(AsyncRequest) + + (inParamsPtr ? sizeof(*inParamsPtr) : 0); + const int kAlign = 16; + const int theBufAlign = + (theCopyFlag && inOffset > 0) ? (int)(inOffset % kAlign) : 0; + char* const theAllocPtr = new char[ + theReqHdr + + (theCopyFlag ? (inSize + kAlign + theBufAlign) : 0) + ]; + void* theBufferPtr; + if (theCopyFlag) { + char* const thePtr = theAllocPtr + theReqHdr; + theBufferPtr = thePtr + + (kAlign - ((thePtr - (char*)0) % kAlign)) + theBufAlign; + memcpy(theBufferPtr, inBufferPtr, inSize); + } else { + theBufferPtr = inBufferPtr; + } + AsyncRequest* const theRetPtr = new(theAllocPtr) AsyncRequest( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr ? + new(theAllocPtr + sizeof(AsyncRequest)) Params(*inParamsPtr) + : 0, + theBufferPtr, + inSize, + inMaxPending, + inOffset + ); + QCASSERT(reinterpret_cast(theRetPtr) == theAllocPtr); + return *theRetPtr; + } + virtual void Done( + int64_t /* inStatus */) + { + if (mParamsPtr) { + const Params* const theParamsPtr = mParamsPtr; + mParamsPtr = 0; + theParamsPtr->~Params(); + } + this->~AsyncRequest(); + delete [] reinterpret_cast(this); + } + private: + AsyncRequest( + RequestType inRequestType, + FileInstance inFileInstance, + FileId inFileId, + const Params* inParamsPtr, + void* inBufferPtr, + int inSize, + int inMaxPending, + int64_t inOffset) + : Request( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr, + inBufferPtr, + inSize, + inMaxPending, + inOffset) + {} + virtual ~AsyncRequest() + {} + private: + AsyncRequest( + const AsyncRequest& inReq); + AsyncRequest& operator=( + const AsyncRequest& inReq); + }; + class SyncRequest : public Request + { + public: + SyncRequest( + RequestType inRequestType = kRequestTypeUnknown, + FileInstance inFileInstance = 0, + FileId inFileId = -1, + const Params* inParamsPtr = 0, + void* inBufferPtr = 0, + int inSize = 0, + int inMaxPending = -1, + int64_t inOffset = -1) + : Request( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr, + inBufferPtr, + inSize, + inMaxPending, + inOffset), + mMutex(), + mCond(), + mRetStatus(0), + mWaitingFlag(false) + { FreeSyncRequests::Init(*this); } + SyncRequest& Reset( + RequestType inRequestType = kRequestTypeUnknown, + FileInstance inFileInstance = 0, + FileId inFileId = -1, + const Params* inParamsPtr = 0, + void* inBufferPtr = 0, + int inSize = 0, + int inMaxPending = -1, + int64_t inOffset = -1) + { + QCRTASSERT(! mWaitingFlag); + Request::Reset( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr, + inBufferPtr, + inSize, + inMaxPending, + inOffset + ); + mRetStatus = 0; + mWaitingFlag = 0; + return *this; + } + virtual ~SyncRequest() + { QCRTASSERT(! mWaitingFlag); } + virtual void Done( + int64_t inStatus) + { + QCStMutexLocker theLock(mMutex); + mRetStatus = inStatus; + mWaitingFlag = false; + mCond.Notify(); + } + int64_t Execute( + Impl& inWorker) + { + mWaitingFlag = true; + inWorker.Enqueue(*this); + QCStMutexLocker theLock(mMutex); + while (mWaitingFlag && mCond.Wait(mMutex)) + {} + return mRetStatus; + } + private: + QCMutex mMutex; + QCCondVar mCond; + int64_t mRetStatus; + bool mWaitingFlag; + SyncRequest* mPrevPtr[1]; + SyncRequest* mNextPtr[1]; + friend class QCDLListOp; + private: + SyncRequest( + const SyncRequest& inReq); + SyncRequest& operator=( + const SyncRequest& inReq); + }; + typedef QCDLList FreeSyncRequests; + class DoNotDeallocate : public libkfsio::IOBufferAllocator + { + public: + DoNotDeallocate() + : mCurBufSize(0) + {} + virtual size_t GetBufferSize() const + { return mCurBufSize; } + virtual char* Allocate() + { + QCRTASSERT(! "unexpected invocation"); + return 0; + } + virtual void Deallocate( + char* /* inBufferPtr */) + {} + libkfsio::IOBufferAllocator& Get( + size_t inBufSize) + { + mCurBufSize = inBufSize; + return *this; + } + private: + size_t mCurBufSize; + }; + class RequestDeallocator + { + public: + RequestDeallocator( + Request& inRequest) + : mRequest(inRequest) + { QCRTASSERT(inRequest.mSize > 0); } + void operator()( + char* inBufferPtr) + { + QCRTASSERT(inBufferPtr == mRequest.mBufferPtr); + Impl::Done(mRequest); + } + private: + Request& mRequest; + }; + friend class RequestDeallocator; + + class Worker; + typedef QCDLList CleanupList; + typedef pair WorkerKey; + typedef map< + WorkerKey, + Worker*, + less, + StdFastAllocator< + pair + > + > Workers; + class Worker + { + public: + typedef KfsProtocolWorker::Impl Owner; + Worker( + Owner& inOwner, + Workers::iterator inWorkersIt) + : mOwner(inOwner), + mWorkersIt(inWorkersIt), + mDeleteScheduledFlag(false) + { CleanupList::Init(*this); } + virtual ~Worker() + { CleanupList::Remove(mOwner.mCleanupList, *this); } + virtual int Open( + FileId inFileId, + const Request::Params& inParams) = 0; + virtual void Process( + Request& inRequest) = 0; + bool IsDeleteScheduled() const + { return mDeleteScheduledFlag; } + protected: + Owner& mOwner; + + void ScheduleDelete() + { + if (mDeleteScheduledFlag) { + return; + } + mDeleteScheduledFlag = true; + mOwner.mWorkers.erase(mWorkersIt); + CleanupList::PushBack(mOwner.mCleanupList, *this); + } + + private: + Worker* mPrevPtr[1]; + Worker* mNextPtr[1]; + const Workers::iterator mWorkersIt; + bool mDeleteScheduledFlag; + friend class QCDLListOp; + + private: + Worker( + const Worker& inWorker); + Worker& operator=( + const Worker& inWorker); + }; + friend class Worker; + + class Appender : public Worker, public WriteAppender::Completion + { + enum { kNoBufferCompaction = -1 }; + public: + Appender( + Owner& inOwner, + Workers::iterator inWorkersIt, + const char* inLogPrefixPtr) + : Worker(inOwner, inWorkersIt), + WriteAppender::Completion(), + mWAppender( + inOwner.mMetaServer, + this, + inOwner.mMaxRetryCount, + inOwner.mWriteAppendThreshold, + inOwner.mTimeSecBetweenRetries, + inOwner.mDefaultSpaceReservationSize, + inOwner.mPreferredAppendSize, + kNoBufferCompaction, // no buffer compaction + inOwner.mOpTimeoutSec, + inOwner.mIdleTimeoutSec, + inLogPrefixPtr, + inOwner.mChunkServerInitialSeqNum, + inOwner.mPreAllocateFlag + ), + mWriteThreshold(inOwner.mWriteAppendThreshold), + mPending(0), + mCurPos(0), + mDonePos(0), + mLastSyncReqPtr(0), + mCloseReqPtr(0) + { WorkQueue::Init(mWorkQueue); } + virtual ~Appender() + { + mWAppender.Shutdown(); + QCASSERT(! mWAppender.IsActive()); + const int64_t thePending = mPending; + mPending = 0; + Appender::Done(thePending, kErrShutdown); + QCRTASSERT(WorkQueue::IsEmpty(mWorkQueue)); + mWAppender.Unregister(this); + } + virtual int Open( + FileId inFileId, + const Request::Params& inParams) + { return mWAppender.Open(inFileId, inParams.mPathName.c_str()); } + virtual void Done( + WriteAppender& inAppender, + int inStatusCode) + { + const int theRem = inStatusCode == 0 ? + mWAppender.GetPendingSize() : 0; + QCRTASSERT(&inAppender == &mWAppender && theRem <= mPending); + const int theDone = mPending - theRem; + mPending = theRem; + Done(theDone, inStatusCode); + } + virtual void Process( + Request& inRequest) + { + const bool theShutdownFlag = + inRequest.mRequestType == kRequestTypeWriteAppendShutdown; + if (theShutdownFlag) { + mWAppender.Shutdown(); + } else if (inRequest.mRequestType == + kRequestTypeWriteAppendSetWriteThreshold) { + mWriteThreshold = inRequest.mSize; + Impl::Done(inRequest, mWAppender.SetWriteThreshold( + mLastSyncReqPtr != 0 ? 0 : mWriteThreshold)); + } else { + QCRTASSERT( + (IsAppend(inRequest) || + inRequest.mRequestType == kRequestTypeWriteAppendClose + ) && + inRequest.mState == Request::kStateInFlight && + (inRequest.mBufferPtr || inRequest.mSize <= 0) + ); + const bool theCloseFlag = + inRequest.mRequestType == kRequestTypeWriteAppendClose; + const bool theFlushFlag = + ! theCloseFlag && + inRequest.mRequestType != kRequestTypeWriteAppendAsync && + (inRequest.mRequestType != kRequestTypeWriteAppendThrottle || + (inRequest.mMaxPendingOrEndPos >= 0 && + inRequest.mMaxPendingOrEndPos < + mPending + max(0, inRequest.mSize))); + if (theFlushFlag) { + mLastSyncReqPtr = &inRequest; + } + if (inRequest.mSize <= 0) { + inRequest.mMaxPendingOrEndPos = mCurPos; + if (theCloseFlag) { + if (mCloseReqPtr || ! mWAppender.IsOpen()) { + if (&inRequest != mCloseReqPtr) { + Impl::Done(inRequest, kErrProtocol); + } + } else { + mCloseReqPtr = &inRequest; + const int theStatus = mWAppender.Close(); + if (theStatus != 0 && &inRequest == mCloseReqPtr) { + mCloseReqPtr = 0; + Impl::Done(inRequest, theStatus); + } + } + } + if (theFlushFlag) { + if (mPending > 0) { + WorkQueue::PushBack(mWorkQueue, inRequest); + } else { + mLastSyncReqPtr = 0; + // SetWriteThreshold() should be effectively a + // no op, it is here to get status. + Impl::Done(inRequest, + mWAppender.SetWriteThreshold(mWriteThreshold)); + } + } else if (inRequest.mRequestType == + kRequestTypeWriteAppendThrottle) { + const int theStatus = mWAppender.GetErrorCode(); + Impl::Done(inRequest, + theStatus == 0 ? mPending : theStatus); + } + } else { + IOBuffer theBuf; + const bool theAsyncThrottle = ! theFlushFlag && + inRequest.mRequestType == + kRequestTypeWriteAppendThrottle; + if (theAsyncThrottle) { + theBuf.CopyIn( + reinterpret_cast(inRequest.mBufferPtr), + inRequest.mSize + ); + } else { + theBuf.Append(IOBufferData( + reinterpret_cast(inRequest.mBufferPtr), + 0, + inRequest.mSize, + mOwner.mDoNotDeallocate.Get(inRequest.mSize) + )); + WorkQueue::PushBack(mWorkQueue, inRequest); + } + mCurPos += inRequest.mSize; + inRequest.mMaxPendingOrEndPos = mCurPos; + const int theStatus = + mWAppender.Append(theBuf, inRequest.mSize); + if (theStatus <= 0) { + if (! theAsyncThrottle) { + WorkQueue::Remove(mWorkQueue, inRequest); + } + Impl::Done(inRequest, theStatus); + } else { + QCRTASSERT(theStatus == inRequest.mSize); + mPending = mWAppender.GetPendingSize(); + if (theAsyncThrottle) { + // Tell the caller the pending size. + Impl::Done(inRequest, mPending); + } + } + if (theCloseFlag) { + mWAppender.Close(); + } + } + if (theFlushFlag && mLastSyncReqPtr && mWriteThreshold > 0) { + mWAppender.SetWriteThreshold(0); + } + } + if (! mWAppender.IsActive()) { + const int thePending = mPending; + mPending = 0; + Done(thePending, kErrShutdown); + } + if (theShutdownFlag) { + Impl::Done(inRequest, kErrNone); + } + } + private: + WriteAppender mWAppender; + int mWriteThreshold; + int64_t mPending; + int64_t mCurPos; + int64_t mDonePos; + Request* mLastSyncReqPtr; + Request* mCloseReqPtr; + Request* mWorkQueue[1]; + + void Done( + int inDone, + int64_t inStatus) + { + QCRTASSERT(inDone >= 0 && mDonePos + inDone <= mCurPos); + mDonePos += inDone; + const bool theHadSynRequestFlag = mLastSyncReqPtr != 0; + Request* theReqPtr; + while ((theReqPtr = WorkQueue::Front(mWorkQueue)) && + theReqPtr->mMaxPendingOrEndPos <= mDonePos) { + Request& theReq = *theReqPtr; + QCRTASSERT(theReq.mMaxPendingOrEndPos >= 0); + if (&theReq == mLastSyncReqPtr) { + mLastSyncReqPtr = 0; + } + WorkQueue::PopFront(mWorkQueue); + Impl::Done(theReq, inStatus); + } + if (mCloseReqPtr && + WorkQueue::IsEmpty(mWorkQueue) && ! mWAppender.IsActive()) { + Request& theReq = *mCloseReqPtr; + mCloseReqPtr = 0; + ScheduleDelete(); + Impl::Done(theReq, inStatus); + return; + } + if (theHadSynRequestFlag && mWriteThreshold > 0 && + ! mLastSyncReqPtr) { + mWAppender.SetWriteThreshold(mWriteThreshold); + } + } + Appender( + const Appender& inAppender); + Appender& operator=( + const Appender& inAppender); + }; + friend class Appender; + + class FileWriter : public Worker, public Writer::Completion + { + enum { kNoBufferCompaction = -1 }; + public: + typedef Writer::Offset Offset; + + FileWriter( + Owner& inOwner, + Workers::iterator inWorkersIt, + const char* inLogPrefixPtr) + : Worker(inOwner, inWorkersIt), + Writer::Completion(), + mWriter( + inOwner.mMetaServer, + this, + inOwner.mMaxRetryCount, + inOwner.mRandomWriteThreshold, + kNoBufferCompaction, + inOwner.mTimeSecBetweenRetries, + inOwner.mOpTimeoutSec, + inOwner.mIdleTimeoutSec, + inOwner.mMaxWriteSize, + inLogPrefixPtr, + inOwner.mChunkServerInitialSeqNum + ), + mCurRequestPtr(0) + { WorkQueue::Init(mWorkQueue); } + virtual ~FileWriter() + { + mWriter.Shutdown(); + while (! WorkQueue::IsEmpty(mWorkQueue)) { + Impl::Done( + *WorkQueue::PopFront(mWorkQueue), + kErrShutdown + ); + } + mWriter.Unregister(this); + } + virtual int Open( + FileId inFileId, + const Request::Params& inParams) + { + return mWriter.Open( + inFileId, + inParams.mPathName.c_str(), + inParams.mFileSize, + inParams.mStriperType, + inParams.mStripeSize, + inParams.mStripeCount, + inParams.mRecoveryStripeCount, + inParams.mReplicaCount + ); + } + virtual void Process( + Request& inRequest) + { + if (! mWriter.IsOpen()) { + Impl::Done(inRequest, kErrProtocol); + return; + } + const int theErrorCode = mWriter.GetErrorCode(); + if (theErrorCode != 0) { + Impl::Done(inRequest, theErrorCode); + return; + } + switch (inRequest.mRequestType) { + case kRequestTypeWriteAsync: + if (inRequest.mSize <= 0) { + Impl::Done(inRequest, 0); + return; + } + case kRequestTypeWriteThrottle: + if (inRequest.mSize <= 0 && + (inRequest.mMaxPendingOrEndPos < 0 || + mWriter.GetPendingSize() <= + inRequest.mMaxPendingOrEndPos)) { + const int theStatus = mWriter.GetErrorCode(); + Impl::Done(inRequest, theStatus == 0 ? + mWriter.GetPendingSize() : int64_t(theStatus)); + return; + } + case kRequestTypeWriteClose: + case kRequestTypeWrite: + QCRTASSERT(inRequest.mBufferPtr || inRequest.mSize <= 0); + if (! WorkQueue::IsEmpty(mWorkQueue)) { + // For now allow only one flush, block all writers + // (threads) until in flight flush done. + WorkQueue::PushBack(mWorkQueue, inRequest); + return; + } + if (inRequest.mSize > 0) { + Write(inRequest); + } else { + const bool theCloseFlag = + inRequest.mRequestType == kRequestTypeWriteClose; + if (! theCloseFlag && mWriter.GetPendingSize() <= 0) { + Impl::Done(inRequest, mWriter.GetErrorCode()); + return; + } + mCurRequestPtr = &inRequest; + WorkQueue::PushBack(mWorkQueue, inRequest); + const int theRet = theCloseFlag ? + mWriter.Close() : mWriter.Flush(); + if (mCurRequestPtr == &inRequest) { + mCurRequestPtr = 0; + if (theRet < 0) { + WorkQueue::Remove(mWorkQueue, inRequest); + Impl::Done(inRequest, theRet); + return; + } + } + } + return; + case kRequestTypeWriteShutdown: + QCASSERT(inRequest.mSize <= 0); + mWriter.Shutdown(); + Impl::Done(inRequest, kErrNone); + return; + case kRequestTypeWriteSetWriteThreshold: + Impl::Done( + inRequest, + mWriter.SetWriteThreshold(inRequest.mSize) + ); + return; + default: + QCRTASSERT(! "unexpected request"); + return; + } + } + virtual void Done( + Writer& inWriter, + int inStatusCode, + Offset /* inOffset */, + Offset /* inSize */) + { + // See comment in Writer.h about inOffset and inSize for striped + // files. + QCRTASSERT(&inWriter == &mWriter); + if (IsDeleteScheduled()) { + return; + } + Request* theWorkQueue[1]; + theWorkQueue[0] = mWorkQueue[0]; + WorkQueue::Init(mWorkQueue); + mCurRequestPtr = 0; + Request* theReqPtr; + if (inStatusCode != 0) { + const int theStatus = + inStatusCode <= 0 ? inStatusCode : -inStatusCode; + // Error: fail all pending requests. + mWriter.Stop(); + while ((theReqPtr = WorkQueue::PopFront(theWorkQueue))) { + Impl::Done(*theReqPtr, theStatus); + } + } + if (! mWriter.IsOpen() && ! mWriter.IsActive()) { + ScheduleDelete(); + while ((theReqPtr = WorkQueue::PopFront(theWorkQueue))) { + const int theStatus = mWriter.GetErrorCode(); + Impl::Done( + *theReqPtr, + theReqPtr->mRequestType == kRequestTypeWriteClose ? + theStatus : kErrProtocol + ); + } + return; + } + if (! (theReqPtr = WorkQueue::Front(theWorkQueue))) { + return; + } + // Check in flight request completion. + Request& theReq = *theReqPtr; + const int64_t thePendingSize = mWriter.GetPendingSize(); + if ((theReq.mRequestType == kRequestTypeWriteThrottle ? + (theReq.mMaxPendingOrEndPos >= 0 && + thePendingSize > theReq.mMaxPendingOrEndPos) : + (thePendingSize > 0)) || + theReq.mRequestType == kRequestTypeWriteClose) { + mWorkQueue[0] = theWorkQueue[0]; + return; // Wait for completion. + } + WorkQueue::PopFront(theWorkQueue); + Impl::Done( + theReq, + theReq.mRequestType == kRequestTypeWriteThrottle ? + thePendingSize : mWriter.GetErrorCode() + ); + // Process the remaining requests, if any. + while ((theReqPtr = WorkQueue::PopFront(theWorkQueue))) { + Process(*theReqPtr); + } + } + private: + Writer mWriter; + const Request* mCurRequestPtr; + Request* mWorkQueue[1]; + + void Write( + Request& inRequest) + { + QCASSERT(inRequest.mSize > 0 && inRequest.mBufferPtr); + + IOBuffer theBuf; + const bool theThrottleFlag = + inRequest.mRequestType == kRequestTypeWriteThrottle; + char* const theBufferPtr = + reinterpret_cast(inRequest.mBufferPtr); + if (inRequest.mRequestType == kRequestTypeWriteAsync || + inRequest.mRequestType == kRequestTypeWriteAsyncNoCopy || + (theThrottleFlag && ( + inRequest.mMaxPendingOrEndPos < 0 || ( + inRequest.mMaxPendingOrEndPos > 0 && + inRequest.mMaxPendingOrEndPos + + max(0, mOwner.mRandomWriteThreshold) > + max(inRequest.mSize, 0) + + mWriter.GetPendingSize())))) { + if (theThrottleFlag) { + theBuf.CopyIn(theBufferPtr, inRequest.mSize); + } else { + theBuf.Append(IOBufferData( + IOBufferData::IOBufferBlockPtr( + theBufferPtr, + RequestDeallocator(inRequest) + ), + inRequest.mSize, + 0, + inRequest.mSize + )); + } + // The request might get deleted by write, save the params. + const int theSize = inRequest.mSize; + const bool kFlushFlag = false; + const int theRet = mWriter.Write( + theBuf, + theSize, + inRequest.mOffset, + kFlushFlag, + (int)inRequest.mMaxPendingOrEndPos + ); + QCASSERT(theRet < 0 || theRet == theSize); + if (theThrottleFlag) { + Impl::Done(inRequest, theRet >= 0 ? + mWriter.GetPendingSize() : int64_t(theRet)); + } + return; + } + if (theThrottleFlag && inRequest.mMaxPendingOrEndPos > 0) { + theBuf.CopyIn(theBufferPtr, inRequest.mSize); + } else { + theBuf.Append(IOBufferData( + theBufferPtr, + 0, + inRequest.mSize, + mOwner.mDoNotDeallocate.Get(inRequest.mSize) + )); + inRequest.mMaxPendingOrEndPos = 0; + } + WorkQueue::PushBack(mWorkQueue, inRequest); + mCurRequestPtr = &inRequest; + const bool theCloseFlag = + inRequest.mRequestType == kRequestTypeWriteClose; + const bool theFlushFlag = ! theThrottleFlag; + const int theRet = mWriter.Write( + theBuf, + inRequest.mSize, + inRequest.mOffset, + theFlushFlag, + theThrottleFlag ? (int)inRequest.mMaxPendingOrEndPos : -1 + ); + QCASSERT(theRet < 0 || theRet == inRequest.mSize); + if (theRet < 0) { + if (mCurRequestPtr == &inRequest) { + mCurRequestPtr = 0; + WorkQueue::Remove(mWorkQueue, inRequest); + Impl::Done(inRequest, theRet); + } + return; + } + mCurRequestPtr = 0; + if (theCloseFlag) { + mWriter.Close(); + } + } + private: + FileWriter( + const FileWriter& inWriter); + FileWriter& operator=( + const FileWriter& inWriter); + }; + friend class FileWriter; + + class FileReader : public Worker, public Reader::Completion + { + public: + typedef Reader::Offset Offset; + typedef Reader::RequestId RequestId; + + FileReader( + Owner& inOwner, + Workers::iterator inWorkersIt, + const char* inLogPrefixPtr) + : Worker(inOwner, inWorkersIt), + Reader::Completion(), + mReader( + inOwner.mMetaServer, + this, + inOwner.mMaxRetryCount, + inOwner.mTimeSecBetweenRetries, + inOwner.mOpTimeoutSec, + inOwner.mIdleTimeoutSec, + inOwner.mMaxReadSize, + inOwner.mReadLeaseRetryTimeout, + inOwner.mLeaseWaitTimeout, + inLogPrefixPtr, + inOwner.mChunkServerInitialSeqNum), + mCurRequestPtr(0), + mAsyncReadStatus(0), + mAsyncReadDoneCount(0) + { WorkQueue::Init(mWorkQueue); } + virtual ~FileReader() + { + mReader.Shutdown(); + while (! WorkQueue::IsEmpty(mWorkQueue)) { + Impl::Done( + *WorkQueue::PopFront(mWorkQueue), + kErrShutdown + ); + } + mReader.Unregister(this); + } + virtual int Open( + FileId inFileId, + const Request::Params& inParams) + { + const bool kUseDefaultBufferAllocatorFlag = false; + const Offset kRecoverChunkPos = -1; + return mReader.Open( + inFileId, + inParams.mPathName.c_str(), + inParams.mFileSize, + inParams.mStriperType, + inParams.mStripeSize, + inParams.mStripeCount, + inParams.mRecoveryStripeCount, + inParams.mSkipHolesFlag, + kUseDefaultBufferAllocatorFlag, + kRecoverChunkPos, + inParams.mFailShortReadsFlag + ); + } + virtual void Process( + Request& inRequest) + { + if (! mReader.IsOpen()) { + Impl::Done(inRequest, kErrProtocol); + return; + } + const int theErrorCode = mReader.GetErrorCode(); + if (theErrorCode != 0) { + Impl::Done(inRequest, theErrorCode); + return; + } + int theStatus = 0; + switch (inRequest.mRequestType) { + case kRequestTypeReadAsync: + case kRequestTypeRead: + if (inRequest.mSize <= 0 && WorkQueue::IsEmpty(mWorkQueue)) { + Done(inRequest, 0); + return; + } + case kRequestTypeReadClose: + QCRTASSERT(inRequest.mBufferPtr || inRequest.mSize <= 0); + WorkQueue::PushBack(mWorkQueue, inRequest); + inRequest.mMaxPendingOrEndPos = -1; + inRequest.mStatus = 0; + mCurRequestPtr = &inRequest; + if (inRequest.mSize > 0) { + IOBuffer theBuf; + theBuf.Append(IOBufferData( + reinterpret_cast(inRequest.mBufferPtr), + 0, + 0, + mOwner.mDoNotDeallocate.Get(inRequest.mSize) + )); + inRequest.mMaxPendingOrEndPos = inRequest.mSize; + RequestId theReqId = RequestId(); + theReqId.mPtr = &inRequest; + theStatus = mReader.Read( + theBuf, + inRequest.mSize, + inRequest.mOffset, + theReqId + ); + } + if (inRequest.mRequestType == kRequestTypeReadClose && + theStatus == 0 && mCurRequestPtr == &inRequest) { + theStatus = mReader.Close(); + } + if (mCurRequestPtr == &inRequest) { + mCurRequestPtr = 0; + if (theStatus != 0) { + WorkQueue::Remove(mWorkQueue, inRequest); + Done(inRequest, theStatus); + } + } + return; + case kRequestTypeReadShutdown: + QCASSERT(inRequest.mSize <= 0); + mReader.Shutdown(); + Done(inRequest, kErrNone); + return; + default: + QCRTASSERT(! "unexpected request"); + return; + } + } + virtual void Done( + Reader& inReader, + int inStatusCode, + Offset inOffset, + Offset inSize, + IOBuffer* inBufferPtr, + RequestId inRequestId) + { + QCRTASSERT(&inReader == &mReader); + if (IsDeleteScheduled()) { + return; + } + mCurRequestPtr = 0; + Request* theReqPtr = static_cast(inRequestId.mPtr); + if (inStatusCode != 0 && mReader.GetErrorCode() != 0) { + const int theStatus = mReader.GetErrorCode(); + // Fatal error: fail all pending requests. + mReader.Stop(); + while ((theReqPtr = WorkQueue::PopFront(mWorkQueue))) { + Done(*theReqPtr, theStatus); + } + } + if (theReqPtr && ! WorkQueue::IsEmpty(mWorkQueue)) { + Request& theReq = *theReqPtr; + if (theReq.mMaxPendingOrEndPos > 0) { + QCRTASSERT( + theReq.mMaxPendingOrEndPos >= inSize && + (inSize <= 0 || + (theReq.mOffset <= inOffset && + inOffset + inSize <= theReq.mOffset + theReq.mSize)) + ); + theReq.mMaxPendingOrEndPos -= inSize; + } + if (inStatusCode != 0) { + theReq.mStatus = + inStatusCode <= 0 ? inStatusCode : -inStatusCode; + } else if (inBufferPtr && theReq.mStatus >= 0) { + theReq.mStatus += inBufferPtr->BytesConsumable(); + QCRTASSERT(theReq.mStatus <= theReq.mSize); + } + if (theReq.mMaxPendingOrEndPos <= 0) { + WorkQueue::Remove(mWorkQueue, theReq); + Done(theReq, theReq.mStatus); + } + } + // Process completion wait requests, if any. + while ((theReqPtr = WorkQueue::Front(mWorkQueue)) && + theReqPtr->mSize <= 0 && + (theReqPtr->mRequestType == kRequestTypeRead || + theReqPtr->mRequestType == kRequestTypeReadAsync)) { + WorkQueue::Remove(mWorkQueue, *theReqPtr); + Done(*theReqPtr, 0); + } + if (! mReader.IsOpen() && ! mReader.IsActive()) { + ScheduleDelete(); + const int theStatus = mReader.GetErrorCode(); + while ((theReqPtr = WorkQueue::PopFront(mWorkQueue))) { + Done( + *theReqPtr, + theReqPtr->mRequestType == kRequestTypeReadClose ? + theStatus : kErrProtocol + ); + } + } + } + private: + Reader mReader; + Request* mCurRequestPtr; + int mAsyncReadStatus; + int mAsyncReadDoneCount; + Request* mWorkQueue[1]; + + void Done( + Request& inReq, + int inStatus) + { + int theStatus = inStatus; + if (inReq.mRequestType == kRequestTypeReadAsync && + inReq.mSize > 0) { + if (inStatus >= 0) { + mAsyncReadStatus += inStatus; + } else { + mAsyncReadStatus = inStatus; + } + mAsyncReadDoneCount++; + } else if (inReq.mSize <= 0 && ( + inReq.mRequestType == kRequestTypeRead || + inReq.mRequestType == kRequestTypeReadAsync)) { + theStatus = mAsyncReadStatus; + mAsyncReadStatus = 0; + mAsyncReadDoneCount = 0; + } + Impl::Done(inReq, theStatus); + } + + private: + FileReader( + const FileReader& inReader); + FileReader& operator=( + const FileReader& inReader); + }; + friend class FileReader; + + NetManager mNetManager; + MetaServer mMetaServer; + Workers mWorkers; + int mMaxRetryCount; + int mTimeSecBetweenRetries; + const int mWriteAppendThreshold; + const int mDefaultSpaceReservationSize; + const int mPreferredAppendSize; + int mOpTimeoutSec; + const int mIdleTimeoutSec; + const char* const mLogPrefixPtr; + const bool mPreAllocateFlag; + const int mMaxWriteSize; + const int mRandomWriteThreshold; + const int mMaxReadSize; + const int mReadLeaseRetryTimeout; + const int mLeaseWaitTimeout; + int64_t mChunkServerInitialSeqNum; + DoNotDeallocate mDoNotDeallocate; + StopRequest mStopRequest; + QCThread mWorker; + QCMutex mMutex; + Request* mWorkQueue[1]; + SyncRequest* mFreeSyncRequests[1]; + Worker* mCleanupList[1]; + + static void Done( + Request& inRequest, + int inStatus) + { + if (inRequest.mState == Request::kStateDone) { + return; + } + QCRTASSERT(inRequest.mState == Request::kStateInFlight); + inRequest.mState = Request::kStateDone; + inRequest.mStatus = inStatus; + inRequest.Done(inStatus); + } + static void Done( + Request& inRequest) + { Done(inRequest, inRequest.mStatus); } + bool NewWorker( + Request& inRequest, + Workers::iterator& inWorkersIt) + { + if (! inRequest.mParamsPtr) { + return false; + } + string theName = inRequest.mParamsPtr->mPathName; + size_t thePos = theName.find_last_of('/'); + if (thePos != string::npos && ++thePos < theName.length()) { + theName = theName.substr(thePos); + } + ostringstream theStream; + theStream << + mLogPrefixPtr << + " " << inRequest.mParamsPtr->mMsgLogId << + "," << inRequest.mFileId << + "," << inRequest.mFileInstance << + "," << theName + ; + const string theLogPrefix = theStream.str(); + Worker* const theRetPtr = IsAppend(inRequest) ? + static_cast(new Appender( + *this, inWorkersIt, theLogPrefix.c_str())) : + (IsWrite(inRequest) ? + static_cast(new FileWriter( + *this, inWorkersIt, theLogPrefix.c_str())) : + (IsRead(inRequest) ? + static_cast(new FileReader( + *this, inWorkersIt, theLogPrefix.c_str())) : + 0 + )); + QCRTASSERT(theRetPtr); + const int theStatus = theRetPtr->Open( + inRequest.mFileId, *inRequest.mParamsPtr); + if (theStatus != kErrNone) { + mWorkers.erase(inWorkersIt); + delete theRetPtr; + Done(inRequest, theStatus); + return false; + } + inWorkersIt->second = theRetPtr; + mChunkServerInitialSeqNum += 100000; + return true; + } + SyncRequest& GetSyncRequest( + RequestType inRequestType, + FileInstance inFileInstance, + FileId inFileId, + const Request::Params* inParamsPtr, + void* inBufferPtr, + int inSize, + int inMaxPending, + int64_t inOffset) + { + QCStMutexLocker lock(mMutex); + SyncRequest* theReqPtr = FreeSyncRequests::PopFront(mFreeSyncRequests); + return (theReqPtr ? theReqPtr->Reset( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr, + inBufferPtr, + inSize, + inMaxPending, + inOffset + ) : *(new SyncRequest( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr, + inBufferPtr, + inSize, + inMaxPending, + inOffset + ))); + } + void PutSyncRequest( + SyncRequest& inRequest) + { + QCStMutexLocker lock(mMutex); + FreeSyncRequests::PushFront(mFreeSyncRequests, inRequest); + } + static int64_t GetInitalSeqNum( + int64_t inSeed = 0) + { + boost::mt19937 theRandom(boost::mt19937::result_type( + microseconds() + inSeed + )); + const int64_t theRet( + (int64_t)theRandom() | ((int64_t)theRandom() << 32)); + return ((theRet < 0 ? -theRet : theRet) >> 1); + } +private: + Impl( + const Impl& inImpl); + Impl& operator=( + const Impl& inImpl); +}; + +KfsProtocolWorker::Request::Request( + KfsProtocolWorker::RequestType inRequestType /* = kRequestTypeUnknown */, + KfsProtocolWorker::FileInstance inFileInstance /* = 0 */, + KfsProtocolWorker::FileId inFileId /* = -1 */, + const KfsProtocolWorker::Request::Params* inParamsPtr /* = 0 */, + void* inBufferPtr /* = 0 */, + int inSize /* = 0 */, + int inMaxPending /* = -1 */, + int64_t inOffset /* = -1 */) + : mRequestType(inRequestType), + mFileInstance(inFileInstance), + mFileId(inFileId), + mParamsPtr(inParamsPtr), + mBufferPtr(inBufferPtr), + mSize(inSize), + mState(KfsProtocolWorker::Request::kStateNone), + mStatus(0), + mMaxPendingOrEndPos(inMaxPending), + mOffset(inOffset) +{ + KfsProtocolWorker::Impl::WorkQueue::Init(*this); +} + +void +KfsProtocolWorker::Request::Reset( + KfsProtocolWorker::RequestType inRequestType /* = kRequestTypeUnknown */, + KfsProtocolWorker::FileInstance inFileInstance /* = 0 */, + KfsProtocolWorker::FileId inFileId /* = -1 */, + const KfsProtocolWorker::Request::Params* inParamsPtr /* = 0 */, + void* inBufferPtr /* = 0 */, + int inSize /* = 0 */, + int inMaxPending /* = -1 */, + int64_t inOffset /* = -1 */) +{ + mRequestType = inRequestType; + mFileInstance = inFileInstance; + mFileId = inFileId; + mParamsPtr = inParamsPtr; + mBufferPtr = inBufferPtr; + mSize = inSize; + mMaxPendingOrEndPos = inMaxPending; + mState = KfsProtocolWorker::Request::kStateNone; + mStatus = 0; + mOffset = inOffset; +} + +/* virtual */ +KfsProtocolWorker::Request::~Request() +{ + QCRTASSERT(mState != kStateInFlight); +} + +KfsProtocolWorker::KfsProtocolWorker( + string inMetaHost, + int inMetaPort, + const KfsProtocolWorker::Parameters* inParametersPtr /* = 0 */) + : mImpl(*(new Impl( + inMetaHost, + inMetaPort, + inParametersPtr ? *inParametersPtr : KfsProtocolWorker::Parameters() + ))) +{ +} + +KfsProtocolWorker::~KfsProtocolWorker() +{ + delete &mImpl; +} + +void +KfsProtocolWorker::Start() +{ + mImpl.Start(); +} + +void +KfsProtocolWorker::Stop() +{ + mImpl.Stop(); +} + +int64_t +KfsProtocolWorker::Execute( + KfsProtocolWorker::RequestType inRequestType, + KfsProtocolWorker::FileInstance inFileInstance, + KfsProtocolWorker::FileId inFileId, + const KfsProtocolWorker::Request::Params* inParamsPtr, + void* inBufferPtr, + int inSize, + int inMaxPending, + int64_t inOffset) +{ + return mImpl.Execute( + inRequestType, + inFileInstance, + inFileId, + inParamsPtr, + inBufferPtr, + inSize, + inMaxPending, + inOffset + ); +} + +void +KfsProtocolWorker::Enqueue( + Request& inRequest) +{ + mImpl.Enqueue(inRequest); +} + +void +KfsProtocolWorker::SetMetaMaxRetryCount( + int inMaxRetryCount) +{ + mImpl.SetMetaMaxRetryCount(inMaxRetryCount); +} + +void +KfsProtocolWorker::SetMetaTimeSecBetweenRetries( + int inSecs) +{ + mImpl.SetMetaTimeSecBetweenRetries(inSecs); +} + +void +KfsProtocolWorker::SetMaxRetryCount( + int inMaxRetryCount) +{ + mImpl.SetMaxRetryCount(inMaxRetryCount); +} + +void +KfsProtocolWorker::SetTimeSecBetweenRetries( + int inSecs) +{ + mImpl.SetTimeSecBetweenRetries(inSecs); +} + +void +KfsProtocolWorker::SetMetaOpTimeoutSec( + int inSecs) +{ + mImpl.SetMetaOpTimeoutSec(inSecs); +} + +void +KfsProtocolWorker::SetOpTimeoutSec( + int inSecs) +{ + mImpl.SetOpTimeoutSec(inSecs); +} + +}} /* namespace client KFS */ diff --git a/src/cc/libclient/KfsProtocolWorker.h b/src/cc/libclient/KfsProtocolWorker.h new file mode 100644 index 000000000..5aab55e7b --- /dev/null +++ b/src/cc/libclient/KfsProtocolWorker.h @@ -0,0 +1,290 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/10/10 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef FKS_PROTOCOL_WORKER_H +#define FKS_PROTOCOL_WORKER_H + +#include "common/kfstypes.h" +#include "kfsio/checksum.h" +#include "qcdio/QCDLList.h" + +#include +#include + +namespace KFS +{ +namespace client +{ +using std::string; + +// KFS client side protocol worker thread runs client side network io state +// machines. +class KfsProtocolWorker +{ +private: + class Impl; +public: + enum + { + kErrNone = 0, + kErrParameters = -EINVAL, + kErrProtocol = -EBADF, + kErrShutdown = -91010 + }; + enum RequestType + { + kRequestTypeUnknown = 0, + kRequestTypeWriteAppend = 1, + kRequestTypeWriteAppendClose = 2, + kRequestTypeWriteAppendShutdown = 3, + kRequestTypeWriteAppendSetWriteThreshold = 4, + kRequestTypeWriteAppendAsync = 5, + kRequestTypeWriteAppendAsyncNoCopy = 6, + kRequestTypeWriteAppendThrottle = 7, + kRequestTypeWrite = 20, + kRequestTypeWriteClose = 21, + kRequestTypeWriteAsync = 22, + kRequestTypeWriteAsyncNoCopy = 23, + kRequestTypeWriteThrottle = 24, + kRequestTypeWriteShutdown = 25, + kRequestTypeWriteSetWriteThreshold = 26, + kRequestTypeRead = 27, + kRequestTypeReadAsync = 28, + kRequestTypeReadClose = 29, + kRequestTypeReadShutdown = 30 + }; + typedef kfsFileId_t FileId; + typedef unsigned int FileInstance; + class Request + { + public: + struct Params + { + Params( + string inPathName = string(), + int64_t inFileSize = -1, + int inStriperType = -1, + int inStripeSize = 0, + int inStripeCount = 0, + int inRecoveryStripeCount = 0, + int inReplicaCount = 0, + bool inSkipHolesFlag = false, + int inMsgLogId = -1, + bool inFailShortReadsFlag = false) + : mPathName(inPathName), + mFileSize(inFileSize), + mStriperType(inStriperType), + mStripeSize(inStripeSize), + mStripeCount(inStripeCount), + mRecoveryStripeCount(inRecoveryStripeCount), + mReplicaCount(inReplicaCount), + mSkipHolesFlag(inSkipHolesFlag), + mMsgLogId(inMsgLogId), + mFailShortReadsFlag(inFailShortReadsFlag) + {} + + string mPathName; + int64_t mFileSize; + int mStriperType; + int mStripeSize; + int mStripeCount; + int mRecoveryStripeCount; + int mReplicaCount; + bool mSkipHolesFlag; + int mMsgLogId; + bool mFailShortReadsFlag; + }; + Request( + RequestType inOpType = kRequestTypeUnknown, + FileInstance inFileInstance = 0, + FileId inFileId = -1, + const Params* inParamsPtr = 0, + void* inBufferPtr = 0, + int inSize = 0, + int inMaxPending = -1, + int64_t inOffset = -1); + void Reset( + RequestType inOpType = kRequestTypeUnknown, + FileInstance inFileInstance = 0, + FileId inFileId = -1, + const Params* inParamsPtr = 0, + void* inBufferPtr = 0, + int inSize = 0, + int inMaxPending = -1, + int64_t inOffset = -1); + virtual void Done( + int64_t inStatus) = 0; + int64_t GetOffset() const + { return mOffset; } + int GetSize() const + { return mSize; } + void* GetBufferPtr() const + { return mBufferPtr; } + protected: + virtual ~Request(); + private: + enum State + { + kStateNone = 0, + kStateInFlight = 1, + kStateDone = 2, + kStateDeleted = 3 + }; + RequestType mRequestType; + FileInstance mFileInstance; + FileId mFileId; + const Params* mParamsPtr; + void* mBufferPtr; + int mSize; + State mState; + int64_t mStatus; + int64_t mMaxPendingOrEndPos; + int64_t mOffset; + private: + Request* mPrevPtr[1]; + Request* mNextPtr[1]; + friend class QCDLListOp; + friend class Impl; + private: + Request( + const Request& inReq); + Request& operator=( + const Request& inReq); + }; + class Parameters + { + public: + Parameters( + int inMetaMaxRetryCount = 6, + int inMetaTimeSecBetweenRetries = 10, + int inMetaOpTimeoutSec = 3 * 60, + int inMetaIdleTimeoutSec = 5 * 60, + int64_t inMetaInitialSeqNum = 0, + const char* inMetaLogPrefixPtr = 0, + int inMaxRetryCount = 10, + int inWriteAppendThreshold = KFS::CHECKSUM_BLOCKSIZE, + int inTimeSecBetweenRetries = 15, + int inDefaultSpaceReservationSize = 1 << 20, + int inPreferredAppendSize = KFS::CHECKSUM_BLOCKSIZE, + int inOpTimeoutSec = 120, + int inIdleTimeoutSec = 5 * 30, + const char* inLogPrefixPtr = 0, + int64_t inChunkServerInitialSeqNum = 0, + bool inPreAllocateFlag = false, + int inMaxWriteSize = 1 << 20, + int inRandomWriteThreshold = 1 << 20, + int inMaxReadSize = 1 << 20, + int inReadLeaseRetryTimeout = 3, + int inLeaseWaitTimeout = 900, + int inMaxMetaServerContentLength = 1 << 20) + : mMetaMaxRetryCount(inMetaMaxRetryCount), + mMetaTimeSecBetweenRetries(inMetaTimeSecBetweenRetries), + mMetaOpTimeoutSec(inMetaOpTimeoutSec), + mMetaIdleTimeoutSec(inMetaIdleTimeoutSec), + mMetaInitialSeqNum(inMetaInitialSeqNum), + mMetaLogPrefixPtr(inMetaLogPrefixPtr), + mMaxRetryCount(inMaxRetryCount), + mWriteAppendThreshold(inWriteAppendThreshold), + mTimeSecBetweenRetries(inTimeSecBetweenRetries), + mDefaultSpaceReservationSize(inDefaultSpaceReservationSize), + mPreferredAppendSize(inPreferredAppendSize), + mOpTimeoutSec(inOpTimeoutSec), + mIdleTimeoutSec(inIdleTimeoutSec), + mLogPrefixPtr(inLogPrefixPtr), + mChunkServerInitialSeqNum(inChunkServerInitialSeqNum), + mPreAllocateFlag(inPreAllocateFlag), + mMaxWriteSize(inMaxWriteSize), + mRandomWriteThreshold(inRandomWriteThreshold), + mMaxReadSize(inMaxReadSize), + mReadLeaseRetryTimeout(inReadLeaseRetryTimeout), + mLeaseWaitTimeout(inLeaseWaitTimeout), + mMaxMetaServerContentLength(inMaxMetaServerContentLength) + {} + int mMetaMaxRetryCount; + int mMetaTimeSecBetweenRetries; + int mMetaOpTimeoutSec; + int mMetaIdleTimeoutSec; + int64_t mMetaInitialSeqNum; + const char* mMetaLogPrefixPtr; + int mMaxRetryCount; + int mWriteAppendThreshold; + int mTimeSecBetweenRetries; + int mDefaultSpaceReservationSize; + int mPreferredAppendSize; + int mOpTimeoutSec; + int mIdleTimeoutSec; + const char* mLogPrefixPtr; + int64_t mChunkServerInitialSeqNum; + bool mPreAllocateFlag; + int mMaxWriteSize; + int mRandomWriteThreshold; + int mMaxReadSize; + int mReadLeaseRetryTimeout; + int mLeaseWaitTimeout; + int mMaxMetaServerContentLength; + }; + KfsProtocolWorker( + std::string inMetaHost, + int inMetaPort, + const Parameters* inParametersPtr = 0); + ~KfsProtocolWorker(); + int64_t Execute( + RequestType inRequestType, + FileInstance inFileInstance, + FileId inFileId, + const Request::Params* inParamsPtr = 0, + void* inBufferPtr = 0, + int inSize = 0, + int inMaxPending = -1, + int64_t inOffset = -1); + void Enqueue( + Request& inRequest); + void Start(); + void Stop(); + void SetMetaMaxRetryCount( + int inMaxRetryCount); + void SetMetaTimeSecBetweenRetries( + int inSecs); + // The following two might not have effect on already opened files. + void SetMaxRetryCount( + int inMaxRetryCount); + void SetTimeSecBetweenRetries( + int inSecs); + void SetMetaOpTimeoutSec( + int inSecs); + void SetOpTimeoutSec( + int inSecs); +private: + Impl& mImpl; +private: + KfsProtocolWorker( + const KfsProtocolWorker& inWorker); + KfsProtocolWorker& operator=( + const KfsProtocolWorker& inWorker); +}; + +}} + +#endif /* FKS_PROTOCOL_WORKER_H */ diff --git a/src/cc/libclient/KfsRead.cc b/src/cc/libclient/KfsRead.cc new file mode 100644 index 000000000..a4ed641e0 --- /dev/null +++ b/src/cc/libclient/KfsRead.cc @@ -0,0 +1,829 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/09/08 +// Author: Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Kfs client blocking read and read ahead implementation. +//---------------------------------------------------------------------------- + +#include "KfsClientInt.h" +#include "KfsProtocolWorker.h" +#include "common/MsgLogger.h" +#include "qcdio/qcstutils.h" +#include "qcdio/QCDLList.h" +#include "qcdio/qcdebug.h" +#include "qcdio/QCUtils.h" + +#include +#include +#include +#include +#include +#include + +namespace KFS +{ +namespace client +{ + +using std::string; +using std::max; +using std::min; +using std::numeric_limits; + +// Blocking read conditional variables with free/unused list "next" pointer. +class ReadRequestCondVar : public QCCondVar +{ +public: + ReadRequestCondVar() + : QCCondVar(), + mNextPtr(0) + {} + ReadRequestCondVar* mNextPtr; +}; + +// Non-blocking read request with blocking completion "wait" used to implement +// blocking read. +class ReadRequest : public KfsProtocolWorker::Request +{ +public: + static int64_t GetEof( + const FileTableEntry& inEntry) + { + return (inEntry.eofMark < 0 ? + inEntry.fattr.fileSize : + min(inEntry.eofMark, inEntry.fattr.fileSize) + ); + } + static int MaxRequestSize( + const FileTableEntry& inEntry, + int inSize, + int64_t inOffset) + { + const int64_t kChunkSize = (int64_t)CHUNKSIZE; + return (int)max(int64_t(0), min( + GetEof(inEntry) - inOffset, + inEntry.skipHoles ? + min(int64_t(inSize), kChunkSize - inOffset % kChunkSize) : + int64_t(inSize) + )); + } + static ReadRequest* Create( + QCMutex& inMutex, + FileTableEntry& inEntry, + void* inBufPtr, + int inSize, + int64_t inOffset, + int inMsgLogId) + { + QCASSERT(inMutex.IsOwned()); + const int theSize = MaxRequestSize(inEntry, inSize, inOffset); + if (theSize <= 0) { + return 0; + } + ReadRequest& theReq = *(new ReadRequest(inMutex)); + if (theReq.Init( + inEntry, inBufPtr, theSize, inOffset, inMsgLogId) <= 0) { + delete &theReq; + return 0; + } + return &theReq; + } + virtual void Done( + int64_t inStatus) + { + // Using the same global lock for both client (mainly meta server + // blocking requests now) and completion isn't ideal. Though the meta + // server sync. call will presently block all clients anyway. + // + // Probably the best way to fix that is to make meta server requests + // non blocking by moving the corresponding logic into protocol worker, + // and change meta server to process requests out of order for the same + // client connection, i.e. allow more than one suspended request per + // client connection. + QCStMutexLocker theLocker(mMutex); + QCASSERT(! mDoneFlag && (mCondVarPtr || mWaitingCount == 0)); + mDoneFlag = true; + mStatus = inStatus; + if (mCondVarPtr) { + mCondVarPtr->Notify(); + } + if (mCanceledFlag && mWaitingCount == 0) { + theLocker.Unlock(); + delete this; + } + } + int64_t Wait( + ReadRequestCondVar*& ioFreeCondVarsHeadPtr, + FileTableEntry& inEntry) + { + QCASSERT(mMutex.IsOwned() && mWaitingCount >= 0); + if (++mWaitingCount <= 1 && ! mDoneFlag) { + QCRTASSERT(! mCondVarPtr); + if (ioFreeCondVarsHeadPtr) { + mCondVarPtr = ioFreeCondVarsHeadPtr; + ioFreeCondVarsHeadPtr = mCondVarPtr->mNextPtr; + mCondVarPtr->mNextPtr = 0; + } else { + mCondVarPtr = new ReadRequestCondVar(); + } + } + while (! mDoneFlag) { + QCASSERT(mCondVarPtr); + mCondVarPtr->Wait(mMutex); + } + const int64_t theStatus = mCanceledFlag ? -ECANCELED : mStatus; + // Do not access inEntry if request was canceled. inEntry might not be + // valid in the case if one thread waits, while the other closes + // the fd. + if (! mCanceledFlag && inEntry.buffer.mReadReq == this) { + inEntry.buffer.mStatus = + (theStatus == -ENOENT && inEntry.skipHoles) ? + 0 : (int)theStatus; + } + QCASSERT(mWaitingCount > 0); + if (--mWaitingCount > 0) { + QCASSERT(mCondVarPtr); + mCondVarPtr->Notify(); + } else { + if (mCondVarPtr) { + mCondVarPtr->mNextPtr = ioFreeCondVarsHeadPtr; + ioFreeCondVarsHeadPtr = mCondVarPtr; + mCondVarPtr = 0; + } + if (! mCanceledFlag) { + Queue::Remove(inEntry.mReadQueue, *this); + if (inEntry.buffer.mReadReq == this) { + inEntry.buffer.mReadReq = 0; + } + } + delete this; + } + return theStatus; + } + void Cancel( + FileTableEntry& inEntry) + { + QCASSERT(mMutex.IsOwned()); + Queue::Remove(inEntry.mReadQueue, *this); + if (inEntry.buffer.mReadReq == this) { + if (! mDoneFlag) { + mBufToDeletePtr = inEntry.buffer.DetachBuffer(); + } + // Even if request is done, mark it as canceled, to make any + // threads waiting in GetReadAhead() unwind immediately + // without accessing the file table entry. + // The file table entry might be invalid when wait returns in the + // case when one thread calls close while the other threads + // are blocked in read. + inEntry.buffer.mStatus = -ECANCELED; + inEntry.buffer.mReadReq = 0; + } + mCanceledFlag = true; + if (mWaitingCount == 0 && mDoneFlag) { + delete this; + } + } + static int64_t Wait( + ReadRequestCondVar*& ioFreeCondVarsHeadPtr, + FileTableEntry& inEntry, + int64_t inOffset, + int inSize) + { + Queue::Iterator theIt(inEntry.mReadQueue); + const int64_t theEndPos = inOffset + inSize; + ReadRequest* thePtr; + while ((thePtr = theIt.Next())) { + const int64_t theReqStart = thePtr->GetOffset(); + const int64_t theReqEnd = theReqStart + thePtr->GetSize(); + if (theReqStart < theEndPos && inOffset < theReqEnd) { + return thePtr->Wait(ioFreeCondVarsHeadPtr, inEntry); + } + } + return 0; + } + static ReadRequest* Find( + FileTableEntry& inEntry, + void* inBufPtr, + int64_t inSize, + int64_t inOffset) + { + QCASSERT(inSize >= 0); + const char* const theLPtr = reinterpret_cast(inBufPtr); + const char* const theRPtr = theLPtr + inSize; + Queue::Iterator theIt(inEntry.mReadQueue); + ReadRequest* thePtr; + while ((thePtr = theIt.Next())) { + const char* const theBPtr = + reinterpret_cast(thePtr->GetBufferPtr()); + const char* const theEPtr = theBPtr + thePtr->GetSize(); + if (theLPtr < theEPtr && theBPtr <= theRPtr) { + return thePtr; + } + } + return 0; + } + static void CancelAll( + FileTableEntry& inEntry) + { + while (! Queue::IsEmpty(inEntry.mReadQueue)) { + Queue::Front(inEntry.mReadQueue)->Cancel(inEntry); + } + QCRTASSERT(! inEntry.buffer.mReadReq); + } + static void InitEntry( + FileTableEntry& inEntry) + { + Queue::Init(inEntry.mReadQueue); + } + static int GetReadAhead( + ReadRequestCondVar*& ioFreeCondVarsHeadPtr, + FileTableEntry& inEntry, + void* inBufPtr, + int inSize, + int64_t inOffset, + bool& outShortReadFlag) + { + outShortReadFlag = false; + if (inOffset < inEntry.buffer.mStart || + inEntry.buffer.mStatus < 0 || + inEntry.buffer.mSize <= 0 || + inEntry.buffer.mStart < 0 || + ! inEntry.buffer.mBuf || + inSize <= 0 || + inEntry.buffer.mStart + inEntry.buffer.mSize <= inOffset) { + return 0; + } + if (inEntry.buffer.mReadReq) { + const int64_t theRet = inEntry.buffer.mReadReq->Wait( + ioFreeCondVarsHeadPtr, inEntry); + // The last thread leaving wait sets inEntry.buffer.mReadReq to 0, + // this guarantees that read ahead buffer and result remains valid, + // and corresponds to the read ahead request that was waited for. + // All other fields of inEntry, can change. + if (theRet <= 0) { + // inEntry might be invalid -- it might not exist anymore. + return (int)theRet; + } + } + return CopyReadAhead( + inEntry, inBufPtr, inSize, inOffset, outShortReadFlag); + } + static int GetReadAheadSize( + FileTableEntry& inEntry, + int64_t inOffset) + { + const int theBufSize = inEntry.buffer.GetBufSize(); + if (theBufSize <= 0) { + return 0; + } + int theSize = MaxRequestSize(inEntry, theBufSize, inOffset); + if (theSize <= 0) { + return 0; + } + char* const thePtr = inEntry.buffer.GetBufPtr(); + if (! thePtr) { + return 0; + } + // SetReadAheadSize() sets "optimal" buffer size, try to align the + // end of the request to this size, to make all subsequent requests + // aligned. + if (inOffset + theSize < GetEof(inEntry)) { + const int theTail = (int)((inOffset + theSize) % theBufSize); + if (theSize > theTail) { + theSize -= theTail; + } + } + return theSize; + } + static ReadRequest* InitReadAhead( + QCMutex& inMutex, + FileTableEntry& inEntry, + int inMsgLogId, + chunkOff_t inPos) + { + QCASSERT(inMutex.IsOwned()); + if (inEntry.buffer.mReadReq || + inPos >= GetEof(inEntry) || + (inEntry.buffer.mSize > 0 && + inEntry.buffer.mStatus > 0 && + inPos < inEntry.buffer.mStart + inEntry.buffer.mStatus)) { + return 0; + } + inEntry.buffer.mStatus = 0; + inEntry.buffer.mSize = 0; + inEntry.buffer.mStart = -1; + const int64_t theOffset = inPos; + int theSize = GetReadAheadSize(inEntry, theOffset); + if (theSize <= 0) { + return 0; + } + char* const thePtr = inEntry.buffer.GetBufPtr(); + QCASSERT(thePtr); + ReadRequest& theReq = *(new ReadRequest(inMutex)); + if (theReq.Init(inEntry, thePtr, theSize, theOffset, inMsgLogId) <= 0) { + delete &theReq; + return 0; + } + inEntry.buffer.mStart = theReq.GetOffset(); + inEntry.buffer.mSize = theReq.GetSize(); + inEntry.buffer.mReadReq = &theReq; + return &theReq; + } +private: + typedef QCDLList Queue; + + Params mOpenParams; + QCMutex& mMutex; + ReadRequestCondVar* mCondVarPtr; + int mWaitingCount; + bool mDoneFlag:1; + bool mCanceledFlag:1; + char* mBufToDeletePtr; + int64_t mStatus; + ReadRequest* mPrevPtr[1]; + ReadRequest* mNextPtr[1]; + + friend class QCDLListOp; + + ReadRequest( + QCMutex& inMutex) + : Request(), + mOpenParams(), + mMutex(inMutex), + mCondVarPtr(0), + mWaitingCount(0), + mDoneFlag(false), + mCanceledFlag(false), + mBufToDeletePtr(0), + mStatus(0) + { Queue::Init(*this); } + virtual ~ReadRequest() + { + QCASSERT(mWaitingCount == 0 && mCondVarPtr == 0); + delete [] mBufToDeletePtr; + } + int Init( + FileTableEntry& inEntry, + void* inBufPtr, + int inSize, + int64_t inOffset, + int inMsgLogId) + { + QCASSERT(! mCondVarPtr && mMutex.IsOwned()); + if (inOffset < 0 || ! inBufPtr) { + return -EINVAL; + } + if (inSize <= 0) { + return 0; + } + Reset( + KfsProtocolWorker::kRequestTypeReadAsync, + inEntry.instance + 1, + inEntry.fattr.fileId, + &mOpenParams, + inBufPtr, + inSize, + 0, // inMaxPending, + inOffset + ); + if (GetSize() <= 0) { + return 0; + } + mOpenParams.mPathName = inEntry.pathname; + mOpenParams.mFileSize = inEntry.fattr.fileSize; + mOpenParams.mStriperType = inEntry.fattr.striperType; + mOpenParams.mStripeSize = inEntry.fattr.stripeSize; + mOpenParams.mStripeCount = inEntry.fattr.numStripes; + mOpenParams.mRecoveryStripeCount = inEntry.fattr.numRecoveryStripes; + mOpenParams.mReplicaCount = inEntry.fattr.numReplicas; + mOpenParams.mSkipHolesFlag = inEntry.skipHoles; + mOpenParams.mFailShortReadsFlag = inEntry.failShortReadsFlag; + mOpenParams.mMsgLogId = inMsgLogId; + mWaitingCount = 0; + mDoneFlag = false; + mCanceledFlag = false; + mStatus = 0; + Queue::PushBack(inEntry.mReadQueue, *this); + return GetSize(); + } + static bool IsReadAheadInFlight( + FileTableEntry& inEntry) + { + return (inEntry.buffer.mReadReq && + ! inEntry.buffer.mReadReq->mDoneFlag); + } + static int CopyReadAhead( + FileTableEntry& inEntry, + void* inBufPtr, + int inSize, + int64_t inOffset, + bool outShortReadFlag) + { + QCASSERT(! IsReadAheadInFlight(inEntry)); + outShortReadFlag = + inEntry.buffer.mStatus >= 0 && + inEntry.buffer.mStatus < inEntry.buffer.mSize && + inEntry.buffer.mStart + inEntry.buffer.mStatus < inOffset + inSize; + const int64_t thePos = inOffset - inEntry.buffer.mStart; + QCASSERT(thePos >= 0); + const int theLen = (int)min( + int64_t(inSize), inEntry.buffer.mStatus - thePos); + if (theLen <= 0) { + return 0; + } + memcpy(inBufPtr, inEntry.buffer.mBuf + (size_t)thePos, (size_t)theLen); + return theLen; + } +private: + ReadRequest( + const ReadRequest& inReq); + ReadRequest& operator=( + const ReadRequest& inReq); +}; + +void +KfsClientImpl::InitPendingRead( + FileTableEntry& inEntry) +{ + QCASSERT(mMutex.IsOwned()); + ReadRequest::InitEntry(inEntry); +} + +void +KfsClientImpl::CancelPendingRead( + FileTableEntry& inEntry) +{ + QCASSERT(mMutex.IsOwned()); + ReadRequest::CancelAll(inEntry); +} + +void +KfsClientImpl::CleanupPendingRead() +{ + while (mFreeCondVarsHead) { + ReadRequestCondVar* const thePtr = mFreeCondVarsHead; + mFreeCondVarsHead = thePtr->mNextPtr; + delete thePtr; + } +} + +int +KfsClientImpl::ReadPrefetch( + int inFd, + char* inBufPtr, + size_t inSize) +{ + if (! inBufPtr) { + return -EINVAL; + } + + QCStMutexLocker theLocker(mMutex); + + if (! valid_fd(inFd)) { + KFS_LOG_STREAM_ERROR << + "read prefetch error invalid fd: " << inFd << + KFS_LOG_EOM; + return -EBADF; + } + FileTableEntry& theEntry = *mFileTable[inFd]; + if (theEntry.openMode == O_WRONLY || + theEntry.currPos.fileOffset < 0 || + theEntry.cachedAttrFlag) { + return -EINVAL; + } + if (theEntry.fattr.isDirectory) { + return -EISDIR; + } + if (inSize <= 0) { + return 0; + } + const int64_t theOffset = theEntry.currPos.fileOffset; + if (theOffset >= ReadRequest::GetEof(theEntry) || + ReadRequest::Find(theEntry, inBufPtr, (int64_t)inSize, theOffset)) { + return 0; + } + StartProtocolWorker(); + ReadRequest* const theReqPtr = ReadRequest::Create( + mMutex, + theEntry, + inBufPtr, + (int)min(inSize, (size_t)numeric_limits::max()), + theOffset, + inFd + ); + if (! theReqPtr) { + return 0; + } + theEntry.readUsedProtocolWorkerFlag = true; + const int theRet = theReqPtr->GetSize(); + theLocker.Unlock(); + QCASSERT(! mMutex.IsOwned()); + + mProtocolWorker->Enqueue(*theReqPtr); + return theRet; +} + +ssize_t +KfsClientImpl::Read( + int inFd, + char* inBufPtr, + size_t inSize, + chunkOff_t* inPosPtr /* = 0 */) +{ + QCStMutexLocker theLocker(mMutex); + + if (! valid_fd(inFd)) { + KFS_LOG_STREAM_ERROR << + "read error invalid fd: " << inFd << + KFS_LOG_EOM; + return -EBADF; + } + FileTableEntry& theEntry = *mFileTable[inFd]; + if (theEntry.openMode == O_WRONLY || theEntry.cachedAttrFlag) { + return -EINVAL; + } + if (theEntry.fattr.isDirectory) { + return ReadDirectory(inFd, inBufPtr, inSize); + } + + chunkOff_t& theFilePos = inPosPtr ? *inPosPtr : theEntry.currPos.fileOffset; + int64_t theFdPos = theFilePos; + if (theFdPos < 0) { + return -EINVAL; + } + + const KfsProtocolWorker::FileId theFileId = theEntry.fattr.fileId; + const KfsProtocolWorker::FileInstance theInstance = theEntry.instance + 1; + + const int64_t kChunkSize = (int64_t)CHUNKSIZE; + const int64_t theEof = ReadRequest::GetEof(theEntry); + int theRet = 0; + int64_t thePos = theFdPos; + int64_t theLen = min(theEof - thePos, (int64_t)inSize); + const int theSize = (int)theLen; + const bool theSkipHolesFlag = theEntry.skipHoles; + // Wait for prefetch with this buffer, if any. + ReadRequest* const theReqPtr = ReadRequest::Find( + theEntry, inBufPtr, (int64_t)inSize, thePos); + if (theReqPtr) { + void* const theBufPtr = theReqPtr->GetBufferPtr(); + const int64_t theReqPos = theReqPtr->GetOffset(); + const int theReqSize = theReqPtr->GetSize(); + int64_t theRes = theReqPtr->Wait(mFreeCondVarsHead, theEntry); + if (theSkipHolesFlag && theRes == -ENOENT) { + theRes = 0; + } + if (theRes < 0) { + return (ssize_t)theRes; + } + // For now discard pre-fetch if pre-fetch position is larger than the + // current. + if (theReqPos <= thePos && theSize == theLen) { + const int64_t theNRd = + min(int64_t(theSize) - theRet, theRes - (thePos - theReqPos)); + if (theNRd > 0) { + if (inBufPtr + theRet != theBufPtr || thePos != theReqPos) { + memmove(inBufPtr + theRet, theBufPtr, (size_t)theNRd); + } + thePos += theNRd; + theRet += theNRd; + if (theSkipHolesFlag && + theRes < theReqSize && + theReqPos + theRes <= thePos) { + // Move to the next chunk if read was short. + thePos += kChunkSize - thePos % kChunkSize; + } + } + } + // Request wait releases mutex, ensure that the fd wasn't closed by + // other thread. + if (! valid_fd(inFd) || mFileTable[inFd] != &theEntry || + theEntry.instance + 1 != theInstance) { + return theRet; + } + if (theFilePos == theFdPos) { + theFilePos = thePos; + theFdPos = thePos; + } + } + // Do the check here to allow to de-queue prefetch regardless of the buffer + // size. + if (theSize != theLen) { + return -EOVERFLOW; + } + // Do not return if nothing more to read -- start the read ahead. + StartProtocolWorker(); + theEntry.readUsedProtocolWorkerFlag = true; + + bool theShortReadFlag = false; + const int theRes = ReadRequest::GetReadAhead( + mFreeCondVarsHead, + theEntry, + inBufPtr + theRet, + theSize - theRet, + thePos, + theShortReadFlag + ); + if (theRes < 0) { + return theRes; + } + thePos += theRes; + theRet += theRes; + if (theSkipHolesFlag && theShortReadFlag) { + // Move to the next chunk if read was short. + thePos += kChunkSize - thePos % kChunkSize; + } + // Wait in GetReadAhead() releases the mutex, ensure that + // the file position remains the same before updating it, or using it. + // Wait returns an error if theEntry (theRet < 0) becomes invalid as result + // of Close() call for example. + // Use read ahead to get the remainder of the request if the remainder is + // small enough. + if (theSize <= theRet || + (theFdPos == theFilePos && + (theSize - theRet) <= + ReadRequest::GetReadAheadSize(theEntry, thePos) / 2)) { + if (theFdPos == theFilePos) { + theFilePos = thePos; + theFdPos = thePos; + } + ReadRequest* const theReqPtr = + ReadRequest::InitReadAhead(mMutex, theEntry, inFd, theFilePos); + if (theReqPtr) { + // Theoretically Enqueue can immediately invoke Request::Done(), + // this should not be a problem as mMutex is recursive. + mProtocolWorker->Enqueue(*theReqPtr); + if (theSize <= theRet) { + return theRet; + } + } + const int theRes = ReadRequest::GetReadAhead( + mFreeCondVarsHead, + theEntry, + inBufPtr + theRet, + theSize - theRet, + thePos, + theShortReadFlag + ); + if (theRes < 0) { + return theRes; + } + thePos += theRes; + theRet += theRes; + if (theSkipHolesFlag && theShortReadFlag) { + // Move to the next chunk if read was short. + thePos += kChunkSize - thePos % kChunkSize; + } + if (theFdPos == theFilePos) { + theFilePos = thePos; + theFdPos = thePos; + } + if (theSize <= theRet) { + return theRet; + } + } + KfsProtocolWorker::Request::Params theOpenParams; + theOpenParams.mPathName = theEntry.pathname; + theOpenParams.mFileSize = theEntry.fattr.fileSize; + theOpenParams.mStriperType = theEntry.fattr.striperType; + theOpenParams.mStripeSize = theEntry.fattr.stripeSize; + theOpenParams.mStripeCount = theEntry.fattr.numStripes; + theOpenParams.mRecoveryStripeCount = theEntry.fattr.numRecoveryStripes; + theOpenParams.mReplicaCount = theEntry.fattr.numReplicas; + theOpenParams.mSkipHolesFlag = theSkipHolesFlag; + theOpenParams.mFailShortReadsFlag = theEntry.failShortReadsFlag; + theOpenParams.mMsgLogId = inFd; + + theLocker.Unlock(); + QCASSERT(! mMutex.IsOwned()); + + int64_t theChunkEnd = theSkipHolesFlag ? + min(theEof, (thePos - thePos % kChunkSize + kChunkSize)) : theEof; + for (; ;) { + int theStatus = (int)mProtocolWorker->Execute( + KfsProtocolWorker::kRequestTypeRead, + theInstance, + theFileId, + &theOpenParams, + inBufPtr + theRet, + (int)min(int64_t(theSize) - theRet, theChunkEnd - thePos), + 0, + thePos + ); + if (theSkipHolesFlag && theStatus == -ENOENT) { + theStatus = 0; + } + if (theStatus < 0) { + theRet = theStatus; + break; + } + theRet += theStatus; + thePos += theStatus; + if (theSize <= theRet) { + break; + } + if (! theSkipHolesFlag || thePos >= theEof) { + break; + } + thePos = theChunkEnd; + if (thePos >= theEof) { + thePos = theEof; + break; + } + theChunkEnd = min(theEof, theChunkEnd + kChunkSize); + } + ReadRequest* theReadAheadReqPtr = 0; + if (theRet > 0) { + QCStMutexLocker theLocker(mMutex); + if (! valid_fd(inFd) || mFileTable[inFd] != &theEntry) { + return theRet; + } + if (theEntry.instance + 1 == theInstance && theFilePos == theFdPos) { + QCASSERT(mProtocolWorker); + theFilePos = thePos; + theReadAheadReqPtr = + ReadRequest::InitReadAhead(mMutex, theEntry, inFd, theFilePos); + } + } + if (theReadAheadReqPtr) { + mProtocolWorker->Enqueue(*theReadAheadReqPtr); + } + return theRet; +} + +ssize_t +KfsClientImpl::SetReadAheadSize( + int inFd, + size_t inSize) +{ + QCStMutexLocker theLocker(mMutex); + if (! valid_fd(inFd)) { + KFS_LOG_STREAM_ERROR << + "read error invalid inFd: " << inFd << + KFS_LOG_EOM; + return -EBADF; + } + return SetReadAheadSize(*mFileTable[inFd], inSize); +} + +ssize_t +KfsClientImpl::SetReadAheadSize( + FileTableEntry& inEntry, + size_t inSize, + bool inOptimalFlag) +{ + QCASSERT(mMutex.IsOwned()); + + int theSize = (int)min((size_t)numeric_limits::max(), + (inSize + CHECKSUM_BLOCKSIZE - 1) / + CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE); + FileAttr& theAttr = inEntry.fattr; + if (theSize > 0 && + theAttr.striperType != KFS_STRIPED_FILE_TYPE_NONE && + theAttr.stripeSize > 0 && + theAttr.numStripes > 0 && + theAttr.stripeSize < theSize) { + const int theStride = theAttr.stripeSize * theAttr.numStripes; + theSize = (max( + inOptimalFlag ? (1 << 20) * theAttr.numStripes : 0, theSize) + + theStride - 1) / theStride * theStride; + } + inEntry.buffer.SetBufSize(theSize); + return inEntry.buffer.GetBufSize(); +} + +ssize_t +KfsClientImpl::GetReadAheadSize( + int inFd) const +{ + QCStMutexLocker theLocker(const_cast(this)->mMutex); + + if (! valid_fd(inFd)) { + KFS_LOG_STREAM_ERROR << + "read error invalid inFd: " << inFd << + KFS_LOG_EOM; + return -EBADF; + } + return mFileTable[inFd]->buffer.GetBufSize(); +} + +}} diff --git a/src/cc/libclient/KfsWrite.cc b/src/cc/libclient/KfsWrite.cc new file mode 100644 index 000000000..4dd04f0e2 --- /dev/null +++ b/src/cc/libclient/KfsWrite.cc @@ -0,0 +1,215 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/10/02 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// All the code to deal with writes from the client. +//---------------------------------------------------------------------------- + +#include "KfsClientInt.h" +#include "KfsProtocolWorker.h" +#include "common/MsgLogger.h" +#include "qcdio/qcstutils.h" + +#include +#include +#include +#include +#include + +namespace KFS { +namespace client { + +using std::string; + +int +KfsClientImpl::RecordAppend(int fd, const char *buf, int numBytes) +{ + const bool asyncFlag = false; + const bool appendOnlyFlag = false; + return Write(fd, buf, numBytes, asyncFlag, appendOnlyFlag); +} + +int +KfsClientImpl::AtomicRecordAppend(int fd, const char *buf, int numBytes) +{ + const bool asyncFlag = false; + const bool appendOnlyFlag = true; + return Write(fd, buf, numBytes, asyncFlag, appendOnlyFlag); +} + +ssize_t +KfsClientImpl::Write(int fd, const char *buf, size_t numBytes, chunkOff_t* pos) +{ + const bool asyncFlag = false; + const bool appendOnlyFlag = false; + return Write(fd, buf, numBytes, asyncFlag, appendOnlyFlag, pos); +} + +int +KfsClientImpl::WriteAsync(int fd, const char *buf, size_t numBytes) +{ + const bool asyncFlag = true; + const bool appendOnlyFlag = false; + return Write(fd, buf, numBytes, asyncFlag, appendOnlyFlag); +} + +int +KfsClientImpl::WriteAsyncCompletionHandler(int fd) +{ + return Sync(fd); +} + +ssize_t +KfsClientImpl::Write(int fd, const char *buf, size_t numBytes, + bool asyncFlag, bool appendOnlyFlag, chunkOff_t* pos /* = 0 */) +{ + QCStMutexLocker lock(mMutex); + + if (! valid_fd(fd)) { + KFS_LOG_STREAM_ERROR << + "write error invalid fd: " << fd << + KFS_LOG_EOM; + return -EBADF; + } + FileTableEntry& entry = *mFileTable[fd]; + if (entry.openMode == O_RDONLY) { + return -EINVAL; + } + if (entry.fattr.fileId <= 0) { + return -EBADF; + } + if (entry.fattr.isDirectory) { + return -EISDIR; + } + if (numBytes <= 0) { + return 0; + } + if (! buf) { + return -EINVAL; + } + + chunkOff_t& filePos = pos ? *pos : entry.currPos.fileOffset; + const int64_t offset = filePos; + const bool appendFlag = (entry.openMode & O_APPEND) != 0; + if ((offset < 0 || appendOnlyFlag) && ! appendFlag) { + return -EINVAL; + } + if (appendFlag) { + if (numBytes > (int)CHUNKSIZE) { + return -EFBIG; + } + } else { + if (offset < 0 || appendOnlyFlag) { + return -EINVAL; + } + if (filePos + (chunkOff_t)numBytes < 0) { + return -EFBIG; + } + filePos += numBytes; + } + StartProtocolWorker(); + KfsProtocolWorker::Request::Params openParams; + openParams.mPathName = entry.pathname; + openParams.mFileSize = entry.fattr.fileSize; + openParams.mStriperType = entry.fattr.striperType; + openParams.mStripeSize = entry.fattr.stripeSize; + openParams.mStripeCount = entry.fattr.numStripes; + openParams.mRecoveryStripeCount = entry.fattr.numRecoveryStripes; + openParams.mReplicaCount = entry.fattr.numReplicas; + openParams.mMsgLogId = fd; + entry.usedProtocolWorkerFlag = true; + entry.pending += numBytes; + const KfsProtocolWorker::FileId fileId = entry.fattr.fileId; + const KfsProtocolWorker::FileInstance fileInstance = entry.instance; + const string pathName = entry.pathname; + const int bufsz = entry.ioBufferSize; + const int prevPending = entry.pending; + const bool throttle = + ! asyncFlag && bufsz > 0 && bufsz <= entry.pending; + if ((throttle || bufsz <= 0) && ! asyncFlag) { + entry.pending = 0; + } + lock.Unlock(); + + KFS_LOG_STREAM_DEBUG << + fd << "," << fileId << "," << fileInstance << "," << pathName << + (appendFlag ? " append ->" : " write ->") << + " offset: " << offset << + " size: " << numBytes << + " throttle: " << throttle << + " pending: " << prevPending << + " bufsz: " << bufsz << + KFS_LOG_EOM; + + const int64_t status = mProtocolWorker->Execute( + asyncFlag ? + (appendFlag ? + KfsProtocolWorker::kRequestTypeWriteAppendAsyncNoCopy : + KfsProtocolWorker::kRequestTypeWriteAsyncNoCopy) : + (bufsz <= 0 ? + (appendFlag ? + KfsProtocolWorker::kRequestTypeWriteAppend : + KfsProtocolWorker::kRequestTypeWrite) : + (throttle ? + (appendFlag ? + KfsProtocolWorker::kRequestTypeWriteAppendThrottle : + KfsProtocolWorker::kRequestTypeWriteThrottle) : + (appendFlag ? + KfsProtocolWorker::kRequestTypeWriteAppendAsync : + KfsProtocolWorker::kRequestTypeWriteAsync) + )), + fileInstance, + fileId, + &openParams, + const_cast(buf), + numBytes, + (throttle || (! appendFlag && bufsz >= 0)) ? bufsz : -1, + offset + ); + if (status < 0) { + return (ssize_t)status; + } + if (throttle && status > 0) { + QCStMutexLocker lock(mMutex); + // File can be closed by other thread, fd entry can be re-used. + // In this cases close / sync should have returned the corresponding + // status. + // Throttle returns current number of bytes pending. + if (valid_fd(fd) && &entry == mFileTable[fd] && + entry.instance == fileInstance) { + KFS_LOG_STREAM_DEBUG << + fd << "," << fileId << "," << fileInstance << "," << pathName << + (appendFlag ? " append <+" : " write <+") << + " offset: " << offset << + " size: " << numBytes << + " pending:" + " prev: " << prevPending << + " cur: " << entry.pending << + " add: " << status << + KFS_LOG_EOM; + entry.pending += status; + } + } + return numBytes; +} + +}} diff --git a/src/cc/libclient/Path.cc b/src/cc/libclient/Path.cc new file mode 100644 index 000000000..33da47a72 --- /dev/null +++ b/src/cc/libclient/Path.cc @@ -0,0 +1,132 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/07/20 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Generic i-node path implementation. +// +//---------------------------------------------------------------------------- + +#include "Path.h" + +#include +#include +#include + +#include +#include +#include +#include + +namespace KFS +{ +namespace client +{ +using std::string; +using std::istringstream; + +Path::Path( + const char* inPathPtr, + size_t inLength) + : mComponents(), + mNormComponents(), + mDirFlag(false) +{ + if (inLength > 0) { + Path::Set(inPathPtr, inLength); + } +} + +bool +Path::Set( + const char* inPathPtr, + size_t inLength) +{ + mComponents.clear(); + mNormComponents.clear(); + mComponents.reserve(16); + mNormComponents.reserve(16); + + const Token kRootDir("/", 1); + const Token kThisDir(".", 1); + const Token kParentDir("..", 2); + const char* const theEndPtr = inPathPtr + inLength; + for (const char* theStartPtr = inPathPtr; + theStartPtr < theEndPtr; + ++theStartPtr) { + const char* const theSlashPtr = + (const char*)memchr(theStartPtr, '/', theEndPtr - theStartPtr); + if (theSlashPtr == inPathPtr) { + const Token theRoot(theSlashPtr, 1); + mComponents.push_back(theRoot); + mNormComponents.push_back(theRoot); + continue; + } + if (theSlashPtr == theStartPtr) { + continue; + } + const char* const thePtr = theSlashPtr ? theSlashPtr : theEndPtr; + const Token theName(theStartPtr, thePtr - theStartPtr); + theStartPtr = thePtr; + if (theName == kParentDir) { + const size_t theSize = mNormComponents.size(); + if (theSize <= 0) { + mComponents.clear(); + break; // invalid path + } + if (theSize > 1 || mNormComponents[0] != kRootDir) { + mNormComponents.pop_back(); + if (theSize == 1) { + mComponents.clear(); + break; // invalid relative path + } + } + mComponents.push_back(theName); + } else if (theName != kThisDir) { + mComponents.push_back(theName); + mNormComponents.push_back(theName); + } + } + if (mComponents.empty()) { + mDirFlag = false; + return false; + } + mDirFlag = inLength > 0 && theEndPtr[-1] == '/'; + return true; +} + +string +Path::ToString(const Path::Components& inComponents) const +{ + string result; + for (iterator theIt = inComponents.begin(); + theIt != inComponents.end(); + theIt++) { + if (theIt > inComponents.begin() + 1 || + (theIt == inComponents.begin() + 1 && + (inComponents[0].mLen != 1 || inComponents[0].mPtr[0] != '/'))) { + result += "/"; + } + result.append(theIt->mPtr, theIt->mLen); + } + return result; +} + +}} diff --git a/src/cc/libclient/Path.h b/src/cc/libclient/Path.h new file mode 100644 index 000000000..5e931bd49 --- /dev/null +++ b/src/cc/libclient/Path.h @@ -0,0 +1,97 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/07/20 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief I-node path. Used for path resolution, and "normalization". +// +//---------------------------------------------------------------------------- + +#ifndef LIBKFSCLIENT_PATH_H +#define LIBKFSCLIENT_PATH_H + +#include + +#include +#include + +namespace KFS { +namespace client { + +using std::string; +using std::vector; + +class Path +{ +public: + class Token + { + public: + Token( + const char* inPtr = 0, + size_t inLen = 0) + : mPtr(inPtr), + mLen(inLen) + {} + bool operator==( + const Token& inToken) const + { + return (inToken.mLen == mLen && + memcmp(inToken.mPtr, mPtr, mLen) == 0); + } + bool operator!=( + const Token& inToken) const + { return ! (*this == inToken); } + const char* mPtr; + size_t mLen; + }; + typedef vector Components; + typedef Components::const_iterator iterator; + + Path( + const char* inPathPtr = 0, + size_t inPathLength = 0); + bool Set( + const char* inPathPtr, + size_t inPathLength); + void Clear() { Set(0, 0); } + string ToString() const { return ToString(mComponents); } + string NormPath() const { return ToString(mNormComponents); } + iterator begin() const { return mComponents.begin(); } + iterator end() const { return mComponents.end(); } + size_t size() const { return mComponents.size(); } + bool empty() const { return mComponents.empty(); } + const Token& operator[](size_t i) const { return mComponents[i]; } + bool IsDir() const { return mDirFlag; } +private: + Components mComponents; + Components mNormComponents; + bool mDirFlag; + + string ToString(const Path::Components& comps) const; + +private: + Path(const Path& inPath); + Path& operator=(const Path& inPath); +}; + +}} + +#endif /* LIBKFSCLIENT_PATH_H */ diff --git a/src/cc/libclient/RSStriper.cc b/src/cc/libclient/RSStriper.cc new file mode 100644 index 000000000..252054033 --- /dev/null +++ b/src/cc/libclient/RSStriper.cc @@ -0,0 +1,3396 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/07/27 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "RSStriper.h" +#include "Writer.h" + +#include "qcrs/rs.h" +#include "kfsio/IOBuffer.h" +#include "kfsio/checksum.h" +#include "common/MsgLogger.h" +#include "qcdio/qcdebug.h" +#include "qcdio/QCDLList.h" +#include "qcdio/QCUtils.h" + +#include +#include +#include +#include +#include +#include + +namespace KFS +{ +namespace client +{ +using std::min; +using std::max; +using std::string; +using std::ostringstream; + +// Stripe iterator / positioning logic used for both read and write striped +// files io. The main goal is to avoid division (and to lesser extend +// multiplication) if possible. +class RSStriper +{ +public: + typedef int64_t Offset; + + enum + { + kErrorNone = 0, + kErrorParameters = -EINVAL, + kErrorIO = -EIO, + kErrorCanceled = -EINTR, + kErrorNoEntry = -ENOENT, + kErrorInvalChunkSize = -EINVALCHUNKSIZE, + kErrorInvalidChunkSizes = -(10000 + EINVAL) + }; + + static bool Validate( + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + string& outErrMsg) + { + const int kChunkSize = (int)CHUNKSIZE; + if (inStripeSize < KFS_MIN_STRIPE_SIZE || + inStripeSize > KFS_MAX_STRIPE_SIZE || + inStripeCount <= 0 || + (inRecoveryStripeCount != 0 && + inStripeCount > RS_LIB_MAX_DATA_BLOCKS) || + inStripeSize % KFS_STRIPE_ALIGNMENT != 0 || + kChunkSize % inStripeSize != 0 || + (inRecoveryStripeCount != 0 && + inRecoveryStripeCount != kMaxRecoveryStripes) || + (inRecoveryStripeCount > 0 && inStripeSize % kAlign != 0)) { + ostringstream theErrStream; + theErrStream << "invalid parameters:" + " stripe count: " << inStripeCount << + " recovery stripe count: " << inRecoveryStripeCount << + " stripe size: " << inStripeSize + ; + outErrMsg = theErrStream.str(); + return false; + } + return true; + } + RSStriper( + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + string inLogPrefix) + : mLogPrefix(inLogPrefix), + mStripeSize(inStripeSize), + mStripeCount(inStripeCount), + mRecoveryStripeCount(inRecoveryStripeCount), + mStrideSize(inStripeSize * inStripeCount), + mChunkBlockSize((Offset)CHUNKSIZE * mStripeCount), + mChunkBlockTotalSize( + (Offset)CHUNKSIZE * (mStripeCount + mRecoveryStripeCount)), + mPos(0), + mChunkBlockPos(0), + mStripePos(0), + mStripeIdx(0), + mChunkBlockStartFilePos(0), + mFilePos(0), + mTempBufAllocPtr(0), + mTempBufPtr(0), + mBufPtr(inRecoveryStripeCount > 0 ? + new void*[inStripeCount + inRecoveryStripeCount] : 0) + { + QCRTASSERT( + mStripeCount > 0 && + (mRecoveryStripeCount == 0 || + mRecoveryStripeCount == kMaxRecoveryStripes) && + mStripeSize >= KFS_MIN_STRIPE_SIZE && + mStripeSize <= KFS_MAX_STRIPE_SIZE && + CHUNKSIZE % mStripeSize == 0 + ); + } + virtual ~RSStriper() + { + delete [] mTempBufAllocPtr; + delete [] mBufPtr; + } + void SetPos( + Offset inPos) + { + QCRTASSERT(inPos >= 0); + if (mPos == inPos) { + return; + } + mPos = inPos; + mChunkBlockPos = mPos % mChunkBlockSize; + mStripePos = mChunkBlockPos % mStripeSize; + mStripeIdx = (mChunkBlockPos / mStripeSize) % mStripeCount; + mChunkBlockStartFilePos = mPos / mChunkBlockSize * mChunkBlockTotalSize; + mFilePos = + mChunkBlockStartFilePos + + mChunkBlockPos / mStrideSize * mStripeSize + + mStripeIdx * (Offset)CHUNKSIZE + + mStripePos; + } + bool SeekStripe( + Offset inCount) + { + QCRTASSERT(inCount >= 0 && mStripePos + inCount <= mStripeSize); + mPos += inCount; + mChunkBlockPos += inCount; + mStripePos += inCount; + mFilePos += inCount; + if (mStripePos < mStripeSize) { + return false; + } + QCASSERT(mStripePos == mStripeSize); + mStripePos = 0; + mFilePos += (Offset)CHUNKSIZE - mStripeSize; + if (++mStripeIdx < mStripeCount) { + return false; + } + mStripeIdx = 0; + mFilePos -= mChunkBlockSize - mStripeSize; + if (mChunkBlockPos >= mChunkBlockSize) { + mChunkBlockPos = 0; + mFilePos += mChunkBlockTotalSize - (Offset)CHUNKSIZE; + mChunkBlockStartFilePos = mFilePos; + QCASSERT(mFilePos % mChunkBlockTotalSize == 0); + return true; // Next chunk block start. + } + return false; + } + bool Seek( + Offset inCount) + { + if (inCount >= 0 && mStripePos + inCount <= mStripeSize) { + return SeekStripe(inCount); + } + SetPos(mPos + inCount); + return true; + } + int GetStripeRemaining() const + { return (int)(mStripeSize - mStripePos); } + Offset GetPos() const + { return mPos; } + Offset GetChunkBlockPos() const + { return mChunkBlockPos; } + Offset GetStripePos() const + { return mStripePos; } + int GetStripeIdx() const + { return (int)mStripeIdx; } + Offset GetFilePos() const + { return mFilePos; } + Offset GetChunkBlockStartFilePos() const + { return mChunkBlockStartFilePos; } + int GetNextStripeIdx( + int inIdx) const + { + QCASSERT(inIdx >= 0 && inIdx < mStripeCount); + return ((inIdx + 1 >= mStripeCount) ? 0 : inIdx + 1); + } + Offset GetChunkSize( + int inStripeIdx, + Offset inBlockPos, + Offset inFileSize) + { + const Offset kChunkSize = (Offset)CHUNKSIZE; + if (inFileSize < 0 || inStripeIdx < 0 || inBlockPos < 0) { + return kChunkSize; + } + QCASSERT( + inStripeIdx < mStripeCount + mRecoveryStripeCount && + inBlockPos % mChunkBlockSize == 0 + ); + const Offset theSize = inFileSize - inBlockPos; + if (theSize <= 0 || theSize >= mChunkBlockSize) { + return kChunkSize; + } + const Offset theStrideCount = theSize / mStrideSize; + const Offset theStrideHead = theSize % mStrideSize; + const Offset theStripeIdx = theStrideHead / mStripeSize; + const int theIdx = + inStripeIdx < mStripeCount ? inStripeIdx : 0; + Offset theChunkSize = theStrideCount * mStripeSize; + if (theIdx < theStripeIdx) { + theChunkSize += mStripeSize; + } else if (theIdx == theStripeIdx) { + theChunkSize += theStrideHead % mStripeSize; + } + QCASSERT(theChunkSize <= kChunkSize); + return theChunkSize; + } + static int GetChunkPos( + Offset inPos) + { return (int)(inPos % (Offset)CHUNKSIZE); } + static IOBufferData NewDataBuffer( + int inSize) + { + char* const thePtr = new char [inSize + kAlign]; + const int theOffset = kAlign - (thePtr - (const char*)0) % kAlign; + return IOBufferData(thePtr, inSize + theOffset, theOffset, 0); + } + static void InternalError( + const char* inMsgPtr = 0) + { + if (inMsgPtr) { + KFS_LOG_STREAM_FATAL << inMsgPtr << KFS_LOG_EOM; + } + MsgLogger::Stop(); + abort(); + } + + enum { kAlign = 16 }; + enum { kMaxRecoveryStripes = RS_LIB_MAX_RECOVERY_BLOCKS }; + + const string mLogPrefix; + const int mStripeSize; + const int mStripeCount; + const int mRecoveryStripeCount; + const int mStrideSize; + const Offset mChunkBlockSize; + const Offset mChunkBlockTotalSize; + +private: + // Stripe iterator. + Offset mPos; + Offset mChunkBlockPos; + Offset mStripePos; + Offset mStripeIdx; + Offset mChunkBlockStartFilePos; + Offset mFilePos; // Meta server / linear file position. + char* mTempBufAllocPtr; + char* mTempBufPtr; + +protected: + void** const mBufPtr; + + enum { kTempBufSize = kAlign * 16 }; + + char* GetTempBufSelfPtr( + int inIndex, + int inBufsCount) + { + if (! mTempBufAllocPtr) { + const size_t theSize = kTempBufSize * inBufsCount; + mTempBufAllocPtr = new char [theSize + kAlign]; + mTempBufPtr = mTempBufAllocPtr + + (kAlign - (mTempBufAllocPtr - (const char*)0) % kAlign); + memset(mTempBufPtr, 0, theSize); + } + QCASSERT(0 <= inIndex && inIndex < inBufsCount); + return (mTempBufPtr + inIndex * kTempBufSize); + } +}; + +// Striped files with and without Reed-Solomon recovery writer implementation. +class RSWriteStriper : public Writer::Striper, private RSStriper +{ +public: + typedef RSStriper::Offset Offset; + + static Striper* Create( + StriperType inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + Writer::Striper::Offset inFileSize, + string inLogPrefix, + Impl& inOuter, + Writer::Striper::Offset& outOpenChunkBlockSize, + string& outErrMsg) + { + if (inType != kStriperTypeRS) { + outErrMsg = "invalid striper type"; + return 0; + } + if (! Validate( + inStripeCount, + inRecoveryStripeCount, + inStripeSize, + outErrMsg)) { + return 0; + } + outOpenChunkBlockSize = + Offset(CHUNKSIZE) * (inStripeCount + inRecoveryStripeCount); + return new RSWriteStriper( + inStripeSize, + inStripeCount, + inRecoveryStripeCount, + inFileSize, + inLogPrefix, + inOuter + ); + } + virtual ~RSWriteStriper() + { + delete [] mBuffersPtr; + } + virtual int Process( + IOBuffer& inBuffer, + Offset& ioOffset, + int inWriteThreshold) + { + const int theSize = inBuffer.BytesConsumable(); + if (ioOffset < 0 && theSize > 0) { + return kErrorParameters; + } + if (mRecoveryStripeCount > 0 && ioOffset != mOffset) { + if (theSize > 0 && + (mRecoveryEndPos < mOffset || + ioOffset % mStrideSize != 0)) { + NotSupported(ioOffset, theSize, + "non sequential unaligned write"); + return kErrorParameters; + } + Flush(0); + QCASSERT(mPendingCount == 0); + mOffset = ioOffset; + mRecoveryEndPos = mOffset; + } + while (! inBuffer.IsEmpty()) { + mOffset = Stripe(inBuffer, ioOffset); + ioOffset = mOffset; + if (mOffset > mFileSize) { + mFileSize = mOffset; + } + ComputeRecovery(); + if (inBuffer.IsEmpty()) { + break; + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + " end chunk block: " << mOffset << + " rem: " << inBuffer.BytesConsumable() << + KFS_LOG_EOM; + // Flush at the end of chunk block. + QCRTASSERT( + ioOffset % (CHUNKSIZE * mStripeCount) == 0 && + mOffset == mRecoveryEndPos + ); + for (int i = 0; i < mStripeCount + mRecoveryStripeCount; i++) { + Write(mBuffersPtr[i]); + } + } + if (mOffset - mRecoveryEndPos < max(1, inWriteThreshold)) { + Flush(inWriteThreshold); + return 0; + } + QCASSERT(mRecoveryStripeCount > 0); + if (mOffset < mFileSize) { + NotSupported(ioOffset, theSize, + "non sequential unaligned write/flush"); + return kErrorParameters; + } + // Zero padd to full stride, and compute recovery. + QCASSERT(mOffset - mRecoveryEndPos < mStrideSize); + const int thePaddSize = (int)(mStrideSize - mOffset % mStrideSize); + int theAlign = (int)(mOffset % kAlign); + const int theBufSize = + min(thePaddSize, ((4 << 10) + kAlign - 1) / kAlign * kAlign); + IOBufferData theBuf = NewDataBuffer(theBufSize); + theBuf.ZeroFill(theBufSize); + SetPos(mOffset); + const int theRecoveryPadd = + GetStripeIdx() > 0 ? 0 : GetStripeRemaining(); + KFS_LOG_STREAM_DEBUG << mLogPrefix << + " pos: " << mOffset << + " padd: " << thePaddSize << + " rec padd: " << theRecoveryPadd << + KFS_LOG_EOM; + IOBuffer theIoBuf; + for (int theRem = thePaddSize; theRem > 0; ) { + theIoBuf.Append(theBuf); + if (theAlign > 0) { + theIoBuf.Consume(theAlign); + theAlign = 0; + } + theIoBuf.Trim(theRem); + theRem -= theIoBuf.BytesConsumable(); + mOffset = Stripe(theIoBuf, mOffset); + QCASSERT(theIoBuf.IsEmpty()); + } + int theFrontTrim = 0; + ComputeRecovery(&theFrontTrim); + // Undo the padding and write the buffers. + QCRTASSERT(mOffset == mRecoveryEndPos && mOffset % mStrideSize == 0); + mOffset -= thePaddSize; + mPendingCount -= thePaddSize; + mRecoveryEndPos -= mStrideSize; + int theStrideHead = mStrideSize - thePaddSize; + QCASSERT(mRecoveryEndPos + theStrideHead == mOffset); + for (int i = 0; i < mStripeCount; i++) { + const int theHeadSize = min(theStrideHead, mStripeSize); + theStrideHead -= theHeadSize; + const int theBPaddSize = mStripeSize - theHeadSize; + Buffer& theBuf = mBuffersPtr[i]; + const int theLen = theBuf.mBuffer.BytesConsumable() - theBPaddSize; + QCASSERT(theLen >= theHeadSize); + theBuf.mBuffer.Trim(theLen); + if (theLen > 0) { + // Write the buffer, keep only the stride head piece, to handle + // possible next sequential write. + theBuf.mEndPos -= theBPaddSize; + Buffer theWrBuf; + theWrBuf.mWriteLen = theLen; + theWrBuf.mEndPos = theBuf.mEndPos; + theWrBuf.mBuffer.Move(&theBuf.mBuffer, theLen - theHeadSize); + if (theHeadSize > 0) { + // The stripe head is always aligned, make sure the new + // buffer is aligned too. + // Copy the data into the new buffer as write request + // nomally owns the buffer. + IOBufferData theHead = NewDataBuffer(theHeadSize); + theHead.Fill(theBuf.mBuffer.CopyOut( + theHead.Producer(), theHeadSize)); + theWrBuf.mBuffer.Move(&theBuf.mBuffer, theHeadSize); + QCASSERT( + theBuf.mBuffer.IsEmpty() && + theHead.BytesConsumable() == theHeadSize + ); + theBuf.mBuffer.Clear(); + theBuf.mBuffer.Append(theHead); + } + TrimBufferFront(theWrBuf, theFrontTrim, mPendingCount); + Write(theWrBuf); + QCASSERT(theWrBuf.mBuffer.IsEmpty()); + } else { + QCASSERT(theBuf.mBuffer.IsEmpty()); + theBuf.mEndPos = 0; + } + theBuf.mWriteLen = 0; + } + for (int i = mStripeCount; + i < mStripeCount + mRecoveryStripeCount; + i++) { + Buffer& theBuf = mBuffersPtr[i]; + QCASSERT( + theBuf.mWriteLen >= theRecoveryPadd && + theBuf.mEndPos >= theRecoveryPadd + ); + theBuf.mWriteLen -= theRecoveryPadd; + theBuf.mEndPos -= theRecoveryPadd; + theBuf.mBuffer.Trim(theBuf.mWriteLen); + mPendingCount -= theRecoveryPadd; + Write(theBuf); + QCASSERT(theBuf.mBuffer.IsEmpty()); + } + QCASSERT(mPendingCount == 0); + mLastPartialFlushPos = mOffset; + return 0; + } + virtual Offset GetPendingSize() const + { return mPendingCount; } + virtual bool IsWriteRetryNeeded( + Offset inChunkOffset, + int inRetryCount, + int inMaxRetryCount, + int& ioStatus) + { + QCASSERT(inChunkOffset % (Offset)CHUNKSIZE == 0); + if (inMaxRetryCount < 1) { + // Do not use chunk invalidation with no retries to prevent + // data loss in the case of rewrite. If the app. doesn't + // need retries it can always delete or truncate the file + // in the case of write failure. + return true; + } + if (mRecoveryStripeCount <= 0) { + return true; + } + if (mWriteFailures.find(inChunkOffset) != mWriteFailures.end()) { + return false; + } + // Retry all but allocation failures, where the chunk being written into + // is not available (chunk server down or lost chunk). + if (ioStatus != -EDATAUNAVAIL && inRetryCount < inMaxRetryCount) { + return true; + } + Offset theChunkBlockStart = 0; + Offset theChunkBlockEnd = 0; + int theFailedCount = GetWriteFailuresCount( + inChunkOffset, theChunkBlockStart, theChunkBlockEnd); + // Allow one failure, after 1 retry. + if (mRecoveryStripeCount <= theFailedCount || + (inRetryCount < inMaxRetryCount && inRetryCount > 0 && + (mRecoveryStripeCount > 1 ? 1 : 0) <= theFailedCount)) { + return true; + } + mWriteFailures.insert(inChunkOffset); + theFailedCount++; + QCASSERT(theFailedCount > 0); + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "turn off write to chunk" + " offset: " << inChunkOffset << + " block: [" << theChunkBlockStart << + "," << theChunkBlockEnd << ")" + " failures: " << theFailedCount << + " status: " << ioStatus << + " retry: " << inRetryCount << + " of " << inMaxRetryCount << + KFS_LOG_EOM; + ioStatus = 0; + return false; + } + virtual Offset GetFileSize() const + { return mFileSize; } + +private: + struct Buffer + { + Buffer() + : mEndPos(0), + mBuffer(), + mCurIt(mBuffer.end()), + mCurPos(0), + mWriteLen(0) + {} + Offset mEndPos; + IOBuffer mBuffer; + IOBuffer::iterator mCurIt; + int mCurPos; + int mWriteLen; + }; + typedef std::set< + Offset, + std::less, + StdFastAllocator + > WriteFailures; + + Offset mFileSize; + Offset mPendingCount; + Offset mOffset; + Offset mRecoveryEndPos; + Offset mLastPartialFlushPos; + WriteFailures mWriteFailures; + Buffer* const mBuffersPtr; + + RSWriteStriper( + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + Offset inFileSize, + string inFilePrefix, + Impl& inOuter) + : Striper(inOuter), + RSStriper( + inStripeSize, + inStripeCount, + inRecoveryStripeCount, + inFilePrefix), + mFileSize(inFileSize), + mPendingCount(0), + mOffset(0), + mRecoveryEndPos(0), + mLastPartialFlushPos(0), + mWriteFailures(), + mBuffersPtr(new Buffer[inStripeCount + inRecoveryStripeCount]) + {} + bool IsChunkWriterFailed( + Offset inOffset) const + { + if (mWriteFailures.empty()) { + return false; + } + WriteFailures::const_iterator theIt = + mWriteFailures.upper_bound(inOffset); + if (theIt != mWriteFailures.begin()) { + --theIt; + } + return (*theIt <= inOffset && inOffset < *theIt + (Offset)CHUNKSIZE); + } + void NotSupported( + Offset inOffset, + int inSize, + const char* inMsgPtr) + { + KFS_LOG_STREAM_ERROR << mLogPrefix << + (inMsgPtr ? inMsgPtr : "io alignment") << + " is not supported:" << + " offset: " << inOffset << + " (" << inOffset % mStrideSize << ")" << + " size: " << inSize << + " (" << inSize % mStrideSize << ")" << + " current: " << mOffset << + " (" << mOffset % mStrideSize << ")" << + " sripe: " << mStripeSize << + " stripe count: " << mStripeCount << + " file size: " << mFileSize << + KFS_LOG_EOM; + } + Offset Stripe( + IOBuffer& inBuffer, + Offset inOffset) + { + if (inBuffer.IsEmpty()) { + return inOffset; + } + SetPos(inOffset); + do { + Buffer& theBuffer = mBuffersPtr[GetStripeIdx()]; + KFS_LOG_STREAM_DEBUG << mLogPrefix << + " pos: " << GetPos() << + " stripe: " << GetStripeIdx() << " " << GetStripePos() << + " off: " << GetFilePos() << + " in: " << inBuffer.BytesConsumable() << + " end: " << theBuffer.mEndPos << + KFS_LOG_EOM; + if (theBuffer.mEndPos != GetFilePos() && + ! theBuffer.mBuffer.IsEmpty()) { + if (mRecoveryStripeCount > 0) { + InternalError("non sequential write is not supported"); + } + Write(theBuffer); // Flush + } + const int theCnt = + theBuffer.mBuffer.Move(&inBuffer, GetStripeRemaining()); + mPendingCount += theCnt; + theBuffer.mEndPos = GetFilePos() + theCnt; + if (SeekStripe(theCnt)) { + // Always stop and write buffers before moving to the next chunk + // block. This is needed as the recovery stripes are at the end + // of the block, thus the recovery has to be computed and + // written before moving to the next chunk block. + break; + } + } while (! inBuffer.IsEmpty()); + return GetPos(); + } + void Write( + Buffer& inBuffer, + int inWriteThreshold = 0) + { + if (inBuffer.mWriteLen < 0) { + return; + } + const int theSize = inBuffer.mBuffer.BytesConsumable(); + QCASSERT( + mPendingCount >= theSize && + inBuffer.mWriteLen <= theSize && + (inBuffer.mEndPos - 1) % (int)CHUNKSIZE >= (theSize - 1) + ); + const Offset theOffset = inBuffer.mEndPos - theSize; + if (IsChunkWriterFailed(theOffset)) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "discarding: " << inBuffer.mWriteLen << " bytes" + " at: " << theOffset << + " due to chunk writer failure" << + KFS_LOG_EOM; + const int theDiscCnt = inBuffer.mBuffer.Consume(inBuffer.mWriteLen); + QCASSERT(theDiscCnt == inBuffer.mWriteLen); + mPendingCount -= theDiscCnt; + inBuffer.mWriteLen -= theDiscCnt; + return; + } + QCRTASSERT(! IsWriteQueued()); + const int theQueuedCount = QueueWrite( + inBuffer.mBuffer, + inBuffer.mWriteLen, + theOffset, + inWriteThreshold + ); + QCRTASSERT( + theQueuedCount <= inBuffer.mWriteLen && + inBuffer.mBuffer.BytesConsumable() + theQueuedCount == theSize + ); + if (theQueuedCount <= 0) { + return; + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "queued write:" + " len: " << inBuffer.mWriteLen << + " thresh: " << inWriteThreshold << + " queued: " << theQueuedCount << + KFS_LOG_EOM; + inBuffer.mWriteLen -= theQueuedCount; + mPendingCount -= theQueuedCount; + StartQueuedWrite(theQueuedCount); + } + void Flush( + int inWriteThreshold) + { + const int theThreshold = max(1, max( + (int)(mOffset - mRecoveryEndPos), inWriteThreshold)); + int theCurThreshold = max(0, inWriteThreshold); + while (mPendingCount >= theThreshold) { + for (int i = 0; i < mStripeCount + mRecoveryStripeCount; i++) { + Write(mBuffersPtr[i], theCurThreshold); + } + if (mPendingCount < theThreshold || theCurThreshold <= 0) { + break; + } + theCurThreshold /= (mStripeCount + mRecoveryStripeCount); + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + " flush: thresh:" + " min: " << (mOffset - mRecoveryEndPos) << + " in: " << inWriteThreshold << + " pending: " << mPendingCount << + " cur: " << theCurThreshold << + KFS_LOG_EOM; + } + void ComputeRecovery( + int* ioPaddSizeWriteFrontTrimPtr = 0) + { + if (mRecoveryStripeCount <= 0) { + for (int i = 0; i < mStripeCount; i++) { + mBuffersPtr[i].mWriteLen = + mBuffersPtr[i].mBuffer.BytesConsumable(); + } + mRecoveryEndPos = mOffset; + return; + } + if (mOffset < mRecoveryEndPos + mStrideSize) { + return; // At least one full stride required. + } + QCASSERT(mRecoveryEndPos % mStrideSize == 0); + int theStrideHead = (int)(mOffset % mStrideSize); + const int theTotalSize = + (int)((mOffset - theStrideHead) - mRecoveryEndPos); + const int theSize = theTotalSize / mStripeCount; + QCASSERT(theSize * mStripeCount == theTotalSize); + Offset thePendingCount = 0; + for (int i = mStripeCount; + i < mStripeCount + mRecoveryStripeCount; + i++) { + IOBufferData theBuf = NewDataBuffer(theSize); + mBufPtr[i] = theBuf.Producer(); + theBuf.Fill(theSize); + if (mBuffersPtr[i].mBuffer.IsEmpty()) { + const Offset thePos = mBuffersPtr[i - 1].mEndPos + CHUNKSIZE; + mBuffersPtr[i].mEndPos = thePos; + if (i == mStripeCount) { + mBuffersPtr[i].mEndPos -= thePos % mStripeSize; + } + mBuffersPtr[i].mWriteLen = theSize; + } else { + mBuffersPtr[i].mWriteLen += theSize; + mBuffersPtr[i].mEndPos += theSize; + } + mBuffersPtr[i].mBuffer.Append(theBuf); + thePendingCount += mBuffersPtr[i].mBuffer.BytesConsumable(); + } + for (int thePos = 0, thePrevLen = 0; thePos < theSize; ) { + int theLen = theSize - thePos; + for (int i = 0; i < mStripeCount; i++) { + IOBuffer& theBuf = mBuffersPtr[i].mBuffer; + IOBuffer::iterator& theIt = mBuffersPtr[i].mCurIt; + int& theSkip = mBuffersPtr[i].mCurPos; + if (thePos == 0) { + const int theTail = min(theStrideHead, mStripeSize); + theStrideHead -= theTail; + const int theBufSize = theBuf.BytesConsumable(); + thePendingCount += theBufSize; + theSkip = theBufSize - (theSize + theTail); + theIt = theBuf.begin(); + mBuffersPtr[i].mWriteLen = theSkip + theSize; + } else { + theSkip += thePrevLen; + } + int theBufSize; + while ((theBufSize = theIt->BytesConsumable()) <= theSkip) { + theSkip -= theBufSize; + ++theIt; + } + theBufSize -= theSkip; + QCRTASSERT( + theIt != theBuf.end() && + theSkip >= 0 && + theBufSize > 0 + ); + const char* const thePtr = theIt->Consumer() + theSkip; + if (theBufSize < kAlign) { + char* theDestPtr = GetTempBufPtr(i); + mBufPtr[i] = memcpy(theDestPtr, thePtr, theBufSize); + theDestPtr += theBufSize; + int theRem = kAlign - theBufSize; + do { + ++theIt; + QCASSERT(theIt != theBuf.end()); + theSkip = theIt->CopyOut(theDestPtr, theRem); + theDestPtr += theSkip; + theRem -= theSkip; + } while (theRem > 0); + theLen = kAlign; + theSkip -= theLen; // To cancel the thePrevLen addition. + } else { + theBufSize -= theBufSize % kAlign; + theLen = min(theLen, theBufSize); + if ((thePtr - (const char*)0) % kAlign != 0) { + theLen = min((int)kTempBufSize, theLen); + mBufPtr[i] = memcpy(GetTempBufPtr(i), thePtr, theLen); + } else { + mBufPtr[i] = const_cast(thePtr); + } + } + } + QCASSERT(theLen > 0 && theLen % kAlign == 0); + if (thePos == 0 || thePos + theLen == theSize) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + " recovery:" + " off: " << mRecoveryEndPos << + " pos: " << thePos << + " len: " << theLen << + KFS_LOG_EOM; + } + rs_encode(mStripeCount + mRecoveryStripeCount, theLen, mBufPtr); + for (int i = mStripeCount; + i < mStripeCount + mRecoveryStripeCount; + i++) { + mBufPtr[i] = (char*)mBufPtr[i] + theLen; + } + thePos += theLen; + thePrevLen = theLen; + } + mRecoveryEndPos += theTotalSize; + if (mLastPartialFlushPos + mStrideSize > mRecoveryEndPos) { + // The partial stride was previously written / flushed. + int theHead = (int)( + mLastPartialFlushPos + mStrideSize - mRecoveryEndPos); + QCRTASSERT(theHead > 0 && theHead < mStrideSize); + if (ioPaddSizeWriteFrontTrimPtr) { + *ioPaddSizeWriteFrontTrimPtr = theHead; + // Do not trim if padded, the caller will do the trimming. + theHead = 0; + } + // Trim only the data buffers, as the recovery stripes may have + // changed. + for (int i = 0; i < mStripeCount && theHead > 0; i++) { + QCASSERT(mBuffersPtr[i].mBuffer.BytesConsumable() >= + min(theHead, mStripeSize)); + TrimBufferFront(mBuffersPtr[i], theHead, thePendingCount); + } + } + mPendingCount = thePendingCount; + } + void TrimBufferFront( + Buffer& inBuf, + int& ioStrideTrim, + Offset& ioPendingCount) + { + if (ioStrideTrim <= 0) { + return; + } + // Try to align the write position to the checksum boundary to avoid + // read modify write, at the expense of the extra network transfer. + const int kChecksumBlockSize = (int)CHECKSUM_BLOCKSIZE; + int theTrim = min(ioStrideTrim, mStripeSize); + ioStrideTrim -= theTrim; + const int theChecksumBlockHead = (int)((theTrim + + inBuf.mEndPos - inBuf.mBuffer.BytesConsumable()) % + kChecksumBlockSize + ); + if (theChecksumBlockHead <= theTrim) { + theTrim -= theChecksumBlockHead; + if (theTrim <= 0) { + return; + } + } + theTrim = inBuf.mBuffer.Consume(theTrim); + inBuf.mWriteLen -= theTrim; + ioPendingCount -= theTrim; + QCASSERT( + ioPendingCount >= 0 && + inBuf.mWriteLen >= 0 && + inBuf.mBuffer.BytesConsumable() >= inBuf.mWriteLen + ); + } + char* GetTempBufPtr( + int inIndex) + { return GetTempBufSelfPtr(inIndex, mStripeCount); } + int GetWriteFailuresCount( + Offset inChunkOffset, + Offset& outChunkBlockStart, + Offset& outChunkBlockEnd) const + { + outChunkBlockStart = + inChunkOffset - inChunkOffset % mChunkBlockTotalSize; + outChunkBlockEnd = outChunkBlockStart + mChunkBlockTotalSize; + int theFailedCount = 0; + for (WriteFailures::const_iterator theIt = + mWriteFailures.lower_bound(outChunkBlockStart); + theIt != mWriteFailures.end() && *theIt < outChunkBlockEnd; + ++theIt, theFailedCount++) + {} + return theFailedCount; + } +private: + RSWriteStriper( + const RSWriteStriper& inRSWriteStriper); + RSWriteStriper& operator=( + const RSWriteStriper& inRSWriteStriper); +}; + +// Striped files with and without Reed-Solomon recovery reader implementation. +// The reader is used by chunk server for RS recovery of both "data" and +// "recovery" chunks. +class RSReadStriper : public Reader::Striper, private RSStriper +{ +public: + typedef RSStriper::Offset Offset; + + static Striper* Create( + StriperType inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + int inMaxAtomicReadRequestSize, + bool inUseDefaultBufferAllocatorFlag, + bool inFailShortReadsFlag, + Reader::Striper::Offset inRecoverChunkPos, + Reader::Striper::Offset inFileSize, + Reader::Striper::SeqNum inInitialSeqNum, + string inLogPrefix, + Impl& inOuter, + Reader::Striper::Offset& outOpenChunkBlockSize, + string& outErrMsg) + { + if (inType != kStriperTypeRS) { + outErrMsg = "invalid striper type"; + return 0; + } + if (inMaxAtomicReadRequestSize <= 0) { + outErrMsg = "invalid max. read request size"; + return 0; + } + if (inRecoverChunkPos > 0 && inRecoverChunkPos % CHUNKSIZE != 0) { + outErrMsg = "invalid chunk recovery position"; + return 0; + } + if (! Validate( + inStripeCount, + inRecoveryStripeCount, + inStripeSize, + outErrMsg)) { + return 0; + } + outOpenChunkBlockSize = + Offset(CHUNKSIZE) * (inStripeCount + inRecoveryStripeCount); + return new RSReadStriper( + inStripeSize, + inStripeCount, + inRecoveryStripeCount, + inMaxAtomicReadRequestSize, + inUseDefaultBufferAllocatorFlag, + inFailShortReadsFlag, + inRecoverChunkPos, + inFileSize, + inInitialSeqNum, + inLogPrefix, + inOuter + ); + } + virtual ~RSReadStriper() + { + Request* thePtr; + while ((thePtr = Requests::Front(mPendingQueue))) { + thePtr->Delete(*this, mPendingQueue); + } + while ((thePtr = Requests::Front(mFreeList))) { + thePtr->Delete(*this, mFreeList); + } + while ((thePtr = Requests::Front(mInFlightList))) { + thePtr->Delete(*this, mInFlightList); + } + delete [] mBufIteratorsPtr; + delete mZeroBufferPtr; + } + virtual int Process( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inRequestId) + { + if (inLength <= 0) { + return 0; + } + if (inOffset < 0) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "invalid read offset: " << inOffset << + KFS_LOG_EOM; + return kErrorParameters; + } + if (mRecoverStripeIdx >= 0) { + return RecoverChunk(inBuffer, inLength, inOffset, inRequestId); + } + // Ensure that all reads in the request including recovery read are + // "atomic", i.e less or equal than mMaxReadSize. + SetPos(inOffset); + int theMin = 0; + int theMax = 0; + Offset theChunkBlockStartPos = 0; + int theLen = inLength; + Request* theRequestPtr = 0; + while (theLen > 0) { + const int theChunkPos = GetChunkPos(GetFilePos()); + if (! theRequestPtr) { + theRequestPtr = &GetRequest(inRequestId, GetPos(), 0); + theMin = theChunkPos; + theMax = theChunkPos; + theChunkBlockStartPos = GetChunkBlockStartFilePos(); + } + QCASSERT(theChunkPos <= theMax); // max at stride beginning + Buffer& theBuf = theRequestPtr->GetBuffer(GetStripeIdx()); + const int theRem = GetStripeRemaining(); + const int theSize = min( + min(theLen, theRem), + theChunkPos >= theMin ? + theMin + mMaxReadSize - theChunkPos : + ((theChunkPos >= theMax - mMaxReadSize) ? mMaxReadSize : 0) + ); + // Align request on the stripe block boundary if possible. + if (theSize <= 0 || + (theSize < theRem && theSize < theLen && theMin < theMax)) { + theRequestPtr->mRecoverySize = -(theMax - theMin); + theRequestPtr->mRecoveryPos = theChunkBlockStartPos + theMin; + QueueRequest(*theRequestPtr); + theRequestPtr = 0; + continue; + } + if (theBuf.mBuf.mSize <= 0) { + theBuf.mBuf.mSize = theSize; + theBuf.mPos = GetFilePos(); + } else { + theBuf.mBuf.mSize += theSize; + } + theBuf.mBuf.mBuffer.MoveSpaceAvailable(&inBuffer, theSize); + theLen -= theSize; + theRequestPtr->mSize += theSize; + theRequestPtr->mPendingCount += theSize; + theMin = min(theMin, theChunkPos); + theMax = max(theMax, theChunkPos + theSize); + QCASSERT(theMin < theMax && theMax - theMin <= mMaxReadSize); + if (SeekStripe(theSize) || theLen <= 0) { + // Create new request when moving to the next chunk block. + theRequestPtr->mRecoverySize = -(theMax - theMin); + theRequestPtr->mRecoveryPos = theChunkBlockStartPos + theMin; + QueueRequest(*theRequestPtr); + theRequestPtr = 0; + continue; + } + } + QCASSERT(! theRequestPtr); + Read(); + return inLength; + } + virtual void ReadCompletion( + int inStatus, + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inOriginalRequestId, + RequestId inRequestId, + kfsChunkId_t inChunkId, + int64_t inChunkVersion) + { + QCASSERT(inRequestId.mPtr && inLength <= mPendingCount); + PBuffer& theBuf = *reinterpret_cast(inRequestId.mPtr); + mPendingCount -= inLength; + theBuf.ReadDone( + *this, + inStatus, + inBuffer, + inLength, + inOffset, + inOriginalRequestId, + inChunkId, + inChunkVersion + ); + } + virtual bool CanCancelRead( + RequestId inRequestId) + { + QCASSERT(inRequestId.mPtr); + PBuffer& theBuf = *reinterpret_cast(inRequestId.mPtr); + return theBuf.CancelRead(*this); + } + int GetBufferCount() const + { return (mStripeCount + mRecoveryStripeCount); } + +private: + class Buffer; + class PBuffer + { + public: + typedef RSReadStriper Outer; + + Buffer& mParent; + IOBuffer mBuffer; + int mSize; + int mStatus; + bool mInFlightFlag:1; + bool mDoneFlag:1; + bool mCancelFlag:1; + + PBuffer( + Buffer& inParent) + : mParent(inParent), + mBuffer(), + mSize(0), + mStatus(0), + mInFlightFlag(false), + mDoneFlag(false), + mCancelFlag(false) + {} + void Clear() + { + mBuffer.Clear(); + mSize = 0; + mStatus = 0; + mInFlightFlag = false; + mDoneFlag = false; + mCancelFlag = false; + } + bool IsFailed() const + { return Outer::IsFailure(mStatus); } + Offset GetPos() const + { return mParent.GetPos(*this); } + int GetStripeIdx() const + { return (mParent.GetStripeIdx()); } + int Retry( + bool inClearFlag) + { + QCASSERT(! mInFlightFlag && mSize >= 0); + if (! IsFailed()) { + return 0; + } + if (inClearFlag) { + mBuffer.Clear(); + } + mDoneFlag = false; + mStatus = 0; + return mSize; + } + int MarkFailed() + { + QCASSERT(! mInFlightFlag); + if (mSize <= 0 || IsFailed()) { + return 0; + } + mStatus = kErrorIO; + mDoneFlag = true; + return mSize; + } + void ReadDone( + Outer& inOuter, + int inStatus, + IOBuffer& inBuffer, + int inSize, + Offset inOffset, + RequestId inRequestId, + kfsChunkId_t inChunkId, + int64_t inChunkVersion) + { + QCRTASSERT( + mInFlightFlag && + inSize == mSize && + inOffset == GetPos() && + mBuffer.IsEmpty() + ); + mInFlightFlag = false; + mDoneFlag = true; + const bool thePrevOk = ! mParent.IsFailed(); + mStatus = inStatus; + if (! IsFailed() && inBuffer.BytesConsumable() < mSize) { + KFS_LOG_STREAM_DEBUG << inOuter.mLogPrefix << + "short read:" + " req: " << mParent.mRequest.mPos << + "," << mParent.mRequest.mSize << + " got: " << inBuffer.BytesConsumable() << + " of: " << mSize << + " stripe: " << GetStripeIdx() << + " pos: " << GetPos() << + " chunk: " << inChunkId << + " version: " << inChunkVersion << + KFS_LOG_EOM; + } + mBuffer.Clear(); + mBuffer.Move(&inBuffer); + mParent.ReadDone( + inOuter, + *this, + thePrevOk && IsFailed(), + inRequestId, + inChunkId, + inChunkVersion + ); + } + int InitRecoveryRead( + Offset inSize) + { + if (inSize <= 0) { + return 0; + } + QCASSERT(! mInFlightFlag); + // If read has failed already mark the extra read as failed. + mStatus = mParent.GetStatus(); + mSize = (int)inSize; + if (IsFailed() || mBuffer.BytesConsumable() == mSize) { + mDoneFlag = true; + return 0; + } + mBuffer.Clear(); + mDoneFlag = false; + return mSize; + } + int Read( + Outer& inOuter) + { + if (mSize <= 0 || mInFlightFlag || mDoneFlag) { + return 0; + } + QCASSERT(GetPos() >= 0); + const bool theRetryFlag = mParent.mRequest.mRecoveryRound > 0; + RequestId theId = RequestId(); + theId.mPtr = this; + IOBuffer theBuffer; + if (this != &mParent.mBuf) { + mBuffer.Clear(); + if (! inOuter.mUseDefaultBufferAllocatorFlag) { + theBuffer.Append(NewDataBuffer(mSize)); + } + } else { + QCASSERT(mBuffer.IsEmpty()); + if (theRetryFlag) { + // This is needed to cancel the requests with no completion + // invocation: the buffers list must be saved. + theBuffer.UseSpaceAvailable(&mBuffer, mSize); + } else { + theBuffer.Move(&mBuffer); + } + } + mParent.mRequest.mInFlightCount += mSize; + QCASSERT(mParent.mRequest.mInFlightCount <= + mParent.mRequest.mPendingCount); + inOuter.mPendingCount += mSize; + mInFlightFlag = true; + // Recovery validates stripe sizes, and wont work if short reads + // fail -- do not fail short reads if recovery is running. + const int theQueuedCount = inOuter.QueueRead( + theBuffer, + mSize, + GetPos(), + mParent.mRequest.mRequestId, + theId, + theRetryFlag || + inOuter.mRecoveryStripeCount <= 0, + inOuter.mFailShortReadsFlag && + mParent.mRequest.mRecoverySize <= 0 + ); + if (theQueuedCount != mSize) { + if (theQueuedCount > 0) { + InternalError("failed to queue complete chunk read"); + } + inOuter.ReadCompletion( + theQueuedCount < 0 ? theQueuedCount : kErrorParameters, + theBuffer, + mSize, + GetPos(), + mParent.mRequest.mRequestId, + theId, + -1, + -1 + ); + return 0; + } + return theQueuedCount; + } + void Cancel( + Outer& inOuter) + { + if (! mInFlightFlag) { + return; + } + QCASSERT(! mDoneFlag); + if (mDoneFlag) { + return; + } + mCancelFlag = true; + } + bool CancelRead( + Outer& inOuter) + { + if (! mInFlightFlag || mDoneFlag || ! mCancelFlag) { + return false; + } + QCASSERT( + mSize >= 0 && + mParent.mRequest.mPendingCount >= mSize && + mParent.mRequest.mInFlightCount >= mSize + ); + mParent.mRequest.mPendingCount -= mSize; + mParent.mRequest.mInFlightCount -= mSize; + mStatus = kErrorCanceled; + mInFlightFlag = false; + mDoneFlag = true; + mCancelFlag = false; + return true; + } + bool CancelPendingRead( + Outer& inOuter) + { + if (mSize <= 0 || mInFlightFlag || mDoneFlag) { + return false; + } + mParent.mRequest.mInFlightCount += mSize; + QCASSERT(mParent.mRequest.mInFlightCount <= + mParent.mRequest.mPendingCount); + mInFlightFlag = true; + mCancelFlag = true; + const bool theRetFlag = CancelRead(inOuter); + QCASSERT(theRetFlag); + return theRetFlag; + } + private: + PBuffer( + const PBuffer& inBuf); + PBuffer& operator=( + const PBuffer& inBuf); + }; + friend class PBuffer; + + class Request; + class Buffer + { + public: + typedef RSReadStriper Outer; + + Request& mRequest; + Offset mPos; + PBuffer mBufL; + PBuffer mBuf; + PBuffer mBufR; + kfsChunkId_t mChunkId; + int64_t mChunkVersion; + + Buffer( + Request& inRequest, + Offset inPos = -1, + int inSize = 0) + : mRequest(inRequest), + mBufL(*this), + mBuf(*this), + mBufR(*this), + mChunkId(-1), + mChunkVersion(-1) + {} + ~Buffer() + {} + void Clear() + { + mBufL.Clear(); + mBuf.Clear(); + mBufR.Clear(); + mPos = -1; + } + int GetStripeIdx() const + { return mRequest.GetStripeIdx(*this); } + int Retry() + { + return (mBufL.Retry(true) + mBuf.Retry(false) + mBufR.Retry(true)); + } + Offset GetPos( + const PBuffer& inBuffer) const + { + Offset theRet = -1; + if (&inBuffer == &mBufL) { + theRet = mPos - mBufL.mSize; + } else if (&inBuffer == &mBuf) { + theRet = mPos; + } else if (&inBuffer == &mBufR) { + theRet = mPos + mBuf.mSize; + } + QCASSERT(theRet >= 0); + return theRet; + } + void ReadDone( + Outer& inOuter, + PBuffer& inBuffer, + bool inNewFailureFlag, + RequestId inRequestId, + kfsChunkId_t inChunkId, + int64_t inChunkVersion) + { + if (inBuffer.IsFailed()) { + if (&inBuffer != &mBuf) { + inBuffer.mBuffer.Clear(); + } + } else { + mChunkId = inChunkId; + mChunkVersion = inChunkVersion; + } + mRequest.ReadDone( + inOuter, + inBuffer, + *this, + inNewFailureFlag, + inRequestId + ); + } + int InitRecoveryRead( + Outer& inOuter, + Offset inOffset, + int inSize) + { + if (inSize <= 0) { + return 0; + } + QCASSERT(inOffset >= 0); + if (mBuf.mSize <= 0) { + mBuf.mSize = 0; + QCASSERT(GetSize() == 0); + mPos = inOffset + inSize; + } + const int theRet = + mBufL.InitRecoveryRead(mPos - inOffset) + + mBufR.InitRecoveryRead( + inOffset + inSize - (mPos + mBuf.mSize)); + if (mBufL.mSize > inOuter.mMaxReadSize || + mBufR.mSize > inOuter.mMaxReadSize) { + InternalError( + "failed to start recovery: invalid request boundary"); + mBufL.mStatus = kErrorParameters; + mBufR.mStatus = kErrorParameters; + return 0; + } + return theRet; + } + int GetSize() const + { return (mBufL.mSize + mBuf.mSize + mBufR.mSize); } + bool IsInFlight() const + { + return ( + mBufL.mInFlightFlag || + mBuf.mInFlightFlag || + mBufR.mInFlightFlag + ); + } + bool IsFailed() const + { + return ( + mBufL.IsFailed() || + mBuf.IsFailed() || + mBufR.IsFailed() + ); + } + bool IsReadyForRecovery() const + { + return ( + ! IsInFlight() && + ! IsFailed() && + GetSize() == mRequest.mRecoverySize + ); + } + int GetStatus() const + { + if (mBuf.IsFailed()) { + return mBuf.mStatus; + } + if (mBufL.IsFailed()) { + return mBufL.mStatus; + } + if (mBufR.IsFailed()) { + return mBufR.mStatus; + } + return 0; + } + void SetSetatus( + int inStatus) + { + if (mBuf.mSize > 0) { + mBuf.mStatus = inStatus; + } + if (mBufL.mSize > 0) { + mBufL.mStatus = inStatus; + } + if (mBufR.mSize > 0) { + mBufR.mStatus = inStatus; + } + } + void Read( + Outer& inOuter) + { + if (IsFailed()) { + return; + } + const int theQueuedCnt = + mBufL.Read(inOuter) + + mBuf.Read(inOuter) + + mBufR.Read(inOuter); + if (theQueuedCnt > 0) { + inOuter.StartQueuedRead(theQueuedCnt); + } + } + void Cancel( + Outer& inOuter) + { + mBufL.Cancel(inOuter); + mBuf.Cancel(inOuter); + mBufR.Cancel(inOuter); + } + void CancelPendingRead( + Outer& inOuter) + { + mBufL.CancelPendingRead(inOuter); + mBuf.CancelPendingRead(inOuter); + mBufR.CancelPendingRead(inOuter); + } + int MarkFailed() + { + return ( + mBufL.MarkFailed() + + mBuf.MarkFailed() + + mBufR.MarkFailed() + ); + } + bool MakeBufferForRecovery( + Outer& inOuter, + IOBuffer& inBuffer, + int& outSize, + int& outRdSize) + { + inBuffer.Clear(); + outRdSize = 0; + outSize = 0; + const bool theReadFailedFlag = IsFailed(); + if (theReadFailedFlag) { + if (mBuf.mSize > 0) { + // If only the extra read failed, but read succeeded then + // return the read result to the caller, and make new temp. + // buffer to run recovery with, just to keep it simple. + // Such "half" failures are expected to be rare. + if (inOuter.mUseDefaultBufferAllocatorFlag && + mBuf.IsFailed()) { + mBuf.mBuffer.EnsureSpaceAvailable(mBuf.mSize); + } + const int theAvail = mBuf.IsFailed() ? + inBuffer.UseSpaceAvailable(&mBuf.mBuffer, mBuf.mSize) + : 0; + if (theAvail < mBuf.mSize) { + if (inOuter.mUseDefaultBufferAllocatorFlag) { + inBuffer.EnsureSpaceAvailable(mBuf.mSize); + } else { + IOBufferData theBuf = + NewDataBuffer(mBuf.mSize - theAvail); + inBuffer.Append(theBuf); + if (mBuf.IsFailed()) { + mBuf.mBuffer.Append(theBuf); + } + } + } + outSize += mBuf.mSize; + } + if (mBufL.mSize > 0) { + IOBuffer theBuf; + theBuf.Move(&inBuffer); + if (inOuter.mUseDefaultBufferAllocatorFlag) { + inBuffer.EnsureSpaceAvailable(mBufL.mSize); + } else { + inBuffer.Append(NewDataBuffer(mBufL.mSize)); + } + inBuffer.Move(&theBuf); + outSize += mBufL.mSize; + } + if (mBufR.mSize > 0) { + if (inOuter.mUseDefaultBufferAllocatorFlag) { + inBuffer.EnsureSpaceAvailable(mBufR.mSize + outSize); + } else { + inBuffer.Append(NewDataBuffer(mBufR.mSize)); + } + outSize += mBufR.mSize; + } + } else { + if (mBufL.mSize > 0) { + const int theSize = mBufL.mBuffer.BytesConsumable(); + outSize += mBufL.mSize; + outRdSize += theSize; + inBuffer.Copy(&mBufL.mBuffer, theSize); + inOuter.ZeroPaddTo(inBuffer, outSize); + } + if (mBuf.mSize > 0) { + const int theSize = mBuf.mBuffer.BytesConsumable(); + outSize += mBuf.mSize; + outRdSize += theSize; + // Copy to save the original length and buffer space of + // short read. + inBuffer.Copy(&mBuf.mBuffer, theSize); + inOuter.ZeroPaddTo(inBuffer, outSize); + } + if (mBufR.mSize > 0) { + const int theSize = mBufR.mBuffer.BytesConsumable(); + outSize += mBufR.mSize; + outRdSize += theSize; + inBuffer.Copy(&mBufR.mBuffer, theSize); + inOuter.ZeroPaddTo(inBuffer, outSize); + } + QCASSERT(inBuffer.BytesConsumable() == outSize); + } + return theReadFailedFlag; + } + int GetReadSize() + { + if (IsFailed()) { + return -1; + } + return ( + (mBufL.mSize > 0 ? mBufL.mBuffer.BytesConsumable() : 0) + + ( mBuf.mSize > 0 ? mBuf.mBuffer.BytesConsumable() : 0) + + (mBufR.mSize > 0 ? mBufR.mBuffer.BytesConsumable() : 0) + ); + } + void SetRecoveryResult( + IOBuffer& inBuffer) + { + if (mBuf.mSize <= 0 || ! mBuf.IsFailed()) { + // Result is already there. + inBuffer.Clear(); + return; + } + QCASSERT(inBuffer.BytesConsumable() <= GetSize()); + // The missing chunk size is not known, thus the recovery might zero + // fill the hole. + // In other words "skip holes" mode will not work, as the + // holes boundary is known only with stripe size precision, in the + // case when the trailing data stripes in the block are missing. + // + // dddd dddd dddd + // d... => ???? => d000 + // rrrr rrrr rrrr + if (mBufL.mSize > 0) { + IOBuffer theBuf; + const int theCnt = theBuf.MoveSpace(&inBuffer, mBufL.mSize); + if (theCnt != mBufL.mSize) { + InternalError("invalid left buffer space"); + } + } + mBuf.mBuffer.Clear(); + const int theCnt = mBuf.mBuffer.MoveSpace(&inBuffer, mBuf.mSize); + if (theCnt != mBuf.mSize) { + InternalError("invalid middle buffer space"); + } + } + private: + Buffer( + const Buffer& inBuffer); + Buffer& operator=( + const Buffer& inBuffer); + }; + typedef QCDLList Requests; + + class RecoveryInfo; + class Request + { + public: + typedef RSReadStriper Outer; + + RequestId mRequestId; + Offset mPos; + Offset mRecoveryPos; + int mSize; + int mPendingCount; + int mInFlightCount; + int mStatus; + int mRecoveryRound; + int mRecursionCount; + int mRecoverySize; + int mBadStripeCount; + + static Request& Create( + Outer& inOuter, + RequestId inRequestId, + Offset inPos, + int inSize) + { + const size_t theSize = + sizeof(Request) + sizeof(Buffer) * inOuter.GetBufferCount(); + char* const thePtr = new char[theSize]; + Request& theRet = *new(thePtr) Request(inRequestId, inPos, inSize); + for (size_t i = sizeof(Request); i < theSize; i += sizeof(Buffer)) { + new (thePtr + i) Buffer(theRet); + } + return theRet; + } + Buffer& GetBuffer( + int inIdx) + { + QCASSERT(inIdx >= 0); + return reinterpret_cast(this + 1)[inIdx]; + } + const Buffer& GetBuffer( + int inIdx) const + { + QCASSERT(inIdx >= 0); + return reinterpret_cast(this + 1)[inIdx]; + } + int GetStripeIdx( + const Buffer& inBuffer) const + { + const int theIdx = (int)(&inBuffer - &GetBuffer(0)); + QCASSERT(theIdx >= 0); + return theIdx; + } + void Delete( + Outer& inOuter, + Request** inListPtr) + { + Requests::Remove(inListPtr, *this); + const int theBufCount = inOuter.GetBufferCount(); + for (int i = 0; i < theBufCount; i++) { + GetBuffer(i).~Buffer(); + } + this->~Request(); + delete [] reinterpret_cast(this); + } + void Reset( + Outer& inOuter, + RequestId inRequestId, + Offset inPos, + int inSize) + { + mRequestId = inRequestId; + mPos = inPos; + mRecoveryPos = 0; + mSize = inSize; + mPendingCount = 0; + mInFlightCount = 0; + mStatus = 0; + mRecoveryRound = 0; + mRecursionCount = 0; + mRecoverySize = 0; + mBadStripeCount = 0; + const int theBufCount = inOuter.GetBufferCount(); + for (int i = 0; i < theBufCount; i++) { + GetBuffer(i).Clear(); + } + } + void ReadDone( + Outer& inOuter, + PBuffer& inPBuffer, + Buffer& inBuffer, + bool inNewFailureFlag, + RequestId inRequestId) + { + const int theStripeIdx = inBuffer.GetStripeIdx(); + const int theBufCount = inOuter.GetBufferCount(); + QCRTASSERT( + inPBuffer.mSize >= 0 && + theStripeIdx >= 0 && + theStripeIdx < theBufCount && + mPendingCount >= inPBuffer.mSize && + mInFlightCount >= inPBuffer.mSize && + mRequestId.mId == inRequestId.mId + ); + mPendingCount -= inPBuffer.mSize; + mInFlightCount -= inPBuffer.mSize; + if (inPBuffer.IsFailed()) { + KFS_LOG_STREAM_INFO << inOuter.mLogPrefix << + "read failure:" + " req: " << mPos << + "," << mSize << + " status: " << inPBuffer.mStatus << + " stripe: " << theStripeIdx << + " bad: " << mBadStripeCount << + " round: " << mRecoveryRound << + KFS_LOG_EOM; + if (inNewFailureFlag && mRecoveryRound <= 0 && + Recovery(inOuter)) { + return; + } + } else if (mRecoveryRound > 0 && + inBuffer.IsReadyForRecovery() && + --mBadStripeCount <= inOuter.mRecoveryStripeCount) { + // Can finish recovery now, cancel all pending reads if any. + if (mRecursionCount > 0) { + // Cancel pending reads that are possibly in the process of + // being scheduled by Read() below. + for (int i = 0; i < theBufCount; i++) { + GetBuffer(i).CancelPendingRead(inOuter); + } + } + QCRTASSERT(mPendingCount == mInFlightCount); + KFS_LOG_STREAM_INFO << inOuter.mLogPrefix << + "can finish recovery:" + " req: " << mPos << + "," << mSize << + " bad: " << mBadStripeCount << + " round: " << mRecoveryRound << + " in flight: " << mInFlightCount << + KFS_LOG_EOM; + if (mInFlightCount > 0) { + for (int i = 0; i < theBufCount; i++) { + GetBuffer(i).Cancel(inOuter); + } + inOuter.CancelRead(); + QCRTASSERT(mPendingCount == 0 && mInFlightCount == 0); + } + } + if (mPendingCount <= 0 && mRecursionCount <= 0) { + Done(inOuter); + } + } + void Read( + Outer& inOuter) + { + if (mRecursionCount > 0 || + mPendingCount <= 0 || + mInFlightCount >= mPendingCount) { + return; + } + mRecursionCount++; + const int theBufCount = inOuter.GetBufferCount(); + while (mPendingCount > 0 && mInFlightCount < mPendingCount) { + for (int i = 0; + i < theBufCount && + mInFlightCount < mPendingCount; + i++) { + GetBuffer(i).Read(inOuter); + } + } + mRecursionCount--; + if (mPendingCount <= 0) { + Done(inOuter); + } + } + bool IsFailed() const + { return Outer::IsFailure(mStatus); } + void InitRecovery( + Outer& inOuter) + { + if (mSize <= 0 || mRecoverySize > 0) { + return; + } + // Process() ensures that the "recovery width" does not exceed max + // atomic read size, and calculates recovery size. + mRecoverySize = -mRecoverySize; + if (mRecoverySize <= 0 || + mRecoverySize > inOuter.mMaxReadSize || + mRecoveryPos < 0) { + InternalError( + "failed to start recovery: invalid request"); + return; + } + KFS_LOG_STREAM_INFO << inOuter.mLogPrefix << + "init recovery:" + " req: " << mPos << + "," << mSize << + " pos: " << mRecoveryPos << + " size: " << mRecoverySize << + " [" << GetChunkPos(mRecoveryPos) << "," << + (GetChunkPos(mRecoveryPos) + mRecoverySize) << ")" << + KFS_LOG_EOM; + Offset theOffset = mRecoveryPos; + for (int i = 0; i < inOuter.mStripeCount; i++) { + mPendingCount += GetBuffer(i).InitRecoveryRead( + inOuter, theOffset, mRecoverySize); + theOffset += (Offset)CHUNKSIZE; + } + } + + private: + Request* mPrevPtr[1]; + Request* mNextPtr[1]; + friend class QCDLListOp; + + Request( + RequestId inRequestId, + Offset inPos, + int inSize) + : mRequestId(inRequestId), + mPos(inPos), + mRecoveryPos(0), + mSize(inSize), + mPendingCount(0), + mInFlightCount(0), + mStatus(0), + mRecoveryRound(0), + mRecursionCount(0), + mRecoverySize(0), + mBadStripeCount(0) + { Requests::Init(*this); } + ~Request() + {} + bool Recovery( + Outer& inOuter) + { + if (++mBadStripeCount > inOuter.mRecoveryStripeCount) { + return false; + } + if (mBadStripeCount <= 1 && mRecoverySize <= 0) { + InitRecovery(inOuter); + } + if (mRecoverySize <= 0) { + return false; + } + const int i = inOuter.mStripeCount + mBadStripeCount - 1; + mPendingCount += GetBuffer(i).InitRecoveryRead( + inOuter, mRecoveryPos + i * (Offset)CHUNKSIZE, mRecoverySize); + Read(inOuter); + return true; + } + void Done( + Outer& inOuter) + { + if (mPendingCount > 0) { + return; + } + if (mRecoverySize > 0 || + inOuter.mStripeCount <= inOuter.mRecoverStripeIdx) { + inOuter.FinishRecovery(*this); + QCRTASSERT(mPendingCount == 0 && mInFlightCount == 0); + if (IsFailed() && + (inOuter.mRecoverStripeIdx < 0 || + mStatus != kErrorInvalidChunkSizes)) { + bool theTryAgainFlag = mRecoveryRound <= 0 || + mStatus == kErrorInvalidChunkSizes; + const int theBufCount = inOuter.GetBufferCount(); + if (! theTryAgainFlag) { + // Retry canceled reads, if any. + for (int i = 0; i < theBufCount; i++) { + if (GetBuffer(i).GetStatus() == kErrorCanceled) { + theTryAgainFlag = true; + break; + } + } + } + if (theTryAgainFlag) { + KFS_LOG_STREAM_INFO << inOuter.mLogPrefix << + "read recovery failed:" + " req: " << mPos << + "," << mSize << + " status: " << mStatus << + " round: " << mRecoveryRound << + " bad stripes: " << mBadStripeCount << + " " << (mRecoveryRound <= 0 ? + "turning on read retries" : + "get remaining stripes") << + KFS_LOG_EOM; + mRecoveryRound++; + mStatus = 0; + int theInvalidChunkSizeCount = 0; + for (int i = 0; i < theBufCount; i++) { + Buffer& theBuf = GetBuffer(i); + const int theStatus = theBuf.GetStatus(); + if (theStatus == kErrorInvalidChunkSizes || + theStatus == kErrorInvalChunkSize) { + theInvalidChunkSizeCount++; + } else { + if (theStatus == 0 && + theBuf.GetSize() != mRecoverySize) { + // Ensure that reads on all stripes are + // scheduled. Size check in FinishRecovery() + // might fail extra stripes without invoking + // Request::Recovery() + mPendingCount += theBuf.InitRecoveryRead( + inOuter, + mRecoveryPos + i * (Offset)CHUNKSIZE, + mRecoverySize); + mBadStripeCount++; + } else { + mPendingCount += theBuf.Retry(); + } + } + } + if (theInvalidChunkSizeCount > + inOuter.mRecoveryStripeCount || + mPendingCount <= 0) { + KFS_LOG_STREAM_ERROR << inOuter.mLogPrefix << + "read recovery failed:" + " req: " << mPos << + "," << mSize << + " status: " << mStatus << + " bad stripes: " << mBadStripeCount << + " invalid chunks: " << + theInvalidChunkSizeCount << + " pending: " << mPendingCount << + KFS_LOG_EOM; + mPendingCount = 0; + mStatus = kErrorIO; + } + if (mPendingCount > 0) { + Read(inOuter); + return; + } + } + } + } else if (mStatus == 0 && + mBadStripeCount > inOuter.mRecoveryStripeCount) { + mStatus = kErrorIO; + } + inOuter.RequestCompletion(*this); + } + private: + Request( + const Request& inRequest); + Request& operator=( + const Request& inRequest); + + friend class RecoveryInfo; + }; + friend class Request; + + class BufIterator + { + public: + typedef RSReadStriper Outer; + + BufIterator() + : mBuffer(), + mCurIt(), + mEndIt(), + mCurPtr(0), + mEndPtr(0), + mSize(0), + mReadFailureFlag(false) + {} + int Set( + Outer& inOuter, + Buffer& inBuffer, + int& outRdSize) + { + mBuffer.Clear(); + mSize = 0; + mReadFailureFlag = inBuffer.MakeBufferForRecovery( + inOuter, mBuffer, mSize, outRdSize); + return Reset(); + } + int MakeScratchBuffer( + Outer& inOuter, + int inSize) + { + return MakeScratchBuffer(inOuter.mUseDefaultBufferAllocatorFlag ? + IOBufferData() : NewDataBuffer(4096 / kAlign * kAlign), inSize); + } + char* Advance( + int inLen) + { + QCASSERT(inLen >= 0); + int theLen = inLen; + for (; ;) { + if (mCurPtr && mCurPtr + theLen < mEndPtr) { + mCurPtr += theLen; + break; + } + if (mCurIt == mEndIt || ++mCurIt == mEndIt) { + mCurPtr = 0; + mEndPtr = 0; + break; + } + theLen -= mEndPtr - mCurPtr; + SetPtrs(); + } + return mCurPtr; + } + int CopyIn( + const void* inPtr, + int inLen) + { + // Copy should use available buffer space, allocated by + // MakeBufferForRecovery(), thus it should not invalidate iterators. + return (mReadFailureFlag ? + mBuffer.CopyIn(static_cast(inPtr), + min(inLen, mSize - mBuffer.BytesConsumable())) : 0 + ); + + } + int CopyOut( + char* inPtr, + int inLen) + { + if (inLen <= 0 || mReadFailureFlag) { + return 0; + } + int theLen = min(inLen, GetCurRemaining()); + memcpy(inPtr, mCurPtr, theLen); + if (theLen >= inLen) { + return theLen; + } + for (It theIt = mCurIt; theLen < inLen && ++theIt != mEndIt; ) { + theLen += theIt->CopyOut(inPtr + theLen, inLen - theLen); + } + return theLen; + } + int GetCurRemaining() const + { return (int)(mEndPtr - mCurPtr); } + bool IsFailure() const + { return mReadFailureFlag; } + bool IsRequested() const + { return (mSize > 0); } + void SetRecoveryResult( + Buffer& inBuffer) + { + inBuffer.SetRecoveryResult(mBuffer); + Clear(); + } + void Clear() + { + mBuffer.Clear(); + mCurIt = mBuffer.end(); + mEndIt = mCurIt; + mCurPtr = 0; + mEndPtr = 0; + mSize = 0; + mReadFailureFlag = false; + } + private: + typedef IOBuffer::iterator It; + + IOBuffer mBuffer; + It mCurIt; + It mEndIt; + char* mCurPtr; + char* mEndPtr; + int mSize; + bool mReadFailureFlag; + + void SetPtrs() + { + mCurPtr = const_cast(mCurIt->Consumer()); + mEndPtr = mCurPtr + mCurIt->BytesConsumable(); + if (mReadFailureFlag) { + mEndPtr += mCurIt->SpaceAvailable(); + } + } + int MakeScratchBuffer( + const IOBufferData& inBuf, + int inSize) + { + mReadFailureFlag = true; + mSize = inSize; + mBuffer.Clear(); + const int theAvail = inBuf.SpaceAvailable(); + QCASSERT(theAvail % kAlign == 0); + for (int theRem = mSize; theRem > 0; theRem -= theAvail) { + if (theAvail <= theRem) { + mBuffer.Append(inBuf); + } else { + char* const thePtr = const_cast(inBuf.Producer()); + mBuffer.Append( + IOBufferData(inBuf, thePtr, thePtr + theRem, thePtr)); + break; + } + } + return Reset(); + } + int Reset() + { + mCurIt = mBuffer.begin(); + mEndIt = mBuffer.end(); + if (mCurIt == mEndIt) { + mCurPtr = 0; + mEndPtr = 0; + } else { + SetPtrs(); + } + return mSize; + } + private: + BufIterator( + const BufIterator& inIt); + BufIterator& operator=( + const BufIterator& inIt); + }; + + class RecoveryInfo + { + public: + typedef RSReadStriper Outer; + + IOBuffer mBuffer; + Offset mPos; + Offset mChunkBlockStartPos; + int mSize; + int mMissingCnt; + int mMissingIdx[kMaxRecoveryStripes]; + + RecoveryInfo() + : mBuffer(), + mPos(-1), + mChunkBlockStartPos(-1), + mSize(0), + mMissingCnt(0) + { + for (int i = 0; i < kMaxRecoveryStripes; i++) { + mMissingIdx[i] = -1; + } + } + void ClearBuffer() + { + mBuffer.Clear(); + mSize = 0; + mPos = -1; + } + void Clear() + { + ClearBuffer(); + mMissingCnt = 0; + mChunkBlockStartPos = -1; + for (int i = 0; i < kMaxRecoveryStripes; i++) { + mMissingIdx[i] = -1; + } + } + void Set( + Outer& inOuter, + Request& inRequest) + { + if (inRequest.IsFailed() || inRequest.mRecoverySize <= 0 || + (inRequest.mBadStripeCount <= 0 && + inOuter.mRecoverStripeIdx < inOuter.mStripeCount)) { + Clear(); + return; + } + mPos = inRequest.mPos; + mChunkBlockStartPos = mPos - mPos % inOuter.mChunkBlockSize; + mMissingCnt = 0; + const int theBufCount = inOuter.GetBufferCount(); + for (int i = 0; + i < theBufCount && mMissingCnt < kMaxRecoveryStripes; + i++) { + Buffer& theBuf = inRequest.GetBuffer(i); + if (theBuf.IsFailed()) { + mMissingIdx[mMissingCnt++] = i; + } + } + if (mMissingCnt <= 0) { + Clear(); + } + // TODO: use mBuffer to save recovery result if request is not + // aligned. + } + void SetIfEmpty( + Outer& inOuter, + Offset inPos, + int inBadStripeIdx) + { + QCASSERT( + inPos >= 0 && + inBadStripeIdx >= 0 && + inBadStripeIdx < inOuter.GetBufferCount() + ); + if (mChunkBlockStartPos >= 0 && mChunkBlockStartPos <= inPos && + inPos < mChunkBlockStartPos + inOuter.mChunkBlockSize) { + return; + } + Clear(); + mPos = inPos; + mChunkBlockStartPos = mPos - mPos % inOuter.mChunkBlockSize; + mMissingCnt = 1; + mMissingIdx[0] = inBadStripeIdx; + const int theCnt = inOuter.GetBufferCount(); + int theSwappedIdx[kMaxRecoveryStripes]; + theSwappedIdx[0] = theCnt - mMissingCnt; + // Randomly select which stripes to use for RS recovery in order to + // attempt to uniformly distribute the chunk server read load. + while (mMissingCnt < kMaxRecoveryStripes) { + int theIdx = inOuter.Rand() % (theCnt - mMissingCnt); + int i; + for (i = 0; i < mMissingCnt; i++) { + if (theIdx == mMissingIdx[i]) { + theIdx = theSwappedIdx[i]; + theSwappedIdx[i] = theCnt - (mMissingCnt + 1); + while (i > 0 && theIdx < mMissingIdx[i - 1]) { + i--; + } + } + // Insertion sort. + if (theIdx < mMissingIdx[i]) { + for (int k = mMissingCnt; i < k; k--) { + mMissingIdx[k] = mMissingIdx[k - 1]; + theSwappedIdx[k] = theSwappedIdx[k - 1]; + } + mMissingIdx[i] = theIdx; + mMissingCnt++; + theSwappedIdx[i] = theCnt - mMissingCnt; + break; + } + } + if (i >= mMissingCnt) { + mMissingIdx[mMissingCnt] = theIdx; + theSwappedIdx[mMissingCnt] = theCnt - (mMissingCnt + 1); + mMissingCnt++; + } + } + } + void Get( + Outer& inOuter, + Request& inRequest) + { + if (mChunkBlockStartPos < 0 || inRequest.mRecoverySize > 0) { + return; + } + if (inRequest.mPos < mChunkBlockStartPos || + inRequest.mPos >= + mChunkBlockStartPos + inOuter.mChunkBlockSize) { + Clear(); + return; + } + QCASSERT( + inRequest.mBadStripeCount == 0 && + inRequest.mInFlightCount == 0 + ); + const int theBufCount = inOuter.GetBufferCount(); + int i; + for (i = 0; ; i++) { + const int theIdx = i < mMissingCnt ? + mMissingIdx[i] : inOuter.mStripeCount; + QCASSERT(theIdx >= 0); + if (theIdx >= inOuter.mStripeCount) { + QCASSERT(theIdx < theBufCount); + if (inRequest.mRecoverySize > 0 || + (inRequest.mBadStripeCount <= 0 && + inOuter.mRecoverStripeIdx < + inOuter.mStripeCount)) { + break; + } + inRequest.InitRecovery(inOuter); + if (inRequest.mRecoverySize <= 0 || + inRequest.mBadStripeCount >= mMissingCnt) { + break; + } + i = 0; + continue; + } + QCASSERT(theIdx < inOuter.mStripeCount); + Buffer& theBuf = inRequest.GetBuffer(theIdx); + const int theSize = theBuf.MarkFailed(); + if (theSize <= 0) { + continue; + } + QCASSERT( + inRequest.mBadStripeCount < mMissingCnt && + inRequest.mPendingCount >= theSize + ); + inRequest.mPendingCount -= theSize; + inRequest.mBadStripeCount++; + KFS_LOG_STREAM_DEBUG << inOuter.mLogPrefix << + "get read recovery info:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " block: " << mChunkBlockStartPos << + " stripe: " << theIdx << + " bad: " << inRequest.mBadStripeCount << + " pending: " << inRequest.mPendingCount << + KFS_LOG_EOM; + } + if (inRequest.mRecoverySize <= 0) { + return; + } + for (int l = 0, k = inOuter.mStripeCount; + l < inRequest.mBadStripeCount && k < theBufCount; + l++, k++) { + Buffer& theBuf = inRequest.GetBuffer(k); + inRequest.mPendingCount += theBuf.InitRecoveryRead( + inOuter, + inRequest.mRecoveryPos + k * (Offset)CHUNKSIZE, + inRequest.mRecoverySize + ); + if (i < mMissingCnt && mMissingIdx[i] == k) { + const int theSize = theBuf.MarkFailed(); + QCASSERT( + theSize > 0 && + inRequest.mBadStripeCount < mMissingCnt && + inRequest.mPendingCount >= theSize + ); + inRequest.mPendingCount -= theSize; + inRequest.mBadStripeCount++; + i++; + KFS_LOG_STREAM_DEBUG << inOuter.mLogPrefix << + "get read recovery info:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " block: " << mChunkBlockStartPos << + " stripe: " << k << + " bad: " << inRequest.mBadStripeCount << + " pending: " << inRequest.mPendingCount << + KFS_LOG_EOM; + } + } + } + private: + RecoveryInfo( + const RecoveryInfo& inInfo); + RecoveryInfo& operator=( + const RecoveryInfo& inInfo); + }; + friend class RecoveryInfo; + + // Chunk read request split threshold. + const int mMaxReadSize; + const bool mUseDefaultBufferAllocatorFlag; + const bool mFailShortReadsFlag; + const int mRecoverStripeIdx; + const Offset mFileSize; + const Offset mRecoverBlockPos; + const Offset mRecoverChunkEndPos; + RecoveryInfo mRecoveryInfo; + BufIterator* mBufIteratorsPtr; + IOBufferData* mZeroBufferPtr; + Offset mPendingCount; + uint32_t mNextRand; + Request* mPendingQueue[1]; + Request* mFreeList[1]; + Request* mInFlightList[1]; + + RSReadStriper( + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + int inMaxAtomicReadRequestSize, + bool inUseDefaultBufferAllocatorFlag, + bool inFailShortReadsFlag, + Offset inRecoverChunkPos, + Offset inFileSize, + SeqNum inInitialSeqNum, + string inFilePrefix, + Impl& inOuter) + : Striper(inOuter), + RSStriper( + inStripeSize, + inStripeCount, + inRecoveryStripeCount, + inFilePrefix), + mMaxReadSize(inMaxAtomicReadRequestSize), + mUseDefaultBufferAllocatorFlag(inUseDefaultBufferAllocatorFlag), + mFailShortReadsFlag(inFailShortReadsFlag), + mRecoverStripeIdx( + inRecoverChunkPos < 0 ? -1 : + (int)(inRecoverChunkPos / (Offset)CHUNKSIZE % + (inStripeCount + inRecoveryStripeCount)) + ), + mFileSize(inFileSize), + mRecoverBlockPos( + inRecoverChunkPos < 0 ? Offset(-1) : + inRecoverChunkPos / (Offset)CHUNKSIZE / + (inStripeCount + inRecoveryStripeCount) * + inStripeCount * (Offset)CHUNKSIZE + ), + mRecoverChunkEndPos(inRecoverChunkPos < 0 ? Offset(-1) : + inRecoverChunkPos + + GetChunkSize(mRecoverStripeIdx, mRecoverBlockPos, mFileSize)), + mRecoveryInfo(), + mBufIteratorsPtr(0), + mZeroBufferPtr(0), + mPendingCount(0), + mNextRand((uint32_t)inInitialSeqNum) + { + QCASSERT(inRecoverChunkPos < 0 || inRecoverChunkPos % CHUNKSIZE == 0); + Requests::Init(mPendingQueue); + Requests::Init(mFreeList); + Requests::Init(mInFlightList); + } + void QueueRequest( + Request& inRequest) + { + mRecoveryInfo.Get(*this, inRequest); + Requests::PushBack(mPendingQueue, inRequest); + } + void Read() + { + Request* thePtr; + while((thePtr = Requests::PopFront(mPendingQueue))) { + Requests::PushBack(mInFlightList, *thePtr); + thePtr->Read(*this); + } + } + int RecoverChunk( + IOBuffer& inBuffer, + int inLength, + Offset inChunkOffset, + RequestId inRequestId) + { + QCASSERT( + mRecoverStripeIdx >= 0 && + mRecoverStripeIdx < mStripeCount + mRecoveryStripeCount && + mRecoverBlockPos >= 0 + ); + const int kChunkSize = (int)CHUNKSIZE; + if (inChunkOffset < 0 || inChunkOffset >= kChunkSize || + inLength > mMaxReadSize || inLength <= 0 || + (inLength > mStripeSize && (inLength % mStripeSize != 0 || + inChunkOffset % mStripeSize != 0)) || + inBuffer.begin() != inBuffer.end()) { + return kErrorParameters; + } + SetPos( + mRecoverBlockPos + + inChunkOffset / mStripeSize * mStrideSize + + inChunkOffset % mStripeSize + + ((mRecoverStripeIdx < mStripeCount && inLength <= mStripeSize) ? + mStripeSize * mRecoverStripeIdx : 0) + ); + Request& theRequest = GetRequest(inRequestId, GetPos(), 0); + theRequest.mRecoverySize = -inLength; + theRequest.mRecoveryPos = GetChunkBlockStartFilePos() + inChunkOffset; + mRecoveryInfo.SetIfEmpty(*this, theRequest.mPos, mRecoverStripeIdx); + Offset thePos = theRequest.mRecoveryPos; + if (mStripeSize < inLength) { + for (int i = 0; i < mStripeCount; i++) { + Buffer& theBuf = theRequest.GetBuffer(i); + theBuf.mBuf.mSize = inLength; + theBuf.mPos = thePos; + theRequest.mSize += inLength; + thePos += kChunkSize; + } + theRequest.mPendingCount = theRequest.mSize; + } else { + const int theStripeToReadIdx = + mRecoverStripeIdx < mStripeCount ? mRecoverStripeIdx : 0; + Buffer& theBuf = theRequest.GetBuffer(theStripeToReadIdx); + theBuf.mBuf.mSize = inLength; + theBuf.mPos = GetFilePos(); + theRequest.mSize += inLength; + theRequest.mPendingCount = theRequest.mSize; + if (mStripeCount <= mRecoverStripeIdx) { + mRecoveryInfo.Get(*this, theRequest); + theRequest.InitRecovery(*this); + QCASSERT(theRequest.mRecoverySize == inLength); + } + } + QueueRequest(theRequest); + Read(); + return inLength; + } + void InvalidChunkSize( + Request& inRequest, + Buffer& inBuf, + int inExpectedSize, + int inPrevStripeIdx = 0) + { + if (inBuf.GetSize() > 0) { + const int theSize = inBuf.GetReadSize(); + if (theSize >= 0 && theSize < inExpectedSize) { + InvalidChunkSize(inRequest, inBuf); + return; + } + } + const int theCnt = GetBufferCount(); + for (int i = inPrevStripeIdx; i < theCnt; i++) { + Buffer& theCBuf = inRequest.GetBuffer(i); + int theSize; + if (&theCBuf == &inBuf || ((theSize = theCBuf.GetReadSize()) >= 0 && + theSize < inExpectedSize)) { + // Invalidate only one at a time. + InvalidChunkSize(inRequest, theCBuf); + return; + } + } + InvalidChunkSize(inRequest, inBuf); + } + void InvalidChunkSize( + Request& inRequest, + Buffer& inBuf) + { + if (! inBuf.IsFailed()) { + inRequest.mBadStripeCount++; + } + inRequest.mStatus = kErrorInvalidChunkSizes; + inBuf.SetSetatus(kErrorInvalidChunkSizes); + if (mRecoverStripeIdx < 0) { + ReportInvalidChunk( + inBuf.mChunkId, + inBuf.mChunkVersion, + kErrorInvalidChunkSizes, + "rs recovery: invalid chunk size" + ); + } + } + static void UpdateMaxLength( + int inSize, + int& ioMaxLength) + { + if (ioMaxLength <= inSize) { + return; + } + ioMaxLength = inSize; + if (ioMaxLength > kAlign) { + ioMaxLength -= ioMaxLength % kAlign; + } + } + bool SetBufIterator( + Request& inRequest, + int inIdx, + int& ioMissingCnt, + int& ioRecoverySize, + int& ioMaxRd, + int& ioFirstGoodRecoveryStripeIdx, + int& ioMaxLength, + int* inMissingIdxPtr, + int& ioEndPosIdx, + int& ioEndPos, + int& ioEndPosHead) + { + BufIterator& theIt = mBufIteratorsPtr[inIdx]; + int theRdSize = 0; + Buffer& theBuf = inRequest.GetBuffer(inIdx); + if (theIt.Set(*this, theBuf, theRdSize) != inRequest.mRecoverySize && + (theIt.IsRequested() || + inIdx < mStripeCount || + ioMissingCnt >= kMaxRecoveryStripes)) { + InternalError("invalid recovery buffer length"); + inRequest.mStatus = kErrorIO; + return false; + } + if (theIt.IsFailure() || ! theIt.IsRequested()) { + if (ioMissingCnt >= kMaxRecoveryStripes) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "read recovery failure:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " more than " << inMissingIdxPtr << + " stripe reads failed" << + KFS_LOG_EOM; + inRequest.mStatus = kErrorIO; + return false; + } + inMissingIdxPtr[ioMissingCnt++] = inIdx; + if (inIdx == GetBufferCount() - 1 && + ioFirstGoodRecoveryStripeIdx < 0 && + ioMaxRd < ioRecoverySize) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "read recovery:" + " no recovery stripes available:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " size: " << ioRecoverySize << + " max read: " << ioMaxRd << + " end:" + " stripe: " << ioEndPosIdx << + " pos: " << ioEndPos << + KFS_LOG_EOM; + if (ioMaxRd < 0) { + InternalError("invalid max read size"); + inRequest.mStatus = kErrorIO; + return false; + } + ioRecoverySize = ioMaxRd; + UpdateMaxLength(ioMaxRd, ioMaxLength); + } + return true; + } + if (inIdx >= mStripeCount) { + if (ioFirstGoodRecoveryStripeIdx < 0) { + ioFirstGoodRecoveryStripeIdx = inIdx; + if (ioMaxRd < theRdSize) { + if (ioMaxRd >= 0) { + for (int i = mStripeCount - 1; i >= 0; i--) { + Buffer& theCBuf = inRequest.GetBuffer(i); + const int theSize = theCBuf.GetReadSize(); + if (theSize < 0) { + continue; + } + const int theExtraSize = + i <= 0 ? + 0 : + ((ioEndPosHead <= 0 || ioEndPosIdx < i) ? + mStripeSize : + (ioEndPosIdx == i ? + mStripeSize - ioEndPosHead : + 0)); + if (theSize + theExtraSize < theRdSize) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "read recovery failure:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " wrong stripe read size:" + " got: " << theSize << + " extra: " << theExtraSize << + " expected: " << theRdSize << + " stripe: " << i << + " pos: " << theCBuf.mBufL.GetPos() << + " size: " << theCBuf.GetSize() << + " chunk: " << theBuf.mChunkId << + " version: " << theBuf.mChunkVersion << + " end:" + " stripe: " << ioEndPosIdx << + " size: " << ioEndPos << + " head: " << ioEndPosHead << + KFS_LOG_EOM; + InvalidChunkSize(inRequest, theCBuf); + return false; + } + } + } + ioMaxRd = theRdSize; + } + } + if (theRdSize != ioMaxRd) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "read recovery failure:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " wrong recovery stripe read size:" + " got: " << theRdSize << + " expected: " << ioMaxRd << + " stripe: " << inIdx << + " pos: " << theBuf.mBufL.GetPos() << + " size: " << theBuf.GetSize() << + " first recov: " << ioFirstGoodRecoveryStripeIdx << + " chunk: " << theBuf.mChunkId << + " version: " << theBuf.mChunkVersion << + KFS_LOG_EOM; + InvalidChunkSize(inRequest, theBuf, + max(theRdSize, ioMaxRd), mStripeCount); + return false; + } + } else if (ioMaxRd < theRdSize) { + if (ioMaxRd >= 0) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "read recovery failure:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " previous short read:" + " got: " << ioMaxRd << + " expected: " << theRdSize << + " stripe: " << inIdx << + " pos: " << theBuf.mBufL.GetPos() << + " size: " << theBuf.GetSize() << + " chunk: " << theBuf.mChunkId << + " version: " << theBuf.mChunkVersion << + KFS_LOG_EOM; + InvalidChunkSize(inRequest, theBuf, theRdSize); + return false; + } + ioMaxRd = theRdSize; + } else if (theRdSize + mStripeSize < ioMaxRd) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "read recovery failure:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " short read:" + " got: " << theRdSize << + " expected: " << ioMaxRd << + " stripe: " << inIdx << + " pos: " << theBuf.mBufL.GetPos() << + " size: " << theBuf.GetSize() << + " chunk: " << theBuf.mChunkId << + " version: " << theBuf.mChunkVersion << + KFS_LOG_EOM; + InvalidChunkSize(inRequest, theBuf); + return false; + } else if (ioEndPosHead >= 0 && theRdSize > 0 && + ioEndPos - ioEndPosHead < theRdSize) { + Buffer& theCBuf = inRequest.GetBuffer(ioEndPosIdx); + KFS_LOG_STREAM_ERROR << mLogPrefix << + "read recovery failure:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " previous short read:" + " stripe: " << ioEndPosIdx << + " stripe head: " << ioEndPosHead << + " got: " << ioEndPos << + " expected: " << theRdSize << + " cur stripe: " << inIdx << + " pos: " << theBuf.mBufL.GetPos() << + " size: " << theBuf.GetReadSize() << + " chunk: " << theBuf.mChunkId << + " version: " << theBuf.mChunkVersion << + KFS_LOG_EOM; + InvalidChunkSize(inRequest, theCBuf); + return false; + } + if (theRdSize >= ioRecoverySize) { + return true; + } + if (inIdx < mStripeCount) { + if (ioEndPosHead < 0) { + ioEndPosIdx = inIdx; + ioEndPos = theRdSize; + ioEndPosHead = + (inRequest.mRecoveryPos + theRdSize) % mStripeSize; + } else if (ioEndPosHead == 0 && ioMaxRd <= ioEndPos) { + if (ioEndPos != theRdSize) { + ioEndPosHead = mStripeSize - (ioEndPos - theRdSize); + if (ioEndPosHead < 0 || ioEndPosHead >= mStripeSize) { + InternalError("undetected previous short read"); + inRequest.mStatus = kErrorIO; + return false; + } + } + ioEndPosIdx = inIdx; + ioEndPos = theRdSize; + } + } else { + if (ioEndPosHead == 0) { + if (ioEndPos == theRdSize) { + // The recovery stripe ends at the stripe boundary. + if (ioEndPosIdx < mStripeCount - 1) { + // Then next stripe could be the last, and it *must* be + // missing. + ioEndPosIdx++; + if (! inRequest.GetBuffer(ioEndPosIdx).IsFailed()) { + InternalError( + "undetected previous invalid read"); + inRequest.mStatus = kErrorIO; + return false; + } + } + } else if (ioEndPosIdx == 0 || + theRdSize < ioEndPos || + ioEndPos + mStripeSize < theRdSize || + (ioEndPos + mStripeSize != theRdSize && + ! inRequest.GetBuffer(0).IsFailed())) { + InternalError("undetected previous invalid read size"); + inRequest.mStatus = kErrorIO; + return false; + } else { + ioEndPosHead = theRdSize - ioEndPos; + if (ioEndPosHead < mStripeSize) { + ioEndPosIdx = 0; // Partial first stripe. + ioEndPos = theRdSize; + } else { + ioEndPosHead = 0; // Last stripe index doesn't change. + } + } + } else if (ioEndPosHead < 0) { + ioEndPosHead = + (inRequest.mRecoveryPos + theRdSize) % mStripeSize; + ioEndPos = theRdSize; + if (ioEndPosHead != 0) { + // Partial first stripe, otherwise recovery end is stripe + // aligned. It is possible to get here with 1+3 encoding + // (which probably makes sense for testing only) when the + // first stripe is missing. + ioEndPosIdx = 0; + } else { + ioEndPosIdx = mStripeCount - 1; + } + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "read recovery:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " size: " << ioRecoverySize << + " recovery stripe read size: " << theRdSize << + " end:" + " stripe: " << ioEndPosIdx << + " pos: " << ioEndPos << + " head: " << ioEndPosHead << + KFS_LOG_EOM; + ioRecoverySize = theRdSize; + UpdateMaxLength(theRdSize, ioMaxLength); + } + return true; + } + void InitRecoveryStripeRestore( + Request& inRequest, + int inIdx, + int inRecoverySize, + int inMissingCnt, + int& ioRRdSize) + { + // Init recovery stripe buffers and iterators, if recovery stripe itself + // needs to be computed. RecoverChunk() the above must schedule the + // first stripe read in order for "hole" / block end logic to work + // properly: the recovery stripes length is always greater or equal to + // the the first stripe. + const int theBufCount = GetBufferCount(); + QCRTASSERT( + inIdx >= mStripeCount && + inIdx <= theBufCount && + inRequest.GetBuffer(0).mBuf.mSize > 0 + ); + if (ioRRdSize < 0) { + if (inMissingCnt >= inIdx + 1) { + // All preceding stripes and the current one are missing or + // absent (hole), find the size from any successfully read + // recovery stripe. + // SetBufIterator() will declare an error later if recovery + // stripe reads don't have the same size. + for (int k = inIdx + 1; k < theBufCount; k++) { + ioRRdSize = max( + ioRRdSize, + inRequest.GetBuffer(k).GetReadSize() + ); + } + } else { + ioRRdSize = inRecoverySize; + } + } + BufIterator& theIt = mBufIteratorsPtr[inIdx]; + theIt.Clear(); + if (inIdx == mRecoverStripeIdx) { + Buffer& theBuf = inRequest.GetBuffer(inIdx); + theBuf.Clear(); + theBuf.mPos = inRequest.mRecoveryPos + ioRRdSize + + inIdx * (Offset)CHUNKSIZE; + theBuf.mBufL.mSize = ioRRdSize; + theBuf.MarkFailed(); + int theRdSize = 0; + QCVERIFY(theIt.Set(*this, theBuf, theRdSize) == ioRRdSize); + } else { + theIt.MakeScratchBuffer(*this, ioRRdSize); + } + } + void FinishRecovery( + Request& inRequest) + { + if (mStripeCount <= mRecoverStripeIdx && inRequest.mRecoverySize <= 0) { + QCASSERT(mRecoverStripeIdx < GetBufferCount()); + inRequest.mRecoverySize = -inRequest.mRecoverySize; + } + if ((inRequest.mBadStripeCount <= 0 && + mRecoverStripeIdx < mStripeCount) || + inRequest.mRecoverySize <= 0) { + return; + } + if (inRequest.mBadStripeCount > mRecoveryStripeCount) { + if (inRequest.mStatus == 0) { + inRequest.mStatus = kErrorIO; + } + return; + } + const int theBufCount = GetBufferCount(); + if (! mBufIteratorsPtr) { + mBufIteratorsPtr = new BufIterator[theBufCount]; + } + int theMissingIdx[kMaxRecoveryStripes]; + int theMissingCnt = 0; + int theSize = inRequest.mRecoverySize; + int thePrevLen = 0; + int theMaxRd = -1; + int theRecovIdx = -1; + int theEndPosIdx = mStripeCount; + int theEndPos = theSize; + int theNextEndPos = theSize; + int theRSize = -1; + int theBufToCopyCount = mStripeCount; + int theEndPosHead = -1; + for (int thePos = 0; thePos < theSize; ) { + int theLen = theSize - thePos; + if (theLen > kAlign) { + theLen -= theLen % kAlign; + } + for (int i = 0; i < theBufCount; i++) { + if (thePos == 0 && + ! SetBufIterator( + inRequest, + i, + theMissingCnt, + theSize, + theMaxRd, + theRecovIdx, + theLen, + theMissingIdx, + theEndPosIdx, + theEndPos, + theEndPosHead)) { + QCASSERT( + mRecoverStripeIdx < mStripeCount || + inRequest.mStatus != 0 + ); + for (int k = 0; k <= i; k++) { + mBufIteratorsPtr[k].Clear(); + } + mRecoveryInfo.Set(*this, inRequest); + return; + } + BufIterator& theIt = mBufIteratorsPtr[i]; + if (i >= mStripeCount && + (theIt.IsFailure() || ! theIt.IsRequested())) { + if (mRecoverStripeIdx < mStripeCount) { + mBufPtr[i] = 0; + continue; + } + if (thePos == 0) { + InitRecoveryStripeRestore( + inRequest, i, theSize, theMissingCnt, theRSize); + theBufToCopyCount = theBufCount; + } + } + if (theSize <= 0) { + QCASSERT(thePos == 0); + mBufPtr[i] = 0; + continue; // Hole of eof, continue to check the chunk sizes. + } + char* thePtr = theIt.Advance(thePrevLen); + const int theRem = theIt.GetCurRemaining(); + QCASSERT(thePtr); + if (theRem < kAlign || (thePtr - (char*)0) % kAlign != 0) { + thePtr = GetTempBufPtr(i); + if (theLen > kTempBufSize) { + theLen = kTempBufSize; + QCASSERT(theLen % kAlign == 0); + } + if (theIt.IsFailure()) { + QCASSERT( + i < mStripeCount || + mRecoverStripeIdx >= mStripeCount + ); + } else { + const int theSize = theIt.CopyOut(thePtr, theLen); + if (theSize < theLen) { + theLen = theSize; + if (theLen > kAlign) { + theLen -= theLen % kAlign; + } + } + } + } else if (theRem < theLen) { + theLen = theRem - theRem % kAlign; + } + mBufPtr[i] = thePtr; + } + const int kMissingCnt = 3; + if (thePos == 0) { + if (theEndPosHead >= 0) { + const int theChunkStridePos = + GetChunkPos(inRequest.mRecoveryPos) + theEndPos - + (theEndPosHead > 0 ? theEndPosHead : + (theEndPos < theSize ? 0 : mStripeSize)); + const Offset theBlockPos = + inRequest.mPos - inRequest.mPos % mChunkBlockSize; + const Offset theHolePos = + theBlockPos + + theChunkStridePos * mStripeCount + + theEndPosIdx * mStripeSize + + theEndPosHead; + if (! mFailShortReadsFlag) { + if (theHolePos <= inRequest.mPos) { + KFS_LOG_STREAM_INFO << mLogPrefix << + "read recovery:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " size: " << theSize << + " hole [" << theHolePos << + "," << (theBlockPos + mChunkBlockSize) << + ")" << + KFS_LOG_EOM; + break; // Hole or eof -- nothing to do. + } + } else if (theHolePos < inRequest.mPos + inRequest.mSize) { + Buffer& theBuf = inRequest.GetBuffer(theEndPosIdx); + KFS_LOG_STREAM_ERROR << mLogPrefix << + "read recovery:" + " short read detected:" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " hole: " << theHolePos << + " size: " << theSize << + " end:" + " stripe: " << theEndPosIdx << + " size: " << theEndPos << + " read: " << theBuf.GetReadSize() << + " head: " << theEndPosHead << + KFS_LOG_EOM; + if (theEndPosIdx < mStripeCount && + ! theBuf.IsFailed()) { + InvalidChunkSize(inRequest, theBuf); + } else { + // Assume that the recovery stripe size is invalid. + InvalidChunkSize( + inRequest, + inRequest.GetBuffer(theBufCount - 1), + theSize + 1, + mStripeCount + ); + } + for (int i = 0; i < theBufCount; i++) { + mBufIteratorsPtr[i].Clear(); + } + mRecoveryInfo.Set(*this, inRequest); + return; + } + } + QCASSERT( + theMissingCnt == kMissingCnt || + inRequest.mRecoveryRound > 0 + ); + for (int i = theBufCount - 1; + theMissingCnt < kMissingCnt && i >= mStripeCount; + i--) { + if (mBufPtr[i]) { + // If recovery stripe restore requested, all recovery + // stripe buffers must be present for rs_decode3 to + // work. In this case just declare the stipe missing, + // rs_decode3 always recalculate all 3 recovery stripes. + if (mRecoverStripeIdx < mStripeCount) { + mBufIteratorsPtr[i].Clear(); + mBufPtr[i] = 0; + } + theMissingIdx[theMissingCnt++] = i; + } + } + if (theEndPosIdx + 1 < mStripeCount) { + theNextEndPos = max(0, theEndPos - theEndPosHead); + } + } + QCRTASSERT( + theLen > 0 && + (theLen % kAlign == 0 || theLen < kAlign) && + theMissingCnt == kMissingCnt + ); + if (thePos == 0 || thePos + theLen >= theSize) { + KFS_LOG_STREAM_INFO << mLogPrefix << + "read recovery" + " req: " << inRequest.mPos << + "," << inRequest.mSize << + " pos: " << inRequest.mRecoveryPos << + "+" << thePos << + " size: " << theLen << + " of: " << theSize << + KFS_LOG_EOM; + } + rs_decode3( + theBufCount, + max(theLen, (int)kAlign), + theMissingIdx[0], + theMissingIdx[1], + theMissingIdx[2], + mBufPtr + ); + for (int i = 0; i < theBufToCopyCount; i++) { + BufIterator& theIt = mBufIteratorsPtr[i]; + const int theCpLen = (i < theEndPosIdx || i >= mStripeCount) ? + theLen : + min(theLen, (i == theEndPosIdx ? + theEndPos : theNextEndPos) - thePos); + if (theCpLen <= 0) { + QCRTASSERT(i < mStripeCount); + i = mStripeCount - 1; + continue; + } + if (! theIt.IsFailure()) { + continue; + } + QCVERIFY(theIt.CopyIn(mBufPtr[i], theCpLen) == theCpLen); + } + thePos += theLen; + thePrevLen = theLen; + } + mRecoveryInfo.Set(*this, inRequest); + for (int i = 0; i < mStripeCount; i++) { + mBufIteratorsPtr[i].SetRecoveryResult(inRequest.GetBuffer(i)); + } + for (int i = mStripeCount; i < theBufCount; i++) { + if (mRecoverStripeIdx == i) { + Buffer& theBuf = inRequest.GetBuffer(i); + // Use middle buffer for RequestCompletion() to work. + if (theBuf.mBuf.mSize <= 0) { + QCASSERT( + theBuf.mPos >= 0 && + theBuf.mPos >= theBuf.mBufL.mSize + ); + theBuf.mPos -= theBuf.mBufL.mSize; + theBuf.mBuf.mSize = theBuf.mBufL.mSize; + theBuf.mBuf.mBuffer.Clear(); + theBuf.mBuf.MarkFailed(); + theBuf.mBufL.mSize = 0; + theBuf.mBufL.mBuffer.Clear(); + mBufIteratorsPtr[i].SetRecoveryResult(theBuf); + } + } else { + inRequest.GetBuffer(i).Clear(); + mBufIteratorsPtr[i].Clear(); + } + } + } + template + static void IOBufferWrite( + IOBuffer& inBuffer, + const T& inVal) + { + inBuffer.CopyIn(reinterpret_cast(&inVal), sizeof(inVal)); + } + void RequestCompletion( + Request& inRequest) + { + QCASSERT(inRequest.mPendingCount == 0); + Requests::Remove(mInFlightList, inRequest); + IOBuffer theBuffer; + if (mRecoverStripeIdx >= 0) { + const int theStatus = inRequest.mStatus; + const Offset thePos = + inRequest.mRecoveryPos + mRecoverStripeIdx * (Offset)CHUNKSIZE; + const int theSize = inRequest.mRecoverySize >= 0 ? + inRequest.mRecoverySize : -inRequest.mRecoverySize; + const RequestId theId = inRequest.mRequestId; + if (theStatus == 0) { + Buffer& theBuf = inRequest.GetBuffer(mRecoverStripeIdx); + QCASSERT(thePos == theBuf.mPos); + theBuffer.Move(&theBuf.mBuf.mBuffer, + (int)(mRecoverChunkEndPos - thePos)); + } else if (theStatus == kErrorInvalidChunkSizes) { + // Report chunks with invalid sizes. + const int theBufCount = GetBufferCount(); + for (int i = 0; i < theBufCount; i++) { + const Buffer& theBuf = inRequest.GetBuffer(i); + if (theBuf.GetStatus() != kErrorInvalidChunkSizes) { + continue; + } + IOBufferWrite(theBuffer, i); + IOBufferWrite(theBuffer, theBuf.mChunkId); + IOBufferWrite(theBuffer, theBuf.mChunkVersion); + } + } + PutRequest(inRequest); + ReportCompletion(theStatus, theBuffer, theSize, thePos, theId); + return; + } + SetPos(inRequest.mPos); + int theLen = inRequest.mSize; + while (theLen > 0) { + const int theSize = min(theLen, GetStripeRemaining()); + Buffer& theBuf = inRequest.GetBuffer(GetStripeIdx()); + theBuffer.MoveSpace(&theBuf.mBuf.mBuffer, theSize); + theLen -= theSize; + // The last seek sets position for the next sequential read request. + // The next SetPos() call will have nothing to do in this case. + if (SeekStripe(theSize) && theLen != 0) { + InternalError("invalid request size"); + } + } + const int theStatus = inRequest.mStatus; + const int theSize = inRequest.mSize; + const Offset thePos = inRequest.mPos; + const RequestId theId = inRequest.mRequestId; + PutRequest(inRequest); + ReportCompletion(theStatus, theBuffer, theSize, thePos, theId); + } + Request& GetRequest( + RequestId inRequestId, + Offset inPos, + int inSize) + { + if (Requests::IsEmpty(mFreeList)) { + return Request::Create(*this, inRequestId, inPos, inSize); + } + Request& theRet = *Requests::PopFront(mFreeList); + theRet.Reset(*this, inRequestId, inPos, inSize); + return theRet; + } + void PutRequest( + Request& inRequest) + { + inRequest.Reset(*this, RequestId(), 0, 0); + Requests::PushFront(mFreeList, inRequest); + } + static bool IsFailure( + int inStatus) + { return (inStatus != 0 && inStatus != kErrorNoEntry); } + void ZeroPaddTo( + IOBuffer& inBuffer, + int inSize) + { + int theRem = inSize - inBuffer.BytesConsumable(); + theRem -= inBuffer.ZeroFillSpaceAvailable(theRem); + inBuffer.RemoveSpaceAvailable(); + while (theRem > 0) { + IOBufferData theBuf = GetZeroBuffer(); + theRem -= theBuf.Trim(theRem); + inBuffer.Append(theBuf); + } + } + IOBufferData GetZeroBuffer() + { + if (! mZeroBufferPtr) { + if (mUseDefaultBufferAllocatorFlag) { + mZeroBufferPtr = new IOBufferData(); + QCASSERT( + (mZeroBufferPtr->Producer() - (char*)0) % kAlign == 0 && + mZeroBufferPtr->SpaceAvailable() > 0 && + mZeroBufferPtr->SpaceAvailable() % kAlign == 0 + ); + } else { + mZeroBufferPtr = new IOBufferData(NewDataBuffer(4 << 10)); + } + mZeroBufferPtr->ZeroFill(mZeroBufferPtr->SpaceAvailable()); + } + return *mZeroBufferPtr; + } + char* GetTempBufPtr( + int inIndex) + { + return GetTempBufSelfPtr(inIndex, mStripeCount + mRecoveryStripeCount); + } + int Rand() + { + mNextRand = mNextRand * 1103515245 + 12345; + return (int)((uint32_t)(mNextRand / 65536) % 32768); + } +private: + RSReadStriper( + const RSReadStriper& inRSReadStriper); + RSReadStriper& operator=( + const RSReadStriper& inRSReadStriper); +}; + +Writer::Striper* +RSStriperCreate( + Writer::Striper::StriperType inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + Writer::Striper::Offset inFileSize, + string inLogPrefix, + Writer::Striper::Impl& inOuter, + Writer::Striper::Offset& outOpenChunkBlockSize, + string& outErrMsg) +{ + return RSWriteStriper::Create( + inType, + inStripeCount, + inRecoveryStripeCount, + inStripeSize, + inFileSize, + inLogPrefix, + inOuter, + outOpenChunkBlockSize, + outErrMsg + ); +} + +Reader::Striper* +RSStriperCreate( + Reader::Striper::StriperType inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + int inMaxReadRequestSize, + bool inUseDefaultBufferAllocatorFlag, + bool inFailShortReadsFlag, + Reader::Striper::Offset inRecoverChunkPos, + Reader::Striper::Offset inFileSize, + Reader::Striper::SeqNum inInitialSeqNum, + string inLogPrefix, + Reader::Striper::Impl& inOuter, + Reader::Striper::Offset& outOpenChunkBlockSize, + string& outErrMsg) +{ + return RSReadStriper::Create( + inType, + inStripeCount, + inRecoveryStripeCount, + inStripeSize, + inMaxReadRequestSize, + inUseDefaultBufferAllocatorFlag, + inFailShortReadsFlag, + inRecoverChunkPos, + inFileSize, + inInitialSeqNum, + inLogPrefix, + inOuter, + outOpenChunkBlockSize, + outErrMsg + ); +} + +}} /* namespace client KFS */ diff --git a/src/cc/libclient/RSStriper.h b/src/cc/libclient/RSStriper.h new file mode 100644 index 000000000..b4da614ef --- /dev/null +++ b/src/cc/libclient/RSStriper.h @@ -0,0 +1,66 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/07/27 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef RSSTRIPER_H +#define RSSTRIPER_H + +#include "Writer.h" +#include "Reader.h" + +namespace KFS +{ +namespace client +{ +using std::string; + +Writer::Striper* RSStriperCreate( + Writer::Striper::StriperType inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + Writer::Striper::Offset inFileSize, + string inLogPrefix, + Writer::Striper::Impl& inOuter, + Writer::Striper::Offset& outOpenChunkBlockSize, + string& outErrMsg); + +Reader::Striper* RSStriperCreate( + Reader::Striper::StriperType inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + int inMaxAtomicReadRequestSize, + bool inUseDefaultBufferAllocatorFlag, + bool inFailShortReadsFlag, + Reader::Striper::Offset inRecoverChunkPos, + Reader::Striper::Offset inFileSize, + Reader::Striper::SeqNum inInitialSeqNum, + string inLogPrefix, + Reader::Striper::Impl& inOuter, + Reader::Striper::Offset& outOpenChunkBlockSize, + string& outErrMsg); +}} + +#endif /* RSSTRIPER_H */ diff --git a/src/cc/libclient/Reader.cc b/src/cc/libclient/Reader.cc new file mode 100644 index 000000000..6b7ab458f --- /dev/null +++ b/src/cc/libclient/Reader.cc @@ -0,0 +1,2392 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/07/13 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "Reader.h" + +#include +#include +#include +#include +#include +#include + +#include "kfsio/IOBuffer.h" +#include "kfsio/NetManager.h" +#include "kfsio/checksum.h" +#include "kfsio/ITimeout.h" +#include "common/kfsdecls.h" +#include "common/MsgLogger.h" +#include "qcdio/QCUtils.h" +#include "qcdio/qcstutils.h" +#include "qcdio/qcdebug.h" +#include "qcdio/QCDLList.h" +#include "KfsOps.h" +#include "utils.h" +#include "KfsClient.h" +#include "RSStriper.h" + +namespace KFS +{ +namespace client +{ + +using std::min; +using std::max; +using std::string; +using std::ostream; +using std::ostringstream; +using std::random_shuffle; +using std::vector; +using std::pair; +using std::make_pair; + +// Kfs client read state machine implementation. +class Reader::Impl : public QCRefCountedObj +{ +public: + typedef QCRefCountedObj::StRef StRef; + + enum + { + kErrorNone = 0, + kErrorParameters = -EINVAL, + kErrorIO = -EIO, + kErrorTryAgain = -EAGAIN, + kErrorNoEntry = -ENOENT, + kErrorBusy = -EBUSY, + kErrorChecksum = -EBADCKSUM, + kErrorLeaseExpired = -ELEASEEXPIRED, + kErrorFault = -EFAULT, + kErrorInvalChunkSize = -EINVALCHUNKSIZE + }; + + Impl( + Reader& inOuter, + MetaServer& inMetaServer, + Completion* inCompletionPtr, + int inMaxRetryCount, + int inTimeSecBetweenRetries, + int inOpTimeoutSec, + int inIdleTimeoutSec, + int inMaxReadSize, + int inLeaseRetryTimeout, + int inLeaseWaitTimeout, + string inLogPrefix, + int64_t inChunkServerInitialSeqNum) + : QCRefCountedObj(), + mOuter(inOuter), + mMetaServer(inMetaServer), + mPathName(), + mFileId(-1), + mClosingFlag(false), + mErrorCode(0), + mIdleTimeoutSec(inIdleTimeoutSec), + mOpTimeoutSec(inOpTimeoutSec), + mMaxRetryCount(inMaxRetryCount), + mTimeSecBetweenRetries(inTimeSecBetweenRetries), + mMaxReadSize(inMaxReadSize), + mLeaseRetryTimeout(inLeaseRetryTimeout), + mLeaseWaitTimeout(inLeaseWaitTimeout), + mSkipHolesFlag(false), + mFailShortReadsFlag(false), + mMaxGetAllocRetryCount(inMaxRetryCount), + mOffset(0), + mOpenChunkBlockSize(0), + mChunkServerInitialSeqNum(inChunkServerInitialSeqNum), + mCompletionPtr(inCompletionPtr), + mLogPrefix(inLogPrefix), + mStats(), + mChunkServersStats(), + mNetManager(mMetaServer.GetNetManager()), + mStriperPtr(0), + mCompletionDepthCount(0) + { Readers::Init(mReaders); } + int Open( + kfsFileId_t inFileId, + const char* inFileNamePtr, + Offset inFileSize, + int inStriperType, + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + bool inSkipHolesFlag, + bool inUseDefaultBufferAllocatorFlag, + Offset inRecoverChunkPos, + bool inFailShortReadsFlag) + { + const char* const theFileNamePtr = inFileNamePtr ? inFileNamePtr : ""; + if (inFileId <= 0 || (! *theFileNamePtr && inRecoverChunkPos < 0)) { + return kErrorParameters; + } + if (mFileId > 0) { + if (inFileId == mFileId && + theFileNamePtr == mPathName) { + return mErrorCode; + } + return kErrorParameters; + } + if (IsOpen() && mErrorCode != 0) { + return (mErrorCode < 0 ? mErrorCode : -mErrorCode); + } + if (mClosingFlag) { + return kErrorTryAgain; + } + QCASSERT(Readers::IsEmpty(mReaders)); + delete mStriperPtr; + string theErrMsg; + mStriperPtr = 0; + mOpenChunkBlockSize = Offset(CHUNKSIZE); + mStriperPtr = Striper::Create( + inStriperType, + inStripeCount, + inRecoveryStripeCount, + inStripeSize, + mMaxReadSize, + inUseDefaultBufferAllocatorFlag, + inFailShortReadsFlag, + inRecoverChunkPos, + inFileSize, + mChunkServerInitialSeqNum, + mLogPrefix, + *this, + mOpenChunkBlockSize, + theErrMsg + ); + if (! theErrMsg.empty()) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + theErrMsg << + KFS_LOG_EOM; + return kErrorParameters; + } + if (! mStriperPtr || mOpenChunkBlockSize < Offset(CHUNKSIZE)) { + mOpenChunkBlockSize = Offset(CHUNKSIZE); + } + mStats.Clear(); + mSkipHolesFlag = inSkipHolesFlag; + mPathName = theFileNamePtr; + mErrorCode = 0; + mFileId = inFileId; + mFailShortReadsFlag = inFailShortReadsFlag; + return 0; + } + int Close() + { + if (! IsOpen()) { + return 0; + } + if (mErrorCode != 0) { + return mErrorCode; + } + if (mClosingFlag) { + return kErrorTryAgain; + } + mClosingFlag = true; + return StartRead(); + } + int Read( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inRequestId) + { + if (inOffset < 0) { + return kErrorParameters; + } + if (mErrorCode != 0) { + return mErrorCode; + } + if (mClosingFlag || ! IsOpen()) { + return kErrorParameters; + } + if (inLength <= 0) { + IOBuffer theBuf; + return ( + ReportCompletion(0, 0, 0, inOffset, &theBuf, inRequestId) ? + mErrorCode : 0 + ); + } + return StartRead(inBuffer, inLength, inOffset, inRequestId); + } + void Stop() + { + while (! Readers::IsEmpty(mReaders)) { + delete Readers::Front(mReaders); + } + mClosingFlag = false; + } + void Shutdown() + { + Stop(); + delete mStriperPtr; + mStriperPtr = 0; + mFileId = -1; + mErrorCode = 0; + } + bool IsOpen() const + { return (mFileId > 0); } + bool IsClosing() const + { return (IsOpen() && mClosingFlag); } + bool IsActive() const + { + return ( + IsOpen() && ( + ! Readers::IsEmpty(mReaders) || + mClosingFlag) + ); + } + void DisableCompletion() + { mCompletionPtr = 0; } + void Register( + Completion* inCompletionPtr) + { + if (inCompletionPtr == mCompletionPtr) { + return; + } + if (mCompletionPtr) { + mCompletionPtr->Unregistered(mOuter); + } + mCompletionPtr = inCompletionPtr; + } + bool Unregister( + Completion* inCompletionPtr) + { + if (inCompletionPtr != mCompletionPtr) { + return false; + } + mCompletionPtr = 0; + return true; + } + void GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats) + { + outStats = mStats; + outChunkServersStats = mChunkServersStats; + } + bool GetErrorCode() const + { return mErrorCode; } + +private: + typedef KfsNetClient ChunkServer; + + class ChunkReader : private ITimeout, private KfsNetClient::OpOwner + { + public: + class ReadOp; + typedef QCDLList Queue; + typedef QCDLList Readers; + + class ReadOp : public KFS::client::ReadOp + { + public: + struct RequestEntry + { + RequestEntry( + size_t inSize = 0, + RequestId inRequestId = RequestId(), + RequestId inStriperRequestId = RequestId()) + : mSize(inSize), + mRequestId(inRequestId), + mStriperRequestId(inStriperRequestId), + mCancelFlag(false) + {} + + size_t mSize; + RequestId mRequestId; + RequestId mStriperRequestId; + bool mCancelFlag; + }; + typedef vector Requests; + + time_t mOpStartTime; + IOBuffer mBuffer; + IOBuffer mTmpBuffer; + RequestId mRequestId; + RequestId mStriperRequestId; + Requests mRequests; + bool mRetryIfFailsFlag; + bool mFailShortReadFlag; + bool mCancelFlag; + + ReadOp( + int inOpSize, + Offset inOffset, + RequestId inRequestId, + RequestId inStriperRequestId, + bool inRetryIfFailsFlag, + bool inFailShortReadFlag) + : KFS::client::ReadOp(-1, -1, -1), + mOpStartTime(0), + mBuffer(), + mTmpBuffer(), + mRequestId(inRequestId), + mStriperRequestId(inStriperRequestId), + mRequests(), + mRetryIfFailsFlag(inRetryIfFailsFlag), + mFailShortReadFlag(inFailShortReadFlag), + mCancelFlag(false) + { + Queue::Init(*this); + numBytes = inOpSize; + offset = inOffset; + } + void Delete( + ReadOp** inQueuePtr) + { + Queue::Remove(inQueuePtr, *this); + delete this; + } + private: + ReadOp* mPrevPtr[1]; + ReadOp* mNextPtr[1]; + + friend class QCDLListOp; + virtual ~ReadOp() + {} + private: + ReadOp( + const ReadOp& inOp); + ReadOp& operator=( + const ReadOp& inOp); + }; + + ChunkReader( + Impl& inOuter, + int64_t inSeqNum, + const string& inLogPrefix) + : ITimeout(), + KfsNetClient::OpOwner(), + mOuter(inOuter), + mChunkServer( + inOuter.mNetManager, + string(), -1, + // All chunk server retries are handled here + 0, // inMaxRetryCount + 0, // inTimeSecBetweenRetries, + inOuter.mOpTimeoutSec, + inOuter.mIdleTimeoutSec, + inSeqNum, + inLogPrefix.c_str(), + // Just fail the op. Error handler will reset connection and + // cancel all pending ops by calling Stop() + false, // inResetConnectionOnOpTimeoutFlag + // Allow some slack and ensure that content size limit is + // reasonably large. + int(min( + int64_t(inOuter.mMaxReadSize) + (64 << 10), + int64_t(std::numeric_limits::max()) + )) + ), + mErrorCode(0), + mRetryCount(0), + mOpenChunkBlockFileOffset(-1), + mOpStartTime(0), + mGetAllocOp(0, -1, -1), + mLeaseAcquireOp(0, -1, ""), + mLeaseRenewOp(0, -1, 0, ""), + mLeaseRelinquishOp(0, -1, 0), + mSizeOp(0, -1, 0), + mLastOpPtr(0), + mLastMetaOpPtr(0), + mChunkServerIdx(0), + mLeaseRenewTime(Now() - 1), + mLeaseExpireTime(mLeaseRenewTime), + mLeaseWaitStartTime(0), + mLeaseRetryCount(0), + mSleepingFlag(false), + mClosingFlag(false), + mChunkServerSetFlag(false), + mStartReadRunningFlag(false), + mRestartStartReadFlag(false), + mLogPrefix(inLogPrefix), + mOpsNoRetryCount(0), + mDeletedFlagPtr(0), + mRunningCompletionPtr(0) + { + Queue::Init(mPendingQueue); + Queue::Init(mInFlightQueue); + Queue::Init(mCompletionQueue); + Readers::Init(*this); + Readers::PushFront(mOuter.mReaders, *this); + mChunkServer.SetRetryConnectOnly(true); + mGetAllocOp.fileOffset = -1; + mGetAllocOp.chunkId = -1; + mLeaseAcquireOp.chunkId = -1; + mLeaseAcquireOp.leaseId = -1; + mSizeOp.size = -1; + mGetAllocOp.status = 0; + } + ~ChunkReader() + { + ChunkReader::Shutdown(); + ChunkServer::Stats theStats; + mChunkServer.GetStats(theStats); + mOuter.mChunkServersStats.Add(theStats); + Readers::Remove(mOuter.mReaders, *this); + if (mDeletedFlagPtr) { + *mDeletedFlagPtr = true; + } + StRunningCompletion::Delete(mRunningCompletionPtr); + } + Offset GetSize() const + { return mSizeOp.size; } + void CancelClose() + { + if (mClosingFlag) { + if (mLastOpPtr == &mLeaseRelinquishOp) { + mOuter.mMetaServer.Cancel(mLastOpPtr, this); + } + mClosingFlag = false; + } + } + // The QueueRead() guarantees that completion will not be invoked. + // The reads will be queued even if the reader is already in the error + // state: mErrorCode != 0. In the case of fatal error all pending writes + // are discarded when the writer gets deleted. + // + // StartRead() must be called in order to start executing pending + // reads. + // This allows the caller to properly update its state before the reads + // get executed, and the corresponding completion(s) invoked. + int QueueRead( + IOBuffer& inBuffer, + int inSize, + Offset inOffset, + RequestId inRequestId, + RequestId inStriperRequestId, + bool inRetryIfFailsFlag, + bool inFailShortReadFlag) + { + int theSize = inSize; + if (theSize <= 0) { + return 0; + } + if (inOffset < 0) { + return kErrorParameters; + } + QCRTASSERT(inOffset >= 0 && ! mClosingFlag); + const Offset kMaxChunkSize = (Offset)CHUNKSIZE; + const Offset theChunkOffset = inOffset % kMaxChunkSize; + if (mGetAllocOp.fileOffset < 0) { + mGetAllocOp.fid = mOuter.mFileId; + mGetAllocOp.filename = mOuter.mPathName; + mGetAllocOp.fileOffset = inOffset - theChunkOffset; + mOpenChunkBlockFileOffset = mGetAllocOp.fileOffset - + mGetAllocOp.fileOffset % mOuter.mOpenChunkBlockSize; + } else { + QCRTASSERT(mGetAllocOp.fileOffset == inOffset - theChunkOffset); + } + Offset thePos = theChunkOffset; + theSize = min(theSize, (int)(kMaxChunkSize - thePos)); + QCASSERT(theSize > 0); + // Try to use the last pending op. + ReadOp* const theLastOpPtr = Queue::Back(mPendingQueue); + if (theLastOpPtr) { + ReadOp& theOp = *theLastOpPtr; + const int theOpSize = theOp.numBytes; + const Offset theOpPos = theOp.offset; + if (theOpPos + theOpSize == thePos && + theOp.mRetryIfFailsFlag == inRetryIfFailsFlag && + theOp.mFailShortReadFlag == inFailShortReadFlag && + theOpSize + theSize <= mOuter.mMaxReadSize) { + if (theOp.mRequests.empty()) { + theOp.mRequests.push_back(ReadOp::RequestEntry( + theOp.numBytes, + theOp.mRequestId, + theOp.mStriperRequestId + )); + } + QCVERIFY(theOpSize <= + theOp.mBuffer.EnsureSpaceAvailable(theOpSize) + ); + theOp.mRequests.push_back(ReadOp::RequestEntry( + size_t(theSize), + inRequestId, + inStriperRequestId + )); + theOp.numBytes += theSize; + thePos += theSize; + theOp.mBuffer.MoveSpaceAvailable(&inBuffer, theSize); + theSize = 0; + } + } + const int theMaxReadSize = max(1, mOuter.mMaxReadSize); + while (theSize > 0) { + const int theOpSize = min(theMaxReadSize, theSize); + ReadOp& theOp = *(new ReadOp( + theOpSize, + thePos, + inRequestId, + inStriperRequestId, + inRetryIfFailsFlag, + inFailShortReadFlag + )); + if (! inRetryIfFailsFlag) { + mOpsNoRetryCount++; + } + thePos += theOpSize; + theSize -= theOpSize; + theOp.mBuffer.MoveSpaceAvailable(&inBuffer, theOpSize); + Queue::PushBack(mPendingQueue, theOp); + } + QCRTASSERT(thePos <= kMaxChunkSize && theSize >= 0); + return (int)(thePos - theChunkOffset); + } + void StartRead() + { + // Unwind recursion form the possible synchronous op completion. + if (mStartReadRunningFlag) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "unwiding recursion:" << + " filepos: " << mGetAllocOp.fileOffset << + " chunkid: " << mGetAllocOp.chunkId << + " restart: " << mRestartStartReadFlag << + KFS_LOG_EOM; + mRestartStartReadFlag = true; + return; + } + mStartReadRunningFlag = true; + QCStDeleteNotifier theDeleteNotifier(mDeletedFlagPtr); + do { + mRestartStartReadFlag = false; + StartReadSelf(); + if (theDeleteNotifier.IsDeleted()) { + return; // Unwind. + } + } while (mRestartStartReadFlag); + mStartReadRunningFlag = false; + } + void Close() + { + if (! mClosingFlag && mErrorCode == 0 && IsOpen()) { + mClosingFlag = true; + StartRead(); + } + } + void Shutdown() + { + Reset(); + QCRTASSERT(Queue::IsEmpty(mInFlightQueue)); + Queue::PushBackList(mCompletionQueue, mPendingQueue); + while (! Queue::IsEmpty(mCompletionQueue)) { + Queue::Front(mCompletionQueue)->Delete(mCompletionQueue); + } + QCRTASSERT( + Queue::IsEmpty(mInFlightQueue) && + Queue::IsEmpty(mPendingQueue) && + Queue::IsEmpty(mCompletionQueue) + ); + mClosingFlag = false; + mErrorCode = 0; + mOpsNoRetryCount = 0; + } + Offset GetFileOffset() const + { + return ((mErrorCode == 0 && ! mClosingFlag) ? + mGetAllocOp.fileOffset : -1); + } + Offset GetOpenChunkBlockFileOffset() const + { + return (GetFileOffset() >= 0 ? mOpenChunkBlockFileOffset : -1); + } + bool IsIdle() const + { + return ( + Queue::IsEmpty(mPendingQueue) && + Queue::IsEmpty(mInFlightQueue) && + Queue::IsEmpty(mCompletionQueue) && + ! mClosingFlag + ); + } + bool IsOpen() const + { + return ( + mErrorCode == 0 && + mGetAllocOp.fileOffset >= 0 && + ! mClosingFlag + ); + } + int GetErrorCode() const + { return mErrorCode; } + void CancelRead() + { + QCASSERT(mOuter.mStriperPtr); + const bool theRestartFlag = + ! Queue::IsEmpty(mInFlightQueue) && + Queue::IsEmpty(mPendingQueue); + CancelRead(mInFlightQueue); + if (Queue::IsEmpty(mInFlightQueue)) { + mChunkServer.Stop(); // Discard replies if any. + mChunkServerSetFlag = false; + } + CancelRead(mPendingQueue); + CancelRead(mCompletionQueue); + StRunningCompletion::Cancel( + mRunningCompletionPtr, *mOuter.mStriperPtr); + if (mSleepingFlag) { + if (! CanRead()) { + Timeout(); + } + } else if (theRestartFlag && ! Queue::IsEmpty(mPendingQueue)) { + StartRead(); + } + } + ChunkReader* GetPrevPtr() + { + ChunkReader& thePrev = ReadersListOp::GetPrev(*this); + return (&thePrev == this ? 0 : &thePrev); + } + kfsChunkId_t GetChunkId() const + { return mGetAllocOp.chunkId; } + int64_t GetChunkVersion() const + { + return (mGetAllocOp.chunkId >= 0 ? + mGetAllocOp.chunkVersion : int64_t(-1)); + } + + private: + class StRunningCompletion + { + public: + StRunningCompletion( + StRunningCompletion*& inHeadPtr, + ReadOp::Requests& inRequests) + : mRequests(), + mNextPtr(0), + mHeadPtr(0) + { + if (inRequests.empty()) { + return; + } + mNextPtr = inHeadPtr; + inHeadPtr = this; + mHeadPtr = &inHeadPtr; + mRequests.swap(inRequests); + } + ~StRunningCompletion() + { + if (! mHeadPtr) { + return; + } + *mHeadPtr = mNextPtr; + } + static void Cancel( + StRunningCompletion* inHeadPtr, + Striper& inStriper) + { + StRunningCompletion* thePtr = inHeadPtr; + while (thePtr) { + thePtr->CancelSelf(inStriper); + thePtr = thePtr->mNextPtr; + } + } + static void Delete( + StRunningCompletion*& inHeadPtr) + { + StRunningCompletion* thePtr = inHeadPtr; + while (thePtr) { + thePtr->mHeadPtr = 0; + thePtr = thePtr->mNextPtr; + } + inHeadPtr = 0; + } + ReadOp::Requests mRequests; + private: + StRunningCompletion* mNextPtr; + StRunningCompletion** mHeadPtr; + + void CancelSelf( + Striper& inStriper) + { + for (ReadOp::Requests::iterator theIt = mRequests.begin(); + theIt != mRequests.end(); + ++theIt) { + if (! theIt->mCancelFlag && + inStriper.CanCancelRead(theIt->mStriperRequestId)) { + theIt->mCancelFlag = true; + } + } + } + private: + StRunningCompletion( + StRunningCompletion& inCompl); + StRunningCompletion& operator=( + StRunningCompletion& inCompl); + }; + + Impl& mOuter; + ChunkServer mChunkServer; + int mErrorCode; + int mRetryCount; + Offset mOpenChunkBlockFileOffset; + time_t mOpStartTime; + GetAllocOp mGetAllocOp; + LeaseAcquireOp mLeaseAcquireOp; + LeaseRenewOp mLeaseRenewOp; + LeaseRelinquishOp mLeaseRelinquishOp; + SizeOp mSizeOp; + KfsOp* mLastOpPtr; + KfsOp* mLastMetaOpPtr; + size_t mChunkServerIdx; + time_t mLeaseRenewTime; + time_t mLeaseExpireTime; + time_t mLeaseWaitStartTime; + int mLeaseRetryCount; + bool mSleepingFlag; + bool mClosingFlag; + bool mChunkServerSetFlag; + bool mStartReadRunningFlag; + bool mRestartStartReadFlag; + string const mLogPrefix; + int mOpsNoRetryCount; + bool* mDeletedFlagPtr; + StRunningCompletion* mRunningCompletionPtr; + ReadOp* mPendingQueue[1]; + ReadOp* mInFlightQueue[1]; + ReadOp* mCompletionQueue[1]; + ChunkReader* mPrevPtr[1]; + ChunkReader* mNextPtr[1]; + + friend class QCDLListOp; + typedef QCDLListOp ReadersListOp; + + bool CanRead() + { return (! Queue::IsEmpty(mPendingQueue)); } + bool IsMetaOp( + const KfsOp* inOpPtr) const + { + return ( + inOpPtr == &mGetAllocOp || + inOpPtr == &mLeaseAcquireOp || + inOpPtr == &mLeaseRenewOp || + inOpPtr == &mLeaseRelinquishOp + ); + } + void CancelMetaOps() + { + if (! mLastMetaOpPtr) { + return; + } + if (! mOuter.mMetaServer.Cancel(mLastMetaOpPtr, this)) { + mOuter.InternalError("failed to cancel meta op"); + } + mLastMetaOpPtr = 0; + } + void StartReadSelf() + { + if (mSleepingFlag) { + return; + } + if (mErrorCode != 0) { + mClosingFlag = false; + return; + } + if (mClosingFlag && ! CanRead()) { + if (! Queue::IsEmpty(mInFlightQueue)) { + return; + } + mChunkServer.Stop(); + if (mLeaseAcquireOp.leaseId >= 0 && + mLeaseAcquireOp.chunkId > 0) { + if (mLastOpPtr != &mLeaseRelinquishOp) { + CloseChunk(); + } + return; + } + CancelMetaOps(); + mClosingFlag = false; + mGetAllocOp.fileOffset = -1; + mGetAllocOp.chunkId = -1; + mLeaseAcquireOp.leaseId = -1; + mChunkServerSetFlag = false; + ReportCompletion(); + return; + } + if (! CanRead()) { + return; + } + if (mGetAllocOp.chunkId > 0 && mChunkServer.WasDisconnected()) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "detected chunk server disconnect: " << + mChunkServer.GetServerLocation() << + " queue: " << (Queue::IsEmpty(mPendingQueue) ? "" : "not") << + " empty" << + KFS_LOG_EOM; + Reset(); + if (! CanRead()) { + return; + } + } + // Return immediately after calling Read() and GetAlloc(), as + // these can invoke completion. Completion, in turn, can delete + // this. + // Other methods of this class have to return immediately (unwind) + // after invoking Read(). + if (mGetAllocOp.chunkId > 0) { + Read(); + } else if (mGetAllocOp.status == kErrorNoEntry) { + Done(mGetAllocOp, false, 0); + } else if (! mLastOpPtr) { + Reset(); + GetAlloc(); + } + } + void Read() + { + if (mLeaseAcquireOp.leaseId < 0 || + mLeaseAcquireOp.chunkId != mGetAllocOp.chunkId || + mLeaseExpireTime <= Now()) { + GetLease(); + return; + } + QCStDeleteNotifier theDeleteNotifier(mDeletedFlagPtr); + if (mLeaseRenewTime <= Now()) { + RenewLease(); + if (theDeleteNotifier.IsDeleted()) { + return; // Unwind. + } + // OK read while renew is in flight, as long as the lease hasn't + // expired yet. + if (mLeaseRenewTime <= Now()) { + if (mLeaseExpireTime <= Now()) { + GetLease(); + } + return; + } + } + if (! mChunkServerSetFlag) { + QCASSERT(mChunkServerIdx < mGetAllocOp.chunkServers.size()); + mChunkServerSetFlag = true; + mChunkServer.SetServer( + mGetAllocOp.chunkServers[mChunkServerIdx]); + } + if (mSizeOp.size < 0) { + GetChunkSize(); + return; + } + Queue::Iterator theIt(mPendingQueue); + ReadOp* theOpPtr; + while ( ! mRestartStartReadFlag && + ! mSleepingFlag && + mErrorCode == 0 && + mGetAllocOp.chunkId > 0 && + (theOpPtr = theIt.Next())) { + Read(*theOpPtr); + if (theDeleteNotifier.IsDeleted()) { + return; // Unwind. + } + } + } + void GetAlloc() + { + QCASSERT(mGetAllocOp.fileOffset >= 0 && mGetAllocOp.fid > 0); + Reset(mGetAllocOp); + mGetAllocOp.chunkServers.clear(); + mGetAllocOp.serversOrderedFlag = false; + EnqueueMeta(mGetAllocOp); + } + void Done( + GetAllocOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(&inOp == &mGetAllocOp && ! inBufferPtr); + if (inCanceledFlag) { + return; + } + if (inOp.status == kErrorNoEntry) { + // Fail all ops. + inOp.chunkId = -1; + ReportCompletionForAll(inOp.status); + return; + } + if (inOp.status != 0 || mGetAllocOp.chunkServers.empty()) { + mGetAllocOp.chunkId = -1; + HandleError(inOp); + return; + } + if (! mGetAllocOp.serversOrderedFlag) { + random_shuffle( + mGetAllocOp.chunkServers.begin(), + mGetAllocOp.chunkServers.end() + ); + } + mChunkServerIdx = 0; + StartRead(); + } + void GetLease() + { + QCASSERT(mGetAllocOp.fileOffset >= 0 && mGetAllocOp.fid > 0); + if (mLeaseAcquireOp.chunkId != mGetAllocOp.chunkId || + mLeaseAcquireOp.status != kErrorBusy) { + mLeaseWaitStartTime = Now(); + mLeaseRetryCount = 0; + } + CancelMetaOps(); + Reset(mLeaseAcquireOp); + mLeaseAcquireOp.chunkId = mGetAllocOp.chunkId; + mLeaseAcquireOp.pathname = mGetAllocOp.filename.c_str(); + mLeaseAcquireOp.leaseId = -1; + mLeaseExpireTime = Now() + LEASE_INTERVAL_SECS; + mLeaseRenewTime = Now() + (LEASE_INTERVAL_SECS + 1) / 2; + mOuter.mStats.mGetLeaseCount++; + EnqueueMeta(mLeaseAcquireOp); + } + void Done( + LeaseAcquireOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(&inOp == &mLeaseAcquireOp && ! inBufferPtr); + if (inCanceledFlag) { + return; + } + if (inOp.status == 0 && mLeaseExpireTime < Now()) { + inOp.status = kErrorLeaseExpired; + } + if (inOp.status != 0) { + mLeaseAcquireOp.leaseId = -1; + mLeaseRenewTime = Now() - 1; + mLeaseExpireTime = mLeaseRenewTime; + HandleError(inOp); + return; + } + StartRead(); + } + void RenewLease() + { + QCASSERT( + mGetAllocOp.fileOffset >= 0 && + mGetAllocOp.fid > 0 && + mGetAllocOp.chunkId > 0 && + mLeaseAcquireOp.leaseId >= 0 && + (! mLastMetaOpPtr || mLastMetaOpPtr == &mLeaseRenewOp) + ); + CancelMetaOps(); + Reset(mLeaseRenewOp); + mLeaseRenewOp.chunkId = mLeaseAcquireOp.chunkId; + mLeaseRenewOp.pathname = mGetAllocOp.filename.c_str(); + mLeaseRenewOp.leaseId = mLeaseAcquireOp.leaseId; + mLeaseExpireTime = Now() + LEASE_INTERVAL_SECS; + mLeaseRenewTime = Now() + (LEASE_INTERVAL_SECS + 1) / 2; + EnqueueMeta(mLeaseRenewOp); + } + void Done( + LeaseRenewOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(&inOp == &mLeaseRenewOp && ! inBufferPtr); + if (inCanceledFlag) { + return; + } + if (inOp.status != 0) { + mLeaseAcquireOp.leaseId = -1; + mLeaseRenewOp.leaseId = -1; + mLeaseRenewTime = Now() - 1; + mLeaseExpireTime = mLeaseRenewTime; + HandleError(inOp); + return; + } + if (Queue::IsEmpty(mInFlightQueue)) { + StartRead(); + } + } + void Read( + ReadOp& inReadOp) + { + QCASSERT( + mGetAllocOp.fileOffset >= 0 && + mGetAllocOp.fid > 0 && + mGetAllocOp.chunkId > 0 && + mLeaseAcquireOp.leaseId >= 0 && + mSizeOp.size >= 0 + ); + Reset(inReadOp); + inReadOp.mTmpBuffer.Clear(); + // Use tmp buffer until the op passes checksum verification to use + // the same buffers with retries. + inReadOp.mTmpBuffer.UseSpaceAvailable( + &inReadOp.mBuffer, inReadOp.numBytes); + inReadOp.chunkId = mGetAllocOp.chunkId; + inReadOp.chunkVersion = mGetAllocOp.chunkVersion; + inReadOp.mOpStartTime = Now(); + Queue::Remove(mPendingQueue, inReadOp); + Queue::PushBack(mInFlightQueue, inReadOp); + if (inReadOp.offset >= mSizeOp.size) { + QCASSERT(inReadOp.offset + inReadOp.numBytes <= CHUNKSIZE); + // Read after end of chunk. + inReadOp.status = 0; + inReadOp.statusMsg = "read offset past end of chunk"; + Done(inReadOp, false, &inReadOp.mTmpBuffer); + return; + } + mOuter.mStats.mOpsReadCount++; + Enqueue(inReadOp, &inReadOp.mTmpBuffer); + } + void Done( + ReadOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT( + inBufferPtr == &inOp.mTmpBuffer && + Queue::IsInList(mInFlightQueue, inOp) + ); + if (inOp.status == kErrorNoEntry && + mGetAllocOp.status != kErrorNoEntry) { + inOp.status = kErrorIO; + } + if (inCanceledFlag || inOp.status < 0 || ! VerifyChecksum(inOp) || + ! VerifyRead(inOp)) { + Queue::Remove(mInFlightQueue, inOp); + Queue::PushBack(mPendingQueue, inOp); + inOp.mTmpBuffer.Clear(); + if (inCanceledFlag) { + return; + } + mOpStartTime = inOp.mOpStartTime; + if (! inOp.mRetryIfFailsFlag && inOp.status != kErrorChecksum && + mChunkServerIdx + 1 >= + mGetAllocOp.chunkServers.size()) { + if (ReportCompletion(inOp, mPendingQueue)) { + StartRead(); + } + } else { + HandleError(inOp); + } + return; + } + const int theDoneCount = (int)inOp.contentLength; + QCASSERT( + theDoneCount >= 0 && + theDoneCount <= inOp.mTmpBuffer.BytesConsumable() && + inOp.contentLength <= inOp.numBytes + ); + mOuter.mStats.mReadByteCount += theDoneCount; + if (theDoneCount < inOp.mTmpBuffer.BytesConsumable()) { + // Move available space, if any, to the end of the short read. + IOBuffer theBuf; + theBuf.MoveSpaceAvailable(&inOp.mBuffer, theDoneCount); + inOp.mTmpBuffer.RemoveSpaceAvailable(); + inOp.mTmpBuffer.Trim(theDoneCount); + inOp.mTmpBuffer.MoveSpaceAvailable( + &inOp.mBuffer, (int)inOp.numBytes - theDoneCount); + } + inOp.mBuffer.RemoveSpaceAvailable(); + inOp.mBuffer.Move(&inOp.mTmpBuffer); + QCASSERT(theDoneCount == inOp.mBuffer.BytesConsumable()); + if (ReportCompletion(inOp, mInFlightQueue)) { + StartRead(); + } + } + bool ReportCompletion( + ReadOp& inOp, + ReadOp** inQueuePtr) + { + IOBuffer theBuffer; + theBuffer.Move(&inOp.mBuffer); + StRunningCompletion theCompl(mRunningCompletionPtr, inOp.mRequests); + const int theSize = (int)inOp.numBytes; + const RequestId theRequestId = inOp.mRequestId; + const RequestId theStriperRequestId = inOp.mStriperRequestId; + const int theStatus = min(0, inOp.status); + Offset theOffset = mGetAllocOp.fileOffset + inOp.offset; + if (mOpsNoRetryCount > 0 && + ! inOp.mRetryIfFailsFlag && + (inQueuePtr == mPendingQueue || + inQueuePtr == mInFlightQueue)) { + mOpsNoRetryCount--; + } + inOp.Delete(inQueuePtr); + if (theCompl.mRequests.empty()) { + if (! ReportCompletion( + theStatus, + theOffset, + theSize, + &theBuffer, + theRequestId, + theStriperRequestId)) { + return false; + } + } else { + for (ReadOp::Requests::iterator + theIt = theCompl.mRequests.begin(); + theIt != theCompl.mRequests.end(); + ++theIt) { + IOBuffer theReqBuf; + theReqBuf.MoveSpace(&theBuffer, theIt->mSize); + if (! theIt->mCancelFlag) { + theIt->mCancelFlag = true; + if (! ReportCompletion( + theStatus, + theOffset, + theIt->mSize, + &theReqBuf, + theIt->mRequestId, + theIt->mStriperRequestId)) { + return false; + } + } + theOffset += theIt->mSize; + } + } + return true; + } + bool VerifyChecksum( + ReadOp& inOp) + { + if (inOp.contentLength <= 0 && inOp.checksums.empty()) { + return true; + } + const vector theChecksums = + ComputeChecksums(&inOp.mTmpBuffer, inOp.contentLength); + if (theChecksums == inOp.checksums) { + return true; + } + if (theChecksums.size() != inOp.checksums.size()) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "checksum vector length mismatch:" + " chunk: " << inOp.chunkId << + " version: " << inOp.chunkVersion << + " offset: " << inOp.offset << + " expected: " << theChecksums.size() << + " got: " << inOp.checksums.size() << + KFS_LOG_EOM; + } else { + for (size_t i = 0; i < theChecksums.size(); i++) { + if (inOp.checksums[i] == theChecksums[i]) { + continue; + } + KFS_LOG_STREAM_ERROR << mLogPrefix << + "checksum mismatch:" + " chunk: " << inOp.chunkId << + " version: " << inOp.chunkVersion << + " offset: " << inOp.offset << + "+" << i << "*" << CHECKSUM_BLOCKSIZE << + " expected: " << theChecksums[i] << + " got: " << inOp.checksums[i] << + KFS_LOG_EOM; + } + } + inOp.status = kErrorChecksum; + inOp.statusMsg = "received checksum mismatch"; + return false; + } + bool VerifyRead( + ReadOp& inOp) + { + if (inOp.status < 0) { + return false; + } + const bool theShortReadExpectedFlag = + inOp.offset + (Offset)inOp.numBytes > mSizeOp.size; + if ((! inOp.mFailShortReadFlag && theShortReadExpectedFlag) || + inOp.contentLength >= inOp.numBytes) { + return true; + } + if (theShortReadExpectedFlag) { + inOp.status = kErrorInvalChunkSize; + inOp.statusMsg = "short read detected"; + } else { + inOp.status = kErrorIO; + inOp.statusMsg = "incomplete read detected"; + } + KFS_LOG_STREAM_ERROR << mLogPrefix << + inOp.statusMsg << ":" + " chunk: " << inOp.chunkId << + " version: " << inOp.chunkVersion << + " server: " << mChunkServer.GetServerLocation() << + " pos: " << inOp.offset << + " requested: " << inOp.numBytes << + " returned: " << inOp.contentLength << + " size: " << mSizeOp.size << + KFS_LOG_EOM; + if (inOp.chunkId > 0 && theShortReadExpectedFlag) { + // Report short chunk to the meta server. + mOuter.ReportInvalidChunk(inOp.chunkId, inOp.chunkVersion, + kErrorInvalChunkSize, inOp.statusMsg.c_str()); + } + return false; + } + void CloseChunk() + { + QCASSERT( + mLeaseAcquireOp.chunkId > 0 && + mLeaseAcquireOp.leaseId >= 0 && + &mLeaseRelinquishOp != mLastMetaOpPtr + ); + // Cancel in flight lease renew if any. + CancelMetaOps(); + Reset(mLeaseRelinquishOp); + mLeaseRelinquishOp.chunkId = mLeaseAcquireOp.chunkId; + mLeaseRelinquishOp.leaseId = mLeaseAcquireOp.leaseId; + mLeaseAcquireOp.leaseId = -1; + EnqueueMeta(mLeaseRelinquishOp); + } + void Done( + LeaseRelinquishOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(&mLeaseRelinquishOp == &inOp && ! inBufferPtr); + if (inCanceledFlag) { + return; + } + if (inOp.status != 0) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "lease relinquish failure, status: " << inOp.status << + " " << inOp.statusMsg << + " ignored" << + KFS_LOG_EOM; + } + Reset(); + mLeaseRenewOp.leaseId = -1; + mLeaseAcquireOp.leaseId = -1; + mLeaseAcquireOp.chunkId = -1; + mGetAllocOp.fileOffset = -1; + StartRead(); + } + void GetChunkSize() + { + QCASSERT( + mGetAllocOp.chunkId > 0 && + mLeaseAcquireOp.chunkId > 0 && + mLeaseAcquireOp.leaseId >= 0 + ); + Reset(mSizeOp); + mSizeOp.chunkId = mGetAllocOp.chunkId; + mSizeOp.chunkVersion = mGetAllocOp.chunkVersion; + Enqueue(mSizeOp); + } + void Done( + SizeOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(&mSizeOp == &inOp && ! inBufferPtr); + if (inCanceledFlag) { + return; + } + if (inOp.status != 0) { + HandleError(inOp); + return; + } + StartRead(); + } + virtual void OpDone( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + if (inOpPtr) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "<- " << (inCanceledFlag ? "canceled " : "") << + inOpPtr->Show() << + " status: " << inOpPtr->status << + " msg: " << inOpPtr->statusMsg << + " seq: " << inOpPtr->seq << + " len: " << inOpPtr->contentLength << + " buffer: " << static_cast(inBufferPtr) << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + KFS_LOG_EOM; + } else { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "<- " << (inCanceledFlag ? "canceled " : "") << + "NULL operation completion?" << + " buffer: " << static_cast(inBufferPtr) << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + KFS_LOG_EOM; + } + if (inCanceledFlag && IsMetaOp(inOpPtr)) { + mOuter.mStats.mMetaOpsCancelledCount++; + } + if (mLastOpPtr == inOpPtr) { + mLastOpPtr = 0; + } + if (mLastMetaOpPtr == inOpPtr) { + mLastMetaOpPtr = 0; + } + if (&mGetAllocOp == inOpPtr) { + Done(mGetAllocOp, inCanceledFlag, inBufferPtr); + } else if (&mLeaseAcquireOp == inOpPtr) { + Done(mLeaseAcquireOp, inCanceledFlag, inBufferPtr); + } else if (&mLeaseRenewOp == inOpPtr) { + Done(mLeaseRenewOp, inCanceledFlag, inBufferPtr); + } else if (&mLeaseRelinquishOp == inOpPtr) { + Done(mLeaseRelinquishOp, inCanceledFlag, inBufferPtr); + } else if (&mSizeOp == inOpPtr) { + Done(mSizeOp, inCanceledFlag, inBufferPtr); + } else if (inOpPtr->op == CMD_READ) { + Done(*static_cast(inOpPtr), + inCanceledFlag, inBufferPtr); + } else { + mOuter.InternalError("unexpected operation completion"); + } + } + void Enqueue( + KfsOp& inOp, + IOBuffer* inBufferPtr = 0) + { EnqueueSelf(inOp, inBufferPtr, &mChunkServer); } + void EnqueueMeta( + KfsOp& inOp, + IOBuffer* inBufferPtr = 0) + { EnqueueSelf(inOp, inBufferPtr, 0); } + void Reset() + { + CancelMetaOps(); + mLastOpPtr = 0; + mChunkServer.Stop(); + mChunkServerSetFlag = false; + QCASSERT(Queue::IsEmpty(mInFlightQueue)); + if (mSleepingFlag) { + mOuter.mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + } + static void Reset( + KfsOp& inOp) + { + inOp.seq = 0; + inOp.status = 0; + inOp.statusMsg.clear(); + inOp.checksum = 0; + inOp.contentLength = 0; + inOp.contentBufLen = 0; + delete [] inOp.contentBuf; + inOp.contentBuf = 0; + } + int GetTimeToNextRetry() const + { + return max(mRetryCount > 1 ? 1 : 0, + mOuter.mTimeSecBetweenRetries - int(Now() - mOpStartTime)); + } + void HandleError( + KfsOp& inOp) + { + ostringstream theOStream; + inOp.Request(theOStream); + KFS_LOG_STREAM_ERROR << mLogPrefix << + "operation" + " failure, seq: " << inOp.seq << + " status: " << inOp.status << + " msg: " << inOp.statusMsg << + " op: " << inOp.Show() << + " current chunk server: " << mChunkServer.GetServerLocation() << + " chunkserver: " << (mChunkServer.IsDataSent() ? + (mChunkServer.IsAllDataSent() ? "all" : "partial") : + "no") << " data sent" << + "\nRequest:\n" << theOStream.str() << + KFS_LOG_EOM; + int theTimeToNextRetry = 0; + bool theFailFlag = false; + bool theReadLeaseOtherFalureFlag = false; + if ((&mLeaseRenewOp == &inOp || &mLeaseAcquireOp == &inOp) && + ! (theReadLeaseOtherFalureFlag = + inOp.status != kErrorBusy && + inOp.status != kErrorLeaseExpired && + inOp.status != KfsNetClient::kErrorMaxRetryReached)) { + mOuter.mStats.mGetLeaseRetryCount++; + mLeaseRetryCount++; + theTimeToNextRetry = max(1, min( + mOuter.mLeaseRetryTimeout * max(1, mLeaseRetryCount) - + int(Now() - mOpStartTime), + int(mLeaseWaitStartTime + mOuter.mLeaseWaitTimeout - Now()) + )); + // Meta ops communication failures are automatically + // retried, declare failure if it isn't lease busy error. + theFailFlag = (inOp.status != kErrorBusy && + (&mLeaseRenewOp != &inOp || + inOp.status != kErrorLeaseExpired)) || + mLeaseWaitStartTime + mOuter.mLeaseWaitTimeout <= Now(); + } else if (&mGetAllocOp == &inOp) { + if (inOp.status == kErrorTryAgain) { + // No servers with this chunk available. + // Read should not be in flight, as the chunk id has not + // been determined yet. + QCASSERT(Queue::IsEmpty(mInFlightQueue)); + if (++mRetryCount >= mOuter.mMaxGetAllocRetryCount) { + theFailFlag = true; + } else { + theTimeToNextRetry = GetTimeToNextRetry(); + } + } else { + // Meta ops communication failures are automatically + // retried, declare failure. + // Either chunk does not exists, or meta comm. failure has + // been declared. + theFailFlag = true; + } + } else { + mOuter.mStats.mRetriesCount++; + if (inOp.op == CMD_READ || &mSizeOp == &inOp || + theReadLeaseOtherFalureFlag) { + if (theReadLeaseOtherFalureFlag || + ++mChunkServerIdx >= mGetAllocOp.chunkServers.size()) { + mChunkServerIdx = 0; + if (inOp.op != CMD_READ || + inOp.status != kErrorChecksum) { + theTimeToNextRetry = GetTimeToNextRetry(); + } + mRetryCount++; + // Restart from get alloc, chunk might have been moved + // or re-replicated. + mGetAllocOp.status = 0; + mGetAllocOp.chunkId = -1; + } + // Always restart from get chunk size, [first] read failure + // might imply that reported chunk size wasn't valid. + // Chunk servers don't initially load chunk headers, instead + // stat() system call is used to compute chunk size. + mSizeOp.size = -1; + } else { + theTimeToNextRetry = GetTimeToNextRetry(); + } + } + if (mRetryCount >= mOuter.mMaxRetryCount || theFailFlag) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "giving up, retry count: " << mRetryCount << + KFS_LOG_EOM; + // Fail all ops. + ReportCompletionForAll( + inOp.status < 0 ? inOp.status : kErrorIO); + return; + } + if (&mGetAllocOp == &inOp || &mSizeOp == &inOp || + theReadLeaseOtherFalureFlag) { + if (! ReportCompletionForPendingWithNoRetryOnly( + inOp.status < 0 ? inOp.status : kErrorIO)) { + return; // Unwind. + } + if (Queue::IsEmpty(mPendingQueue) && + Queue::IsEmpty(mInFlightQueue)) { + Reset(); + mRetryCount = 0; + StartRead(); + return; + } + } + // Retry. + KFS_LOG_STREAM_INFO << mLogPrefix << + "scheduling retry: " << mRetryCount << + " of " << mOuter.mMaxRetryCount << + " in " << theTimeToNextRetry << " sec." << + " op: " << inOp.Show() << + KFS_LOG_EOM; + mErrorCode = 0; + Reset(); + Sleep(theTimeToNextRetry); + if (! mSleepingFlag) { + Timeout(); + } + } + bool ReportCompletionForPendingWithNoRetryOnly( + int inStatus) + { + if (mOpsNoRetryCount <= 0) { + return true; + } + mOpsNoRetryCount = 0; + Queue::Iterator theIt(mPendingQueue); + ReadOp* theOpPtr; + while ((theOpPtr = theIt.Next())) { + if (! theOpPtr->mRetryIfFailsFlag) { + Queue::Remove(mPendingQueue, *theOpPtr); + Queue::PushBack(mCompletionQueue, *theOpPtr); + } + } + return RunCompletionQueue(inStatus); + } + bool ReportCompletionForAll( + int inStatus) + { + Reset(); + QCRTASSERT(Queue::IsEmpty(mInFlightQueue)); + mOpsNoRetryCount = 0; + Queue::PushBackList(mCompletionQueue, mPendingQueue); + if (Queue::IsEmpty(mCompletionQueue)) { + return ReportCompletion(); + } + return RunCompletionQueue(inStatus); + } + bool RunCompletionQueue( + int inStatus) + { + const int theStatus = (inStatus == kErrorNoEntry && + mGetAllocOp.status != kErrorNoEntry) ? kErrorIO : inStatus; + ReadOp* theOpPtr; + while ((theOpPtr = Queue::Front(mCompletionQueue))) { + theOpPtr->status = theStatus; + if (theOpPtr->mFailShortReadFlag && theStatus == kErrorNoEntry) { + ReadOp& theOp = *theOpPtr; + theOp.status = kErrorInvalChunkSize; + theOp.statusMsg = "no such chunk -- hole"; + KFS_LOG_STREAM_ERROR << mLogPrefix << + theOp.statusMsg << ":" + " pos: " << mGetAllocOp.fileOffset << + " + " << theOp.offset << + " requested: " << theOp.numBytes << + KFS_LOG_EOM; + } + if (! ReportCompletion(*theOpPtr, mCompletionQueue)) { + return false; + } + } + return true; + } + bool Sleep( + int inSec) + { + if (inSec <= 0 || mSleepingFlag) { + return false; + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "sleeping: " << inSec << + (mRestartStartReadFlag ? "resetting restart flag" : "") << + KFS_LOG_EOM; + mRestartStartReadFlag = false; + mSleepingFlag = true; + mOuter.mStats.mSleepTimeSec += inSec; + const bool kResetTimerFlag = true; + SetTimeoutInterval(inSec * 1000, kResetTimerFlag); + mOuter.mNetManager.RegisterTimeoutHandler(this); + return true; + } + virtual void Timeout() + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << "timeout" << + KFS_LOG_EOM; + if (mSleepingFlag) { + mOuter.mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + StartRead(); + } + bool ReportCompletion( + int inStatus = 0, + Offset inOffset = 0, + Offset inSize = 0, + IOBuffer* inBufferPtr = 0, + RequestId inRequestId = RequestId(), + RequestId inStriperRequestId = RequestId()) + { + if (mErrorCode == 0 && + (inStatus >= 0 || inStatus == kErrorNoEntry)) { + // Reset retry counts on successful completion. + mRetryCount = 0; + } + return mOuter.ReportCompletion( + inStatus, + this, + inOffset, + inSize, + inBufferPtr, + inRequestId, + inStriperRequestId + ); + } + time_t Now() const + { return mOuter.mNetManager.Now(); } + void EnqueueSelf( + KfsOp& inOp, + IOBuffer* inBufferPtr, + KfsNetClient* inServerPtr) + { + mLastOpPtr = &inOp; + mOpStartTime = Now(); + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "+> " << (inServerPtr ? "" : "meta ") << inOp.Show() << + " buffer: " << (void*)inBufferPtr << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + KFS_LOG_EOM; + if (inServerPtr) { + mOuter.mStats.mChunkOpsQueuedCount++; + } else { + if (mLastMetaOpPtr) { + mOuter.InternalError("more than one meta op in flight"); + } + mLastMetaOpPtr = &inOp; + mOuter.mStats.mMetaOpsQueuedCount++; + } + if (! (inServerPtr ? *inServerPtr : mOuter.mMetaServer).Enqueue( + &inOp, this, inBufferPtr)) { + mOuter.InternalError(inServerPtr ? + "chunk op enqueue failure" : + "meta op enqueue failure" + ); + inOp.status = kErrorFault; + OpDone(&inOp, false, inBufferPtr); + } + } + void CancelRead( + ReadOp** inQueuePtr) + { + Queue::Iterator theIt(inQueuePtr); + ReadOp* theOpPtr; + while ((theOpPtr = theIt.Next())) { + QCASSERT(mOuter.mStriperPtr); + if (CancelRead(inQueuePtr, *theOpPtr)) { + if (inQueuePtr == mInFlightQueue) { + // Cancel will move the request into the pending queue. + mChunkServer.Cancel(theOpPtr, this); + } else { + theOpPtr->Delete(inQueuePtr); + } + } + } + } + bool CancelRead( + ReadOp** inQueuePtr, + ReadOp& inOp) + { + QCASSERT(mOuter.mStriperPtr); + if (inOp.mCancelFlag) { + return true; + } + if (inOp.mRequests.empty()) { + if (mOuter.mStriperPtr->CanCancelRead(inOp.mStriperRequestId)) { + inOp.mCancelFlag = true; + } + return inOp.mCancelFlag; + } + size_t theCanceledCnt = 0; + for (ReadOp::Requests::iterator theIt = inOp.mRequests.begin(); + theIt != inOp.mRequests.end(); + ++theIt) { + if (theIt->mCancelFlag) { + theCanceledCnt++; + } else if (mOuter.mStriperPtr->CanCancelRead( + theIt->mStriperRequestId)) { + theIt->mCancelFlag = true; + theCanceledCnt++; + } + } + if (theCanceledCnt <= 0) { + return false; + } + if (theCanceledCnt == inOp.mRequests.size()) { + inOp.mCancelFlag = true; + return true; + } + if (inQueuePtr == mCompletionQueue) { + // Report completion uses mCancelFlag to skip over it. + return false; + } + if (inQueuePtr == mInFlightQueue) { + // The request has to be moved into the pending queue first, + // then this method has to be called again. + return true; + } + ReadOp* theDeleteQueue[1]; + Queue::Init(theDeleteQueue); + if (inQueuePtr == mPendingQueue) { + // Remove from pending to prevent adding pieces back to it. + Queue::Remove(mPendingQueue, inOp); + Queue::PushBack(theDeleteQueue, inOp); + } + // Re-queue the left over pieces. + IOBuffer theBuf; + Offset theOffset = mGetAllocOp.fileOffset + inOp.offset; + for (ReadOp::Requests::iterator theIt = inOp.mRequests.begin(); + theIt != inOp.mRequests.end(); + ++theIt) { + if (theIt->mCancelFlag) { + theBuf.MoveSpace(&inOp.mTmpBuffer, theIt->mSize); + theBuf.Clear(); + } else { + QCVERIFY((int)theIt->mSize == + QueueRead( + inOp.mTmpBuffer, + theIt->mSize, + theOffset, + theIt->mRequestId, + theIt->mStriperRequestId, + inOp.mRetryIfFailsFlag, + inOp.mFailShortReadFlag + )); + } + theOffset += theIt->mSize; + } + if (inQueuePtr == mPendingQueue) { + inOp.Delete(theDeleteQueue); + return false; // The original request canceled. + } + return true; // Cancel the original request. + } + private: + ChunkReader( + const ChunkReader& inChunkReader); + ChunkReader& operator=( + const ChunkReader& inChunkReader); + }; + class ReportInvalidChunkOp : public CreateOp + { + public: + ReportInvalidChunkOp( + kfsChunkId_t inChunkId, + int64_t inVersion) + : CreateOp(0, ROOTFID, 0, 1, true), + mPath(MakePathName(inChunkId, inVersion)) + { + filename = mPath.c_str(); + } + private: + const string mPath; + + static string MakePathName( + kfsChunkId_t inChunkId, + int64_t inVersion) + { + ostringstream theStream; + theStream << "/proc/invalid_chunks/" << + inChunkId << "." << inVersion; + return theStream.str(); + } + private: + ReportInvalidChunkOp( + const ReportInvalidChunkOp& inOp); + ReportInvalidChunkOp& operator=( + const ReportInvalidChunkOp& inOp); + }; + friend class ChunkReader; + friend class Striper; + + typedef ChunkReader::Readers Readers; + + Reader& mOuter; + MetaServer& mMetaServer; + string mPathName; + kfsFileId_t mFileId; + bool mClosingFlag; + int mErrorCode; + const int mIdleTimeoutSec; + const int mOpTimeoutSec; + const int mMaxRetryCount; + const int mTimeSecBetweenRetries; + const int mMaxReadSize; + const int mLeaseRetryTimeout; + const int mLeaseWaitTimeout; + bool mSkipHolesFlag; + bool mFailShortReadsFlag; + int mMaxGetAllocRetryCount; + Offset mOffset; + Offset mOpenChunkBlockSize; + int64_t mChunkServerInitialSeqNum; + Completion* mCompletionPtr; + string const mLogPrefix; + Stats mStats; + KfsNetClient::Stats mChunkServersStats; + NetManager& mNetManager; + Striper* mStriperPtr; + int mCompletionDepthCount; + ChunkReader* mReaders[1]; + + void InternalError( + const char* inMsgPtr = 0) + { + if (inMsgPtr) { + KFS_LOG_STREAM_FATAL << inMsgPtr << KFS_LOG_EOM; + } + MsgLogger::Stop(); + abort(); + } + + virtual ~Impl() + { + DisableCompletion(); + Impl::Shutdown(); + } + int StartRead( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inRequestId) + { + mOffset = inOffset; + int theRem = inLength; + while (theRem > 0) { + const int thePrevRefCount = GetRefCount(); + const int theRet = QueueRead( + inBuffer, theRem, mOffset, inRequestId); + if (thePrevRefCount > GetRefCount()) { + return mErrorCode; // Unwind. + } + if (theRet <= 0) { + QCASSERT(theRet < 0); + if (mErrorCode == 0) { + mErrorCode = theRet; + } + break; + } + theRem -= theRet; + mOffset += theRet; + } + return StartRead(); + } + int StartRead() + { + if (! mClosingFlag) { + return mErrorCode; + } + if (Readers::IsEmpty(mReaders)) { + return ((! ReportCompletion()) ? 0 : mErrorCode); + } + Readers::Iterator theIt(mReaders); + ChunkReader* thePtr; + while ((thePtr = theIt.Next())) { + if (! thePtr->IsOpen()) { + continue; + } + const int thePrevRefCount = GetRefCount(); + thePtr->Close(); + if (thePrevRefCount > GetRefCount()) { + return mErrorCode; // Unwind. + } + // Restart from the beginning as close can invoke completion + // and remove or close more than one reader in TryToCloseIdle(). + theIt.Reset(); + } + return mErrorCode; + } + int QueueRead( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inRequestId) + { + if (mStriperPtr) { + return mStriperPtr->Process( + inBuffer, inLength, inOffset, inRequestId); + } + const int theQueuedCount = QueueChunkRead( + inBuffer, inLength, inOffset, inRequestId, + RequestId(), true, mFailShortReadsFlag); + if (theQueuedCount > 0) { + StartQueuedRead(theQueuedCount); + } + return theQueuedCount; + } + int QueueChunkRead( + IOBuffer& inBuffer, + int inSize, + Offset inOffset, + RequestId inRequestId, + RequestId inStriperRequestId, + bool inRetryIfFailsFlag, + bool inFailShortReadFlag) + { + QCASSERT(inOffset >= 0); + if (inSize <= 0) { + return 0; + } + const Offset theFileOffset = inOffset - inOffset % CHUNKSIZE; + Readers::Iterator theIt(mReaders); + ChunkReader* thePtr; + while ((thePtr = theIt.Next())) { + if (thePtr->GetFileOffset() == theFileOffset) { + break; + } + } + if (thePtr) { + Readers::PushFront(mReaders, *thePtr); + thePtr->CancelClose(); + } else { + mChunkServerInitialSeqNum += 10000; + thePtr = new ChunkReader( + *this, mChunkServerInitialSeqNum, mLogPrefix); + } + QCASSERT(Readers::Front(mReaders) == thePtr); + return thePtr->QueueRead( + inBuffer, + inSize, + inOffset, + inRequestId, + inStriperRequestId, + inRetryIfFailsFlag, + inFailShortReadFlag + ); + } + void StartQueuedRead( + int inQueuedCount) + { + if (inQueuedCount <= 0) { + return; + } + QCASSERT(! Readers::IsEmpty(mReaders)); + Readers::Front(mReaders)->StartRead(); + } + void CancelRead() + { + Readers::Iterator theIt(mReaders); + ChunkReader* thePtr; + while ((thePtr = theIt.Next())) { + thePtr->CancelRead(); + } + } + void FatalError( + int inErrorCode = 0) + { + if (mErrorCode == 0) { + mErrorCode = inErrorCode; + } + if (mErrorCode == 0) { + mErrorCode = -1; + } + mClosingFlag = false; + ReportCompletion(mErrorCode); + } + bool CanClose( + ChunkReader& inReader) + { + if (! inReader.IsIdle()) { + return false; + } + if (! inReader.IsOpen() || (mClosingFlag && ! mStriperPtr)) { + return true; + } + // The most recently used should always be first. + const ChunkReader* const thePtr = Readers::Front(mReaders); + if (! thePtr) { + return true; + } + if (thePtr == &inReader) { + return false; + } + const Offset theLeftEdge = thePtr->GetOpenChunkBlockFileOffset(); + if (theLeftEdge < 0) { + return false; + } + const Offset theRightEdge = theLeftEdge + mOpenChunkBlockSize; + const Offset theOffset = inReader.GetFileOffset(); + return (theOffset < theLeftEdge || theRightEdge <= theOffset); + } + bool TryToCloseIdle( + ChunkReader* inReaderPtr) + { + ChunkReader* thePtr = Readers::Back(mReaders); + if (! thePtr) { + thePtr = inReaderPtr; + } + bool theRetFlag = true; + while (thePtr) { + ChunkReader& theReader = *thePtr; + thePtr = (thePtr == Readers::Front(mReaders)) ? + 0 : theReader.GetPrevPtr(); + if (CanClose(theReader)) { + const bool theOpenFlag = theReader.IsOpen(); + if (theOpenFlag) { + theReader.Close(); + } + // Handle "synchronous" Close(). ReportCompletion, calls + // this method only when mCompletionDepthCount <= 1 + if (! theOpenFlag || + (! theReader.IsOpen() && CanClose(theReader))) { + if (&theReader == inReaderPtr) { + theRetFlag = false; + } + delete &theReader; + } + } else if (theReader.IsIdle() && theReader.IsOpen()) { + // Stop at the first idle that can not be closed. + break; + } + } + return theRetFlag; + } + bool ReportCompletion( + int inStatus = 0, + ChunkReader* inReaderPtr = 0, + Offset inOffset = 0, + Offset inSize = 0, + IOBuffer* inBufferPtr = 0, + RequestId inRequestId = RequestId(), + RequestId inStriperRequestId = RequestId(), + bool inStiperDoneFlag = false) + { + // Order matters here, as StRef desctructor can delete this. + StRef theRef(*this); + QCStValueIncrementor theIncrement(mCompletionDepthCount, 1); + + if (inReaderPtr && mErrorCode == 0) { + mErrorCode = inReaderPtr->GetErrorCode(); + } + const int thePrevRefCount = GetRefCount(); + if (mStriperPtr && inReaderPtr && inBufferPtr && ! inStiperDoneFlag) { + // The following can (and normally will) recursively this method + // with inStiperDoneFlag set + mStriperPtr->ReadCompletion( + inStatus, + *inBufferPtr, + (int)inSize, + inOffset, + inRequestId, + inStriperRequestId, + inReaderPtr->GetChunkId(), + inReaderPtr->GetChunkVersion() + ); + } + if ((! mStriperPtr || inStiperDoneFlag) && + (! mClosingFlag || inBufferPtr)) { + int theStatus = mErrorCode == 0 ? inStatus : mErrorCode; + if (! mSkipHolesFlag && inBufferPtr && mErrorCode == 0 && + (inStatus == kErrorNoEntry || inStatus == 0)) { + const int theLen = inBufferPtr->BytesConsumable(); + if (theLen < inSize) { + inBufferPtr->ZeroFill(inSize - theLen); + } + theStatus = 0; + } + if (mCompletionPtr) { + mCompletionPtr->Done( + mOuter, + theStatus, + inOffset, + inSize, + inBufferPtr, + inRequestId + ); + } + } + bool theRetFlag = true; + if (mCompletionDepthCount <= 1 && thePrevRefCount <= GetRefCount()) { + theRetFlag = TryToCloseIdle(inReaderPtr); + if (mClosingFlag && + Readers::IsEmpty(mReaders) && + ! inStiperDoneFlag) { + mClosingFlag = false; + mFileId = -1; + Striper* const theStriperPtr = mStriperPtr; + mStriperPtr = 0; + QCASSERT(! IsOpen()); + delete theStriperPtr; + theRetFlag = false; + if (mCompletionPtr) { + mCompletionPtr->Done( + mOuter, + mErrorCode, + 0, + 0, + 0, + RequestId() + ); + } + } + } + return (theRetFlag && thePrevRefCount <= GetRefCount()); + } + void ReportInvalidChunk( + kfsChunkId_t inChunkId, + int64_t inChunkVersion, + int inStatus, + const char* inStatusMsgPtr) + { + KFS_LOG_STREAM_WARN << mLogPrefix << + "invalid" + " chunk: " << inChunkId << + " version: " << inChunkVersion << + " status: " << inStatus << + ((inStatusMsgPtr && *inStatusMsgPtr) ? " msg: " : "") << + ((inStatusMsgPtr && *inStatusMsgPtr) ? inStatusMsgPtr : "") << + KFS_LOG_EOM; + mMetaServer.Enqueue( + new ReportInvalidChunkOp(inChunkId, inChunkVersion), + 0 + ); + } +private: + Impl( + const Impl& inReader); + Impl& operator=( + const Impl& inReader); +}; + +/* static */ Reader::Striper* +Reader::Striper::Create( + int inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + int inMaxAtomicReadRequestSize, + bool inUseDefaultBufferAllocatorFlag, + bool inFailShortReadsFlag, + Reader::Striper::Offset inRecoverChunkPos, + Reader::Striper::Offset inFileSize, + Reader::Striper::SeqNum inInitialSeqNum, + string inLogPrefix, + Reader::Striper::Impl& inOuter, + Reader::Striper::Offset& outOpenChunkBlockSize, + string& outErrMsg) +{ + switch (inType) { + case kStriperTypeNone: + outOpenChunkBlockSize = Offset(CHUNKSIZE); + return 0; + case kStriperTypeRS: + return RSStriperCreate( + kStriperTypeRS, + inStripeCount, + inRecoveryStripeCount, + inStripeSize, + inMaxAtomicReadRequestSize, + inUseDefaultBufferAllocatorFlag, + inFailShortReadsFlag, + inRecoverChunkPos, + inFileSize, + inInitialSeqNum, + inLogPrefix, + inOuter, + outOpenChunkBlockSize, + outErrMsg + ); + default: + outErrMsg = "unsupported striper type"; + break; + } + return 0; +} + +int +Reader::Striper::QueueRead( + IOBuffer& inBuffer, + int inSize, + Reader::Striper::Offset inOffset, + Reader::Striper::RequestId inOriginalRequestId, + Reader::Striper::RequestId inRequestId, + bool inRetryIfFailsFlag, + bool inFailShortReadFlag) +{ + return mOuter.QueueChunkRead( + inBuffer, + inSize, + inOffset, + inOriginalRequestId, + inRequestId, + inRetryIfFailsFlag, + inFailShortReadFlag + ); +} + +void +Reader::Striper::StartQueuedRead( + int inQueuedCount) +{ + mOuter.StartQueuedRead(inQueuedCount); +} + +void +Reader::Striper::CancelRead() +{ + mOuter.CancelRead(); +} + +bool +Reader::Striper::ReportCompletion( + int inStatus, + IOBuffer& inBuffer, + int inLength, + Reader::Striper::Offset inOffset, + Reader::Striper::RequestId inRequestId) +{ + return mOuter.ReportCompletion( + inStatus, + 0, + inOffset, + inLength, + &inBuffer, + inRequestId, + RequestId(), + true + ); +} + +void +Reader::Striper::ReportInvalidChunk( + kfsChunkId_t inChunkId, + int64_t inChunkVersion, + int inStatus, + const char* inStatusMsgPtr) +{ + mOuter.ReportInvalidChunk( + inChunkId, + inChunkVersion, + inStatus, + inStatusMsgPtr + ); +} + +Reader::Reader( + Reader::MetaServer& inMetaServer, + Reader::Completion* inCompletionPtr /* = 0 */, + int inMaxRetryCount /* = 6 */, + int inTimeSecBetweenRetries /* = 15 */, + int inOpTimeoutSec /* = 30 */, + int inIdleTimeoutSec /* = 5 * 30 */, + int inMaxReadSize /* = 1 << 20 */, + int inLeaseRetryTimeout /* = 3 */, + int inLeaseWaitTimeout /* = 900 */, + const char* inLogPrefixPtr /* = 0 */, + int64_t inChunkServerInitialSeqNum /* = 1 */) + : mImpl(*new Reader::Impl( + *this, + inMetaServer, + inCompletionPtr, + inMaxRetryCount, + inTimeSecBetweenRetries, + inOpTimeoutSec, + inIdleTimeoutSec, + inMaxReadSize, + inLeaseRetryTimeout, + inLeaseWaitTimeout, + (inLogPrefixPtr && inLogPrefixPtr[0]) ? + (inLogPrefixPtr + string(" ")) : string(), + inChunkServerInitialSeqNum + )) +{ + mImpl.Ref(); +} + +/* virtual */ +Reader::~Reader() +{ + mImpl.DisableCompletion(); + mImpl.UnRef(); +} + +int +Reader::Open( + kfsFileId_t inFileId, + const char* inFileNamePtr, + Reader::Offset inFileSize, + int inStriperType, + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + bool inSkipHolesFlag, + bool inUseDefaultBufferAllocatorFlag, + Reader::Offset inRecoverChunkPos, + bool inFailShortReadsFlag) +{ + Impl::StRef theRef(mImpl); + return mImpl.Open( + inFileId, + inFileNamePtr, + inFileSize, + inStriperType, + inStripeSize, + inStripeCount, + inRecoveryStripeCount, + inSkipHolesFlag, + inUseDefaultBufferAllocatorFlag, + inRecoverChunkPos, + inFailShortReadsFlag + ); +} + +int +Reader::Close() +{ + Impl::StRef theRef(mImpl); + return mImpl.Close(); +} + +int +Reader::Read( + IOBuffer& inBuffer, + int inLength, + Reader::Offset inOffset, + Reader::RequestId inRequestId) +{ + Impl::StRef theRef(mImpl); + return mImpl.Read(inBuffer, inLength, inOffset, inRequestId); +} + +void +Reader::Stop() +{ + Impl::StRef theRef(mImpl); + mImpl.Stop(); +} + +void +Reader::Shutdown() +{ + Impl::StRef theRef(mImpl); + mImpl.Shutdown(); +} + +bool +Reader::IsOpen() const +{ + Impl::StRef theRef(mImpl); + return (mImpl.IsOpen() && ! IsClosing()); +} + +bool +Reader::IsClosing() const +{ + Impl::StRef theRef(mImpl); + return mImpl.IsClosing(); +} + +bool +Reader::IsActive() const +{ + Impl::StRef theRef(mImpl); + return mImpl.IsActive(); +} + +int +Reader::GetErrorCode() const +{ + Impl::StRef theRef(mImpl); + return mImpl.GetErrorCode(); +} + +void +Reader::Register( + Reader::Completion* inCompletionPtr) +{ + Impl::StRef theRef(mImpl); + mImpl.Register(inCompletionPtr); +} + +bool +Reader::Unregister( + Reader::Completion* inCompletionPtr) +{ + Impl::StRef theRef(mImpl); + return mImpl.Unregister(inCompletionPtr); +} + +void +Reader::GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats) +{ + Impl::StRef theRef(mImpl); + mImpl.GetStats(outStats, outChunkServersStats); +} + +}} diff --git a/src/cc/libclient/Reader.h b/src/cc/libclient/Reader.h new file mode 100644 index 000000000..e0fc12485 --- /dev/null +++ b/src/cc/libclient/Reader.h @@ -0,0 +1,288 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/08/13 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef READER_H +#define READER_H + +#include "KfsNetClient.h" +#include "common/kfstypes.h" + +#include +#include + +namespace KFS +{ +class IOBuffer; + +namespace client +{ + +using std::string; +using std::ostream; + +// Kfs client file read state machine. +class Reader +{ +public: + typedef int64_t Offset; + union RequestId + { + int64_t mId; + void* mPtr; + }; + class Impl; + + class Completion + { + public: + virtual void Done( + Reader& inReader, + int inStatusCode, + Offset inOffset, + Offset inSize, + IOBuffer* inBufferPtr, + RequestId inRequestId) = 0; + virtual void Unregistered( + Reader& /* inReader */) + {} + protected: + Completion() + {} + Completion( + const Completion&) + {} + virtual ~Completion() + {} + }; + struct Stats + { + typedef int64_t Counter; + Stats() + : mMetaOpsQueuedCount(0), + mMetaOpsCancelledCount(0), + mChunkOpsQueuedCount(0), + mSleepTimeSec(0), + mGetLeaseCount(0), + mGetLeaseRetryCount(0), + mOpsReadCount(0), + mRetriesCount(0), + mReadCount(0), + mReadByteCount(0) + {} + void Clear() + { *this = Stats(); } + Stats& Add( + const Stats& inStats) + { + mMetaOpsQueuedCount += inStats.mMetaOpsQueuedCount; + mMetaOpsCancelledCount += inStats.mMetaOpsCancelledCount; + mChunkOpsQueuedCount += inStats.mChunkOpsQueuedCount; + mSleepTimeSec += inStats.mSleepTimeSec; + mGetLeaseCount += inStats.mGetLeaseCount; + mGetLeaseRetryCount += inStats.mGetLeaseRetryCount; + mOpsReadCount += inStats.mOpsReadCount; + mRetriesCount += inStats.mRetriesCount; + mReadCount += inStats.mReadCount; + mReadByteCount += inStats.mReadByteCount; + return *this; + } + ostream& Display( + ostream& inStream, + const char* inSeparatorPtr = 0, + const char* inDelimiterPtr = 0) const + { + const char* const theSeparatorPtr = + inSeparatorPtr ? inSeparatorPtr : " "; + const char* const theDelimiterPtr = + inDelimiterPtr ? inDelimiterPtr : ": "; + inStream << + "MetaOpsQueued" << theDelimiterPtr << + mMetaOpsQueuedCount << theSeparatorPtr << + "MetaOpsCancelled" << theDelimiterPtr << + mMetaOpsCancelledCount << theSeparatorPtr << + "ChunkOpsQueued" << theDelimiterPtr << + mChunkOpsQueuedCount << theSeparatorPtr << + "SleepTimeSec" << theDelimiterPtr << + mSleepTimeSec << theSeparatorPtr << + "GetLeaseCount" << theDelimiterPtr << + mGetLeaseCount << theSeparatorPtr << + "OpsRead" << theDelimiterPtr << + mOpsReadCount << theSeparatorPtr << + "Retries" << theDelimiterPtr << + mRetriesCount << theSeparatorPtr << + "ReadCount" << theDelimiterPtr << + mReadCount << theSeparatorPtr << + "ReadByteCount" << theDelimiterPtr << + mReadByteCount + ; + return inStream; + } + Counter mMetaOpsQueuedCount; + Counter mMetaOpsCancelledCount; + Counter mChunkOpsQueuedCount; + Counter mSleepTimeSec; + Counter mGetLeaseCount; + Counter mGetLeaseRetryCount; + Counter mOpsReadCount; + Counter mRetriesCount; + Counter mReadCount; + Counter mReadByteCount; + }; + class Striper + { + public: + typedef Reader::Impl Impl; + typedef Reader::Offset Offset; + typedef Reader::RequestId RequestId; + typedef int64_t SeqNum; + enum StriperType + { + kStriperTypeNone = KFS::KFS_STRIPED_FILE_TYPE_NONE, + kStriperTypeRS = KFS::KFS_STRIPED_FILE_TYPE_RS + }; + static Striper* Create( + int inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + int inMaxAtomicReadRequestSize, + bool inUseDefaultBufferAllocatorFlag, + bool inFailShortReadsFlag, + Offset inRecoverChunkPos, + Offset inFileSize, + SeqNum inInitialSeqNum, + string inLogPrefix, + Impl& inOuter, + Offset& outOpenChunkBlockSize, + string& outErrMsg); + virtual ~Striper() + {} + virtual int Process( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inRequestId) = 0; + virtual void ReadCompletion( + int inStatus, + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inRequestId, + RequestId inStriperRequestId, + kfsChunkId_t inChunkId, + int64_t inChunkVersion) = 0; + virtual bool CanCancelRead( + RequestId inStriperRequestId) = 0; + protected: + Striper( + Impl& inOuter) + : mOuter(inOuter) + {} + int QueueRead( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inOriginalRequestId, + RequestId inRequestId, + bool inRetryIfFailsFlag, + bool inFailShortReadFlag); + void StartQueuedRead( + int inQueuedCount); + bool ReportCompletion( + int inStatus, + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inRequestId); + void CancelRead(); + void ReportInvalidChunk( + kfsChunkId_t inChunkId, + int64_t inChunkVersion, + int inStatus, + const char* inStatusMsgPtr); + private: + Impl& mOuter; + private: + Striper( + const Striper& inStriper); + Striper& operator=( + const Striper& inStipter); + }; + typedef KfsNetClient MetaServer; + Reader( + MetaServer& inMetaServer, + Completion* inCompletionPtr = 0, + int inMaxRetryCount = 6, + int inTimeSecBetweenRetries = 15, + int inOpTimeoutSec = 30, + int inIdleTimeoutSec = 5 * 30, + int inMaxReadSize = 1 << 20, + int inLeaseRetryTimeout = 3, + int inLeaseWaitTimeout = 900, + const char* inLogPrefixPtr = 0, + int64_t inChunkServerInitialSeqNum = 1); + virtual ~Reader(); + int Open( + kfsFileId_t inFileId, + const char* inFileNamePtr, + Offset inFileSize, + int inStriperType, + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + bool inSkipHolesFlag, + bool inUseDefaultBufferAllocatorFlag = false, + Offset inRecoverChunkPos = -1, + bool inFailShortReadsFlag = false); + int Close(); + int Read( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + RequestId inRequestId); + void Stop(); + void Shutdown(); + bool IsOpen() const; + bool IsClosing() const; + bool IsActive() const; + int GetErrorCode() const; + void Register( + Completion* inCompletionPtr); + bool Unregister( + Completion* inCompletionPtr); + void GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats); +private: + Impl& mImpl; +private: + Reader( + const Reader& inReader); + Reader& operator=( + const Reader& inReader); +}; +}} + +#endif /* READER_H */ diff --git a/src/cc/libclient/WriteAppender.cc b/src/cc/libclient/WriteAppender.cc new file mode 100644 index 000000000..abcda891b --- /dev/null +++ b/src/cc/libclient/WriteAppender.cc @@ -0,0 +1,1710 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/05/20 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "WriteAppender.h" + +#include +#include +#include +#include +#include + +#include "kfsio/IOBuffer.h" +#include "kfsio/NetManager.h" +#include "kfsio/Globals.h" +#include "kfsio/checksum.h" +#include "kfsio/ITimeout.h" +#include "common/kfsdecls.h" +#include "common/MsgLogger.h" +#include "qcdio/QCUtils.h" +#include "KfsOps.h" +#include "utils.h" +#include "KfsClient.h" +#include "ClientPool.h" + +namespace KFS +{ +namespace client +{ +using std::max; +using std::min; +using std::string; +using std::vector; +using std::deque; +using std::istringstream; +using std::ostringstream; + +// Kfs client write append state machine implementation. +class WriteAppender::Impl : private ITimeout, private KfsNetClient::OpOwner +{ +public: + Impl( + WriteAppender& inOuter, + MetaServer& inMetaServer, + Completion* inCompletionPtr, + int inMaxRetryCount, + int inWriteThreshold, + int inTimeSecBetweenRetries, + int inDefaultSpaceReservationSize, + int inPreferredAppendSize, + int inMaxPartialBuffersCount, + int inOpTimeoutSec, + int inIdleTimeoutSec, + bool inPreAllocationFlag, + string inLogPrefix, + int64_t inChunkServerInitialSeqNum, + ClientPool* inClientPoolPtr) + : ITimeout(), + KfsNetClient::OpOwner(), + mOuter(inOuter), + mMetaServer(inMetaServer), + mChunkServer( + mMetaServer.GetNetManager(), + "", -1, + // All chunk server retries are handled here + 0, // inMaxRetryCount + 0, // inTimeSecBetweenRetries, + inOpTimeoutSec, + inIdleTimeoutSec, + inChunkServerInitialSeqNum, + inLogPrefix.c_str() + ), + mPathName(), + mFileName(), + mWriteIds(), + mCanceledFlag(false), + mSleepingFlag(false), + mOpenFlag(false), + mOpeningFlag(false), + mClosingFlag(false), + mMakeDirsFlag(false), + mPreAllocationFlag(inPreAllocationFlag), + mErrorCode(0), + mSpaceAvailable(0), + mRetryCount(0), + mAppendRestartRetryCount(0), + mWriteThreshold(inWriteThreshold), + mNumReplicas(0), + mPartialBuffersCount(0), + mAppendLength(0), + mForcedAllocationInterval(0), + mOpTimeoutSec(inOpTimeoutSec), + mMaxRetryCount(inMaxRetryCount), + mTimeSecBetweenRetries(inTimeSecBetweenRetries), + mDefaultSpaceReservationSize( + min((int)KFS::CHUNKSIZE, inDefaultSpaceReservationSize)), + mMaxPartialBuffersCount(inMaxPartialBuffersCount), + mPreferredAppendSize(min((int)KFS::CHUNKSIZE, inPreferredAppendSize)), + mPathNamePos(0), + mOpStartTime(0), + mCurOpPtr(0), + mCompletionPtr(inCompletionPtr), + mBuffer(), + mWriteQueue(), + mLookupOp(0, 0, ""), + mMkdirOp(0, 0, ""), + mCreateOp(0, 0, "", mNumReplicas, false), + mLookupPathOp(0, 0, ""), + mAllocOp(0, 0, ""), + mCloseOp(0, 0), + mWriteIdAllocOp(0, 0, 0, 0, 0), + mSpaceReserveOp(0, 0, 0, mWriteIds, 0), + mRecAppendOp(0, 0, 0, -1, mWriteIds), + mSpaceReleaseOp(0, 0, 0, mWriteIds, 0), + mGetRecordAppendOpStatusOp(0, 0, 0), + mPrevRecordAppendOpSeq(-1), + mGetRecordAppendOpStatusIndex(0u), + mLogPrefix(inLogPrefix), + mStats(), + mLastAppendActivityTime(0), + mClientPoolPtr(inClientPoolPtr), + mChunkServerPtr(0), + mNetManager(mMetaServer.GetNetManager()) + { + Impl::Reset(); + mChunkServer.SetRetryConnectOnly(true); + } + ~Impl() + { + mMetaServer.Cancel(mCurOpPtr, this); + StopChunkServer(); + Impl::Register(0); + if (mSleepingFlag) { + mNetManager.UnRegisterTimeoutHandler(this); + } + } + int Open( + const char* inFileNamePtr, + int inNumReplicas, + bool inMakeDirsFlag) + { + if (! inFileNamePtr || ! *inFileNamePtr) { + return -EINVAL; + } + if (mOpenFlag) { + if (inFileNamePtr == mPathName && + inNumReplicas == mNumReplicas) { + return mErrorCode; + } + return -EINVAL; + } + if (mErrorCode) { + return mErrorCode; + } + if (mClosingFlag || mOpeningFlag || mSleepingFlag) { + return -EAGAIN; + } + mBuffer.Clear(); + mStats.Clear(); + mPartialBuffersCount = 0; + mOpeningFlag = true; + mNumReplicas = inNumReplicas; + mPathName = inFileNamePtr; + mErrorCode = 0; + mPathNamePos = 0; + mSpaceReserveOp.status = 0; // Do allocate with append flag. + mMakeDirsFlag = inMakeDirsFlag; + assert(! mPathName.empty()); + LookupPath(); + return mErrorCode; + } + int Open( + kfsFileId_t inFileId, + const char* inFileNamePtr) + { + if (inFileId <= 0 || ! inFileNamePtr || ! *inFileNamePtr) { + return -EINVAL; + } + if (mOpenFlag) { + if (inFileId == mLookupOp.fattr.fileId && + inFileNamePtr == mPathName) { + return mErrorCode; + } + return -EINVAL; + } + if (mErrorCode) { + return mErrorCode; + } + if (mClosingFlag || mOpeningFlag || mSleepingFlag) { + return -EAGAIN; + } + mBuffer.Clear(); + mStats.Clear(); + mPartialBuffersCount = 0; + mPathName = inFileNamePtr; + mErrorCode = 0; + mPathNamePos = 0; + mSpaceReserveOp.status = 0; // Do allocate with append flag. + mMakeDirsFlag = false; + mNumReplicas = 0; // Do not create if doesn't exist. + assert(! mPathName.empty()); + mLookupOp.parentFid = -1; // Input, not known, and not needed. + mLookupOp.status = 0; + if (inFileId > 0) { + mLookupOp.fattr.fileId = inFileId; + mLookupOp.fattr.isDirectory = false; + mOpenFlag = true; + mOpeningFlag = false; + ReportCompletion(); + StartAppend(); + } else { + mOpeningFlag = true; + LookupPath(); + } + return mErrorCode; + } + int Close() + { + if (! mOpenFlag) { + if (mOpeningFlag) { + mClosingFlag = true; + } + return 0; + } + if (mErrorCode) { + return mErrorCode; + } + if (mClosingFlag) { + return -EAGAIN; + } + mClosingFlag = true; + if (! mCurOpPtr) { + StartAppend(); + } + return mErrorCode; + } + int Append( + IOBuffer& inBuffer, + int inLength) + { + if (mErrorCode) { + return mErrorCode; + } + if (mClosingFlag || (! mOpenFlag && ! mOpeningFlag)) { + return -EINVAL; + } + if (inLength <= 0) { + return 0; + } + if (mMaxPartialBuffersCount == 0 || + inLength < IOBufferData::GetDefaultBufferSize() * 2) { + // If record is too small, just copy it into the last buffer. + mBuffer.ReplaceKeepBuffersFull(&inBuffer, + mBuffer.BytesConsumable(), inLength); + } else { + if (mBuffer.IsEmpty()) { + mPartialBuffersCount = 0; + } + mBuffer.Move(&inBuffer, inLength); + mPartialBuffersCount++; + if (mMaxPartialBuffersCount >= 0 && + mPartialBuffersCount >= mMaxPartialBuffersCount) { + mBuffer.MakeBuffersFull(); + mPartialBuffersCount = 0; + mStats.mBufferCompactionCount++; + } + } + const int kMinWriteQueueEntrySize = 256; + if (mWriteQueue.empty() || + mWriteQueue.back() > kMinWriteQueueEntrySize) { + mWriteQueue.push_back(inLength); + } else { + mWriteQueue.back() += inLength; + } + if (! mCurOpPtr && mOpenFlag) { + StartAppend(); + } + return (mErrorCode ? + (mErrorCode < 0 ? mErrorCode : - mErrorCode) : inLength); + } + void Shutdown() + { + Reset(); + StopChunkServer(); + mMetaServer.Cancel(mCurOpPtr, this); + if (mSleepingFlag) { + mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + mClosingFlag = false; + mOpeningFlag = false; + mOpenFlag = false; + mErrorCode = 0; + mWriteQueue.clear(); + mBuffer.Clear(); + } + bool IsOpen() const + { return (mOpenFlag && ! mClosingFlag); } + bool IsOpening() const + { return (! mOpenFlag && mOpeningFlag); } + bool IsClosing() const + { return (mOpenFlag && mClosingFlag); } + bool IsSleeping() const + { return ((mOpenFlag || mOpeningFlag) && mSleepingFlag); } + bool IsActive() const + { return (mOpenFlag || mOpeningFlag); } + int GetPendingSize() const + { return mBuffer.BytesConsumable(); } + string GetServerLocation() const + { return GetChunkServer().GetServerLocation(); } + int SetWriteThreshold( + int inThreshold) + { + const bool theStartAppendFlag = mWriteThreshold > inThreshold; + mWriteThreshold = inThreshold; + if (theStartAppendFlag && ! mCurOpPtr && mOpenFlag && + mErrorCode == 0 && ! mWriteQueue.empty()) { + StartAppend(); + } + return mErrorCode; + } + void Register( + Completion* inCompletionPtr) + { + if (inCompletionPtr == mCompletionPtr) { + return; + } + if (mCompletionPtr) { + mCompletionPtr->Unregistered(mOuter); + } + mCompletionPtr = inCompletionPtr; + } + bool Unregister( + Completion* inCompletionPtr) + { + if (inCompletionPtr != mCompletionPtr) { + return false; + } + mCompletionPtr = 0; + return true; + } + void GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats) + { + outStats = mStats; + mChunkServer.GetStats(outChunkServersStats); + } + int SetPreAllocation( + bool inFlag) + { + if (inFlag == mPreAllocationFlag) { + return mErrorCode; + } + mPreAllocationFlag = inFlag; + if (mPreAllocationFlag && ! mCurOpPtr && mOpenFlag && + mErrorCode == 0 && ! mWriteQueue.empty()) { + StartAppend(); + } + return mErrorCode; + } + bool GetPreAllocation() const + { return mPreAllocationFlag; } + bool GetErrorCode() const + { return mErrorCode; } + void SetForcedAllocationInterval( + int inInterval) + { mForcedAllocationInterval = inInterval; } + +protected: + virtual void OpDone( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + if (mCurOpPtr != inOpPtr && ! mErrorCode) { + abort(); + } + if (inOpPtr) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "<- " << inOpPtr->Show() << + (inCanceledFlag ? " canceled" : "") << + " buffer: " << (void*)inBufferPtr << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + " status: " << inOpPtr->status << + " seq: " << inOpPtr->seq << + KFS_LOG_EOM; + } else { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "NULL operation completion? " << + (inCanceledFlag ? " canceled" : "") << + " buffer: " << (void*)inBufferPtr << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + KFS_LOG_EOM; + } + bool theOpFoundFlag; + if (mErrorCode || inCanceledFlag) { + NopDispatch theNopDispatch; + theOpFoundFlag = Dispatch(theNopDispatch, inOpPtr, inBufferPtr); + if (theOpFoundFlag) { + if (inCanceledFlag) { + HandleCancel(); + } else { + mCurOpPtr = 0; + } + } + } else { + theOpFoundFlag = Dispatch(*this, inOpPtr, inBufferPtr); + } + assert(theOpFoundFlag); + if (! theOpFoundFlag) { + abort(); + } + } + +private: + enum + { + kErrorAppenderBase = 100000, + kErrorOpCanceled = -(kErrorAppenderBase + 1), + kErrorMetaEnqueue = -(kErrorAppenderBase + 2), + kErrorChunkEnqueue = -(kErrorAppenderBase + 3) + }; + enum { kAgainRetryMinTime = 4 }; + enum { kGetStatusOpMinTime = 16 }; + enum { kAppendInactivityCheckTimeout = 3 * 60 }; + + typedef KfsNetClient ChunkServer; + typedef vector WriteIds; + typedef deque WriteQueue; + typedef string::size_type StringPos; + struct NopDispatch + { + void Done( + KfsOp& inOpPtr, + IOBuffer* inBufferPtr) {} + }; + + WriteAppender& mOuter; + MetaServer& mMetaServer; + ChunkServer mChunkServer; + string mPathName; + string mFileName; + WriteIds mWriteIds; + bool mCanceledFlag; + bool mSleepingFlag; + bool mOpenFlag; + bool mOpeningFlag; + bool mClosingFlag; + bool mMakeDirsFlag; + bool mPreAllocationFlag; + int mErrorCode; + int mSpaceAvailable; + int mRetryCount; + int mAppendRestartRetryCount; + int mWriteThreshold; + int mNumReplicas; + int mPartialBuffersCount; + int mAppendLength; + int mForcedAllocationInterval; + const int mOpTimeoutSec; + const int mMaxRetryCount; + const int mTimeSecBetweenRetries; + const int mDefaultSpaceReservationSize; + const int mMaxPartialBuffersCount; + const int mPreferredAppendSize; + StringPos mPathNamePos; + time_t mOpStartTime; + KfsOp* mCurOpPtr; + Completion* mCompletionPtr; + IOBuffer mBuffer; + WriteQueue mWriteQueue; + LookupOp mLookupOp; + MkdirOp mMkdirOp; + CreateOp mCreateOp; + LookupPathOp mLookupPathOp; + AllocateOp mAllocOp; + CloseOp mCloseOp; + WriteIdAllocOp mWriteIdAllocOp; + ChunkSpaceReserveOp mSpaceReserveOp; + RecordAppendOp mRecAppendOp; + ChunkSpaceReleaseOp mSpaceReleaseOp; + GetRecordAppendOpStatus mGetRecordAppendOpStatusOp; + int64_t mPrevRecordAppendOpSeq; + unsigned int mGetRecordAppendOpStatusIndex; + string const mLogPrefix; + Stats mStats; + time_t mLastAppendActivityTime; + ClientPool* mClientPoolPtr; + ChunkServer* mChunkServerPtr; + NetManager& mNetManager; + + template bool Dispatch( + T& inObj, + KfsOp* inOpPtr, + IOBuffer* inBufferPtr) + { + if (&mWriteIdAllocOp == inOpPtr) { + inObj.Done(mWriteIdAllocOp, inBufferPtr); + } else if (&mSpaceReserveOp == inOpPtr) { + inObj.Done(mSpaceReserveOp, inBufferPtr); + } else if (&mSpaceReleaseOp == inOpPtr) { + inObj.Done(mSpaceReleaseOp, inBufferPtr); + } else if (&mRecAppendOp == inOpPtr) { + inObj.Done(mRecAppendOp, inBufferPtr); + } else if (&mLookupOp == inOpPtr) { + inObj.Done(mLookupOp, inBufferPtr); + } else if (&mMkdirOp == inOpPtr) { + inObj.Done(mMkdirOp, inBufferPtr); + } else if (&mCreateOp == inOpPtr) { + inObj.Done(mCreateOp, inBufferPtr); + } else if (&mLookupPathOp == inOpPtr) { + inObj.Done(mLookupPathOp, inBufferPtr); + } else if (&mAllocOp == inOpPtr) { + inObj.Done(mAllocOp, inBufferPtr); + } else if (&mCloseOp == inOpPtr) { + inObj.Done(mCloseOp, inBufferPtr); + } else if (&mGetRecordAppendOpStatusOp == inOpPtr) { + inObj.Done(mGetRecordAppendOpStatusOp, inBufferPtr); + } else { + return false; + } + return true; + } + void StopChunkServer() + { + if (mChunkServerPtr && mChunkServerPtr != &mChunkServer) { + mChunkServerPtr->Cancel(mCurOpPtr, this); + } + mChunkServerPtr = 0; + mChunkServer.Stop(); + } + bool WasChunkServerDisconnected() + { + return (mClientPoolPtr ? + (! mChunkServerPtr || mChunkServerPtr->WasDisconnected()) : + mChunkServer.WasDisconnected() + ); + } + void StartAppend() + { + if (mSleepingFlag || mErrorCode) { + return; + } + mCurOpPtr = 0; + if (mClosingFlag && mWriteQueue.empty()) { + if (! WasChunkServerDisconnected()) { + if (mAllocOp.chunkId > 0 && mSpaceAvailable > 0) { + SpaceRelease(); + return; + } + if (mAllocOp.chunkId > 0) { + CloseChunk(); + return; + } + } + StopChunkServer(); + mMetaServer.Cancel(mCurOpPtr, this); + mClosingFlag = false; + mOpeningFlag = false; + mOpenFlag = false; + ReportCompletion(); + return; + } + if ((mDefaultSpaceReservationSize <= 0 || ! mPreAllocationFlag) && + ! CanAppend()) { + return; + } + if (mAllocOp.chunkId > 0 && WasChunkServerDisconnected()) { + // When chunk server disconnects it automatically cleans up + // space reservation and write appenders. Start from the + // beginning -- chunk allocation. + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "detected chunk server disconnect: " << GetServerLocation() << + " starting from chunk allocation, pending:" << + " queue: " << mWriteQueue.size() << + " bytes: " << mBuffer.BytesConsumable() << + KFS_LOG_EOM; + Reset(); + if (! CanAppend()) { + // Do not try to preallocate chunk and reserve space + // after inactivity timeout or error, if no data pending. + return; + } + } + if (mAllocOp.chunkId > 0 && mSpaceReserveOp.status == -ENOSPC) { + if (mSpaceAvailable > 0) { + SpaceRelease(); + } else { + CloseChunk(); + } + return; + } + if (mAllocOp.chunkId > 0 && ! mWriteIds.empty()) { + ReserveSpace(); + } else { + Reset(); + AllocateChunk(); + } + } + void Lookup() + { + mCurOpPtr = &mLookupOp; // For HandleError() below to work. + const bool theStartFlag = mPathNamePos == 0; + if (theStartFlag) { + mFileName.clear(); + mCreateOp.status = 0; + } else if (mFileName.empty()) { + mLookupOp.status = -ENOENT; + HandleError(); + return; + } else if (mLookupOp.status == -ENOENT && mMakeDirsFlag) { + mLookupOp.status = 0; + Mkdir(); + return; + } else if (mLookupOp.status != 0) { + HandleError(); + return; + } + kfsFileId_t const theParentFid = theStartFlag ? + KFS::ROOTFID : mLookupOp.fattr.fileId; + const string theFileName = mFileName; + + Reset(mLookupOp); + mLookupOp.filename = 0; + mLookupOp.parentFid = theParentFid; + StringPos theNext = string::npos; + StringPos const theEnd = mPathName.length(); + const char theSeparator = '/'; + while (mPathNamePos < theEnd && + (theNext = mPathName.find(theSeparator, mPathNamePos)) != + string::npos && + theNext == mPathNamePos) { + mPathNamePos++; + } + if (theNext == string::npos) { + theNext = theEnd; + } + if (mPathNamePos >= theEnd) { + mFileName.clear(); + } else { + mFileName = mPathName.substr(mPathNamePos, theNext - mPathNamePos); + } + if (theNext - mPathNamePos > KFS::MAX_FILENAME_LEN) { + mLookupOp.status = -ENAMETOOLONG; + HandleError(); + return; + } + mPathNamePos = theNext; + if (theNext == theEnd) { + if (! mFileName.empty()) { + Create(); + return; + } + if (mCreateOp.status == -EEXIST && ! theFileName.empty()) { + mCreateOp.status = 0; + mFileName = theFileName; + mLookupOp.fattr.isDirectory = true; + } + } + if (! theStartFlag && + mLookupOp.fattr.isDirectory == mFileName.empty()) { + mLookupOp.status = mFileName.empty() ? -ENOENT : -ENOTDIR; + HandleError(); + return; + } + if (mFileName.empty()) { + mOpenFlag = true; + mOpeningFlag = false; + ReportCompletion(); + StartAppend(); + return; + } + mLookupOp.filename = mFileName.c_str(); + assert(*mLookupOp.filename); + EnqueueMeta(mLookupOp); + } + void Done( + LookupOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mLookupOp == &inOp && ! inBufferPtr); + Lookup(); + } + void Mkdir() + { + assert(mLookupOp.parentFid > 0 && ! mFileName.empty()); + Reset(mMkdirOp); + mMkdirOp.parentFid = mLookupOp.parentFid; + mMkdirOp.dirname = mLookupOp.filename; + EnqueueMeta(mMkdirOp); + } + void Done( + MkdirOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mMkdirOp == &inOp && ! inBufferPtr); + if (inOp.status == -EEXIST) { + // Just re-queue the lookup op, it should succeed now. + assert(mLookupOp.parentFid == mMkdirOp.parentFid && + mMkdirOp.dirname == mLookupOp.filename); + EnqueueMeta(mLookupOp); + return; + } + if (inOp.status != 0) { + mAllocOp.chunkId = 0; + HandleError(); + return; + } + assert(mLookupOp.parentFid == mMkdirOp.parentFid); + mLookupOp.fattr.fileId = mMkdirOp.fileId; + mLookupOp.fattr.isDirectory = true; + mLookupOp.status = 0; + Lookup(); + } + void Create() + { + assert(mLookupOp.parentFid > 0 && ! mFileName.empty()); + Reset(mCreateOp); + mCreateOp.parentFid = mLookupOp.parentFid; + mCreateOp.filename = mFileName.c_str(); + mCreateOp.numReplicas = mNumReplicas; + // With false it deletes the file then creates it again. + mCreateOp.exclusive = true; + EnqueueMeta(mCreateOp); + } + void Done( + CreateOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mCreateOp == &inOp && ! inBufferPtr); + if (inOp.status == -EEXIST) { + Lookup(); + return; + } + if (inOp.status != 0) { + mAllocOp.chunkId = 0; + HandleError(); + return; + } + mLookupOp.parentFid = inOp.parentFid; + mLookupOp.status = inOp.status; + mLookupOp.fattr.fileId = inOp.fileId; + mOpenFlag = true; + mOpeningFlag = false; + ReportCompletion(); + StartAppend(); + } + void LookupPath() + { + Reset(mLookupPathOp); + mLookupPathOp.rootFid = KFS::ROOTFID; + mLookupPathOp.filename = mPathName.c_str(); + assert(*mLookupPathOp.filename); + EnqueueMeta(mLookupPathOp); + } + void Done( + LookupPathOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mLookupPathOp == &inOp && ! inBufferPtr); + if (inOp.status == KfsNetClient::kErrorMaxRetryReached) { + HandleError(); + return; + } + if (inOp.status != 0 && mNumReplicas > 0) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "lookup path failed: " << inOp.status << + " falling back to open" << + KFS_LOG_EOM; + Lookup(); + return; + } + if (inOp.fattr.isDirectory) { + inOp.status = -EISDIR; + HandleError(); + return; + } + inOp.filename = ""; // Reset just in case. + // Copy result into lookup op. + mLookupOp.parentFid = -1; // Input, not known, and not needed. + mLookupOp.status = inOp.status; + mLookupOp.fattr = inOp.fattr; + mOpenFlag = true; + mOpeningFlag = false; + ReportCompletion(); + StartAppend(); + } + void AllocateChunk() + { + assert(mLookupOp.fattr.fileId > 0); + Reset(mAllocOp); + mSpaceAvailable = 0; + chunkOff_t theOffset; + if (mSpaceReserveOp.status == -ENOSPC) { + theOffset = (mAllocOp.fileOffset + KFS::CHUNKSIZE) / + KFS::CHUNKSIZE * KFS::CHUNKSIZE; + mSpaceReserveOp.status = 0; + } else { + theOffset = -1; + } + mAllocOp = AllocateOp(0, mLookupOp.fattr.fileId, mPathName); + mAllocOp.append = true; + mAllocOp.chunkId = 0; + mAllocOp.fileOffset = theOffset; + mAllocOp.spaceReservationSize = max( + mClosingFlag ? 0 : mDefaultSpaceReservationSize, + mBuffer.BytesConsumable() + ); + mAllocOp.maxAppendersPerChunk = mDefaultSpaceReservationSize > 0 ? + (KFS::CHUNKSIZE / mDefaultSpaceReservationSize) : 64; + mStats.mChunkAllocCount++; + EnqueueMeta(mAllocOp); + } + void Done( + AllocateOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mAllocOp == &inOp && ! inBufferPtr); + if (inOp.status != 0 || mAllocOp.chunkServers.empty()) { + mAllocOp.chunkId = 0; + HandleError(); + return; + } + AllocateWriteId(); + } + void CloseChunk() + { + assert(mAllocOp.chunkId > 0); + Reset(mCloseOp); + mCloseOp.chunkId = mAllocOp.chunkId; + mCloseOp.writeInfo = mWriteIds; + if (mCloseOp.writeInfo.empty()) { + mCloseOp.chunkServerLoc = mAllocOp.chunkServers; + } else { + mCloseOp.chunkServerLoc.clear(); + } + Enqueue(mCloseOp); + } + void Done( + CloseOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mCloseOp == &inOp && ! inBufferPtr); + if (mCloseOp.status != 0) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "chunk close failure, status: " << mCloseOp.status << + " ignored" << + KFS_LOG_EOM; + StopChunkServer(); + } + mCurOpPtr = 0;// Graceful close, do not reset chunk server's connection. + Reset(); + StartAppend(); + } + bool CanAppend() + { + return ( + ! mWriteQueue.empty() && + (mClosingFlag || mBuffer.BytesConsumable() >= mWriteThreshold) + ); + } + bool ReserveSpace( + bool inCheckAppenderFlag = false) + { + assert(mAllocOp.chunkId > 0 && ! mWriteIds.empty()); + const int theSpaceNeeded = mWriteQueue.empty() ? + ((mSpaceAvailable <= 0 && ! mClosingFlag) ? + mDefaultSpaceReservationSize : 0) : + mWriteQueue.front(); + if (! inCheckAppenderFlag && theSpaceNeeded <= mSpaceAvailable) { + if (CanAppend()) { + Append(); + return true; + } else { + return false; // Nothing to do. + } + } + Reset(mSpaceReserveOp); + mSpaceReserveOp.chunkId = mAllocOp.chunkId; + mSpaceReserveOp.chunkVersion = mAllocOp.chunkVersion, + mSpaceReserveOp.writeInfo = mWriteIds; + mSpaceReserveOp.numBytes = theSpaceNeeded <= mSpaceAvailable ? + size_t(0) : + size_t(max( + mClosingFlag ? 0 : mDefaultSpaceReservationSize, + max(theSpaceNeeded, min( + max(mPreferredAppendSize, mDefaultSpaceReservationSize), + mBuffer.BytesConsumable()))) - + mSpaceAvailable + ); + mStats.mReserveSpaceCount++; + Enqueue(mSpaceReserveOp); + return true; + } + void Done( + ChunkSpaceReserveOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mSpaceReserveOp == &inOp && ! inBufferPtr); + if (inOp.status != 0) { + if (inOp.status == -ENOSPC) { + mStats.mReserveSpaceDeniedCount++; + if (mSpaceAvailable > 0) { + SpaceRelease(); + } else { + CloseChunk(); + } + return; + } + HandleError(); + return; + } + mSpaceAvailable += inOp.numBytes; + mLastAppendActivityTime = Now(); + StartAppend(); + } + void AllocateWriteId() + { + assert(mAllocOp.chunkId > 0 && ! mAllocOp.chunkServers.empty()); + Reset(mWriteIdAllocOp); + mWriteIdAllocOp.chunkId = mAllocOp.chunkId; + mWriteIdAllocOp.chunkVersion = mAllocOp.chunkVersion; + mWriteIdAllocOp.isForRecordAppend = true; + mWriteIdAllocOp.chunkServerLoc = mAllocOp.chunkServers; + mWriteIdAllocOp.offset = 0; + mWriteIdAllocOp.numBytes = 0; + if (mClientPoolPtr) { + mChunkServerPtr = &mClientPoolPtr->Get(mAllocOp.chunkServers[0]); + } else { + mChunkServerPtr = 0; + if (! mChunkServer.SetServer(mAllocOp.chunkServers[0])) { + mCurOpPtr = &mWriteIdAllocOp; + HandleError(); + return; + } + } + Enqueue(mWriteIdAllocOp); + } + void Done( + WriteIdAllocOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mWriteIdAllocOp == &inOp && ! inBufferPtr); + mWriteIds.clear(); + if (inOp.status < 0) { + HandleError(); + return; + } + const size_t theServerCount = inOp.chunkServerLoc.size(); + mWriteIds.reserve(theServerCount); + istringstream theStream(inOp.writeIdStr); + for (size_t i = 0; i > + theWInfo.serverLoc.hostname >> + theWInfo.serverLoc.port >> + theWInfo.writeId)) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "write id alloc: invalid response: " << inOp.writeIdStr << + KFS_LOG_EOM; + break; + } + mWriteIds.push_back(theWInfo); + } + if (theServerCount != mWriteIds.size()) { + HandleError(); + return; + } + mPrevRecordAppendOpSeq = inOp.seq; + if (! ReserveSpace()) { + StartAppend(); + } + } + void Append() + { + while (! mWriteQueue.empty() && mWriteQueue.front() <= 0) { + assert(! "invalid write queue"); + mWriteQueue.pop_front(); + } + if (mWriteQueue.empty()) { + assert(mBuffer.IsEmpty()); + StartAppend(); // Nothing to append yet. + return; + } + bool theCheckAppenderFlag = false; + if (mWriteQueue.front() > mSpaceAvailable || + (theCheckAppenderFlag = mLastAppendActivityTime + + kAppendInactivityCheckTimeout <= Now())) { + const bool theOpQueuedFlag = ReserveSpace(theCheckAppenderFlag); + QCRTASSERT(theOpQueuedFlag); + return; + } + const int theTotal = mBuffer.BytesConsumable(); + const int thePreferredAppendSize = min(mSpaceAvailable, + (mPreferredAppendSize < theTotal && + (theTotal >> 1) < mPreferredAppendSize && + theTotal - mPreferredAppendSize >= mWriteThreshold) ? + theTotal : mPreferredAppendSize + ); + int theSum; + while (mWriteQueue.size() > 1 && + (theSum = mWriteQueue[0] + mWriteQueue[1]) <= + thePreferredAppendSize) { + mWriteQueue.pop_front(); + mWriteQueue.front() = theSum; + } + mAppendLength = mWriteQueue.front(); + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "append: " << mAppendLength << + " pending: queue: " << mWriteQueue.size() << + " bytes: " << theTotal << + " wthresh: " << mWriteThreshold << + KFS_LOG_EOM; + assert(mBuffer.BytesConsumable() >= mAppendLength); + Reset(mRecAppendOp); + mRecAppendOp.chunkId = mAllocOp.chunkId; + mRecAppendOp.chunkVersion = mAllocOp.chunkVersion; + mRecAppendOp.offset = -1; // Let chunk server pick offset. + mRecAppendOp.writeInfo = mWriteIds; + mRecAppendOp.contentLength = size_t(mAppendLength); + mRecAppendOp.checksum = + ComputeBlockChecksum(&mBuffer, mAppendLength); + mStats.mOpsRecAppendCount++; + Enqueue(mRecAppendOp, &mBuffer); + } + void Done( + RecordAppendOp& inOp, + IOBuffer* inBufferPtr, + bool inResetFlag = false) + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "append done: " << + (mWriteQueue.empty() ? -1 : mWriteQueue.front()) << + " pending: queue: " << mWriteQueue.size() << + " bytes: " << mBuffer.BytesConsumable() << + " wthresh: " << mWriteThreshold << + KFS_LOG_EOM; + assert(&mRecAppendOp == &inOp && inBufferPtr == &mBuffer && + ! mWriteQueue.empty()); + if (inOp.status != 0 || mWriteQueue.empty()) { + HandleError(); + return; + } + const int theConsumed = mBuffer.Consume(mAppendLength); + QCRTASSERT(mAppendLength > 0 && theConsumed == mAppendLength && + mSpaceAvailable >= mAppendLength); + mSpaceAvailable -= mAppendLength; + // The queue can change in the case if it had only one record when + // append started, and then the next record arrived and the two + // (short) records were coalesced into one. + while (mAppendLength > 0) { + assert(! mWriteQueue.empty()); + int& theLen = mWriteQueue.front(); + if (mAppendLength >= theLen) { + mAppendLength -= theLen; + mWriteQueue.pop_front(); + } else { + theLen -= mAppendLength; + mAppendLength = 0; + } + } + mLastAppendActivityTime = Now(); + mPrevRecordAppendOpSeq = inOp.seq; + mStats.mAppendCount++; + mStats.mAppendByteCount += theConsumed; + ReportCompletion(); + if (inResetFlag || (mForcedAllocationInterval > 0 && + (mStats.mOpsRecAppendCount % mForcedAllocationInterval) == 0)) { + Reset(); + } + StartAppend(); + } + void SpaceRelease() + { + if (mSpaceAvailable <= 0) { + StartAppend(); + return; + } + Reset(mSpaceReleaseOp); + mSpaceReleaseOp.chunkId = mAllocOp.chunkId; + mSpaceReleaseOp.chunkVersion = mAllocOp.chunkVersion, + mSpaceReleaseOp.writeInfo = mWriteIds; + mSpaceReleaseOp.numBytes = size_t(mSpaceAvailable); + Enqueue(mSpaceReleaseOp); + } + void Done( + ChunkSpaceReleaseOp& inOp, + IOBuffer* inBufferPtr) + { + assert(&mSpaceReleaseOp == &inOp && ! inBufferPtr); + if (inOp.status != 0) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "space release error: " << inOp.status << + " msg: " << inOp.statusMsg << + " ignored; op: " << + inOp.Show() << + KFS_LOG_EOM; + Reset(); + // HandleError(); + // return; + } else { + assert(size_t(mSpaceAvailable) == mSpaceReleaseOp.numBytes); + mSpaceAvailable = 0; + } + StartAppend(); + } + void GetLastRecordAppendOpStatus() + { + const unsigned int theIndex = mGetRecordAppendOpStatusIndex; + assert(theIndex >= 0 && theIndex < mWriteIds.size()); + Reset(mGetRecordAppendOpStatusOp); + mGetRecordAppendOpStatusOp.chunkId = mAllocOp.chunkId; + mGetRecordAppendOpStatusOp.writeId = mWriteIds[theIndex].writeId; + assert(mChunkServer.GetMaxRetryCount() <= 1); + // <= 0 -- infinite timeout + // For record append status always use separate / dedicated connection. + mChunkServerPtr = 0; + mChunkServer.SetOpTimeoutSec( + max(int(kGetStatusOpMinTime), mOpTimeoutSec / 8)); + mChunkServer.SetServer(mWriteIds[theIndex].serverLoc); + Enqueue(mGetRecordAppendOpStatusOp); + } + void Done( + GetRecordAppendOpStatus& inOp, + IOBuffer* inBufferPtr) + { + assert( + &mGetRecordAppendOpStatusOp == &inOp && + ! inBufferPtr && + mGetRecordAppendOpStatusIndex < mWriteIds.size() + ); + if (inOp.status != 0) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "operation" + " failure, seq: " << inOp.seq << + " status: " << inOp.status << + " msg: " << inOp.statusMsg << + " chunk server: " << GetServerLocation() << + " op: " << inOp.Show() << + KFS_LOG_EOM; + } + // Restore chunk server settings. + mChunkServer.SetOpTimeoutSec(mOpTimeoutSec); + if (inOp.status != 0) { + // If he doesn't know about this chunk and write id, then it is + // possible that he has restarted, or protocol state got purged. + // Do not waste time retrying in case of network errors, the + // protocol state might get purged. + // Move to the next chunk server. + if (++mGetRecordAppendOpStatusIndex < mWriteIds.size()) { + GetLastRecordAppendOpStatus(); + } else { + // Tried all servers. + // Use normal retry mecanism to schedule another round of + // status recovery. + mCurOpPtr = &mRecAppendOp; + const bool kResetFlag = true; + Done(mRecAppendOp, &mBuffer, kResetFlag); + } + return; + } + KFS_LOG_STREAM_INFO << mLogPrefix << + "record append seq:" + " prev: " << mPrevRecordAppendOpSeq << + " cur: " << mRecAppendOp.seq << + " recovered last record append status: " << + inOp.Show() << + KFS_LOG_EOM; + if (inOp.opSeq != mRecAppendOp.seq && + inOp.opSeq != mPrevRecordAppendOpSeq) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + " status op: unexpected sequence number: " + " got: " << inOp.opSeq << + " expected: " << mPrevRecordAppendOpSeq << + " or " << mRecAppendOp.seq << + KFS_LOG_EOM; + FatalError(-EINVAL); + return; + } + if (inOp.chunkVersion != mAllocOp.chunkVersion) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + " status op: chunk version mismatch: " + " got: " << inOp.chunkVersion << + " expected: " << mAllocOp.chunkVersion << + KFS_LOG_EOM; + FatalError(-EINVAL); + return; + } + const int theStatus = inOp.opSeq == mRecAppendOp.seq ? + inOp.opStatus : (inOp.widReadOnlyFlag ? -EFAULT : -EAGAIN); + if (theStatus == -EAGAIN) { + if (mRetryCount > 1 && + ++mGetRecordAppendOpStatusIndex < mWriteIds.size()) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "server: " << + GetServerLocation() << + " status \"in progress\", trying next server" << + KFS_LOG_EOM; + // If this is *not* the first recovery round, try to find + // the server that hasn't received the append in question. + // + // The only reason not to do this for the first recovery + // round is to prevent short timeout to cause problems by + // failing replications down the replication chain, in the + // case when replication is still in flight, but hasn't + // reached or hasn't been processed yet by the "downstream" + // participants. + GetLastRecordAppendOpStatus(); + return; + } + } else if (mRetryCount == mMaxRetryCount && mRetryCount > 0) { + // Give one more chance to do append seq. without a failure. + mRetryCount--; + } + mRecAppendOp.status = theStatus; + mCurOpPtr = &mRecAppendOp; + const bool kResetFlag = true; + Done(mRecAppendOp, &mBuffer, kResetFlag); + } + void Enqueue( + KfsOp& inOp, + IOBuffer* inBufferPtr = 0) + { EnqueueSelf(inOp, inBufferPtr, false); } + void EnqueueMeta( + KfsOp& inOp, + IOBuffer* inBufferPtr = 0) + { EnqueueSelf(inOp, inBufferPtr, true); } + time_t Now() const + { return mNetManager.Now(); } + KfsNetClient& GetChunkServer() + { return (mChunkServerPtr ? *mChunkServerPtr : mChunkServer); } + const KfsNetClient& GetChunkServer() const + { return (mChunkServerPtr ? *mChunkServerPtr : mChunkServer); } + void EnqueueSelf( + KfsOp& inOp, + IOBuffer* inBufferPtr, + bool inMetaOpFlag) + { + mCurOpPtr = &inOp; + mOpStartTime = Now(); + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "+> " << (inMetaOpFlag ? "meta" : "" ) << + " " << inOp.Show() << + " buffer: " << (void*)inBufferPtr << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + KFS_LOG_EOM; + if (inMetaOpFlag) { + mStats.mMetaOpsQueuedCount++; + if (! mMetaServer.Enqueue(&inOp, this, inBufferPtr)) { + inOp.status = kErrorMetaEnqueue; + HandleEnqueueError(); + } + } else if (! (GetChunkServer().Enqueue(&inOp, this, inBufferPtr))) { + inOp.status = kErrorChunkEnqueue; + HandleEnqueueError(); + } + } + void Reset( + KfsOp& inOp) + { + inOp.seq = 0; + inOp.status = 0; + inOp.statusMsg.clear(); + inOp.checksum = 0; + inOp.contentLength = 0; + inOp.contentBufLen = 0; + delete [] inOp.contentBuf; + inOp.contentBuf = 0; + } + void Reset() + { + if (mCurOpPtr) { + StopChunkServer(); + mMetaServer.Cancel(mCurOpPtr, this); + } + Reset(mAllocOp); + mWriteIds.clear(); + assert(mSpaceAvailable >= 0); + mSpaceAvailable = 0; + mAllocOp.chunkId = 0; + mCurOpPtr = 0; + mAppendLength = 0; + } + void HandleEnqueueError() + { HandleError(true); } + int GetTimeToNextRetry( + int inTimeSecBetweenRetries) const + { + return max(0, inTimeSecBetweenRetries - int(Now() - mOpStartTime)); + } + void HandleError( + bool inEnqueueErrorFlag = false) + { + if (mCurOpPtr) { + ostringstream theOStream; + mCurOpPtr->Request(theOStream); + KFS_LOG_STREAM_ERROR << mLogPrefix << + "operation" << (inEnqueueErrorFlag ? " enqueue" : "") << + " failure, seq: " << mCurOpPtr->seq << + " status: " << mCurOpPtr->status << + " msg: " << mCurOpPtr->statusMsg << + " op: " << mCurOpPtr->Show() << + " current chunk server: " << GetServerLocation() << + " chunkserver: " << (GetChunkServer().IsDataSent() ? + (GetChunkServer().IsAllDataSent() ? "all" : "partial") : + "no") << " data sent" << + "\nRequest:\n" << theOStream.str() << + KFS_LOG_EOM; + } else { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "NULL operation " << + (inEnqueueErrorFlag ? "enqueue" : "") << " failure" << + KFS_LOG_EOM; + } + if (! (mErrorCode = mCurOpPtr ? mCurOpPtr->status : -1)) { + mErrorCode = -1; + } + // Meta operations are automatically retried by MetaServer. + // Declare fatal error in the case of meta op failure. + if (&mLookupOp == mCurOpPtr || &mCreateOp == mCurOpPtr || + &mMkdirOp == mCurOpPtr || &mLookupPathOp == mCurOpPtr) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "meta operation failed, giving up" << + KFS_LOG_EOM; + } else if (mRetryCount >= mMaxRetryCount) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "max retry reached: " << mRetryCount << ", giving up" << + KFS_LOG_EOM; + } else if (! mOpenFlag) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "appender closed, giving up" << + KFS_LOG_EOM; + } else if (&mRecAppendOp == mCurOpPtr && ( + (mRecAppendOp.status == KfsNetClient::kErrorMaxRetryReached && + (mClientPoolPtr || GetChunkServer().IsAllDataSent()) + ) || + mRecAppendOp.status == -EAGAIN) + ) { + mRetryCount++; + mErrorCode = 0; + mGetRecordAppendOpStatusIndex = 0; + if (mRecAppendOp.status == -EAGAIN) { + const int theTimeToNextRetry = GetTimeToNextRetry( + min(4, mRetryCount - 1) * kAgainRetryMinTime + + max(int(kAgainRetryMinTime), mTimeSecBetweenRetries) + ); + KFS_LOG_STREAM_ERROR << mLogPrefix << + "record append operation status unknown," + " schedule to get status in " << + theTimeToNextRetry << " sec" << + KFS_LOG_EOM; + mCurOpPtr = &mGetRecordAppendOpStatusOp; + Sleep(theTimeToNextRetry); + } else { + // From now on for recovery purposes threat as undefined status: + // mChunkServer.IsAllDataSent() during the recovery corresponds + // to the "get op status", instead of "record append", and from + // now on the retry timeout needs to be enforced. + // For debugging set status message accordingly: + mRecAppendOp.statusMsg = "all data sent, but no ack received"; + mRecAppendOp.status = -EAGAIN; + } + if (! mSleepingFlag) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "record append operation failed to receive ack," + " trying to get status" << + KFS_LOG_EOM; + GetLastRecordAppendOpStatus(); + } + return; + } else { + mStats.mRetriesCount++; + mRetryCount++; + int theTimeToNextRetry = GetTimeToNextRetry(mTimeSecBetweenRetries); + // Treat alloc failure the same as chunk server failure. + if (&mAllocOp == mCurOpPtr) { + mStats.mAllocRetriesCount++; + } else if (&mWriteIdAllocOp == mCurOpPtr || + &mRecAppendOp == mCurOpPtr || + (&mSpaceReserveOp == mCurOpPtr && + mSpaceReserveOp.status != -ENOSPC)) { + if (++mAppendRestartRetryCount == 2 || + mAppendRestartRetryCount == 5 || + mAppendRestartRetryCount == 15) { + // When write id or append fails the second, fifth, and + // fifteen times tell meta server to allocate new chunk to + // paper over bugs, and network connectivity problems by + // pretending that space reservation have failed. + KFS_LOG_STREAM_INFO << + "force new chunk allocation" + " retry: " << mAppendRestartRetryCount << + KFS_LOG_EOM; + mSpaceReserveOp.status = -ENOSPC; + theTimeToNextRetry = 0; + } else if (mAppendRestartRetryCount <= 1) { + theTimeToNextRetry = 0; + } + } + // Retry. + KFS_LOG_STREAM_INFO << mLogPrefix << + "scheduling retry: " << mRetryCount << + " of " << mMaxRetryCount << + " in " << theTimeToNextRetry << " sec." << + " op: " << + (mCurOpPtr ? mCurOpPtr->Show() : string("NULL")) << + KFS_LOG_EOM; + mErrorCode = 0; + if (&mGetRecordAppendOpStatusOp != mCurOpPtr) { + Reset(); + } + Sleep(theTimeToNextRetry); + if (! mSleepingFlag) { + Timeout(); + } + return; + } + FatalError(); + } + void FatalError( + int inErrorCode = 0) + { + if (inErrorCode != 0) { + mErrorCode = inErrorCode; + } + if (mErrorCode == 0) { + mErrorCode = -1; + } + mOpenFlag = false; + mOpeningFlag = false; + mClosingFlag = false; + mCurOpPtr = 0; + ReportCompletion(); + } + void HandleCancel() + { + if (&mAllocOp == mCurOpPtr || + &mLookupOp == mCurOpPtr || + &mCreateOp == mCurOpPtr) { + mStats.mMetaOpsCancelledCount++; + } + if (! mCurOpPtr) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "NULL operation canceled" << + KFS_LOG_EOM; + } else { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "operation canceled " << mCurOpPtr->Show() << + KFS_LOG_EOM; + } + mCurOpPtr = 0; + mErrorCode = kErrorOpCanceled; + } + void ReportCompletion() + { + if (mErrorCode == 0) { + // Reset retry counts on successful completion. + mRetryCount = 0; + mAppendRestartRetryCount = 0; + } + if (mCompletionPtr) { + mCompletionPtr->Done(mOuter, mErrorCode); + } + } + bool Sleep(int inSec) + { + if (inSec <= 0 || mSleepingFlag) { + return false; + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "sleeping: " << inSec << + " append: " << mWriteQueue.front() << + " pending: queue: " << mWriteQueue.size() << + " bytes: " << mBuffer.BytesConsumable() << + " cur op: " << + (mCurOpPtr ? mCurOpPtr->Show() : string("none")) << + KFS_LOG_EOM; + mSleepingFlag = true; + mStats.mSleepTimeSec += inSec; + const bool kResetTimerFlag = true; + SetTimeoutInterval(inSec * 1000, kResetTimerFlag); + mNetManager.RegisterTimeoutHandler(this); + return true; + } + virtual void Timeout() + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "timeout: " + " append: " << mWriteQueue.front() << + " pending: queue: " << mWriteQueue.size() << + " bytes: " << mBuffer.BytesConsumable() << + " cur op: " << + (mCurOpPtr ? mCurOpPtr->Show() : string("none")) << + KFS_LOG_EOM; + if (mSleepingFlag) { + mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + if (&mGetRecordAppendOpStatusOp == mCurOpPtr) { + GetLastRecordAppendOpStatus(); + } else { + StartAppend(); + } + } +private: + Impl( + const Impl& inAppender); + Impl& operator=( + const Impl& inAppender); +}; + +WriteAppender::WriteAppender( + MetaServer& inMetaServer, + Completion* inCompletionPtr /* = 0 */, + int inMaxRetryCount /* = 6 */, + int inWriteThreshold /* = KFS::CHECKSUM_BLOCKSIZE */, + int inTimeSecBetweenRetries /* = 15 */, + int inDefaultSpaceReservationSize /* = 1 << 20 */, + int inPreferredAppendSize /* = KFS::CHECKSUM_BLOCKSIZE */, + int inMaxPartialBuffersCount /* = 16 */, + int inOpTimeoutSec /* = 30 */, + int inIdleTimeoutSec /* = 5 * 30 */, + const char* inLogPrefixPtr /* = 0 */, + int64_t inChunkServerInitialSeqNum /* = 1 */, + bool inPreAllocationFlag /* = true */, + ClientPool* inClientPoolPtr /* = 0 */) + : mImpl(*new WriteAppender::Impl( + *this, + inMetaServer, + inCompletionPtr, + inMaxRetryCount, + inWriteThreshold, + inTimeSecBetweenRetries, + inDefaultSpaceReservationSize, + inPreferredAppendSize, + inMaxPartialBuffersCount, + inOpTimeoutSec, + inIdleTimeoutSec, + inPreAllocationFlag, + (inLogPrefixPtr && inLogPrefixPtr[0]) ? + (inLogPrefixPtr + string(" ")) : string(), + inChunkServerInitialSeqNum, + inClientPoolPtr + )) +{ +} + +/* virtual */ +WriteAppender::~WriteAppender() +{ + delete &mImpl; +} + +int +WriteAppender::Open( + const char* inFileNamePtr, + int inNumReplicas /* = 3 */, + bool inMakeDirsFlag /* = false */) +{ + return mImpl.Open(inFileNamePtr, inNumReplicas, inMakeDirsFlag); +} + +int +WriteAppender::Open( + kfsFileId_t inFileId, + const char* inFileNamePtr) +{ + return mImpl.Open(inFileId, inFileNamePtr); +} + +int +WriteAppender::Close() +{ + return mImpl.Close(); +} + +int +WriteAppender::Append( + IOBuffer& inBuffer, + int inLength) +{ + return mImpl.Append(inBuffer, inLength); +} + +void +WriteAppender::Shutdown() +{ + mImpl.Shutdown(); +} + +bool +WriteAppender::IsOpen() const +{ + return mImpl.IsOpen(); +} + +bool +WriteAppender::IsOpening() const +{ + return mImpl.IsOpening(); +} + +bool +WriteAppender::IsClosing() const +{ + return mImpl.IsClosing(); +} + +bool +WriteAppender::IsSleeping() const +{ + return mImpl.IsSleeping(); +} + +bool +WriteAppender::IsActive() const +{ + return mImpl.IsActive(); +} + +int +WriteAppender::GetPendingSize() const +{ + return mImpl.GetPendingSize(); +} + +int +WriteAppender::GetErrorCode() const +{ + return mImpl.GetErrorCode(); +} + +int +WriteAppender::SetWriteThreshold( + int inThreshold) +{ + return mImpl.SetWriteThreshold(inThreshold); +} + +void +WriteAppender::Register( + Completion* inCompletionPtr) +{ + mImpl.Register(inCompletionPtr); +} + +bool +WriteAppender::Unregister( + Completion* inCompletionPtr) +{ + return mImpl.Unregister(inCompletionPtr); +} + +void +WriteAppender::GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats) +{ + mImpl.GetStats(outStats, outChunkServersStats); +} + +string +WriteAppender::GetServerLocation() const +{ + return mImpl.GetServerLocation(); +} + +int +WriteAppender::SetPreAllocation( + bool inFlag) +{ + return mImpl.SetPreAllocation(inFlag); +} + +bool +WriteAppender::GetPreAllocation() const +{ + return mImpl.GetPreAllocation(); +} + +void +WriteAppender::SetForcedAllocationInterval( + int inInterval) +{ + return mImpl.SetForcedAllocationInterval(inInterval); +} + +} +} diff --git a/src/cc/libclient/WriteAppender.h b/src/cc/libclient/WriteAppender.h new file mode 100644 index 000000000..ea9f909bf --- /dev/null +++ b/src/cc/libclient/WriteAppender.h @@ -0,0 +1,215 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/05/20 +// Author: Mike Ovsiannikov +// +// Copyright 2009-2011 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef WRITE_APPENDER_H +#define WRITE_APPENDER_H + +#include "KfsNetClient.h" +#include "common/kfstypes.h" +#include "kfsio/checksum.h" + +#include +#include + +namespace KFS +{ +class IOBuffer; + +namespace client +{ + +class ClientPool; + +// Kfs client write append state machine. +class WriteAppender +{ +public: + class Completion + { + public: + virtual void Done( + WriteAppender& inAppender, + int inStatusCode) = 0; + virtual void Unregistered( + WriteAppender& /* inAppender */) + {} + protected: + Completion() + {} + Completion( + const Completion&) + {} + virtual ~Completion() + {} + }; + struct Stats + { + typedef int64_t Counter; + Stats() + : mMetaOpsQueuedCount(0), + mMetaOpsCancelledCount(0), + mSleepTimeSec(0), + mChunkAllocCount(0), + mReserveSpaceCount(0), + mReserveSpaceDeniedCount(0), + mOpsRecAppendCount(0), + mAllocRetriesCount(0), + mRetriesCount(0), + mBufferCompactionCount(0), + mAppendCount(0), + mAppendByteCount(0) + {} + void Clear() + { *this = Stats(); } + Stats& Add( + const Stats& inStats) + { + mMetaOpsQueuedCount += inStats.mMetaOpsQueuedCount; + mMetaOpsCancelledCount += inStats.mMetaOpsCancelledCount; + mSleepTimeSec += inStats.mSleepTimeSec; + mChunkAllocCount += inStats.mChunkAllocCount; + mReserveSpaceCount += inStats.mReserveSpaceCount; + mReserveSpaceDeniedCount += inStats.mReserveSpaceDeniedCount; + mOpsRecAppendCount += inStats.mOpsRecAppendCount; + mAllocRetriesCount += inStats.mAllocRetriesCount; + mRetriesCount += inStats.mRetriesCount; + mBufferCompactionCount += inStats.mBufferCompactionCount; + mAppendCount += inStats.mAppendCount; + mAppendByteCount += inStats.mAppendByteCount; + return *this; + } + std::ostream& Display( + std::ostream& inStream, + const char* inSeparatorPtr = 0, + const char* inDelimiterPtr = 0) const + { + const char* const theSeparatorPtr = + inSeparatorPtr ? inSeparatorPtr : " "; + const char* const theDelimiterPtr = + inDelimiterPtr ? inDelimiterPtr : ": "; + inStream << + "MetaOpsQueued" << theDelimiterPtr << + mMetaOpsQueuedCount << theSeparatorPtr << + "MetaOpsCancelled" << theDelimiterPtr << + mMetaOpsCancelledCount << theSeparatorPtr << + "SleepTimeSec" << theDelimiterPtr << + mSleepTimeSec << theSeparatorPtr << + "ChunkAlloc" << theDelimiterPtr << + mChunkAllocCount << theSeparatorPtr << + "ReserveSpace" << theDelimiterPtr << + mReserveSpaceCount << theSeparatorPtr << + "ReserveSpaceDenied" << theDelimiterPtr << + mReserveSpaceDeniedCount << theSeparatorPtr << + "OpsRecAppend" << theDelimiterPtr << + mOpsRecAppendCount << theSeparatorPtr << + "AllocRetries" << theDelimiterPtr << + mAllocRetriesCount << theSeparatorPtr << + "Retries" << theDelimiterPtr << + mRetriesCount << theSeparatorPtr << + "BufferCompaction" << theDelimiterPtr << + mBufferCompactionCount << theSeparatorPtr << + "AppendCount" << theDelimiterPtr << + mAppendCount << theSeparatorPtr << + "AppendByteCount" << theDelimiterPtr << + mAppendByteCount + ; + return inStream; + } + Counter mMetaOpsQueuedCount; + Counter mMetaOpsCancelledCount; + Counter mSleepTimeSec; + Counter mChunkAllocCount; + Counter mReserveSpaceCount; + Counter mReserveSpaceDeniedCount; + Counter mOpsRecAppendCount; + Counter mAllocRetriesCount; + Counter mRetriesCount; + Counter mBufferCompactionCount; + Counter mAppendCount; + Counter mAppendByteCount; + }; + typedef KfsNetClient MetaServer; + WriteAppender( + MetaServer& inMetaServer, + Completion* inCompletionPtr = 0, + int inMaxRetryCount = 6, + int inWriteThreshold = KFS::CHECKSUM_BLOCKSIZE, + int inTimeSecBetweenRetries = 15, + int inDefaultSpaceReservationSize = 1 << 20, + int inPreferredAppendSize = KFS::CHECKSUM_BLOCKSIZE, + int inMaxPartialBuffersCount = 16, + int inOpTimeoutSec = 30, + int inIdleTimeoutSec = 5 * 30, + const char* inLogPrefixPtr = 0, + int64_t inChunkServerInitialSeqNum = 1, + bool inPreAllocationFlag = true, + ClientPool* inClientPoolPtr = 0); + virtual ~WriteAppender(); + int Open( + const char* inFileNamePtr, + int inNumReplicas = 3, + bool inMakeDirsFlag = false); + int Open( + kfsFileId_t inFileId, + const char* inFileNamePtr); + int Close(); + int Append( + IOBuffer& inBuffer, + int inLength); + void Shutdown(); + bool IsOpen() const; + bool IsOpening() const; + bool IsClosing() const; + bool IsSleeping() const; + bool IsActive() const; + int GetPendingSize() const; + int GetErrorCode() const; + int SetWriteThreshold( + int inThreshold); + void Register( + Completion* inCompletionPtr); + bool Unregister( + Completion* inCompletionPtr); + void GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats); + std::string GetServerLocation() const; + int SetPreAllocation( + bool inFlag); + bool GetPreAllocation() const; + void SetForcedAllocationInterval( + int inInterval); +private: + class Impl; + Impl& mImpl; +private: + WriteAppender( + const WriteAppender& inAppender); + WriteAppender& operator=( + const WriteAppender& inAppender); +}; +}} + +#endif /* WRITE_APPENDER_H */ diff --git a/src/cc/libclient/Writer.cc b/src/cc/libclient/Writer.cc new file mode 100644 index 000000000..d51dc7d2a --- /dev/null +++ b/src/cc/libclient/Writer.cc @@ -0,0 +1,1903 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/06/11 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#include "Writer.h" + +#include +#include +#include +#include +#include +#include + +#include "kfsio/IOBuffer.h" +#include "kfsio/NetManager.h" +#include "kfsio/checksum.h" +#include "kfsio/ITimeout.h" +#include "common/kfsdecls.h" +#include "common/MsgLogger.h" +#include "qcdio/QCUtils.h" +#include "qcdio/qcstutils.h" +#include "qcdio/qcdebug.h" +#include "qcdio/QCDLList.h" +#include "RSStriper.h" +#include "KfsOps.h" +#include "utils.h" +#include "KfsClient.h" + +namespace KFS +{ +namespace client +{ + +using std::min; +using std::max; +using std::string; +using std::ostream; +using std::ostringstream; +using std::istringstream; + +// Kfs client write state machine implementation. +class Writer::Impl : + public QCRefCountedObj, + private ITimeout, + private KfsNetClient::OpOwner +{ +public: + typedef QCRefCountedObj::StRef StRef; + + enum + { + kErrorNone = 0, + kErrorParameters = -EINVAL, + kErrorTryAgain = -EAGAIN, + kErrorFault = -EFAULT, + kErrorNoEntry = -ENOENT + }; + + Impl( + Writer& inOuter, + MetaServer& inMetaServer, + Completion* inCompletionPtr, + int inMaxRetryCount, + int inWriteThreshold, + int inMaxPartialBuffersCount, + int inTimeSecBetweenRetries, + int inOpTimeoutSec, + int inIdleTimeoutSec, + int inMaxWriteSize, + string inLogPrefix, + int64_t inChunkServerInitialSeqNum) + : QCRefCountedObj(), + ITimeout(), + KfsNetClient::OpOwner(), + mOuter(inOuter), + mMetaServer(inMetaServer), + mPathName(), + mFileId(-1), + mClosingFlag(false), + mSleepingFlag(false), + mErrorCode(0), + mWriteThreshold(max(0, inWriteThreshold)), + mPartialBuffersCount(0), + mPendingCount(0), + mIdleTimeoutSec(inIdleTimeoutSec), + mOpTimeoutSec(inOpTimeoutSec), + mMaxRetryCount(inMaxRetryCount), + mTimeSecBetweenRetries(inTimeSecBetweenRetries), + mMaxPartialBuffersCount(inMaxPartialBuffersCount), + mMaxWriteSize(min((int)CHUNKSIZE, + (int)((max(0, inMaxWriteSize) + CHECKSUM_BLOCKSIZE - 1) / + CHECKSUM_BLOCKSIZE * CHECKSUM_BLOCKSIZE))), + mReplicaCount(-1), + mRetryCount(0), + mFileSize(0), + mOffset(0), + mOpenChunkBlockSize(CHUNKSIZE), + mChunkServerInitialSeqNum(inChunkServerInitialSeqNum), + mCompletionPtr(inCompletionPtr), + mBuffer(), + mLogPrefix(inLogPrefix), + mStats(), + mNetManager(mMetaServer.GetNetManager()), + mTruncateOp(0, 0, -1, 0), + mOpStartTime(0), + mCompletionDepthCount(0), + mStriperProcessCount(0), + mStriperPtr(0) + { Writers::Init(mWriters); } + int Open( + kfsFileId_t inFileId, + const char* inFileNamePtr, + Offset inFileSize, + int inStriperType, + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + int inReplicaCount) + { + if (inFileId <= 0 || ! inFileNamePtr || ! *inFileNamePtr) { + return kErrorParameters; + } + if (mFileId > 0) { + if (inFileId == mFileId && + inFileNamePtr == mPathName) { + return mErrorCode; + } + return kErrorParameters; + } + if (IsOpen() && mErrorCode != 0) { + return (mErrorCode < 0 ? mErrorCode : -mErrorCode); + } + if (mClosingFlag || mSleepingFlag) { + return kErrorTryAgain; + } + delete mStriperPtr; + string theErrMsg; + mStriperPtr = 0; + mOpenChunkBlockSize = Offset(CHUNKSIZE); + mStriperPtr = Striper::Create( + inStriperType, + inStripeCount, + inRecoveryStripeCount, + inStripeSize, + inFileSize, + mLogPrefix, + *this, + mOpenChunkBlockSize, + theErrMsg + ); + if (! theErrMsg.empty()) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + theErrMsg << + KFS_LOG_EOM; + return kErrorParameters; + } + if (! mStriperPtr || mOpenChunkBlockSize < Offset(CHUNKSIZE)) { + mOpenChunkBlockSize = Offset(CHUNKSIZE); + } + mBuffer.Clear(); + mStats.Clear(); + mReplicaCount = inReplicaCount; + mFileSize = inFileSize; + mPartialBuffersCount = 0; + mPathName = inFileNamePtr; + mErrorCode = 0; + mFileId = inFileId; + mTruncateOp.fid = -1; + mTruncateOp.pathname = 0; + mTruncateOp.fileOffset = mFileSize; + mRetryCount = 0; + return StartWrite(); + } + int Close() + { + if (! IsOpen()) { + return 0; + } + if (mErrorCode != 0) { + return mErrorCode; + } + if (mClosingFlag) { + return kErrorTryAgain; + } + mClosingFlag = true; + return StartWrite(); + } + int Write( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + bool inFlushFlag, + int inWriteThreshold) + { + if (inOffset < 0) { + return kErrorParameters; + } + if (mErrorCode != 0) { + return (mErrorCode < 0 ? mErrorCode : -mErrorCode); + } + if (mClosingFlag || ! IsOpen()) { + return kErrorParameters; + } + if (inLength <= 0) { + return ( + (ReportCompletion(0, inLength, inOffset) && inFlushFlag) ? + StartWrite(true) : 0 + ); + } + if (inOffset != mOffset + mBuffer.BytesConsumable()) { + // Just flush for now, do not try to optimize buffer rewrite. + const int thePrevRefCount = GetRefCount(); + const int theRet = Flush(); + if (theRet < 0) { + return theRet; + } + if (thePrevRefCount > GetRefCount()) { + return (mErrorCode < 0 ? mErrorCode : -mErrorCode); + } + mOffset = inOffset; + } + if (mMaxPartialBuffersCount == 0 || + inLength < IOBufferData::GetDefaultBufferSize() * 2) { + // If write size is small, then copy it into the last buffer. + mBuffer.ReplaceKeepBuffersFull( + &inBuffer, mBuffer.BytesConsumable(), inLength); + } else { + if (mBuffer.IsEmpty()) { + mPartialBuffersCount = 0; + } + mBuffer.Move(&inBuffer, inLength); + mPartialBuffersCount++; + if (mMaxPartialBuffersCount >= 0 && + mPartialBuffersCount >= mMaxPartialBuffersCount) { + mBuffer.MakeBuffersFull(); + mPartialBuffersCount = 0; + mStats.mBufferCompactionCount++; + } + } + if (inWriteThreshold >= 0) { + mWriteThreshold = inWriteThreshold; + } + const int theErrorCode = StartWrite(inFlushFlag); + return (theErrorCode == 0 ? inLength : + (theErrorCode < 0 ? theErrorCode : -theErrorCode)); + } + int Flush() + { + const int theErrorCode = StartWrite(true); + return (theErrorCode < 0 ? theErrorCode : -theErrorCode); + } + void Stop() + { + while (! Writers::IsEmpty(mWriters)) { + delete Writers::Front(mWriters); + } + if (mTruncateOp.fid >= 0) { + mMetaServer.Cancel(&mTruncateOp, this); + } + if (mSleepingFlag) { + mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + mClosingFlag = false; + mBuffer.Clear(); + } + void Shutdown() + { + Stop(); + mFileId = -1; + mErrorCode = 0; + } + bool IsOpen() const + { return (mFileId > 0); } + bool IsClosing() const + { return (IsOpen() && mClosingFlag); } + bool IsActive() const + { + return ( + IsOpen() && ( + ! mBuffer.IsEmpty() || + ! Writers::IsEmpty(mWriters) || + mClosingFlag) + ); + } + Offset GetPendingSize() const + { return (GetPendingSizeSelf() + mPendingCount); } + int SetWriteThreshold( + int inThreshold) + { + const int theThreshold = max(0, inThreshold); + const bool theStartWriteFlag = mWriteThreshold > theThreshold; + mWriteThreshold = theThreshold; + return ((theStartWriteFlag && IsOpen() && mErrorCode == 0) ? + StartWrite() : mErrorCode + ); + } + void DisableCompletion() + { mCompletionPtr = 0; } + void Register( + Completion* inCompletionPtr) + { + if (inCompletionPtr == mCompletionPtr) { + return; + } + if (mCompletionPtr) { + mCompletionPtr->Unregistered(mOuter); + } + mCompletionPtr = inCompletionPtr; + } + bool Unregister( + Completion* inCompletionPtr) + { + if (inCompletionPtr != mCompletionPtr) { + return false; + } + mCompletionPtr = 0; + return true; + } + void GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats) + { + outStats = mStats; + outChunkServersStats = mChunkServersStats; + } + bool GetErrorCode() const + { return mErrorCode; } + +private: + typedef KfsNetClient ChunkServer; + + class ChunkWriter : private ITimeout, private KfsNetClient::OpOwner + { + public: + struct WriteOp; + typedef QCDLList Queue; + typedef QCDLList Writers; + + struct WriteOp : public KfsOp + { + WritePrepareOp mWritePrepareOp; + WriteSyncOp mWriteSyncOp; + IOBuffer mBuffer; + size_t mBeginBlock; + size_t mEndBlock; + time_t mOpStartTime; + bool mChecksumValidFlag; + WriteOp* mPrevPtr[1]; + WriteOp* mNextPtr[1]; + + WriteOp() + : KfsOp(CMD_WRITE, 0), + mWritePrepareOp(0, 0, 0), + mWriteSyncOp(), + mBuffer(), + mBeginBlock(0), + mEndBlock(0), + mOpStartTime(0), + mChecksumValidFlag(false) + { Queue::Init(*this); } + void Delete( + WriteOp** inListPtr) + { + Queue::Remove(inListPtr, *this); + delete this; + } + virtual void Request( + ostream& inStream) + { + if (mWritePrepareOp.replyRequestedFlag) { + mWritePrepareOp.seq = seq; + } else { + mWriteSyncOp.seq = seq; + mWritePrepareOp.seq = seq + 1; + } + mWritePrepareOp.Request(inStream); + } + virtual bool NextRequest( + kfsSeq_t inSeqNum, + ostream& inStream) + { + if (mWritePrepareOp.replyRequestedFlag) { + return false; + } + QCASSERT(seq <= inSeqNum && inSeqNum <= mWritePrepareOp.seq + 1); + if (mWritePrepareOp.seq < inSeqNum) { + return false; + } + mWriteSyncOp.Request(inStream); + return true; + } + virtual string Show() const + { + string theRet = mWritePrepareOp.Show(); + if (! mWritePrepareOp.replyRequestedFlag) { + theRet += " "; + theRet += mWriteSyncOp.Show(); + } + return theRet; + } + virtual void ParseResponseHeaderSelf( + const Properties& inProps) + { + if (contentLength > 0) { + KFS_LOG_STREAM_ERROR << + "invalid response content length: " << contentLength << + " " << mWriteSyncOp.Show() << + KFS_LOG_EOM; + contentLength = 0; + } + mWritePrepareOp.status = status; + mWritePrepareOp.statusMsg = statusMsg; + mWriteSyncOp.status = status; + mWriteSyncOp.statusMsg = statusMsg; + if (mWritePrepareOp.replyRequestedFlag) { + mWritePrepareOp.ParseResponseHeaderSelf(inProps); + } else { + mWriteSyncOp.ParseResponseHeaderSelf(inProps); + } + } + void InitBlockRange() + { + QCASSERT( + mWritePrepareOp.offset >= 0 && + mWritePrepareOp.offset + + mBuffer.BytesConsumable() <= (Offset)CHUNKSIZE + ); + mBeginBlock = mWritePrepareOp.offset / CHECKSUM_BLOCKSIZE; + mEndBlock = mBeginBlock + + (mBuffer.BytesConsumable() + CHECKSUM_BLOCKSIZE - 1) / + CHECKSUM_BLOCKSIZE; + } + private: + virtual ~WriteOp() + {} + WriteOp( + const WriteOp& inWriteOp); + WriteOp& operator=( + const WriteOp& inWriteOp); + }; + + ChunkWriter( + Impl& inOuter, + int64_t inSeqNum, + const string& inLogPrefix) + : ITimeout(), + KfsNetClient::OpOwner(), + mOuter(inOuter), + mChunkServer( + inOuter.mNetManager, + string(), -1, + // All chunk server retries are handled here + 0, // inMaxRetryCount + 0, // inTimeSecBetweenRetries, + inOuter.mOpTimeoutSec, + inOuter.mIdleTimeoutSec, + inSeqNum, + inLogPrefix.c_str(), + // Just fail the op. Error handler will reset connection and + // cancel all pending ops by calling Stop() + false // inResetConnectionOnOpTimeoutFlag + ), + mErrorCode(0), + mRetryCount(0), + mPendingCount(0), + mOpenChunkBlockFileOffset(-1), + mOpStartTime(0), + mWriteIds(), + mAllocOp(0, 0, ""), + mWriteIdAllocOp(0, 0, 0, 0, 0), + mCloseOp(0, 0), + mLastOpPtr(0), + mSleepingFlag(false), + mClosingFlag(false), + mLogPrefix(inLogPrefix), + mOpDoneFlagPtr(0), + mInFlightBlocks() + { + Queue::Init(mPendingQueue); + Queue::Init(mInFlightQueue); + Writers::Init(*this); + Writers::PushFront(mOuter.mWriters, *this); + mChunkServer.SetRetryConnectOnly(true); + mAllocOp.fileOffset = -1; + mAllocOp.invalidateAllFlag = false; + } + ~ChunkWriter() + { + ChunkWriter::Shutdown(); + ChunkServer::Stats theStats; + mChunkServer.GetStats(theStats); + mOuter.mChunkServersStats.Add(theStats); + Writers::Remove(mOuter.mWriters, *this); + } + void CancelClose() + { mClosingFlag = false; } + // The QueueWrite() guarantees that completion will not be invoked. + // The writes will be queued even if the writer is already in the error + // state: mErrorCode != 0. In the case of fatal error all pending writes + // are discarded when the writer gets deleted. + // + // StartWrite() must be called in order to start executing pending + // writes. + // This allows the caller to properly update its state before the writes + // get executed, and the corresponding completion(s) invoked. + int QueueWrite( + IOBuffer& inBuffer, + int inSize, + Offset inOffset, + int inWriteThreshold) + { + int theSize = min(inBuffer.BytesConsumable(), inSize); + if (theSize <= 0) { + return 0; + } + const Offset kChunkSize = (Offset)CHUNKSIZE; + const int kChecksumBlockSize = (int)CHECKSUM_BLOCKSIZE; + QCRTASSERT(inOffset >= 0 && ! mClosingFlag); + const Offset theChunkOffset = inOffset % kChunkSize; + if (mAllocOp.fileOffset < 0) { + mAllocOp.fileOffset = inOffset - theChunkOffset; + mOpenChunkBlockFileOffset = mAllocOp.fileOffset - + mAllocOp.fileOffset % mOuter.mOpenChunkBlockSize; + } else { + QCRTASSERT(mAllocOp.fileOffset == inOffset - theChunkOffset); + } + theSize = min(theSize, (int)(kChunkSize - theChunkOffset)); + QCASSERT(theSize > 0); + Offset thePos = theChunkOffset; + // Try to append to the last pending op. + WriteOp* const theLastOpPtr = Queue::Back(mPendingQueue); + if (theLastOpPtr) { + WriteOp& theOp = *theLastOpPtr; + const int theOpSize = theOp.mBuffer.BytesConsumable(); + const Offset theOpPos = theOp.mWritePrepareOp.offset; + if (theOpPos + theOpSize == thePos) { + const int theHead = (int)(theOpPos % kChecksumBlockSize); + int theNWr = min(theSize, + (theHead == 0 ? + mOuter.mMaxWriteSize : + kChecksumBlockSize - theHead + ) - theOpSize + ); + if (theNWr > 0 && + theOpSize + theNWr > kChecksumBlockSize) { + theNWr -= (theOpSize + theNWr) % kChecksumBlockSize; + } + if (theNWr > 0) { + theOp.mBuffer.Move(&inBuffer, theNWr); + // Force checksum recomputation. + theOp.mChecksumValidFlag = false; + theOp.mWritePrepareOp.checksums.clear(); + // Update last the block index. + const int theCurBegin = theOp.mBeginBlock; + theOp.InitBlockRange(); + theOp.mBeginBlock = theCurBegin; + // The op is already in the pending queue. + theSize -= theNWr; + thePos += theNWr; + } + } + } + const int theWriteThreshold = thePos + theSize >= kChunkSize ? + 1 : max(inWriteThreshold, 1); + const int theBlockOff = (int)(thePos % kChecksumBlockSize); + if (theBlockOff > 0 && (theSize >= theWriteThreshold || + theBlockOff + theSize >= kChecksumBlockSize)) { + WriteOp* const theWriteOpPtr = new WriteOp(); + theWriteOpPtr->mWritePrepareOp.offset = thePos; + const int theNWr = theWriteOpPtr->mBuffer.Move( + &inBuffer, + min(theSize, kChecksumBlockSize - theBlockOff) + ); + theSize -= theNWr; + thePos += theNWr; + theWriteOpPtr->InitBlockRange(); + Queue::PushBack(mPendingQueue, *theWriteOpPtr); + } + while (theSize >= theWriteThreshold) { + int theOpSize = min(mOuter.mMaxWriteSize, theSize); + if (theOpSize > kChecksumBlockSize) { + theOpSize -= theOpSize % kChecksumBlockSize; + } + WriteOp* const theWriteOpPtr = new WriteOp(); + theWriteOpPtr->mWritePrepareOp.offset = thePos; + const int theNWr = + theWriteOpPtr->mBuffer.Move(&inBuffer, theOpSize); + theSize -= theNWr; + thePos += theNWr; + theWriteOpPtr->InitBlockRange(); + Queue::PushBack(mPendingQueue, *theWriteOpPtr); + } + QCRTASSERT(thePos <= kChunkSize && theSize >= 0); + const Offset theNWr = thePos - theChunkOffset; + // The following must be updated before invoking StartWrite(), + // as it could invoke completion immediately (in the case of + // failure). + mPendingCount += theNWr; + return theNWr; + } + void StartWrite() + { + if (mSleepingFlag) { + return; + } + if (mErrorCode != 0 && ! mAllocOp.invalidateAllFlag) { + if (mLastOpPtr) { + Reset(); + } + mClosingFlag = false; + return; + } + if (mClosingFlag && ! CanWrite()) { + if (! Queue::IsEmpty(mInFlightQueue)) { + return; + } + if (mLastOpPtr == &mCloseOp) { + return; + } + // Try to close chunk even if chunk server disconnected, to + // release the write lease. + if (mAllocOp.chunkId > 0) { + CloseChunk(); + return; + } + mChunkServer.Stop(); + if (mLastOpPtr == &mAllocOp) { + mOuter.mMetaServer.Cancel(mLastOpPtr, this); + } + mClosingFlag = false; + mAllocOp.fileOffset = -1; + mAllocOp.chunkId = -1; + ReportCompletion(); + return; + } + if (! CanWrite()) { + return; + } + if (mAllocOp.chunkId > 0 && mChunkServer.WasDisconnected()) { + // When chunk server disconnects it might clean up write lease. + // Start from the beginning -- chunk allocation. + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "detected chunk server disconnect: " << + mChunkServer.GetServerLocation() << + " starting from chunk allocation, pending:" << + " queue: " << (Queue::IsEmpty(mPendingQueue) ? "" : "not") << + " empty" << + KFS_LOG_EOM; + Reset(); + if (! CanWrite()) { + // Do not try to preallocate chunk after inactivity timeout + // or error, if no data pending. + return; + } + } + // Return immediately after calling Write() and AllocateChunk(), as + // these can invoke completion. Completion, in turn, can delete + // this. + // Other methods of this class have to return immediately (unwind) + // after invoking StartWrite(). + if (mAllocOp.chunkId > 0 && ! mWriteIds.empty()) { + Write(); + } else if (! mLastOpPtr) { // Close can be in flight. + Reset(); + AllocateChunk(); + } + } + void Close() + { + if (! mClosingFlag && IsOpen()) { + mClosingFlag = true; + StartWrite(); + } + } + void Shutdown() + { + Reset(); + QCRTASSERT(Queue::IsEmpty(mInFlightQueue)); + while (! Queue::IsEmpty(mPendingQueue)) { + Queue::Front(mPendingQueue)->Delete(mPendingQueue); + } + mClosingFlag = false; + mErrorCode = 0; + mPendingCount = 0; + } + Offset GetFileOffset() const + { + return (mErrorCode == 0 ? mAllocOp.fileOffset : -1); + } + bool IsIdle() const + { + return ( + Queue::IsEmpty(mPendingQueue) && + Queue::IsEmpty(mInFlightQueue) && + ! mClosingFlag + ); + } + bool IsOpen() const + { + return ( + mErrorCode == 0 && + mAllocOp.fileOffset >= 0 && + ! mClosingFlag + ); + } + int GetErrorCode() const + { return mErrorCode; } + Offset GetPendingCount() const + { return mPendingCount; } + ChunkWriter* GetPrevPtr() + { + ChunkWriter& thePrev = ChunkWritersListOp::GetPrev(*this); + return (&thePrev == this ? 0 : &thePrev); + } + Offset GetOpenChunkBlockFileOffset() const + { + return (mAllocOp.fileOffset >= 0 ? mOpenChunkBlockFileOffset : -1); + } + + private: + typedef std::vector WriteIds; + typedef std::bitset ChecksumBlocks; + + Impl& mOuter; + ChunkServer mChunkServer; + int mErrorCode; + int mRetryCount; + Offset mPendingCount; + Offset mOpenChunkBlockFileOffset; + time_t mOpStartTime; + WriteIds mWriteIds; + AllocateOp mAllocOp; + WriteIdAllocOp mWriteIdAllocOp; + CloseOp mCloseOp; + KfsOp* mLastOpPtr; + bool mSleepingFlag; + bool mClosingFlag; + string const mLogPrefix; + bool* mOpDoneFlagPtr; + ChecksumBlocks mInFlightBlocks; + WriteOp* mPendingQueue[1]; + WriteOp* mInFlightQueue[1]; + ChunkWriter* mPrevPtr[1]; + ChunkWriter* mNextPtr[1]; + + friend class QCDLListOp; + typedef QCDLListOp ChunkWritersListOp; + + void AllocateChunk() + { + QCASSERT( + mOuter.mFileId > 0 && + mAllocOp.fileOffset >= 0 && + ! Queue::IsEmpty(mPendingQueue) + ); + Reset(mAllocOp); + mAllocOp.fid = mOuter.mFileId; + mAllocOp.pathname = mOuter.mPathName; + mAllocOp.append = false; + mAllocOp.chunkId = -1; + mAllocOp.chunkVersion = -1; + mAllocOp.spaceReservationSize = 0; + mAllocOp.maxAppendersPerChunk = 0; + mAllocOp.chunkServers.clear(); + mOuter.mStats.mChunkAllocCount++; + EnqueueMeta(mAllocOp); + } + void Done( + AllocateOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(&mAllocOp == &inOp && ! inBufferPtr); + if (inCanceledFlag) { + return; + } + if (inOp.status != 0 || (mAllocOp.chunkServers.empty() && + ! mAllocOp.invalidateAllFlag)) { + mAllocOp.chunkId = 0; + HandleError(inOp); + return; + } + if (mAllocOp.invalidateAllFlag) { + // Report all writes completed. Completion does not expect the + // offset to match the original write offset with striper. + KFS_LOG_STREAM_INFO << mLogPrefix << + "invalidate done:" + " chunk: " << mAllocOp.chunkId << + " offset: " << mAllocOp.fileOffset << + " status: " << inOp.status << + " pending: " << mPendingCount << + " w-empty: " << Queue::IsEmpty(mPendingQueue) << + KFS_LOG_EOM; + const int theSize = mPendingCount; + const Offset theOffset = theSize > 0 ? mAllocOp.fileOffset : 0; + mAllocOp.invalidateAllFlag = false; + Shutdown(); + ReportCompletion(theOffset, theSize); + return; + } + AllocateWriteId(); + } + bool CanWrite() + { + return ( + ! Queue::IsEmpty(mPendingQueue) || + mAllocOp.invalidateAllFlag + ); + } + void AllocateWriteId() + { + QCASSERT(mAllocOp.chunkId > 0 && ! mAllocOp.chunkServers.empty()); + Reset(mWriteIdAllocOp); + mWriteIdAllocOp.chunkId = mAllocOp.chunkId; + mWriteIdAllocOp.chunkVersion = mAllocOp.chunkVersion; + mWriteIdAllocOp.isForRecordAppend = false; + mWriteIdAllocOp.chunkServerLoc = mAllocOp.chunkServers; + mWriteIdAllocOp.offset = 0; + mWriteIdAllocOp.numBytes = 0; + mWriteIdAllocOp.writePrepReplySupportedFlag = false; + if (mChunkServer.SetServer(mAllocOp.chunkServers[0])) { + Enqueue(mWriteIdAllocOp); + } else { + HandleError(mWriteIdAllocOp); + } + } + void Done( + WriteIdAllocOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(&mWriteIdAllocOp == &inOp && ! inBufferPtr); + mWriteIds.clear(); + if (inCanceledFlag) { + return; + } + if (inOp.status < 0) { + HandleError(inOp); + return; + } + const size_t theServerCount = inOp.chunkServerLoc.size(); + mWriteIds.reserve(theServerCount); + istringstream theStream(inOp.writeIdStr); + for (size_t i = 0; i < theServerCount; i++) { + WriteInfo theWInfo; + if (! (theStream >> + theWInfo.serverLoc.hostname >> + theWInfo.serverLoc.port >> + theWInfo.writeId)) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "write id alloc:" + " at index: " << i << + " of: " << theServerCount << + " invalid response: " << inOp.writeIdStr << + KFS_LOG_EOM; + break; + } + mWriteIds.push_back(theWInfo); + } + if (theServerCount != mWriteIds.size()) { + HandleError(inOp); + return; + } + StartWrite(); + } + void Write() + { + if (mOpDoneFlagPtr) { + return; + } + bool theOpDoneFlag = false; + mOpDoneFlagPtr = &theOpDoneFlag; + Queue::Iterator theIt(mPendingQueue); + WriteOp* theOpPtr; + while (! mSleepingFlag && + mErrorCode == 0 && + mAllocOp.chunkId > 0 && + (theOpPtr = theIt.Next())) { + Write(*theOpPtr); + if (theOpDoneFlag) { + return; // Unwind. "this" might be deleted. + } + } + mOpDoneFlagPtr = 0; + } + void Write( + WriteOp& inWriteOp) + { + while (inWriteOp.mBeginBlock < inWriteOp.mEndBlock) { + if (mInFlightBlocks.test(inWriteOp.mBeginBlock)) { + return; // Wait until the in flight write done. + } + mInFlightBlocks.set(inWriteOp.mBeginBlock++, 1); + } + Reset(inWriteOp); + inWriteOp.contentLength = + size_t(inWriteOp.mBuffer.BytesConsumable()); + inWriteOp.mWritePrepareOp.chunkId = mAllocOp.chunkId; + inWriteOp.mWritePrepareOp.chunkVersion = + mAllocOp.chunkVersion; + inWriteOp.mWritePrepareOp.writeInfo = mWriteIds; + inWriteOp.mWritePrepareOp.contentLength = + inWriteOp.contentLength; + inWriteOp.mWritePrepareOp.numBytes = + inWriteOp.contentLength; + inWriteOp.mWritePrepareOp.replyRequestedFlag = + mWriteIdAllocOp.writePrepReplySupportedFlag; + // No need to recompute checksums on retry. Presently the buffer + // remains the unchanged. + if (inWriteOp.mWritePrepareOp.replyRequestedFlag) { + if (! inWriteOp.mChecksumValidFlag) { + inWriteOp.mWritePrepareOp.checksum = ComputeBlockChecksum( + &inWriteOp.mBuffer, + inWriteOp.mWritePrepareOp.numBytes + ); + inWriteOp.mChecksumValidFlag = true; + } + inWriteOp.mWritePrepareOp.checksums.clear(); + } else { + if (inWriteOp.mWritePrepareOp.checksums.empty()) { + inWriteOp.mWritePrepareOp.checksums = ComputeChecksums( + &inWriteOp.mBuffer, + inWriteOp.mWritePrepareOp.numBytes, + &inWriteOp.mWritePrepareOp.checksum + ); + inWriteOp.mChecksumValidFlag = true; + } + inWriteOp.mWriteSyncOp.chunkId = + inWriteOp.mWritePrepareOp.chunkId; + inWriteOp.mWriteSyncOp.chunkVersion = + inWriteOp.mWritePrepareOp.chunkVersion; + inWriteOp.mWriteSyncOp.offset = + inWriteOp.mWritePrepareOp.offset; + inWriteOp.mWriteSyncOp.numBytes = + inWriteOp.mWritePrepareOp.numBytes; + inWriteOp.mWriteSyncOp.writeInfo = + inWriteOp.mWritePrepareOp.writeInfo; + inWriteOp.mWriteSyncOp.checksums = + inWriteOp.mWritePrepareOp.checksums; + } + inWriteOp.mOpStartTime = Now(); + Queue::Remove(mPendingQueue, inWriteOp); + Queue::PushBack(mInFlightQueue, inWriteOp); + mOuter.mStats.mOpsWriteCount++; + Enqueue(inWriteOp, &inWriteOp.mBuffer); + } + void Done( + WriteOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT( + inBufferPtr == &inOp.mBuffer && + Queue::IsInList(mInFlightQueue, inOp) + ); + inOp.InitBlockRange(); + for (size_t i = inOp.mBeginBlock; i < inOp.mEndBlock; i++) { + mInFlightBlocks.set(i, 0); + } + if (inCanceledFlag || inOp.status < 0) { + Queue::Remove(mInFlightQueue, inOp); + Queue::PushBack(mPendingQueue, inOp); + if (! inCanceledFlag) { + mOpStartTime = inOp.mOpStartTime; + HandleError(inOp); + } + return; + } + const Offset theOffset = inOp.mWritePrepareOp.offset; + const Offset theDoneCount = inOp.mBuffer.BytesConsumable(); + QCASSERT( + theDoneCount >= 0 && + mPendingCount >= theDoneCount + ); + mPendingCount -= theDoneCount; + inOp.Delete(mInFlightQueue); + if (! ReportCompletion(theOffset, theDoneCount)) { + return; + } + StartWrite(); + } + void CloseChunk() + { + QCASSERT(mAllocOp.chunkId > 0); + Reset(mCloseOp); + mCloseOp.chunkId = mAllocOp.chunkId; + mCloseOp.writeInfo = mWriteIds; + if (mCloseOp.writeInfo.empty()) { + mCloseOp.chunkServerLoc = mAllocOp.chunkServers; + } else { + mCloseOp.chunkServerLoc.clear(); + } + mWriteIds.clear(); + mAllocOp.chunkId = -1; + Enqueue(mCloseOp); + } + void Done( + CloseOp& inOp, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(&mCloseOp == &inOp && ! inBufferPtr); + if (inCanceledFlag) { + return; + } + if (mCloseOp.status != 0) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "chunk close failure, status: " << mCloseOp.status << + " ignored" << + KFS_LOG_EOM; + } + Reset(); + StartWrite(); + } + virtual void OpDone( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + if (mOpDoneFlagPtr) { + *mOpDoneFlagPtr = true; + mOpDoneFlagPtr = 0; + } + if (inOpPtr) { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "<- " << (inCanceledFlag ? "canceled " : "") << + inOpPtr->Show() << + " status: " << inOpPtr->status << + " msg: " << inOpPtr->statusMsg << + " seq: " << inOpPtr->seq << + " len: " << inOpPtr->contentLength << + " buffer: " << static_cast(inBufferPtr) << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + KFS_LOG_EOM; + } else { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "<- " << (inCanceledFlag ? "canceled " : "") << + "NULL operation completion?" << + " buffer: " << static_cast(inBufferPtr) << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + KFS_LOG_EOM; + } + if (inCanceledFlag && inOpPtr == &mAllocOp) { + mOuter.mStats.mMetaOpsCancelledCount++; + } + if (mLastOpPtr == inOpPtr) { + mLastOpPtr = 0; + } + if (&mAllocOp == inOpPtr) { + Done(mAllocOp, inCanceledFlag, inBufferPtr); + } else if (&mWriteIdAllocOp == inOpPtr) { + Done(mWriteIdAllocOp, inCanceledFlag, inBufferPtr); + } else if (&mAllocOp == inOpPtr) { + Done(mAllocOp, inCanceledFlag, inBufferPtr); + } else if (&mCloseOp == inOpPtr) { + Done(mCloseOp, inCanceledFlag, inBufferPtr); + } else if (inOpPtr->op == CMD_WRITE) { + Done(*static_cast(inOpPtr), + inCanceledFlag, inBufferPtr); + } else { + mOuter.InternalError("unexpected operation completion"); + } + } + void Enqueue( + KfsOp& inOp, + IOBuffer* inBufferPtr = 0) + { EnqueueSelf(inOp, inBufferPtr, &mChunkServer); } + void EnqueueMeta( + KfsOp& inOp, + IOBuffer* inBufferPtr = 0) + { EnqueueSelf(inOp, inBufferPtr, 0); } + void Reset() + { + if (mLastOpPtr == &mAllocOp) { + mOuter.mMetaServer.Cancel(mLastOpPtr, this); + } + Reset(mAllocOp); + mWriteIds.clear(); + mAllocOp.chunkId = 0; + mLastOpPtr = 0; + mChunkServer.Stop(); + QCASSERT(Queue::IsEmpty(mInFlightQueue)); + if (mSleepingFlag) { + mOuter.mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + } + static void Reset( + KfsOp& inOp) + { + inOp.seq = 0; + inOp.status = 0; + inOp.statusMsg.clear(); + inOp.checksum = 0; + inOp.contentLength = 0; + inOp.contentBufLen = 0; + delete [] inOp.contentBuf; + inOp.contentBuf = 0; + } + int GetTimeToNextRetry() const + { + return max(mRetryCount >= 1 ? 1 : 0, + mOuter.mTimeSecBetweenRetries - int(Now() - mOpStartTime)); + } + void HandleError( + KfsOp& inOp) + { + ostringstream theOStream; + inOp.Request(theOStream); + KFS_LOG_STREAM_ERROR << mLogPrefix << + "operation" + " failure, seq: " << inOp.seq << + " status: " << inOp.status << + " msg: " << inOp.statusMsg << + " op: " << inOp.Show() << + " current chunk server: " << mChunkServer.GetServerLocation() << + " chunkserver: " << (mChunkServer.IsDataSent() ? + (mChunkServer.IsAllDataSent() ? "all" : "partial") : + "no") << " data sent" << + "\nRequest:\n" << theOStream.str() << + KFS_LOG_EOM; + int theStatus = inOp.status; + if (&inOp == &mAllocOp && theStatus == kErrorNoEntry) { + // File deleted, and lease expired or meta server restarted. + KFS_LOG_STREAM_ERROR << mLogPrefix << + "file does not exist, giving up" << + KFS_LOG_EOM; + mErrorCode = theStatus; + Reset(); + mOuter.FatalError(theStatus); + return; + } + if (mOuter.mStriperPtr && ! mAllocOp.invalidateAllFlag && + mAllocOp.fileOffset >= 0 && + ! mOuter.mStriperPtr->IsWriteRetryNeeded( + mAllocOp.fileOffset, + mRetryCount, + mOuter.mMaxRetryCount, + theStatus)) { + KFS_LOG_STREAM_INFO << mLogPrefix << + "invalidate:" + " chunk: " << mAllocOp.chunkId << + " offset: " << mAllocOp.fileOffset << + " status: " << inOp.status << + " => " << theStatus << + " pending: " << mPendingCount << + " w-empty: " << Queue::IsEmpty(mPendingQueue) << + KFS_LOG_EOM; + mErrorCode = theStatus; + mAllocOp.invalidateAllFlag = true; + mRetryCount = 0; + Reset(); + QCASSERT(CanWrite()); + StartWrite(); + return; + } + if (++mRetryCount > mOuter.mMaxRetryCount) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "max retry reached: " << mRetryCount << ", giving up" << + KFS_LOG_EOM; + mErrorCode = theStatus < 0 ? theStatus : -1; + Reset(); + mOuter.FatalError(theStatus < 0 ? theStatus : -1); + return; + } + // Treat alloc failure the same as chunk server failure. + if (&mAllocOp == mLastOpPtr) { + mOuter.mStats.mAllocRetriesCount++; + } + mOuter.mStats.mRetriesCount++; + const int theTimeToNextRetry = GetTimeToNextRetry(); + // Retry. + KFS_LOG_STREAM_INFO << mLogPrefix << + "scheduling retry: " << mRetryCount << + " of " << mOuter.mMaxRetryCount << + " in " << theTimeToNextRetry << " sec." << + " op: " << inOp.Show() << + KFS_LOG_EOM; + mErrorCode = 0; + Reset(); + Sleep(theTimeToNextRetry); + if (! mSleepingFlag) { + Timeout(); + } + } + bool Sleep( + int inSec) + { + if (inSec <= 0 || mSleepingFlag) { + return false; + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "sleeping: " << inSec << + KFS_LOG_EOM; + mSleepingFlag = true; + mOuter.mStats.mSleepTimeSec += inSec; + const bool kResetTimerFlag = true; + SetTimeoutInterval(inSec * 1000, kResetTimerFlag); + mOuter.mNetManager.RegisterTimeoutHandler(this); + return true; + } + virtual void Timeout() + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << "timeout" << + KFS_LOG_EOM; + if (mSleepingFlag) { + mOuter.mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + StartWrite(); + } + bool ReportCompletion( + Offset inOffset = 0, + Offset inSize = 0) + { + if (mErrorCode == 0) { + // Reset retry counts on successful completion. + mRetryCount = 0; + } + return mOuter.ReportCompletion(this, inOffset, inSize); + } + time_t Now() const + { return mOuter.mNetManager.Now(); } + void EnqueueSelf( + KfsOp& inOp, + IOBuffer* inBufferPtr, + KfsNetClient* inServerPtr) + { + mLastOpPtr = &inOp; + mOpStartTime = Now(); + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "+> " << (inServerPtr ? "" : "meta ") << inOp.Show() << + " buffer: " << (void*)inBufferPtr << + "/" << (inBufferPtr ? inBufferPtr->BytesConsumable() : 0) << + KFS_LOG_EOM; + if (inServerPtr) { + mOuter.mStats.mChunkOpsQueuedCount++; + } else { + mOuter.mStats.mMetaOpsQueuedCount++; + } + if (! (inServerPtr ? *inServerPtr : mOuter.mMetaServer).Enqueue( + &inOp, this, inBufferPtr)) { + mOuter.InternalError(inServerPtr ? + "chunk op enqueue failure" : + "meta op enqueue failure" + ); + inOp.status = kErrorFault; + OpDone(&inOp, false, inBufferPtr); + } + } + private: + ChunkWriter( + const ChunkWriter& inChunkWriter); + ChunkWriter& operator=( + const ChunkWriter& inChunkWriter); + }; + friend class ChunkWriter; + friend class Striper; + + typedef ChunkWriter::Writers Writers; + + Writer& mOuter; + MetaServer& mMetaServer; + string mPathName; + kfsFileId_t mFileId; + bool mClosingFlag; + bool mSleepingFlag; + int mErrorCode; + int mWriteThreshold; + int mPartialBuffersCount; + Offset mPendingCount; + const int mIdleTimeoutSec; + const int mOpTimeoutSec; + const int mMaxRetryCount; + const int mTimeSecBetweenRetries; + const int mMaxPartialBuffersCount; + const int mMaxWriteSize; + int mReplicaCount; + int mRetryCount; + Offset mFileSize; + Offset mOffset; + Offset mOpenChunkBlockSize; + int64_t mChunkServerInitialSeqNum; + Completion* mCompletionPtr; + IOBuffer mBuffer; + string const mLogPrefix; + Stats mStats; + KfsNetClient::Stats mChunkServersStats; + NetManager& mNetManager; + TruncateOp mTruncateOp; + time_t mOpStartTime; + int mCompletionDepthCount; + int mStriperProcessCount; + Striper* mStriperPtr; + ChunkWriter* mWriters[1]; + + void InternalError( + const char* inMsgPtr = 0) + { + if (inMsgPtr) { + KFS_LOG_STREAM_FATAL << inMsgPtr << KFS_LOG_EOM; + } + MsgLogger::Stop(); + abort(); + } + + virtual ~Impl() + { + Impl::DisableCompletion(); + Impl::Shutdown(); + delete mStriperPtr; + } + Offset GetPendingSizeSelf() const + { + return (mBuffer.BytesConsumable() + + (mStriperPtr ? + max(Offset(0), mStriperPtr->GetPendingSize()) : + Offset(0) + )); + } + int StartWrite( + bool inFlushFlag = false) + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "start write:" << + " pending: " << GetPendingSizeSelf() << + " thresh: " << mWriteThreshold << + " flush: " << inFlushFlag << + (mSleepingFlag ? " SLEEPING" : "") << + KFS_LOG_EOM; + + if (mSleepingFlag) { + return mErrorCode; + } + const bool theFlushFlag = inFlushFlag || mClosingFlag; + const int theWriteThreshold = + max(1, theFlushFlag ? 1 : mWriteThreshold); + while (mErrorCode == 0 && GetPendingSizeSelf() >= theWriteThreshold) { + const int thePrevRefCount = GetRefCount(); + QueueWrite(theWriteThreshold); + if (thePrevRefCount > GetRefCount()) { + return mErrorCode; // Unwind + } + if (mBuffer.IsEmpty()) { + break; + } + } + if (! mClosingFlag) { + return mErrorCode; + } + if (Writers::IsEmpty(mWriters)) { + ReportCompletion(); + return mErrorCode; + } + Writers::Iterator theIt(mWriters); + ChunkWriter* thePtr; + while ((thePtr = theIt.Next())) { + if (! thePtr->IsOpen()) { + continue; + } + const int thePrevRefCount = GetRefCount(); + thePtr->Close(); + if (thePrevRefCount > GetRefCount()) { + return mErrorCode; // Unwind + } + // Restart from the beginning as close can invoke completion + // and remove or close more than one writer in TryToCloseIdle(). + theIt.Reset(); + } + if (Writers::IsEmpty(mWriters) && mClosingFlag) { + SetFileSize(); + } + return mErrorCode; + } + void SetFileSize() + { + if (! mStriperPtr || mErrorCode != 0 || mTruncateOp.fid >= 0) { + return; + } + const Offset theSize = mStriperPtr->GetFileSize(); + if (theSize < 0 || theSize <= mTruncateOp.fileOffset) { + return; + } + mOpStartTime = mNetManager.Now(); + mTruncateOp.pathname = mPathName.c_str(); + mTruncateOp.fid = mFileId; + mTruncateOp.fileOffset = theSize; + mTruncateOp.status = 0; + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "meta +> " << mTruncateOp.Show() << + KFS_LOG_EOM; + if (! mMetaServer.Enqueue(&mTruncateOp, this, 0)) { + InternalError("meta truncate enqueue failure"); + mTruncateOp.status = kErrorFault; + OpDone(&mTruncateOp, false, 0); + } + } + virtual void OpDone( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "meta <- " << (inOpPtr ? inOpPtr->Show() : string("null")) << + (inCanceledFlag ? " canceled" : "") << + " status: " << (inOpPtr ? inOpPtr->status : 0) << + " " << (inOpPtr ? inOpPtr->statusMsg : string()) << + KFS_LOG_EOM; + QCASSERT(inOpPtr == &mTruncateOp); + if (inOpPtr != &mTruncateOp) { + return; + } + mTruncateOp.pathname = 0; + mTruncateOp.fid = -1; + if (inCanceledFlag) { + mTruncateOp.fileOffset = -1; + return; + } + if (mTruncateOp.status != 0) { + KFS_LOG_STREAM_ERROR << mLogPrefix << + "set size failure:" + " offset: " << mTruncateOp.fileOffset << + " status: " << mTruncateOp.status << + " msg: " << mTruncateOp.statusMsg << + " retry: " << mRetryCount << + " of: " << mMaxRetryCount << + KFS_LOG_EOM; + mTruncateOp.fileOffset = -1; + if (++mRetryCount < mMaxRetryCount) { + Sleep(max( + mRetryCount > 1 ? 1 : 0, + mTimeSecBetweenRetries - + int(mNetManager.Now() - mOpStartTime) + )); + if (! mSleepingFlag) { + StartWrite(); + } + } else { + FatalError(mTruncateOp.status); + } + } else { + mRetryCount = 0; + ReportCompletion(); + } + } + bool Sleep( + int inSec) + { + if (inSec <= 0 || mSleepingFlag) { + return false; + } + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "sleeping: " << inSec << + KFS_LOG_EOM; + mSleepingFlag = true; + mStats.mSleepTimeSec += inSec; + const bool kResetTimerFlag = true; + SetTimeoutInterval(inSec * 1000, kResetTimerFlag); + mNetManager.RegisterTimeoutHandler(this); + return true; + } + virtual void Timeout() + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << "timeout" << + KFS_LOG_EOM; + if (mSleepingFlag) { + mNetManager.UnRegisterTimeoutHandler(this); + mSleepingFlag = false; + } + StartWrite(); + } + void QueueWrite( + int inWriteThreshold) + { + if (mStriperPtr) { + QCStValueIncrementor theIncrement(mStriperProcessCount, 1); + const int theErrCode = + mStriperPtr->Process(mBuffer, mOffset, inWriteThreshold); + if (theErrCode != 0 && mErrorCode == 0) { + mErrorCode = theErrCode; + } + return; + } + const int theQueuedCount = QueueWrite( + mBuffer, + mBuffer.BytesConsumable(), + mOffset, + inWriteThreshold + ); + if (theQueuedCount > 0) { + mOffset += theQueuedCount; + StartQueuedWrite(theQueuedCount); + } + } + int QueueWrite( + IOBuffer& inBuffer, + int inSize, + Offset inOffset, + int inWriteThreshold) + { + QCASSERT(inOffset >= 0); + if (inSize <= 0 || inBuffer.BytesConsumable() <= 0) { + return 0; + } + const Offset theFileOffset = inOffset - inOffset % CHUNKSIZE; + Writers::Iterator theIt(mWriters); + ChunkWriter* thePtr; + while ((thePtr = theIt.Next())) { + if (thePtr->GetFileOffset() == theFileOffset) { + break; + } + } + if (thePtr) { + Writers::PushFront(mWriters, *thePtr); + thePtr->CancelClose(); + } else { + mChunkServerInitialSeqNum += 10000; + thePtr = new ChunkWriter( + *this, mChunkServerInitialSeqNum, mLogPrefix); + } + const int theQueuedCount = thePtr->QueueWrite( + inBuffer, inSize, inOffset, inWriteThreshold); + QCASSERT(Writers::Front(mWriters) == thePtr); + return theQueuedCount; + } + void StartQueuedWrite( + int inQueuedCount) + { + if (inQueuedCount <= 0) { + return; + } + QCASSERT(! Writers::IsEmpty(mWriters)); + mPendingCount += inQueuedCount; + Writers::Front(mWriters)->StartWrite(); + } + void FatalError( + int inErrorCode = 0) + { + if (mErrorCode == 0) { + mErrorCode = inErrorCode; + } + if (mErrorCode == 0) { + mErrorCode = -1; + } + mClosingFlag = false; + ReportCompletion(); + } + bool CanClose( + ChunkWriter& inWriter) + { + if (! inWriter.IsIdle()) { + return false; + } + if (! inWriter.IsOpen() || mClosingFlag) { + return true; + } + // The most recently used should always be first. + const ChunkWriter* const thePtr = Writers::Front(mWriters); + if (! thePtr) { + return true; + } + if (thePtr == &inWriter) { + return false; + } + const Offset theLeftEdge = thePtr->GetOpenChunkBlockFileOffset(); + if (theLeftEdge < 0) { + return false; + } + const Offset theRightEdge = theLeftEdge + mOpenChunkBlockSize; + const Offset theOffset = inWriter.GetFileOffset(); + return (theOffset < theLeftEdge || theRightEdge <= theOffset); + } + bool TryToCloseIdle( + ChunkWriter* inWriterPtr) + { + ChunkWriter* thePtr = Writers::Back(mWriters); + if (! thePtr) { + thePtr = inWriterPtr; + } + bool theRetFlag = true; + while (thePtr) { + ChunkWriter& theWriter = *thePtr; + thePtr = (thePtr == Writers::Front(mWriters)) ? + 0 : theWriter.GetPrevPtr(); + if (CanClose(theWriter)) { + const bool theOpenFlag = theWriter.IsOpen(); + if (theOpenFlag) { + theWriter.Close(); + } + // Handle "synchronous" Close(). ReportCompletion, calls + // this method only when mCompletionDepthCount <= 1 + if (! theOpenFlag || + (! theWriter.IsOpen() && CanClose(theWriter))) { + if (&theWriter == inWriterPtr) { + theRetFlag = false; + } + delete &theWriter; + } + } else if (theWriter.IsIdle() && theWriter.IsOpen()) { + // Stop at the first idle that can not be closed. + break; + } + } + return theRetFlag; + } + bool ReportCompletion( + ChunkWriter* inWriterPtr = 0, + Offset inOffset = 0, + Offset inSize = 0) + { + // Order matters here, as StRef desctructor can delete this. + StRef theRef(*this); + QCStValueIncrementor theIncrement(mCompletionDepthCount, 1); + + QCRTASSERT(mPendingCount >= 0 && mPendingCount >= inSize); + mPendingCount -= inSize; + if (inWriterPtr && mErrorCode == 0) { + mErrorCode = inWriterPtr->GetErrorCode(); + } + const int thePrevRefCount = GetRefCount(); + if (mCompletionPtr) { + mCompletionPtr->Done(mOuter, mErrorCode, inOffset, inSize); + } + bool theRet = true; + if (mCompletionDepthCount <= 1 && mStriperProcessCount <= 0) { + theRet = TryToCloseIdle(inWriterPtr); + if (mClosingFlag && Writers::IsEmpty(mWriters) && ! mSleepingFlag) { + SetFileSize(); + if (mTruncateOp.fid < 0 && ! mSleepingFlag) { + mClosingFlag = false; + mFileId = -1; + Striper* const theStriperPtr = mStriperPtr; + mStriperPtr = 0; + delete theStriperPtr; + theRet = false; + if (mCompletionPtr) { + mCompletionPtr->Done(mOuter, mErrorCode, 0, 0); + } + } + } + } + return (theRet && thePrevRefCount <= GetRefCount()); + } +private: + Impl( + const Impl& inWriter); + Impl& operator=( + const Impl& inWriter); +}; + +/* static */ Writer::Striper* +Writer::Striper::Create( + int inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + Writer::Striper::Offset inFileSize, + string inLogPrefix, + Writer::Striper::Impl& inOuter, + Writer::Striper::Offset& outOpenChunkBlockSize, + string& outErrMsg) +{ + switch (inType) { + case kStriperTypeNone: + outOpenChunkBlockSize = Offset(CHUNKSIZE); + break; + case kStriperTypeRS: + return RSStriperCreate( + kStriperTypeRS, + inStripeCount, + inRecoveryStripeCount, + inStripeSize, + inFileSize, + inLogPrefix, + inOuter, + outOpenChunkBlockSize, + outErrMsg + ); + default: + outErrMsg = "unsupported striper type"; + break; + } + return 0; +} + +int +Writer::Striper::QueueWrite( + IOBuffer& inBuffer, + int inSize, + Writer::Offset inOffset, + int inWriteThreshold) +{ + const int theQueuedCount = mOuter.QueueWrite( + inBuffer, inSize, inOffset, inWriteThreshold); + mWriteQueuedFlag = theQueuedCount > 0; + return theQueuedCount; +} + +void +Writer::Striper::StartQueuedWrite( + int inQueuedCount) +{ + if (! mWriteQueuedFlag) { + return; + } + mWriteQueuedFlag = false; + mOuter.StartQueuedWrite(inQueuedCount); +} + +Writer::Writer( + Writer::MetaServer& inMetaServer, + Writer::Completion* inCompletionPtr /* = 0 */, + int inMaxRetryCount /* = 6 */, + int inWriteThreshold /* = 1 << 20 */, + int inMaxPartialBuffersCount /* = 16 */, + int inTimeSecBetweenRetries /* = 15 */, + int inOpTimeoutSec /* = 30 */, + int inIdleTimeoutSec /* = 5 * 30 */, + int inMaxWriteSize /* = 1 << 20 */, + const char* inLogPrefixPtr /* = 0 */, + int64_t inChunkServerInitialSeqNum /* = 1 */) + : mImpl(*new Writer::Impl( + *this, + inMetaServer, + inCompletionPtr, + inMaxRetryCount, + inWriteThreshold, + inMaxPartialBuffersCount, + inTimeSecBetweenRetries, + inOpTimeoutSec, + inIdleTimeoutSec, + inMaxWriteSize, + (inLogPrefixPtr && inLogPrefixPtr[0]) ? + (inLogPrefixPtr + string(" ")) : string(), + inChunkServerInitialSeqNum + )) +{ + mImpl.Ref(); +} + +/* virtual */ +Writer::~Writer() +{ + mImpl.DisableCompletion(); + mImpl.UnRef(); +} + +int +Writer::Open( + kfsFileId_t inFileId, + const char* inFileNamePtr, + Offset inFileSize, + int inStriperType, + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + int inReplicaCount) +{ + Impl::StRef theRef(mImpl); + return mImpl.Open( + inFileId, + inFileNamePtr, + inFileSize, + inStriperType, + inStripeSize, + inStripeCount, + inRecoveryStripeCount, + inReplicaCount + ); +} + +int +Writer::Close() +{ + Impl::StRef theRef(mImpl); + return mImpl.Close(); +} + +int +Writer::Write( + IOBuffer& inBuffer, + int inLength, + Writer::Offset inOffset, + bool inFlushFlag, + int inWriteThreshold) +{ + Impl::StRef theRef(mImpl); + return mImpl.Write( + inBuffer, inLength, inOffset, inFlushFlag, inWriteThreshold); +} + +bool +Writer::IsOpen() const +{ + Impl::StRef theRef(mImpl); + return (mImpl.IsOpen() && ! IsClosing()); +} + +bool +Writer::IsClosing() const +{ + Impl::StRef theRef(mImpl); + return mImpl.IsClosing(); +} + +bool +Writer::IsActive() const +{ + Impl::StRef theRef(mImpl); + return mImpl.IsActive(); +} + +Writer::Offset +Writer::GetPendingSize() const +{ + Impl::StRef theRef(mImpl); + return mImpl.GetPendingSize(); +} + +int +Writer::GetErrorCode() const +{ + Impl::StRef theRef(mImpl); + return mImpl.GetErrorCode(); +} + +int +Writer::SetWriteThreshold( + int inThreshold) +{ + Impl::StRef theRef(mImpl); + return mImpl.SetWriteThreshold(inThreshold); +} + +int +Writer::Flush() +{ + Impl::StRef theRef(mImpl); + return mImpl.Flush(); +} + +void +Writer::Stop() +{ + Impl::StRef theRef(mImpl); + mImpl.Stop(); +} + +void +Writer::Shutdown() +{ + Impl::StRef theRef(mImpl); + mImpl.Shutdown(); +} + +void +Writer::Register( + Writer::Completion* inCompletionPtr) +{ + Impl::StRef theRef(mImpl); + mImpl.Register(inCompletionPtr); +} + +bool +Writer::Unregister( + Writer::Completion* inCompletionPtr) +{ + Impl::StRef theRef(mImpl); + return mImpl.Unregister(inCompletionPtr); +} + +void +Writer::GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats) +{ + Impl::StRef theRef(mImpl); + mImpl.GetStats(outStats, outChunkServersStats); +} + +}} diff --git a/src/cc/libclient/Writer.h b/src/cc/libclient/Writer.h new file mode 100644 index 000000000..4cbebac51 --- /dev/null +++ b/src/cc/libclient/Writer.h @@ -0,0 +1,267 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2010/06/22 +// Author: Mike Ovsiannikov +// +// Copyright 2010-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +//---------------------------------------------------------------------------- + +#ifndef WRITER_H +#define WRITER_H + +#include "KfsNetClient.h" +#include "common/kfstypes.h" + +#include +#include + +namespace KFS +{ + +class IOBuffer; + +namespace client +{ +// Kfs client write protocol state machine. +class Writer +{ +public: + typedef int64_t Offset; + class Impl; + + class Completion + { + public: + // For striped files inOffset and inSize parameters are the file + // offset and the size of the stripe (possibly recovery stripe), not + // the enqueued write request file offset and size. + virtual void Done( + Writer& inWriter, + int inStatusCode, + Offset inOffset, + Offset inSize) = 0; + virtual void Unregistered( + Writer& /* inWriter */) + {} + protected: + Completion() + {} + Completion( + const Completion&) + {} + virtual ~Completion() + {} + }; + struct Stats + { + typedef int64_t Counter; + Stats() + : mMetaOpsQueuedCount(0), + mMetaOpsCancelledCount(0), + mChunkOpsQueuedCount(0), + mSleepTimeSec(0), + mChunkAllocCount(0), + mOpsWriteCount(0), + mAllocRetriesCount(0), + mRetriesCount(0), + mWriteCount(0), + mWriteByteCount(0), + mBufferCompactionCount(0) + {} + void Clear() + { *this = Stats(); } + Stats& Add( + const Stats& inStats) + { + mMetaOpsQueuedCount += inStats.mMetaOpsQueuedCount; + mMetaOpsCancelledCount += inStats.mMetaOpsCancelledCount; + mChunkOpsQueuedCount += inStats.mChunkOpsQueuedCount; + mSleepTimeSec += inStats.mSleepTimeSec; + mChunkAllocCount += inStats.mChunkAllocCount; + mOpsWriteCount += inStats.mOpsWriteCount; + mAllocRetriesCount += inStats.mAllocRetriesCount; + mRetriesCount += inStats.mRetriesCount; + mWriteCount += inStats.mWriteCount; + mWriteByteCount += inStats.mWriteByteCount; + mBufferCompactionCount += inStats.mBufferCompactionCount; + return *this; + } + std::ostream& Display( + std::ostream& inStream, + const char* inSeparatorPtr = 0, + const char* inDelimiterPtr = 0) const + { + const char* const theSeparatorPtr = + inSeparatorPtr ? inSeparatorPtr : " "; + const char* const theDelimiterPtr = + inDelimiterPtr ? inDelimiterPtr : ": "; + inStream << + "MetaOpsQueued" << theDelimiterPtr << + mMetaOpsQueuedCount << theSeparatorPtr << + "MetaOpsCancelled" << theDelimiterPtr << + mMetaOpsCancelledCount << theSeparatorPtr << + "ChunkOpsQueued" << theDelimiterPtr << + mChunkOpsQueuedCount << theSeparatorPtr << + "SleepTimeSec" << theDelimiterPtr << + mSleepTimeSec << theSeparatorPtr << + "ChunkAlloc" << theDelimiterPtr << + mChunkAllocCount << theSeparatorPtr << + "OpsWrite" << theDelimiterPtr << + mOpsWriteCount << theSeparatorPtr << + "AllocRetries" << theDelimiterPtr << + mAllocRetriesCount << theSeparatorPtr << + "BufferCompactionCount" << theDelimiterPtr << + mBufferCompactionCount << theSeparatorPtr << + "Retries" << theDelimiterPtr << + mRetriesCount << theSeparatorPtr << + "WriteCount" << theDelimiterPtr << + mWriteCount << theSeparatorPtr << + "WriteByteCount" << theDelimiterPtr << + mWriteByteCount + ; + return inStream; + } + Counter mMetaOpsQueuedCount; + Counter mMetaOpsCancelledCount; + Counter mChunkOpsQueuedCount; + Counter mSleepTimeSec; + Counter mChunkAllocCount; + Counter mOpsWriteCount; + Counter mAllocRetriesCount; + Counter mRetriesCount; + Counter mWriteCount; + Counter mWriteByteCount; + Counter mBufferCompactionCount; + }; + class Striper + { + public: + typedef Writer::Impl Impl; + typedef Writer::Offset Offset; + enum StriperType + { + kStriperTypeNone = KFS::KFS_STRIPED_FILE_TYPE_NONE, + kStriperTypeRS = KFS::KFS_STRIPED_FILE_TYPE_RS + }; + static Striper* Create( + int inType, + int inStripeCount, + int inRecoveryStripeCount, + int inStripeSize, + Offset inFileSize, + string inLogPrefix, + Impl& inOuter, + Offset& outOpenChunkBlockSize, + std::string& outErrMsg); + virtual ~Striper() + {} + virtual int Process( + IOBuffer& inBuffer, + Offset& ioOffset, + int inWriteThreshold) = 0; + virtual Offset GetPendingSize() const = 0; + virtual bool IsWriteRetryNeeded( + Offset inChunkOffset, + int inRetryCount, + int inMaxRetryCount, + int& ioStatus) = 0; + virtual Offset GetFileSize() const = 0; + protected: + Striper( + Impl& inOuter) + : mOuter(inOuter), + mWriteQueuedFlag(false) + {} + int QueueWrite( + IOBuffer& inBuffer, + int inSize, + Offset inOffset, + int inWriteThreshold); + void StartQueuedWrite( + int inQueuedCount); + bool IsWriteQueued() const + { return mWriteQueuedFlag; } + private: + Impl& mOuter; + bool mWriteQueuedFlag; + private: + Striper( + const Striper& inStriper); + Striper& operator=( + const Striper& inStipter); + }; + typedef KfsNetClient MetaServer; + Writer( + MetaServer& inMetaServer, + Completion* inCompletionPtr = 0, + int inMaxRetryCount = 6, + int inWriteThreshold = 1 << 20, + int inMaxPartialBuffersCount = 16, + int inTimeSecBetweenRetries = 15, + int inOpTimeoutSec = 30, + int inIdleTimeoutSec = 5 * 30, + int inMaxWriteSize = 1 << 20, + const char* inLogPrefixPtr = 0, + int64_t inChunkServerInitialSeqNum = 1); + virtual ~Writer(); + int Open( + kfsFileId_t inFileId, + const char* inFileNamePtr, + Offset inFileSize, + int inStriperType, + int inStripeSize, + int inStripeCount, + int inRecoveryStripeCount, + int inReplicaCount); + int Close(); + int Write( + IOBuffer& inBuffer, + int inLength, + Offset inOffset, + bool inFlushFlag, + int inWriteThreshold = -1); + int SetWriteThreshold( + int inThreshold); + int Flush(); + void Stop(); + void Shutdown(); + bool IsOpen() const; + bool IsClosing() const; + bool IsActive() const; + Offset GetPendingSize() const; + int GetErrorCode() const; + void Register( + Completion* inCompletionPtr); + bool Unregister( + Completion* inCompletionPtr); + void GetStats( + Stats& outStats, + KfsNetClient::Stats& outChunkServersStats); +private: + Impl& mImpl; +private: + Writer( + const Writer& inWriter); + Writer& operator=( + const Writer& inWriter); +}; +}} + +#endif /* WRITER_H */ diff --git a/src/cc/libclient/kfsglob.cc b/src/cc/libclient/kfsglob.cc new file mode 100644 index 000000000..d8d281c6a --- /dev/null +++ b/src/cc/libclient/kfsglob.cc @@ -0,0 +1,455 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/08/18 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Kfs glob() equivalent. +// +//---------------------------------------------------------------------------- + +#include "kfsglob.h" + +#include "KfsClient.h" + +#include "qcdio/QCMutex.h" +#include "qcdio/qcstutils.h" +#include "qcdio/qcdebug.h" +#include "qcdio/QCUtils.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +// Enabling thread local assumes that the libc glob() implementation is +// re-entrant. +#if ! defined(KFS_GLOB_USE_THREAD_LOCAL) && defined(__GNUC__) && \ + (! defined(QC_OS_NAME_DARWIN) || ((defined(__SIZEOF_POINTER__) && \ + __SIZEOF_POINTER__ == 8))) +# define KFS_GLOB_USE_THREAD_LOCAL +#endif + +namespace KFS +{ +namespace client +{ +using std::vector; +using std::string; +using std::find; + +class KfsOpenDir +{ +private: + typedef struct stat StatBuf; +public: + class Glob : private QCRefCountedObj + { + public: + static int Expand( + KfsClient& inClient, + const char* inGlobPtr, + int inGlobFlags, + int (*inErrorHandlerPtr)(const char* inErrPathPtr, int inError), + glob_t* inResultPtr) + { + if (! inResultPtr) { + errno = EFAULT; + return GLOB_ABORTED; + } + if ((inGlobFlags & GLOB_ALTDIRFUNC) != 0) { + errno = EINVAL; + return GLOB_ABORTED; + } + + Glob& theInstance = Instance(); + QCStMutexLocker theLock(GetMutexPtr()); + StRef theRef(theInstance); + // The mutex doesn't exists or cannot be deleted when "theRef" + // destructor is invoked as the condition in the following + // assertion shows, therefore no problem exists with "theLock" + // destructor releasing mutex after de-referencing. + // Such order of constructors and destructors ensures the + // reference counter access serialization with thread local + // disabled. + // With thread local disabled the instance must be constructed + // prior to entering this method, therefore this method must not + // be invoked before entering the "main()", otherwise the following + // assertion will fail. + // The assertion assumes no recursion, as GLOB_ALTDIRFUNC is not + // allowed. + QCRTASSERT(theInstance.GetRefCount() == (GetMutexPtr() ? 2 : 1)); + return theInstance.ExpandSelf(inClient, inGlobPtr, inGlobFlags, + inErrorHandlerPtr, inResultPtr); + } + private: + Glob() + : mClientPtr(0), + mDirToReusePtr(0), + mOpenDirs(), + mError(0), + mCwd(), + mTmpName() + { + // Insure that the mutex constructor is invoked. + GetMutexPtr(); + } + virtual ~Glob() + { + QCStMutexLocker theLock(GetMutexPtr()); + if (this == sInstancePtr) { + sInstancePtr = 0; + } + QCASSERT(mOpenDirs.empty()); + Reset(); + delete mDirToReusePtr; + } + static Glob& Instance() + { + if (! sInstancePtr) { + sInstancePtr = new Glob(); + } + return *sInstancePtr; + } + bool IsMutexOwner() const + { + QCMutex* const theMutexPtr = GetMutexPtr(); + return (! theMutexPtr || theMutexPtr->IsOwned()); + } + int ExpandSelf( + KfsClient& inClient, + const char* inGlobPtr, + int inGlobFlags, + int (*inErrorHandlerPtr)(const char* inErrPathPtr, int inError), + glob_t* inResultPtr) + { + QCASSERT(IsMutexOwner() && mOpenDirs.empty()); + Reset(); + + mClientPtr = &inClient; + mCwd = mClientPtr->GetCwd(); + if (*mCwd.rbegin() != '/') { + mCwd += "/"; + } + inResultPtr->gl_closedir = &Glob::CloseDir; + inResultPtr->gl_readdir = &Glob::ReadDir; + inResultPtr->gl_opendir = &Glob::OpenDir; + inResultPtr->gl_lstat = &Glob::LStat; + inResultPtr->gl_stat = &Glob::Stat; + return glob(inGlobPtr, inGlobFlags | GLOB_ALTDIRFUNC, + inErrorHandlerPtr, inResultPtr); + } + static void* OpenDir( + const char* inPathNamePtr) + { return Instance().OpenDirSelf(inPathNamePtr); } + static void CloseDir( + void* inDirPtr) + { Instance().CloseDirSelf(inDirPtr); } + static struct dirent* ReadDir( + void* inDirPtr) + { return Instance().ReadDirSelf(inDirPtr); } + static int Stat( + const char* inPathNamePtr, + StatBuf* inStatPtr) + { return Instance().StatSelf(inPathNamePtr, inStatPtr); } + // For now same as stat as sym links aren't supported. + static int LStat( + const char* inPathNamePtr, + StatBuf* inStatPtr) + { return Instance().StatSelf(inPathNamePtr, inStatPtr); } + void* OpenDirSelf( + const char* inPathNamePtr) + { + QCASSERT(mClientPtr && IsMutexOwner()); + + if (! inPathNamePtr || ! *inPathNamePtr) { + errno = EINVAL; + return 0; + } + // Use absolute path names as concurrent threads can change + // client's current working directory. + const char* const theDirNamePtr = GetAbsPathName(inPathNamePtr); + KfsOpenDir& theDir = mDirToReusePtr ? + mDirToReusePtr->Clear() : *(new KfsOpenDir()); + mDirToReusePtr = 0; + const bool kComputeFileSizeFlag = false; + mError = mClientPtr->ReaddirPlus( + theDirNamePtr, + theDir.mDirContent, + kComputeFileSizeFlag + ); + if (mError != 0) { + mDirToReusePtr = &(theDir.Clear()); + errno = mError < 0 ? -mError : mError; + return 0; + } + theDir.Reset(theDirNamePtr); + if (mOpenDirs.capacity() == 0) { + mOpenDirs.reserve(16); + } + mOpenDirs.push_back(&theDir); + return &theDir; + } + void CloseDirSelf( + void* inDirPtr) + { + QCASSERT(mClientPtr && IsMutexOwner() && ! mOpenDirs.empty()); + + if (! inDirPtr) { + return; + } + KfsOpenDir* const theDirPtr = reinterpret_cast(inDirPtr); + if (theDirPtr == mOpenDirs.back()) { + mOpenDirs.pop_back(); + } else { + OpenDirs::iterator const theIt = find( + mOpenDirs.begin(), mOpenDirs.end(), theDirPtr); + if (theIt == mOpenDirs.end()) { + QCRTASSERT(! "invalid CloseDir argument"); + } + mOpenDirs.erase(theIt); + } + if (mDirToReusePtr) { + delete theDirPtr; + } else { + mDirToReusePtr = &(theDirPtr->Clear()); + } + } + struct dirent* ReadDirSelf( + void* inDirPtr) + { + QCASSERT(mClientPtr && IsMutexOwner()); + + if (! inDirPtr) { + return 0; + } + return reinterpret_cast(inDirPtr)->ReadDirSelf(); + } + int StatSelf( + const char* inPathNamePtr, + StatBuf* inStatBufPtr) + { + QCASSERT(mClientPtr && IsMutexOwner()); + + if (! inPathNamePtr || ! *inPathNamePtr || ! inStatBufPtr) { + errno = EINVAL; + return -1; + } + const char* const theAbsPathNamePtr = GetAbsPathName(inPathNamePtr); + if (mOpenDirs.empty() || ! mOpenDirs.back()->Stat( + theAbsPathNamePtr, *inStatBufPtr)) { + KfsFileAttr theAttr; + const bool kComputeFileSizesFlag = false; + mError = mClientPtr->Stat( + theAbsPathNamePtr, theAttr, kComputeFileSizesFlag); + if (mError != 0) { + errno = mError < 0 ? -mError : mError; + return -1; + } + theAttr.ToStat(*inStatBufPtr); + } + return 0; + } + private: + typedef vector OpenDirs; + + KfsClient* mClientPtr; + KfsOpenDir* mDirToReusePtr; + OpenDirs mOpenDirs; + int mError; + string mCwd; + string mTmpName; + +#ifdef KFS_GLOB_USE_THREAD_LOCAL + static __thread Glob* sInstancePtr; + static QCMutex* GetMutexPtr() + { return 0; } +#else + static Glob* sInstancePtr; + static StRef sInstanceRef; + static QCMutex* GetMutexPtr() + { + static QCMutex sMutex; + return &sMutex; + } +#endif + void Reset() + { + for (OpenDirs::const_iterator theIt = mOpenDirs.begin(); + theIt != mOpenDirs.end(); + ++theIt) { + delete *theIt; + } + mOpenDirs.clear(); + mError = 0; + mCwd.clear(); + mTmpName.clear(); + mClientPtr = 0; + } + const char* GetAbsPathName( + const char* inPathNamePtr) + { + const char* theAbsPathNamePtr = inPathNamePtr; + if (*theAbsPathNamePtr != '/') { + mTmpName.reserve(mCwd.length() + 1024); + mTmpName = mCwd; + mTmpName += theAbsPathNamePtr; + theAbsPathNamePtr = mTmpName.c_str(); + } + return theAbsPathNamePtr; + } + private: + Glob(const Glob& inGlob); + Glob& operator=(const Glob& inGlob); + }; + friend class Glob; +private: + typedef vector DirConent; + class DirEntry : public dirent + { + public: + DirEntry() + : dirent() + { d_name[0] = 0; } + private: + DirEntry(const DirEntry& inDirEntry); + DirEntry& operator=(const DirEntry& inDirEntry); + }; + + DirConent mDirContent; + DirEntry mCurEntry; + const KfsFileAttr* mCurPtr; + DirConent::const_iterator mNextIt; + string mDirName; + + KfsOpenDir() + : mDirContent(), + mCurPtr(0), + mNextIt(mDirContent.begin()), + mDirName() + { + mDirContent.reserve(256); + mNextIt = mDirContent.begin(); + mDirName.reserve(1024); + } + void Reset( + const char* inDirNamePtr) + { + mNextIt = mDirContent.begin(); + mDirName = inDirNamePtr; + mCurPtr = 0; + if (mDirName.empty()) { + mDirName = "./"; + } else if (*mDirName.rbegin() != '/') { + mDirName += "/"; + } + } + KfsOpenDir& Clear() + { + mDirName.clear(); + mDirContent.clear(); + mNextIt = mDirContent.begin(); + mCurPtr = 0; + return *this; + } + DirEntry* ReadDirSelf() + { + for (; ;) { + if (mNextIt == mDirContent.end()) { + mCurPtr = 0; + return 0; + } + mCurPtr = &(*mNextIt); + ++mNextIt; + const size_t theLen = mCurPtr->filename.length() + 1; + const size_t kMaxLen = sizeof(mCurEntry.d_name); + if (theLen > kMaxLen) { + // Skip / hide entries that exceed max length for now. + // Other way to handle this is to scan all entries in OpenDir or + // modify ReadDirplus to do this, and return ENAMETOOLONG. + // Not adequate name length is not expected be a problem with + // the modern os / libc / glibc. There seems to be no reliable + // way to let glob know about the problem at this exact point. + continue; + } + memcpy(mCurEntry.d_name, mCurPtr->filename.c_str(), theLen); + mCurEntry.d_ino = (ino_t)mCurPtr->fileId; + if (mCurEntry.d_ino == 0) { + // I-node must be non 0 for glob to treat as directory entry. + mCurEntry.d_ino = 1; + } +#ifndef QC_OS_NAME_CYGWIN + // Cygwin has no d_type, all other supported platforms have the file + // type, and glob uses this field instead of invoking stat. + mCurEntry.d_type = mCurPtr->isDirectory ? DT_DIR : DT_REG; +#endif + break; + } + return &mCurEntry; + } + bool Stat( + const char* inPathNamePtr, + StatBuf& outStatBuf) + { + if (! mCurPtr) { + return false; + } + const size_t theLen = mDirName.length(); + if (mDirName.compare(0, theLen, inPathNamePtr) != 0 || + mCurPtr->filename.compare(inPathNamePtr + theLen) != 0) { + return false; + } + mCurPtr->ToStat(outStatBuf); + return true; + } +private: + KfsOpenDir( + const KfsOpenDir& inOpenDir); + KfsOpenDir& operator=( + const KfsOpenDir& inOpenDir); +}; +#ifdef KFS_GLOB_USE_THREAD_LOCAL +__thread KfsOpenDir::Glob* KfsOpenDir::Glob::sInstancePtr = 0; +#else +KfsOpenDir::Glob* KfsOpenDir::Glob::sInstancePtr = 0; +KfsOpenDir::Glob::StRef KfsOpenDir::Glob::sInstanceRef( + KfsOpenDir::Glob::Instance()); +#endif + +} // namespace client + +int +KfsGlob( + KfsClient& inClient, + const char* inGlobPtr, + int inGlobFlags, + int (*inErrorHandlerPtr)(const char* inErrPathPtr, int inError), + glob_t* inResultPtr) +{ + return client::KfsOpenDir::Glob::Expand( + inClient, inGlobPtr, inGlobFlags, inErrorHandlerPtr, inResultPtr); +} + +} diff --git a/src/cc/libclient/kfsglob.h b/src/cc/libclient/kfsglob.h new file mode 100644 index 000000000..703fe4292 --- /dev/null +++ b/src/cc/libclient/kfsglob.h @@ -0,0 +1,45 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/08/18 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Kfs glob() equivalent. +// +//---------------------------------------------------------------------------- + +#ifndef LIBCLIENT_KFSGLOB_H +#define LIBCLIENT_KFSGLOB_H + +#include + +namespace KFS +{ +class KfsClient; + +int KfsGlob( + KfsClient& inClient, + const char* inGlobPtr, + int inGlobFlags, + int (*inErrorHandlerPtr)(const char* inErrPathPtr, int inError), + glob_t* inResultPtr); + +} + +#endif /* LIBCLIENT_KFSGLOB_H */ diff --git a/src/cc/libclient/utils.cc b/src/cc/libclient/utils.cc new file mode 100644 index 000000000..72c5793c9 --- /dev/null +++ b/src/cc/libclient/utils.cc @@ -0,0 +1,89 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/08/31 +// Author: Sriram Rao +// Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief Miscellaneous utility functions. +// +//---------------------------------------------------------------------------- + +#include "utils.h" +#include "common/RequestParser.h" + +#include +#include +#include +#include + +namespace KFS +{ +namespace client +{ + +void +Sleep(int secs) +{ + if (secs <= 0) { + return; + } + struct timeval end; + gettimeofday(&end, 0); + end.tv_sec += secs; + struct timeval tm; + tm.tv_sec = secs; + tm.tv_usec = 0; + for(; ;) { + if (select(0, 0, 0, 0, &tm) == 0) { + break; + } + gettimeofday(&tm, 0); + if (tm.tv_sec + secs + 1 < end.tv_sec || // backward clock jump + end.tv_sec < tm.tv_sec || + (end.tv_sec == tm.tv_sec && + end.tv_usec <= tm.tv_usec + 10000)) { + break; + } + if (end.tv_usec < tm.tv_usec) { + tm.tv_sec = end.tv_sec - tm.tv_sec - 1; + tm.tv_usec = 1000000 - tm.tv_usec + end.tv_usec; + } else { + tm.tv_sec = end.tv_sec - tm.tv_sec; + tm.tv_usec = end.tv_usec - tm.tv_usec; + } + } +} + +void +GetTimeval(const char* s, struct timeval& tv) +{ + const char* p = s; + const char* const e = p + (p ? strlen(s) : 0); + if (! p || + ! DecIntParser::Parse(p, e - p, tv.tv_sec) || + ! DecIntParser::Parse(p, e - p, tv.tv_usec)) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } +} + +} +} diff --git a/src/cc/libclient/utils.h b/src/cc/libclient/utils.h new file mode 100644 index 000000000..b0ff93e85 --- /dev/null +++ b/src/cc/libclient/utils.h @@ -0,0 +1,42 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/08/31 +// Author: Sriram Rao +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief utils.h: Miscellaneous utility functions. +// +//---------------------------------------------------------------------------- + +#ifndef LIBKFSCLIENT_UTILS_H +#define LIBKFSCLIENT_UTILS_H + +#include + +namespace KFS { +namespace client { + +void GetTimeval(const char* s, struct timeval& tv); +void Sleep(int secs); + +} +} + +#endif // LIBKFSCLIENT_UTILS_H diff --git a/src/cc/meta/AuditLog.cc b/src/cc/meta/AuditLog.cc new file mode 100644 index 000000000..15001dc7f --- /dev/null +++ b/src/cc/meta/AuditLog.cc @@ -0,0 +1,193 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/04/10 +// Author: Mike Ovsiannikov. +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +// \file AuditLog.cc +// \brief Kfs meta server audit log implementation. +// +//---------------------------------------------------------------------------- + +#include "AuditLog.h" +#include "MetaRequest.h" +#include "kfsio/IOBuffer.h" +#include "common/BufferedLogWriter.h" + +#include + +namespace KFS +{ + +using std::ostream; +using std::streambuf; + +class AuditLogWriter : public BufferedLogWriter::Writer +{ +public: + AuditLogWriter( + const MetaRequest& inOp) + : mOp(inOp) + {} + virtual ~AuditLogWriter() + {} + virtual int Write( + char* inBufferPtr, + int inBufferSize) + { + if (inBufferSize <= 0) { + return 0; + } + int theNWr = max(0, + mOp.reqHeaders.CopyOut(inBufferPtr, inBufferSize)); + char* const theEndPtr = inBufferPtr + inBufferSize; + // This is not re-entrant, and it doesn't have to be re-entrant due to + // serialization in BufferedLogWriter::Append(). + // Creating and destroying output stream on every invocation is + // rather cpu expensive most likely due to c++ lib allocations. + static OutStream sStream; + sStream.Set(inBufferPtr + theNWr, theEndPtr) << + "Client-ip: " << mOp.clientIp << "\r\n" + "Status: " << mOp.status + ; + // Put terminal 0 -- record separator. + char* const thePtr = min(theEndPtr - 1, sStream.GetCurPtr()); + *thePtr = 0; + return (int)(thePtr + 1 - inBufferPtr); + + } + virtual int GetMsgLength() + { + return (mOp.reqHeaders.BytesConsumable() + 256); + } +private: + class OutStream : + private streambuf, + public ostream + { + public: + OutStream() + : streambuf(), + ostream(this) + {} + ostream& Set( + char* inBufferPtr, + char* inBufferEndPtr) + { + setp(inBufferPtr, inBufferEndPtr); + Clear(); + return *this; + } + virtual streamsize xsputn( + const char* inBufPtr, + streamsize inLength) + { + char* const theEndPtr = epptr(); + char* const theCurPtr = pptr(); + const streamsize theSize( + min(max(streamsize(0), inLength), + streamsize(theEndPtr - theCurPtr))); + memcpy(theCurPtr, inBufPtr, theSize); + pbump(theSize); + return theSize; + } + char* GetCurPtr() const + { return pptr(); } + void Clear() + { + ostream::clear(); + ostream::flags(ostream::dec | ostream::skipws); + ostream::precision(6); + ostream::width(0); + ostream::fill(' '); + ostream::tie(0); + } + private: + OutStream( + const OutStream&); + OutStream& operator=( + const OutStream&); + }; + const MetaRequest& mOp; +private: + AuditLogWriter( + const AuditLogWriter&); + AuditLogWriter& operator=( + const AuditLogWriter&); +}; + +static const BufferedLogWriter* sBufferedLogWriterForGdbToFindPtr = 0; + +static BufferedLogWriter& +GetAuditMsgWriter() +{ + static BufferedLogWriter sAuditMsgWriter; + if (! sBufferedLogWriterForGdbToFindPtr) { + sBufferedLogWriterForGdbToFindPtr = &sAuditMsgWriter; + } + return sAuditMsgWriter; +} + +/* static */ void +AuditLog::Log( + const MetaRequest& inOp) +{ + AuditLogWriter theWriter(inOp); + GetAuditMsgWriter().Append( + inOp.status >= 0 ? + BufferedLogWriter::kLogLevelINFO : + BufferedLogWriter::kLogLevelERROR, + theWriter + ); +} + +/* static */ void +AuditLog::SetParameters( + const Properties& inProps) +{ + GetAuditMsgWriter().SetParameters(inProps, + "metaServer.auditLogWriter."); +} + +/* static */ void +AuditLog::Stop() +{ + GetAuditMsgWriter().Stop(); +} + +/* static */ void +AuditLog::PrepareToFork() +{ + GetAuditMsgWriter().PrepareToFork(); +} + +/* static */ void +AuditLog::ForkDone() +{ + GetAuditMsgWriter().ForkDone(); +} + +/* static */ void +AuditLog::ChildAtFork() +{ + GetAuditMsgWriter().ChildAtFork(); +} + +} diff --git a/src/cc/meta/AuditLog.h b/src/cc/meta/AuditLog.h new file mode 100644 index 000000000..9fc551c48 --- /dev/null +++ b/src/cc/meta/AuditLog.h @@ -0,0 +1,52 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/04/10 +// Author: Mike Ovsiannikov. +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +// \file AuditLog.h +// \brief Kfs meta server audit log interface. Writes every client request into +// audit log file. +// +//---------------------------------------------------------------------------- + +#ifndef AUDIT_LOG_H +#define AUDIT_LOG_H + +namespace KFS +{ + +struct MetaRequest; +class Properties; + +class AuditLog +{ +public: + static void Log(const MetaRequest& inOp); + static void SetParameters(const Properties& inProps); + static void Stop(); + static void PrepareToFork(); + static void ForkDone(); + static void ChildAtFork(); +}; + +}; + +#endif /* AUDIT_LOG_H */ diff --git a/src/cc/meta/CMakeLists.txt b/src/cc/meta/CMakeLists.txt new file mode 100644 index 000000000..ea873c052 --- /dev/null +++ b/src/cc/meta/CMakeLists.txt @@ -0,0 +1,90 @@ +# +# $Id$ +# +# Created 2006 +# Author: Sriram Rao (Kosmix Corp) +# +# Copyright 2006 Kosmix Corp. +# +# This file is part of Kosmos File System (KFS). +# +# Licensed under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# + +# +# For the library take everything except the *_main.cc files +# +set (lib_srcs +AuditLog.cc +Checkpoint.cc +ChunkServer.cc +ChildProcessTracker.cc +ClientSM.cc +DiskEntry.cc +kfsops.cc +kfstree.cc +LayoutManager.cc +Logger.cc +meta.cc +MetaRequest.cc +NetDispatch.cc +Replay.cc +Restorer.cc +util.cc +) + +add_library (kfsMeta STATIC ${lib_srcs}) +add_library (kfsMeta-shared SHARED ${lib_srcs} layoutmanager_instance.cc) +set_target_properties (kfsMeta PROPERTIES OUTPUT_NAME "kfs_meta") +set_target_properties (kfsMeta-shared PROPERTIES OUTPUT_NAME "kfs_meta") +set_target_properties (kfsMeta PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties (kfsMeta-shared PROPERTIES CLEAN_DIRECT_OUTPUT 1) + +set (exe_files metaserver logcompactor filelister kfsfsck) +foreach (exe_file ${exe_files}) + add_executable (${exe_file} ${exe_file}_main.cc layoutmanager_instance.cc) + if (USE_STATIC_LIB_LINKAGE) + target_link_libraries (${exe_file} kfsMeta kfsIO kfsCommon qcdio pthread crypto) + add_dependencies (${exe_file} kfsCommon kfsIO kfsMeta qcdio) + else (USE_STATIC_LIB_LINKAGE) + target_link_libraries (${exe_file} kfsMeta-shared kfsIO-shared kfsCommon-shared qcdio-shared pthread crypto) + add_dependencies (${exe_file} kfsCommon-shared kfsIO-shared kfsMeta-shared qcdio-shared) + endif (USE_STATIC_LIB_LINKAGE) +endforeach (exe_file) + +if (APPLE OR CYGWIN) + target_link_libraries(kfsMeta-shared kfsCommon-shared kfsIO-shared crypto) +endif (APPLE OR CYGWIN) + +if (NOT APPLE) + target_link_libraries(kfsMeta rt) + target_link_libraries(metaserver rt) +endif (NOT APPLE) + +if (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + # mtmalloc seemed to worsen metaserver startup time; it took + # 4 mins for fsck to load checkpoint from WORM, where as 30 for metaserver. + # So, switch to umem + target_link_libraries(kfsMeta umem) + target_link_libraries(metaserver umem) +endif (CMAKE_SYSTEM_NAME STREQUAL "SunOS") + +# +# Install them +# +install (TARGETS ${exe_files} kfsMeta kfsMeta-shared + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static) + diff --git a/src/cc/meta/CSMap.h b/src/cc/meta/CSMap.h new file mode 100644 index 000000000..41e9e2e71 --- /dev/null +++ b/src/cc/meta/CSMap.h @@ -0,0 +1,1386 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2011/02/03 +// Author: Mike Ovsiannikov +// +// Copyright 2011-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file CSMap.h +// \brief Chunk ids to chunk server, file id, and replication state table. +// +//---------------------------------------------------------------------------- + +#ifndef CS_MAP_H +#define CS_MAP_H + +#include "qcdio/QCDLList.h" +#include "common/LinearHash.h" +#include "common/PoolAllocator.h" +#include "common/StdAllocator.h" +#include "kfstypes.h" +#include "meta.h" +#include "util.h" +#include "MetaRequest.h" +#include "ChunkServer.h" + +#include +#include +#include +#include + +namespace KFS +{ +using std::vector; + +// chunkid to server(s) map +class CSMap +{ +public: + class Entry; +private: + typedef QCDLListOp EList; +public: + typedef MetaRequest::Servers Servers; + + class Entry : private MetaChunkInfo + { + public: + enum State + { + // Order is important: the lists are scanned in this + // order. + kStateNone = 0, + kStateCheckReplication = 1, + kStatePendingReplication = 2, + kStateNoDestination = 3, + kStatePendingRecovery = 4, + kStateDelayedRecovery = 5, + kStateCount + }; + + explicit Entry(MetaFattr* fattr = 0, chunkOff_t offset = 0, + chunkId_t chunkId = 0, seq_t chunkVersion = 0) + : MetaChunkInfo(fattr, offset, chunkId, chunkVersion), + mIdxData(0) + { + EList::Init(*this); + BOOST_STATIC_ASSERT(sizeof(void*) <= sizeof(mIdxData)); + } + explicit Entry(chunkId_t chunkId, const Entry& entry) + : MetaChunkInfo(entry.fattr, + entry.offset, entry.chunkId, entry.chunkVersion), + mIdxData(0) + { + assert(chunkId == this->chunkId && entry.mIdxData == 0); + EList::Init(*this); + } + ~Entry() + { + EList::Remove(*this); + if (IsAddr()) { + AddrClear(); + } + } + const chunkId_t& GetChunkId() const { return chunkId; } + fid_t GetFileId() const { return id(); } + MetaFattr* GetFattr() const { return fattr; } + void SetFattr(MetaFattr* fa) { + if (! fa) { + panic("SetFattr: null argument", false); + return; + } + fattr = fa; + } + virtual void destroy(); + MetaChunkInfo* GetChunkInfo() const + { return const_cast(this); } + size_t ServerCount(const CSMap& map) const { + return map.ServerCount(*this); + } + ChunkServerPtr GetServer(const CSMap& map) const { + return map.GetServer(*this); + } + Servers GetServers(const CSMap& map) const { + return map.GetServers(*this); + } + void SetServers(CSMap& map, const Servers& servers) { + return map.SetServers(servers, *this); + } + bool Add(const CSMap& map, const ChunkServerPtr& server) { + return map.AddServer(server, *this); + } + bool Remove(const CSMap& map, const ChunkServerPtr& server) { + return map.RemoveServer(server, *this); + } + void RemoveAllServers(const CSMap& map) { + map.RemoveHosted(*this); + ClearServers(); + } + bool HasServer(const CSMap& map, + const ServerLocation& loc) const { + return map.HasServer(loc, *this); + } + bool HasServer(const CSMap& map, + const ChunkServerPtr& srv) const { + return map.HasServer(srv, *this); + } + ChunkServerPtr GetServer(const CSMap& map, + const ServerLocation& loc) const { + return map.GetServer(loc, *this); + } + static size_t GetAllocBlockCount() { + return GetAllocator().GetBlockCount(); + } + static size_t GetAllocByteCount() { + return GetAllocator().GetByteCount(); + } + static const Entry& GetCsEntry(const MetaChunkInfo& chunkInfo) { + return static_cast(chunkInfo); + } + static Entry& GetCsEntry(MetaChunkInfo& chunkInfo) { + return static_cast(chunkInfo); + } + static Entry* GetCsEntry(MetaChunkInfo* chunkInfo) { + return static_cast(chunkInfo); + } + private: + typedef uint64_t IdxData; + typedef uint16_t AllocIdx; + enum + { + kNumStateBits = 3, + kIdxBits = 15, + kIdxMask = (1 << kIdxBits) - 1, + kFirstIdxShift = sizeof(IdxData) * 8 - kIdxBits, + kMaxNonAllocSrvs = sizeof(IdxData) * 8 / kIdxBits, + kMaxServers = kIdxMask, // sentinel is 0 + kSentinel = 0, // sentinel entry + kStateMask = (1 << kNumStateBits) - 1, + kAllocated = 1 << kNumStateBits, + kOtherBitsMask = kStateMask | kAllocated, + kAddrAlign = kOtherBitsMask + 1, + // kNumStateBits + 1 low order bits are used, address + // must be aligned accordingly. + // Allocated array positions. + kAddrCapacityPos = 0, + kAddrSizePos = 1, + kAddrIdxPos = 2 + }; + BOOST_STATIC_ASSERT( + kStateCount <= kStateMask + 1 && + kIdxBits * kMaxNonAllocSrvs + kNumStateBits < + sizeof(IdxData) * 8 && + kAddrAlign % sizeof(AllocIdx) == 0 + ); + class Allocator + { + public: + Allocator() + : mBlockCount(0), + mByteCount(0) + {} + char* Allocate(size_t n) { + assert(n > 0); + char* const ret = mAlloc.allocate(n); + if (! ret) { + panic("allocation failure", true); + return 0; + } + mBlockCount++; + mByteCount += n; + return ret; + } + void Deallocate(char* ptr, size_t n) { + if (! ptr) { + return; + } + assert( + n > 0 && + mByteCount >= n && mBlockCount > 0 + ); + mAlloc.deallocate(ptr, n); + mByteCount -= n; + mBlockCount--; + } + size_t GetBlockCount() const { + return mBlockCount; + } + size_t GetByteCount() const { + return mByteCount; + } + private: + StdAllocator mAlloc; + size_t mBlockCount; + size_t mByteCount; + }; + + IdxData mIdxData; + Entry* mPrevPtr[1]; + Entry* mNextPtr[1]; + + static Allocator& GetAllocator() { + static Allocator alloc; + return alloc; + } + size_t ServerCount() const { + if (IsAddr()) { + return AddrCount(); + } + size_t count = 0; + IdxData mask = IdxData(kIdxMask) << kFirstIdxShift; + while ((mask & mIdxData) != 0) { + count++; + mask >>= kIdxBits; + mask &= ~IdxData(kOtherBitsMask); + } + return count; + } + bool HasServers() const { + return (IsAddr() || (mIdxData & + (IdxData(kIdxMask) << kFirstIdxShift)) != 0); + } + State GetState() const { + return (State)(mIdxData & kStateMask); + } + void SetState(State state) { + mIdxData &= ~IdxData(kStateMask); + mIdxData |= IdxData(kStateMask & state); + } + bool IsAddr() const { + return ((mIdxData & kAllocated) != 0); + } + void ClearServers() + { + if (IsAddr()) { + AddrClear(); + } else { + mIdxData &= IdxData(kStateMask); + } + } + bool HasIndex(size_t idx) const { + if (idx >= kMaxServers) { + return false; + } + if (IsAddr()) { + return AddrHasIndex((AllocIdx)(idx + 1)); + } + for (IdxData id = IdxData(idx + 1) << + kFirstIdxShift, + data = mIdxData & + ~IdxData(kOtherBitsMask), + mask = IdxData(kIdxMask) << + kFirstIdxShift; + (data & mask) != 0; + id >>= kIdxBits, + mask >>= kIdxBits) { + if ((data & mask) == id) { + return true; + } + } + return false; + } + bool AddIndex(size_t idx) { + if (idx >= kMaxServers) { + return false; + } + const AllocIdx index = (AllocIdx)(idx + 1); + if (IsAddr()) { + return AddrAddIndex(index); + } + for (IdxData id = IdxData(index) << + kFirstIdxShift, + mask = IdxData(kIdxMask) << + kFirstIdxShift; + mask >= IdxData(kIdxMask); + id >>= kIdxBits, + mask >>= kIdxBits) { + if ((mIdxData & mask) == 0) { + mIdxData |= id; + return true; + } + if ((mIdxData & mask) == id) { + return false; + } + } + AllocateIndexes(index, kMaxNonAllocSrvs + 1); + return true; + } + bool RemoveIndex(size_t idx) { + if (idx >= kMaxServers) { + return false; + } + const AllocIdx index = (AllocIdx)(idx + 1); + if (IsAddr()) { + return AddrRemoveIndex(index); + } + int shift = kFirstIdxShift; + for (IdxData id = IdxData(index) << + kFirstIdxShift, + data = mIdxData & + ~IdxData(kOtherBitsMask), + mask = IdxData(kIdxMask) << + kFirstIdxShift; + (data & mask) != 0; + id >>= kIdxBits, + mask >>= kIdxBits, + shift -= kIdxBits) { + if ((data & mask) == id) { + const IdxData lm = + (IdxData(1) << shift) - 1; + mIdxData = + (data & ~(mask | lm)) | + ((data & lm) << kIdxBits) | + (mIdxData & kOtherBitsMask); + return true; + } + } + return false; + } + size_t IndexAt(size_t idx) const { + if (IsAddr()) { + return AddrIndexAt(idx); + } + return ((size_t)((mIdxData >> + (kFirstIdxShift - idx * kIdxBits)) & kIdxMask) + - 1); + } + size_t AddrIndexAt(size_t idx) const { + return (AddrGetArray()[idx + kAddrIdxPos] - 1); + } + AllocIdx* AddrGetIdxPtr() const { + return reinterpret_cast((char*)0 + + (mIdxData & ~IdxData(kOtherBitsMask))); + } + void AddrSet(AllocIdx* addr) { + const IdxData idxAddr = (IdxData) + (reinterpret_cast(addr) - (char*)0); + assert((idxAddr & kOtherBitsMask) == 0); + mIdxData &= IdxData(kStateMask); + mIdxData |= idxAddr; + mIdxData |= kAllocated; + } + AllocIdx* AddrGetArray() const { + AllocIdx* p = AddrGetIdxPtr(); + if ((p[0] & ~AllocIdx(kIdxMask)) != 0) { + p++; // First slot is alignment + } + return p; + } + size_t AddrCount() const { + return (AddrGetArray()[kAddrSizePos] - kAddrIdxPos); + } + void AddrClear() { + AllocIdx* p = AddrGetIdxPtr(); + size_t size; + const AllocIdx kMask(kIdxMask); + if ((p[0] & ~kMask) != 0) { + const size_t off = p[0] & kMask; + size = kAddrAlign + (p[1] + 1) * sizeof(p[1]); + p = reinterpret_cast( + reinterpret_cast(p) - off); + } else { + size = p[0] * sizeof(p[0]); + } + GetAllocator().Deallocate( + reinterpret_cast(p), size); + mIdxData &= IdxData(kStateMask); + } + void AllocateIndexes(AllocIdx idx, size_t capacity) { + const size_t kQuantum = 4; + size_t alloccap = (capacity + 3 + kQuantum - 1) / + kQuantum * kQuantum; + // Glibc malloc is expected to return 2 * sizeof(size_t) + // aligned address. + // The low order bits are used, ensure that the address + // aligned accordingly. + char* const alloc = GetAllocator().Allocate( + kAddrAlign + alloccap * sizeof(AllocIdx)); + size_t const align = (AllocIdx)((alloc - (char*)0) & + size_t(kAddrAlign - 1)); + AllocIdx const off = (AllocIdx) (align > 0 ? + (kAddrAlign - align) : 0); + AllocIdx* const idxAddr = reinterpret_cast( + alloc + off); + AllocIdx* indexes = idxAddr; + if (off != 0) { + *indexes++ = off | (AllocIdx(1) << kIdxBits); + alloccap--; + } else { + alloccap += kAddrAlign / sizeof(AllocIdx); + } + indexes[kAddrCapacityPos] = alloccap; + AllocIdx& size = indexes[kAddrSizePos]; + size = kAddrIdxPos; + if (IsAddr()) { + AllocIdx* const prev = AddrGetArray(); + assert(prev[kAddrSizePos] <= alloccap); + memcpy(indexes + kAddrSizePos, + prev + kAddrSizePos, + (prev[kAddrSizePos] - 1) * + sizeof(prev[0])); + if (idx != kSentinel) { + assert(size < alloccap); + indexes[size++] = idx; + } + AddrClear(); + AddrSet(idxAddr); + return; + } + assert(alloccap > kMaxNonAllocSrvs + kAddrIdxPos && + idx != kSentinel); + for (int shift = kFirstIdxShift; + shift >= 0; + shift -= kIdxBits) { + const AllocIdx id = (AllocIdx) + ((mIdxData >> shift) & kIdxMask); + assert(id > 0); + indexes[size++] = id; + } + indexes[size++] = idx; + AddrSet(idxAddr); + } + bool AddrHasIndex(AllocIdx idx) const { + AllocIdx const* const indexes = AddrGetArray(); + AllocIdx size = indexes[kAddrSizePos]; + for (size_t i = kAddrIdxPos; i < size; i++) { + if (idx == indexes[i]) { + return true; + } + } + return false; + } + bool AddrAddIndex(AllocIdx idx) { + if (AddrHasIndex(idx)) { + return false; + } + AllocIdx* const indexes = AddrGetArray(); + AllocIdx& capacity = indexes[kAddrCapacityPos]; + AllocIdx& size = indexes[kAddrSizePos]; + if (capacity > size) { + indexes[size++] = idx; + } else { + AllocateIndexes(idx, capacity + 1); + } + return true; + } + bool AddrRemoveIndex(AllocIdx idx) { + AllocIdx* const indexes = AddrGetArray(); + AllocIdx& capacity = indexes[kAddrCapacityPos]; + AllocIdx& size = indexes[kAddrSizePos]; + size_t i; + for (i = kAddrIdxPos; i < size && idx != indexes[i]; i++) + {} + if (i >= indexes[1]) { + return false; + } + for (size_t k = i + 1; k < size; k++, i++) { + indexes[i] = indexes[k]; + } + size--; + if (size <= kAddrIdxPos) { + // 0 indexes + AddrClear(); + return true; + } + if (size <= kMaxNonAllocSrvs + kAddrIdxPos) { + // indexes fit into mIdxData + IdxData data = 0; + int shift = kFirstIdxShift; + for (i = kAddrIdxPos; i < size; i++) { + data |= IdxData(indexes[i]) << shift; + shift -= kIdxBits; + } + AddrClear(); + mIdxData |= data; + return true; + } + if (capacity / 2 >= size) { + // Reallocate. + AllocateIndexes(kSentinel, capacity / 2); + } + return true; + } + friend class QCDLListOp; + friend class CSMap; + private: + Entry(const Entry& entry); + Entry& operator=(const Entry& entry); + }; + + CSMap() + : mMap(), + mServers(), + mPendingRemove(), + mNullSlots(), + mServerCount(0), + mHibernatedCount(0), + mRemoveServerScanPtr(0), + mCachedEntry(0), + mCachedChunkId(-1), + mDebugValidateFlag(false) + { + for (int i = 0; i < Entry::kStateCount; i++) { + mCounts[i] = 0; + mPrevPtr[i] = 0; + mNextPtr[i] = 0; + mLists[i].SetState(Entry::State(i)); + mNextEnd[i].SetState(Entry::State(i)); + EList::Insert(mLists[i+1], mLists[i]); + } + mMap.SetDeleteObserver(this); + memset(mHibernatedIndexes, 0, sizeof(mHibernatedIndexes)); + } + ~CSMap() + { + mMap.SetDeleteObserver(0); + } + bool SetDebugValidate(bool flag) { + if (GetServerCount() > 0) { + return (mDebugValidateFlag == flag); + } + mDebugValidateFlag = flag; + return true; + } + bool Validate(const ChunkServerPtr& server) const { + if (! server) { + return false; + } + const int idx = server->GetIndex(); + return (idx >= 0 && idx < (int)mServers.size() && + mServers[idx] == server); + } + bool AddServer(const ChunkServerPtr& server) { + if (! server || Validate(server)) { + return false; + } + if (mServerCount + mPendingRemove.size() >= + Entry::kMaxServers) { + return false; + } + if (mNullSlots.empty()) { + server->SetIndex(mServers.size(), mDebugValidateFlag); + mServers.push_back(server); + } else { + Entry::AllocIdx const idx = mNullSlots.back(); + mNullSlots.pop_back(); + if (idx >= mServers.size() || mServers[idx]) { + InternalError("invalid null slots"); + return false; + } + mServers[idx] = server; + server->SetIndex(idx, mDebugValidateFlag); + } + server->ClearHosted(); + mServerCount++; + Validate(); + return true; + } + bool RemoveServer(const ChunkServerPtr& server) { + if (! server || ! Validate(server)) { + return false; + } + Validate(); + mServers[server->GetIndex()].reset(); + mPendingRemove.push_back(server->GetIndex()); + server->SetIndex(-1, mDebugValidateFlag); + mServerCount--; + server->ClearHosted(); + // Start or restart full scan. + RemoveServerScanFirst(); + return true; + } + bool SetHibernated(const ChunkServerPtr& server, size_t& idx) { + if (! server || ! Validate(server) || + ! SetHibernated(server->GetIndex())) { + return false; + } + mServers[server->GetIndex()].reset(); + idx = server->GetIndex(); + server->SetIndex(-1, mDebugValidateFlag); + Validate(); + return true; + } + bool RemoveHibernatedServer(size_t idx) { + if (/* idx < 0 ||*/ idx >= Entry::kMaxServers) { + return false; + } + Validate(); + if (! ClearHibernated(idx)) { + return false; + } + assert(! mServers[idx] && mServerCount > 0); + mPendingRemove.push_back(idx); + mServerCount--; + // Start or restart full scan. + RemoveServerScanFirst(); + return true; + } + size_t GetServerCount() const { + return mServerCount; + } + size_t GetHibernatedCount() const { + return mHibernatedCount; + } + size_t ServerCount(const Entry& entry) const { + if (mRemoveServerScanPtr) { + return CleanupStaleServers(entry); + } + ValidateHosted(entry); + if (mHibernatedCount > 0) { + size_t ret = 0; + for (size_t i = 0, e = entry.ServerCount(); + i < e; + i++) { + if (IsHibernated(entry.IndexAt(i))) { + continue; + } + ret++; + } + return ret; + } + return entry.ServerCount(); + } + bool HasServers(chunkId_t chunkId) const { + const Entry* const entry = Find(chunkId); + return (entry && HasServers(*entry)); + } + bool HasServers(const Entry& entry) const { + if (mRemoveServerScanPtr) { + return (CleanupStaleServers(entry) > 0); + } + if (mHibernatedCount > 0) { + return (ServerCount(entry) > 0); + } + ValidateHosted(entry); + return entry.HasServers(); + } + Servers GetServers(chunkId_t chunkId) const { + const Entry* const entry = Find(chunkId); + return (entry ? GetServers(*entry) : Servers()); + } + Servers GetServers(const Entry& entry) const { + Servers servers; + GetServers(entry, servers); + return servers; + } + Servers GetServers(const Entry& entry, size_t& hibernatedCount) const { + Servers servers; + GetServers(entry, servers, hibernatedCount); + return servers; + } + size_t GetServers(const Entry& entry, Servers& servers) const { + size_t hibernatedCount = 0; + return GetServers(entry, servers, hibernatedCount); + } + size_t GetServers(const Entry& entry, Servers& servers, + size_t& hibernatedCount) const { + hibernatedCount = 0; + if (mRemoveServerScanPtr) { + return CleanupStaleServers(entry, &servers, + hibernatedCount); + } + ValidateHosted(entry); + size_t count = 0; + for (size_t i = 0, e = entry.ServerCount(); i < e; i++) { + const ChunkServerPtr& srv = mServers[entry.IndexAt(i)]; + if (srv) { + servers.push_back(srv); + count++; + } else { + if (mHibernatedCount <= 0) { + panic("invalid server index", false); + } + hibernatedCount++; + } + } + return count; + } + ChunkServerPtr GetServer(const Entry& entry) const { + if (mRemoveServerScanPtr) { + Servers srvs; + return (CleanupStaleServers(entry, &srvs) > 0 ? + srvs[0] : ChunkServerPtr()); + } + ValidateHosted(entry); + if (mHibernatedCount > 0) { + for (size_t i = 0, e = entry.ServerCount(); + i < e; + i++) { + const ChunkServerPtr& srv = + mServers[entry.IndexAt(i)]; + if (srv) { + return srv; + } + } + return ChunkServerPtr(); + } + return (entry.HasServers() ? + mServers[entry.IndexAt(0)] : ChunkServerPtr()); + } + void SetServers(const Servers& servers, Entry& entry) { + ValidateHosted(entry); + entry.RemoveAllServers(*this); + for (Servers::const_iterator it = servers.begin(); + it != servers.end(); + ++it) { + AddServer(*it, entry); + } + } + bool AddServer(const ChunkServerPtr& server, Entry& entry) const { + if (! Validate(server)) { + return false; + } + if (mRemoveServerScanPtr) { + CleanupStaleServers(entry); + } + ValidateHosted(entry); + if (! entry.AddIndex(server->GetIndex())) { + return false; + } + AddHosted(server, entry); + ValidateServers(entry); + return true; + } + bool RemoveServer(const ChunkServerPtr& server, Entry& entry) const { + if (! Validate(server)) { + return false; + } + ValidateHosted(entry); + if (! entry.RemoveIndex(server->GetIndex())) { + return false; + } + RemoveHosted(server, entry); + ValidateHosted(entry); + return true; + } + bool HasServer(const ChunkServerPtr& server, const Entry& entry) const { + if (! Validate(server)) { + return false; + } + if (mRemoveServerScanPtr) { + CleanupStaleServers(entry); + } else { + ValidateHosted(entry); + } + return entry.HasIndex(server->GetIndex()); + } + ChunkServerPtr GetServer(const ServerLocation& loc, + const Entry& entry) const { + ValidateHosted(entry); + for (size_t i = 0, e = entry.ServerCount(); i < e; i++) { + const ChunkServerPtr& srv = mServers[entry.IndexAt(i)]; + if (srv && loc == srv->GetServerLocation()) { + return srv; + } + } + return ChunkServerPtr(); + } + bool HasServer(const ServerLocation& loc, const Entry& entry) const { + return GetServer(loc, entry); + } + Entry::State GetState(const Entry& entry) const { + Validate(entry); + return entry.GetState(); + } + bool SetState(chunkId_t chunkId, Entry::State state) { + Entry* const entry = Find(chunkId); + return (entry && SetState(*entry, state)); + } + bool SetState(Entry& entry, Entry::State state) { + if (! Validate(entry) || ! Validate(state)) { + return false; + } + SetStateSelf(entry, state); + if (mRemoveServerScanPtr) { + // The entry can potentially be missed by the + // lazy full scan due to its list position change. + // Do cleanup here. + CleanupStaleServers(entry); + } else { + ValidateHosted(entry); + } + return true; + } + Entry* Next(Entry& entry) const { + Entry* ret = &EList::GetNext(entry); + if (IsNextEnd(*ret)) { + ret = &EList::GetNext(*ret); + } + return ((ret == &entry || IsHead(*ret)) ? 0 : ret); + } + const Entry* Next(const Entry& entry) const { + const Entry* ret = &EList::GetNext(entry); + if (IsNextEnd(*ret)) { + ret = &EList::GetNext(*ret); + } + return ((ret == &entry || IsHead(*ret)) ? 0 : ret); + } + Entry* Prev(Entry& entry) const { + Entry* ret = &EList::GetPrev(entry); + if (IsNextEnd(*ret)) { + ret = &EList::GetPrev(*ret); + } + return ((ret == &entry || IsHead(*ret)) ? 0 : ret); + } + const Entry* Prev(const Entry& entry) const { + const Entry* ret = &EList::GetPrev(entry); + if (IsNextEnd(*ret)) { + ret = &EList::GetPrev(*ret); + } + return ((ret == &entry || IsHead(*ret)) ? 0 : ret); + } + Entry* Find(chunkId_t chunkId) { + if (mCachedChunkId == chunkId && mCachedEntry) { + return mCachedEntry; + } + Entry* const entry = mMap.Find(chunkId); + if (entry) { + mCachedEntry = entry; + mCachedChunkId = chunkId; + } + return entry; + } + const Entry* Find(chunkId_t chunkId) const { + return const_cast(this)->Find(chunkId); + } + size_t Erase(chunkId_t chunkId) { + return mMap.Erase(chunkId); + } + Entry* Insert(MetaFattr* fattr, chunkOff_t offset, chunkId_t chunkId, + seq_t chunkVersion, bool& newEntryFlag) { + newEntryFlag = false; + Entry* const entry = mMap.Insert( + chunkId, + Entry(fattr, offset, chunkId, chunkVersion), + newEntryFlag); + if (newEntryFlag) { + const Entry::State state = entry->GetState(); + mCounts[state]++; + assert(mCounts[state] > 0); + EList::Insert(*entry, + EList::GetPrev(mLists[state + 1])); + } else if (entry) { + entry->offset = offset; + entry->chunkVersion = chunkVersion; + entry->SetFattr(fattr); + } + if (entry) { + mCachedEntry = entry; + mCachedChunkId = chunkId; + } + return entry; + } + void First() { + mMap.First(); + } + const Entry* Next() { + return mMap.Next(); + } + size_t Size() const { + return mMap.GetSize(); + } + void Clear() { + mMap.Clear(); + RemoveServerCleanup(0); + } + bool CanAddServer(const ChunkServerPtr& server) const { + return (mServerCount + mPendingRemove.size() < + Entry::kMaxServers && + server && ! Validate(server) + ); + } + bool Validate(const Entry& entry) const { + const char* const reason = ValidateSelf(entry); + if (reason) { + InternalError(reason); + } + return (! reason); + } + bool Validate(Entry::State state) const { + if (state < 0 || state >= Entry::kStateCount) { + InternalError("invalid state"); + return false; + } + return true; + } + void First(Entry::State state) { + if (Validate(state)) { + mNextPtr[state] = Next(mLists[state]); + // Insert or move iteration delimiter at the present + // list end. + // Set state inserts items before mLists[state + 1], + // this prevents iterating over newly inserted entries, + // and the endless loops with the reordering withing the + // same list. + EList::Insert(mNextEnd[state], + EList::GetPrev(mLists[state + 1])); + } + } + Entry* Next(Entry::State state) { + if (! Validate(state)) { + return 0; + } + Entry* const ret = mNextPtr[state]; + if (ret) { + SetNextPtr(mNextPtr[state]); + } + return ret; + } + void Last(Entry::State state) { + if (Validate(state)) { + mPrevPtr[state] = Prev(mLists[state + 1]); + } + } + Entry* Prev(Entry::State state) { + if (! Validate(state)) { + return 0; + } + Entry* const ret = mPrevPtr[state]; + if (ret) { + mPrevPtr[state] = Prev(*ret); + } + return ret; + } + Entry* Front(Entry::State state) { + if (! Validate(state)) { + return 0; + } + return Next(mLists[state]); + } + const Entry* Front(Entry::State state) const { + if (! Validate(state)) { + return 0; + } + return Next(mLists[state]); + } + bool RemoveServerCleanup(size_t maxScanCount) { + RemoveServerScanCur(); + for (size_t i = 0; + mRemoveServerScanPtr && + (i < maxScanCount || maxScanCount <= 0); + i++) { + Entry& entry = *mRemoveServerScanPtr; + mRemoveServerScanPtr = &EList::GetPrev(entry); + CleanupStaleServers(entry); + RemoveServerScanCur(); + } + return (mRemoveServerScanPtr != 0); + } + size_t GetCount(Entry::State state) const { + return (Validate(state) ? mCounts[state] : size_t(0)); + } +private: + struct KeyVal : public Entry + { + typedef chunkId_t Key; + typedef Entry Val; + + KeyVal(const Key& key, const Val& val) + : Entry(key, val) + {} + KeyVal(const KeyVal& kv) + : Entry(kv.GetKey(), kv.GetVal()) + {} + const Key& GetKey() const { return GetChunkId(); } + const Val& GetVal() const { return *this; } + Val& GetVal() { return *this; } + private: + KeyVal& operator=(const KeyVal&); + }; +public: + // Public only to avoid declaring LinearHash as a friend. + void operator()(KeyVal& keyVal) { + Erasing(keyVal.GetVal()); + } +private: + template + class Allocator + { + public: + T* allocate(size_t n) { + if (n != 1) { + panic("alloc n != 1 not implemented", false); + return 0; + } + return reinterpret_cast(GetAllocator().Allocate()); + } + void deallocate(T* ptr, size_t n) { + if (n != 1) { + panic("dealloc n != 1 not implemented", false); + return; + } + GetAllocator().Deallocate(ptr); + } + static void construct(T* ptr, const T& other) { + new (ptr) T(other); + } + static void destroy(T* ptr) { + ptr->~T(); + } + template + struct rebind { + typedef Allocator other; + }; + typedef PoolAllocator< + sizeof(T), // size_t TItemSize, + size_t(8) << 20, // size_t TMinStorageAlloc, + size_t(128) << 20, // size_t TMaxStorageAlloc, + false // bool TForceCleanupFlag + > Alloc; + const Alloc& GetAllocator() const { + return alloc; + } + private: + Alloc alloc; + Alloc& GetAllocator() { + return alloc; + } + }; + typedef LinearHash< + KeyVal, + KeyCompare, + DynamicArray< + SingleLinkedList*, + 24 // 2^24 * sizeof(void*) => 128 MB + >, + Allocator, + CSMap + > Map; +public: + typedef Map::Allocator::Alloc PAllocator; + const PAllocator& GetAllocator() const { + return mMap.GetAllocator().GetAllocator(); + } +private: + typedef vector SlotIndexes; + typedef uint8_t HibernatedBits; + enum + { + kHibernatedBitShift = 3, + kHibernatedBitMask = (1 << kHibernatedBitShift) - 1 + }; + + Map mMap; + Servers mServers; + SlotIndexes mPendingRemove; + SlotIndexes mNullSlots; + size_t mServerCount; + size_t mHibernatedCount; + Entry* mRemoveServerScanPtr; + Entry* mCachedEntry; + chunkId_t mCachedChunkId; + bool mDebugValidateFlag; + Entry* mPrevPtr[Entry::kStateCount]; + Entry* mNextPtr[Entry::kStateCount]; + size_t mCounts[Entry::kStateCount]; + Entry mLists[Entry::kStateCount + 1]; + Entry mNextEnd[Entry::kStateCount]; + HibernatedBits mHibernatedIndexes[ + (Entry::kMaxServers + kHibernatedBitMask) / + (1 << kHibernatedBitShift)]; + + void Erasing(Entry& entry) { + if (EList::IsInList(entry)) { + const Entry::State state = entry.GetState(); + assert(mCounts[state] > 0); + mCounts[state]--; + if (&entry == mNextPtr[state]) { + SetNextPtr(mNextPtr[state]); + } + if (&entry == mPrevPtr[state]) { + mPrevPtr[state] = Prev(entry); + } + if (&entry == mRemoveServerScanPtr) { + // Do not call RemoveServerScanNext() + // Validate() will fail since the size of the + // list won't match the size of hash table, as + // the entry has already been removed. + mRemoveServerScanPtr = &EList::GetPrev(entry); + } + RemoveHosted(entry); + if (mCachedEntry == &entry) { + mCachedEntry = 0; + mCachedChunkId = -1; + } + // Only update counters here. + // Entry dtor removes it from the list. + } + } + bool IsHead(const Entry& entry) const { + return (&entry == &mLists[entry.GetState()] || + &entry == &mLists[Entry::kStateCount]); + } + bool IsNextEnd(const Entry& entry) const { + return (&entry == &mNextEnd[entry.GetState()]); + } + void SetNextPtr(Entry*& next) { + next = &EList::GetNext(*next); + if (IsHead(*next) || IsNextEnd(*next)) { + next = 0; + } + } + const char* ValidateSelf(const Entry& entry) const { + if (! EList::IsInList(entry)) { + return "not in list"; + } + const int state = entry.GetState(); + if (state < 0 || state >= Entry::kStateCount) { + return "invalid state"; + } + if (IsHead(entry)) { + return "list head"; + } + if (IsNextEnd(entry)) { + return "next end"; + } + return 0; + } + bool ValidateServers(const Entry& entry, + bool ignoreScanFlag = false) const { + if (! mDebugValidateFlag) { + return true; + } + for (size_t i = 0, e = entry.ServerCount(); i < e; i++) { + const size_t idx = entry.IndexAt(i); + const ChunkServerPtr& srv = mServers[idx]; + if (srv) { + const int* const cur = srv->HostedIdx( + entry.GetChunkId()); + if (! cur || *cur != (int)idx) { + InternalError("hosted mismatch"); + return false; + } + } else { + if (idx >= mServers.size()) { + InternalError("invalid index"); + return false; + } + if ((ignoreScanFlag || + ! mRemoveServerScanPtr) && + ! IsHibernated(idx)) { + InternalError("no server"); + return false; + } + } + } + return true; + } + bool ValidateServersNoScan(const Entry& entry) { + return ValidateServers(entry, true); + } + void RemoveHosted(Entry& entry) const { + for (size_t i = 0, e = entry.ServerCount(); i < e; i++) { + const size_t idx = entry.IndexAt(i); + const ChunkServerPtr& srv = mServers[idx]; + if (srv) { + if (mDebugValidateFlag) { + srv->RemoveHosted( + entry.GetChunkId(), idx); + } else { + srv->RemoveHosted(); + } + } + } + } + void AddHosted(const ChunkServerPtr& server, const Entry& entry) const { + if (mDebugValidateFlag) { + server->AddHosted( + entry.GetChunkId(), server->GetIndex()); + } else { + server->AddHosted(); + } + } + void RemoveHosted(const ChunkServerPtr& server, + const Entry& entry) const { + if (mDebugValidateFlag) { + server->RemoveHosted( + entry.GetChunkId(), server->GetIndex()); + } else { + server->RemoveHosted(); + } + } + bool Validate() const { + if (! mDebugValidateFlag) { + return true; + } + size_t cnt = 0; + for (const Entry* entry = &mLists[Entry::kStateNone]; ; ) { + entry = &EList::GetNext(*entry); + if (entry == &mLists[Entry::kStateNone]) { + break; + } + if (! IsNextEnd(*entry) && ! IsHead(*entry)) { + cnt++; + if (! ValidateServers(*entry)) { + return false; + } + } + } + if (cnt != mMap.GetSize()) { + InternalError("invalid entry count"); + return false; + } + return true; + } + bool ValidateHosted(const Entry& entry) const { + return (! mDebugValidateFlag || + (Validate(entry) && ValidateServers(entry))); + } + size_t CleanupStaleServers(const Entry& entry, + Servers* servers = 0) const { + size_t hibernatedCount = 0; + return CleanupStaleServers(entry, servers, hibernatedCount); + } + size_t CleanupStaleServers(const Entry& entry, + Servers* servers, size_t& hibernatedCount) const { + return const_cast(this)->CleanupStaleServers( + const_cast(entry), servers, hibernatedCount); + } + size_t CleanupStaleServers(Entry& entry, Servers* servers = 0) { + size_t hibernatedCount = 0; + return CleanupStaleServers(entry, servers, hibernatedCount); + } + size_t CleanupStaleServers(Entry& entry, Servers* servers, + size_t& hibernatedCount) { + ValidateHosted(entry); + size_t cnt = entry.ServerCount(); + const size_t prev = cnt; + size_t ret = 0; + for (size_t i = 0; i < cnt; ) { + const size_t idx = entry.IndexAt(i); + const ChunkServerPtr& server = mServers[idx]; + if (server) { + if (servers) { + servers->push_back(server); + } + ret++; + } else if (IsHibernated(idx)) { + hibernatedCount++; + } else { + entry.RemoveIndex(idx); + cnt--; + continue; + } + i++; + } + ValidateServersNoScan(entry); + // Enqueue replication check if servers were removed. + if (prev != cnt && entry.GetState() == Entry::kStateNone) { + SetStateSelf(entry, Entry::kStateCheckReplication); + } + return ret; + } + void RemoveServerScanFirst() { + // Scan backwards to avoid scanning the newly added entries, + // or entries that have been moved. + mRemoveServerScanPtr = &mLists[Entry::kStateCount]; + RemoveServerScanNext(); + } + void RemoveServerScanNext() { + for (; ;) { + if (&mLists[Entry::kStateNone] == + mRemoveServerScanPtr) { + mRemoveServerScanPtr = 0; + Validate(); + if (mNullSlots.empty()) { + mNullSlots.swap(mPendingRemove); + } else { + mNullSlots.insert( + mNullSlots.end(), + mPendingRemove.begin(), + mPendingRemove.end()); + mPendingRemove.clear(); + } + return; + } + mRemoveServerScanPtr = &EList::GetPrev( + *mRemoveServerScanPtr); + if (! IsHead(*mRemoveServerScanPtr) && + ! IsNextEnd(*mRemoveServerScanPtr)) { + break; + } + } + } + void RemoveServerScanCur() { + if (mRemoveServerScanPtr && + (IsHead(*mRemoveServerScanPtr) || + IsNextEnd(*mRemoveServerScanPtr))) { + RemoveServerScanNext(); + } + } + void SetStateSelf(Entry& entry, Entry::State state) { + const Entry::State prev = entry.GetState(); + assert(mCounts[prev] > 0); + mCounts[prev]--; + if (&entry == mNextPtr[prev]) { + SetNextPtr(mNextPtr[prev]); + } + if (&entry == mPrevPtr[prev]) { + mPrevPtr[prev] = Prev(entry); + } + if (&entry == mRemoveServerScanPtr) { + // Do not call RemoveServerScanNext(), SetState() + // cleans the entry only if mRemoveServerScanPtr != 0 + mRemoveServerScanPtr = &EList::GetPrev(entry); + } + entry.SetState(state); + mCounts[state]++; + assert(mCounts[state] > 0); + EList::Insert(entry, EList::GetPrev(mLists[state + 1])); + } + bool IsHibernated(size_t idx) const { + return (mHibernatedIndexes[idx >> kHibernatedBitShift] & + (HibernatedBits(1) << (idx & kHibernatedBitMask))) != 0; + } + bool SetHibernated(size_t idx) { + HibernatedBits& bits = mHibernatedIndexes[ + idx >> kHibernatedBitShift]; + const HibernatedBits bit = HibernatedBits(1) << + (idx & kHibernatedBitMask); + if ((bits & bit) != 0) { + return false; + } + bits |= bit; + mHibernatedCount++; + assert(mHibernatedCount > 0); + return true; + } + bool ClearHibernated(size_t idx) { + HibernatedBits& bits = mHibernatedIndexes[ + idx >> kHibernatedBitShift]; + const HibernatedBits bit = HibernatedBits(1) << + (idx & kHibernatedBitMask); + if ((bits & bit) == 0) { + return false; + } + bits &= ~bit; + assert(mHibernatedCount > 0); + mHibernatedCount--; + return true; + } + static void InternalError(const char* errMsg) { + panic(errMsg ? errMsg : "internal error", false); + } +private: + CSMap(const CSMap&); + CSMap& operator=(const CSMap&); +}; + +} // namespace KFS + +#endif /* CS_MAP_H */ diff --git a/src/cc/meta/Checkpoint.cc b/src/cc/meta/Checkpoint.cc new file mode 100644 index 000000000..14715e18f --- /dev/null +++ b/src/cc/meta/Checkpoint.cc @@ -0,0 +1,193 @@ +/*! + * $Id$ + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * + * \file checkpoint.cc + * \brief KFS metadata checkpointing + * \author Sriram Rao and Blake Lewis + * + * The metaserver during its normal operation writes out log records. Every + * N minutes, the metaserver rolls over the log file. Periodically, a sequence + * of log files are compacted to create a checkpoint: a previous checkpoint is + * loaded and subsequent log files are replayed to update the tree. At the end + * of replay, a checkpoint is saved to disk. To save a checkpoint, we iterate + * through the leaf nodes of the tree copying the contents of each node to a + * checkpoint file. + */ + +#include "Checkpoint.h" +#include "kfstree.h" +#include "MetaRequest.h" +#include "Logger.h" +#include "util.h" +#include "LayoutManager.h" +#include "common/MdStream.h" +#include "common/FdWriter.h" + +#include +#include +#include +#include +#include +#include + +namespace KFS +{ +using std::hex; +using std::dec; + +// default values +string CPDIR("./kfscp"); //!< directory for CP files +string LASTCP(CPDIR + "/latest"); //!< most recent CP file (link) + +Checkpoint cp(CPDIR); + +int +Checkpoint::write_leaves(ostream& os) +{ + LeafIter li(metatree.firstLeaf(), 0); + Node *p = li.parent(); + Meta *m = li.current(); + int status = 0; + while (status == 0 && m) { + if (m->skip()) { + m->clearskip(); + } else { + status = m->checkpoint(os); + } + li.next(); + p = li.parent(); + m = p ? li.current() : 0; + } + return status; +} + +/* + * At system startup, take a CP if the file that corresponds to the + * latest CP doesn't exist. +*/ +void +Checkpoint::initial_CP() +{ + if (file_exists(LASTCP)) { + return; + } + do_CP(); +} + +int +Checkpoint::do_CP() +{ + if (oplog.name().empty()) { + return -EINVAL; + } + seq_t highest = oplog.checkpointed(); + cpname = cpfile(highest); + const char* const suffix = ".tmp.XXXXXX"; + char* const tmpname = new char[cpname.length() + strlen(suffix) + 1]; + memcpy(tmpname, cpname.c_str(), cpname.length()); + strcpy(tmpname + cpname.length(), suffix); + int fd = mkstemp(tmpname); + int status = 0; + if (fd < 0) { + status = errno > 0 ? -errno : -EIO; + } else { + close(fd); + fd = open(tmpname, O_WRONLY | (writesync ? O_SYNC : 0)); + if (fd < 0) { + status = errno > 0 ? -errno : -EIO; + unlink(tmpname); + } + } + if (status == 0) { + FdWriter fdw(fd); + const bool kSyncFlag = false; + MdStreamT os(&fdw, kSyncFlag, string(), writebuffersize); + os << dec; + os << "checkpoint/" << highest << '\n'; + os << "checksum/last-line\n"; + os << "version/" << VERSION << '\n'; + os << "fid/" << fileID.getseed() << '\n'; + os << "chunkId/" << chunkID.getseed() << '\n'; + os << "chunkVersionInc/1\n"; + os << "time/" << DisplayIsoDateTime() << '\n'; + os << "setintbase/16\n" << hex; + os << "log/" << oplog.name() << "\n\n"; + status = write_leaves(os); + if (status == 0 && os) { + status = gLayoutManager.WritePendingMakeStable(os); + } + if (status == 0 && os) { + status = gLayoutManager.WritePendingChunkVersionChange(os); + } + if (status == 0) { + os << "time/" << DisplayIsoDateTime() << '\n'; + const string md = os.GetMd(); + os << "checksum/" << md << '\n'; + os.SetStream(0); + if ((status = fdw.GetError()) != 0) { + if (status > 0) { + status = -status; + } + } else if (! os) { + status = -EIO; + } + } + if (status == 0) { + if (close(fd)) { + status = errno > 0 ? -errno : -EIO; + } else { + if (rename(tmpname, cpname.c_str())) { + status = errno > 0 ? -errno : -EIO; + } else { + fd = -1; + status = link_latest(cpname, LASTCP); + } + } + } else { + close(fd); + } + } + if (status != 0 && fd >= 0) { + unlink(tmpname); + } + ++cpcount; + delete [] tmpname; + return status; +} + +void +checkpointer_setup_paths(const string& cpdir) +{ + if (! cpdir.empty()) { + CPDIR = cpdir; + LASTCP = cpdir + "/latest"; + cp.setCPDir(cpdir); + } +} + +void +checkpointer_init() +{ + // start a CP on restart. + cp.initial_CP(); +} + +} diff --git a/src/cc/meta/Checkpoint.h b/src/cc/meta/Checkpoint.h new file mode 100644 index 000000000..7cbfa0296 --- /dev/null +++ b/src/cc/meta/Checkpoint.h @@ -0,0 +1,104 @@ +/* + * $Id$ + * + * \file Checkpoint.h + * \brief KFS checkpointer + * \author Blake Lewis (Kosmix Corp.) + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ +#if !defined(KFS_CHECKPOINT_H) +#define KFS_CHECKPOINT_H + +#include + +#include "kfstypes.h" +#include "util.h" + +namespace KFS { +using std::string; + +/*! + * \brief keeps track of checkpoint status + * + * This class records the current state of the metadata server + * with respect to checkpoints---the name of the checkpoint file, + * whether the CP is running, whether the server has any recent + * updates that would make a new CP worthwhile, etc. + * + * Writing out a checkpoint involves walking the leaves of the + * metatree and saving them to disk. The on-disk checkpoint file stores + * the name of the log that contains all the operations after the checkpoint + * was taken. For failure recovery, there is a notion of a "LATEST" checkpoint + * file (created via a hardlink) that identifies the checkpoint that should be + * used for restore purposes. + * + */ +class Checkpoint +{ +public: + static const int VERSION = 1; + Checkpoint(const string& d = string()) + : cpdir(d), + cpname(), + mutations(0), + cpcount(0), + writesync(true), + writebuffersize(16 << 20) + {} + void setCPDir(const string& d) + { cpdir = d; } + const string name() const { return cpname; } + //!< return true if a CP will be taken + bool isCPNeeded() { return mutations != 0; } + void initial_CP(); //!< schedule a checkpoint on startup if needed + int do_CP(); //!< do the actual work + void note_mutation() { ++mutations; } + void resetMutationCount() { mutations = 0; } + bool getWriteSyncFlag() const { return writesync; } + void setWriteSyncFlag(bool flag) { writesync = flag; } + size_t getWriteBufferSize() const { return writebuffersize; } + void setWriteBufferSize(size_t size) { writebuffersize = size; } +private: + string cpdir; //!< dir for CP files + string cpname; //!< name of CP file + int mutations; //!< changes since last CP + int cpcount; //!< number of CP's since startup + bool writesync; + size_t writebuffersize; + + string cpfile(seq_t highest) //!< generate the next file name + { return makename(cpdir, "chkpt", highest); } + int write_leaves(ostream& os); +private: + // No copy. + Checkpoint(const Checkpoint&); + Checkpoint& operator=(const Checkpoint&); +}; + +extern string CPDIR; //!< directory for CP files +extern string LASTCP; //!< most recent CP file (link) + +extern Checkpoint cp; +extern void checkpointer_setup_paths(const string &cpdir); +extern void checkpointer_init(); + +} + +#endif // !defined(KFS_CHECKPOINT_H) diff --git a/src/cc/meta/ChildProcessTracker.cc b/src/cc/meta/ChildProcessTracker.cc new file mode 100644 index 000000000..662c64782 --- /dev/null +++ b/src/cc/meta/ChildProcessTracker.cc @@ -0,0 +1,102 @@ + +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/04/30 +// Author: Sriram Rao, Mike Ovsiannikov +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file ChildProcessTracker.cc +// \brief Handler for tracking child process that are forked off, retrieve +// their exit status. +// +//---------------------------------------------------------------------------- + +#include +#include + +#include "MetaRequest.h" +#include "Logger.h" +#include "ChildProcessTracker.h" +#include "kfsio/Globals.h" +#include "common/MsgLogger.h" + +namespace KFS +{ +using std::vector; +using std::pair; +using std::back_inserter; +using std::make_pair; + +ChildProcessTrackingTimer gChildProcessTracker; + +void ChildProcessTrackingTimer::Track(pid_t pid, MetaRequest *r) +{ + if (mPending.empty()) { + libkfsio::globalNetManager().RegisterTimeoutHandler(this); + } + mPending.insert(make_pair(pid, r)); +} + +void ChildProcessTrackingTimer::Timeout() +{ + while (! mPending.empty()) { + int status = 0; + const pid_t pid = waitpid(-1, &status, WNOHANG); + if (pid <= 0) { + return; + } + pair const range = + mPending.equal_range(pid); + if (range.first == range.second) { + // Assume that all children are reaped here. + KFS_LOG_STREAM_ERROR << + "untracked child exited:" + " pid: " << pid << + " status: " << status << + KFS_LOG_EOM; + continue; + } + typedef vector > Requests; + Requests reqs; + copy(range.first, range.second, back_inserter(reqs)); + mPending.erase(range.first, range.second); + const bool lastReqFlag = mPending.empty(); + if (lastReqFlag) { + libkfsio::globalNetManager().UnRegisterTimeoutHandler(this); + } + for (Requests::const_iterator it = reqs.begin(); it != reqs.end(); ++it) { + MetaRequest* const req = it->second; + req->status = WIFEXITED(status) ? WEXITSTATUS(status) : + (WIFSIGNALED(status) ? -WTERMSIG(status) : -11111); + req->suspended = false; + KFS_LOG_STREAM_INFO << + "child exited:" + " pid: " << pid << + " status: " << req->status << + " request: " << req->Show() << + KFS_LOG_EOM; + submit_request(req); + } + if (lastReqFlag) { + return; + } + } +} + +} diff --git a/src/cc/meta/ChildProcessTracker.h b/src/cc/meta/ChildProcessTracker.h new file mode 100644 index 000000000..16ef00e39 --- /dev/null +++ b/src/cc/meta/ChildProcessTracker.h @@ -0,0 +1,65 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2009/04/30 +// +// Copyright 2009-2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \brief A timer that periodically tracks child process that have been spawned +// and retrieves their completion status. +// +//---------------------------------------------------------------------------- + +#ifndef META_CHILDPROCESSTRACKER_H +#define META_CHILDPROCESSTRACKER_H + +#include "kfsio/ITimeout.h" + +#include +#include + +#include + +namespace KFS +{ + +using std::multimap; +struct MetaRequest; + +class ChildProcessTrackingTimer : public ITimeout +{ +public: + ChildProcessTrackingTimer(int timeoutMilliSec = 500) { + SetTimeoutInterval(timeoutMilliSec); + }; + // On a timeout check the child processes for exit status + virtual void Timeout(); + // track the process with pid and return the exit status to MetaRequest + void Track(pid_t pid, MetaRequest *r); + size_t GetProcessCount() const { + return mPending.size(); + } +private: + typedef multimap Pending; + Pending mPending; +}; + +extern ChildProcessTrackingTimer gChildProcessTracker; + +} + +#endif // META_CHILDPROCESSTRACKER_H diff --git a/src/cc/meta/ChunkPlacement.h b/src/cc/meta/ChunkPlacement.h new file mode 100644 index 000000000..d0dda4b15 --- /dev/null +++ b/src/cc/meta/ChunkPlacement.h @@ -0,0 +1,824 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2012/02/06 +// Author: Mike Ovsiannikov +// +// Copyright 2012 Quantcast Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file ChunkPlacement.h +// +//---------------------------------------------------------------------------- + +#ifndef CHUNK_PLACEMENT_H +#define CHUNK_PLACEMENT_H + +#include "common/StdAllocator.h" +#include "common/MsgLogger.h" + +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace KFS +{ +using std::vector; +using std::pair; +using std::make_pair; +using std::find_if; +using std::iter_swap; +using std::sort; +using boost::bind; + +/* + * Rack aware chunk placement. + * + * The long term placement goal is to keep at most 1 replica of a chunk per + * rack, or all chunks from Reed-Solomon recovery block with replication 1 on + * different racks. + * + * The likelihood of a particular rack to be selected for placement is + * proportional to the number of chunk servers that can be used in each rack + * multiplied by the rack weight. The rack weight can be configured, otherwise + * it assumed to be 1.0. + * + * The likelihood of a particular server to be chosen from a rack is + * proportional to the to the reciprocal of the "write load" or available space. + * For more details look at GetLoad() and LayoutManager::IsCandidateServer(). + * + * The available space is used for space re-balancing, and write load is used + * for the initial chunk placement, unless using available space is forced by + * the meta server configuration for initial chunk placement. + * + * To minimize network transfers between the rack the re-replication and + * re-balancing attempts to choose re-replication source and destination withing + * the same rack. If not enough different racks available, put chunk replicas + * into the rack that has the least chunk replicas or lest chunks for a given rs + * block. When more racks become available later, then re-balancing will move + * the replicas accordingly. + * + */ + +template +class ChunkPlacement +{ +public: + typedef typename LayoutManager::Servers Servers; + typedef typename Servers::value_type ChunkServerPtr; + typedef typename ChunkServerPtr::element_type ChunkServer; + typedef typename LayoutManager::RackInfos RackInfos; + typedef typename RackInfos::value_type RackInfo; + typedef typename RackInfo::RackId RackId; + typedef std::size_t size_t; + + ChunkPlacement( + LayoutManager& layoutManager) + : mLayoutManager(layoutManager), + mRacks(mLayoutManager.GetRacks()), + mRackExcludes(), + mServerExcludes(), + mCandidateRacks(), + mCandidates(), + mLoadAvgSum(0), + mRackPos(0), + mCandidatePos(0), + mCurRackId(-1), + mCandidatesInRacksCount(0), + mMaxReplicationsPerNode(0), + mMaxSpaceUtilizationThreshold(0), + mForReplicationFlag(false), + mUsingRackExcludesFlag(false), + mUsingServerExcludesFlag(false), + mSortBySpaceUtilizationFlag(false), + mSortCandidatesByLoadAvgFlag(false), + mUseTotalFsSpaceFlag(false) + {} + void Reset() + { + mLoadAvgSum = 0; + mRackPos = 0; + mCandidatePos = 0; + mCurRackId = -1; + mUsingRackExcludesFlag = false; + mUsingServerExcludesFlag = false; + mCandidateRacks.clear(); + mCandidates.clear(); + } + void clear() + { + Reset(); + mRackExcludes.Clear(); + mServerExcludes.Clear(); + } + void FindCandidates( + bool forReplicationFlag = false, + RackId rackIdToUse = -1) + { + Reset(); + mForReplicationFlag = forReplicationFlag; + mSortBySpaceUtilizationFlag = + mLayoutManager.GetSortCandidatesBySpaceUtilizationFlag(); + mSortCandidatesByLoadAvgFlag = + mLayoutManager.GetSortCandidatesByLoadAvgFlag(); + mUseTotalFsSpaceFlag = + mLayoutManager.GetUseFsTotalSpaceFlag(); + mMaxSpaceUtilizationThreshold = + mLayoutManager.GetMaxSpaceUtilizationThreshold(); + mMaxReplicationsPerNode = + mLayoutManager.GetMaxConcurrentWriteReplicationsPerNode(); + FindCandidatesSelf(rackIdToUse); + } + void FindCandidatesInRack( + RackId rackIdToUse) + { FindCandidates(false, rackIdToUse); } + + void FindCandidatesForReplication() + { FindCandidates(true); } + + void FindRebalanceCandidates( + double maxUtilization, + RackId rackIdToUse = -1) + { + Reset(); + mForReplicationFlag = true; + mSortBySpaceUtilizationFlag = true; + mSortCandidatesByLoadAvgFlag = false; + mUseTotalFsSpaceFlag = + mLayoutManager.GetUseFsTotalSpaceFlag(); + mMaxSpaceUtilizationThreshold = min(maxUtilization, + mLayoutManager.GetMaxSpaceUtilizationThreshold()); + mMaxReplicationsPerNode = + mLayoutManager.GetMaxConcurrentWriteReplicationsPerNode(); + FindCandidatesSelf(rackIdToUse); + } + + size_t GetCandidateRackCount() const + { return mCandidateRacks.size(); } + + bool HasCandidateRacks() + { + bool anyAvailableFlag = false; + FindCandidateRacks(&anyAvailableFlag); + return anyAvailableFlag; + } + + bool SearchCandidateRacks() + { + Reset(); + bool anyAvailableFlag = false; + FindCandidateRacks(&anyAvailableFlag); + return anyAvailableFlag; + } + + ChunkServerPtr GetNext( + bool canIgnoreServerExcludesFlag) + { + if (mUsingServerExcludesFlag) { + if (! canIgnoreServerExcludesFlag) { + return ChunkServerPtr(); + } + while (mCandidatePos < mServerExcludes.Size()) { + ChunkServer& srv = *(mServerExcludes.Get( + mCandidatePos++)); + if (IsCandidateServer(srv)) { + return srv.shared_from_this(); + } + } + return ChunkServerPtr(); + } + if (mCandidatePos <= 0) { + if (! canIgnoreServerExcludesFlag || + ! mUsingRackExcludesFlag || + mRackPos <= mRackExcludes.Size()) { + return ChunkServerPtr(); + } + mServerExcludes.SortByCount(); + mCandidatePos = 0; + mUsingServerExcludesFlag = true; + // Tail recursion. + return GetNext(canIgnoreServerExcludesFlag); + } + // Random shuffle chosen servers, such that the servers with + // smaller "load" go before the servers with larger load. + if (mCandidatePos == 1) { + return mCandidates[--mCandidatePos + ].second->shared_from_this(); + } + assert(mLoadAvgSum > 0); + int64_t rnd = Rand(mLoadAvgSum); + size_t ri = mCandidatePos--; + int64_t load; + do { + --ri; + load = mCandidates[ri].first; + rnd -= load; + } while (rnd >= 0 && ri > 0); + iter_swap(mCandidates.begin() + mCandidatePos, + mCandidates.begin() + ri); + mLoadAvgSum -= load; + return mCandidates[mCandidatePos].second->shared_from_this(); + } + + bool IsUsingServerExcludes() const + { return mUsingServerExcludesFlag; } + + bool IsUsingRackExcludes() const + { return mUsingRackExcludesFlag; } + + bool NextRack() + { + for ( ; ; ) { + if (mUsingRackExcludesFlag) { + mCurRackId = -1; + const size_t size = mRackExcludes.Size(); + if (size < mRackPos) { + return false; + } + if (size == mRackPos) { + // The last attempt -- put it somewhere. + FindCandidateServers( + mLayoutManager.GetChunkServers() + ); + mRackPos++; + break; + } + const RackId rackId = + mRackExcludes.Get(mRackPos++); + typename RackInfos::const_iterator const it = + find_if( + mRacks.begin(), mRacks.end(), + bind(&RackInfo::id, _1) == rackId + ); + if (it == mRacks.end()) { + KFS_LOG_STREAM_ERROR << + " invalid rack id: " << + rackId << + KFS_LOG_EOM; + continue; + } + FindCandidateServers(it->getServers()); + if (mCandidates.empty()) { + continue; + } + mCurRackId = rackId; + break; + } + if (mRackPos >= mCandidateRacks.size()) { + // Use rack excludes. + // Put the racks with least chunk replicas + // first, to put the same number of replicas on + // each rack. + mRackExcludes.SortByCount(); + mRackPos = 0; + mUsingRackExcludesFlag = true; + continue; + } + // Random shuffle chosen racks, such that the racks with + // larger number of possible allocation candidates have + // higher probability to go before the racks with lesser + // number of candidates. + if (mRackPos + 1 < mCandidateRacks.size()) { + assert(mCandidatesInRacksCount > 0); + int64_t rnd = Rand(mCandidatesInRacksCount); + size_t ri = mRackPos; + for (; ;) { + const int64_t sz = + mCandidateRacks[ri].first; + if ((rnd -= sz) < 0) { + mCandidatesInRacksCount -= sz; + break; + } + ri++; + } + iter_swap(mCandidateRacks.begin() + mRackPos, + mCandidateRacks.begin() + ri); + } + const RackInfo& rack = *(mCandidateRacks[ + mRackPos++].second); + FindCandidateServers(rack.getServers()); + if (! mCandidates.empty()) { + mCurRackId = rack.id(); + break; + } + } + return (! mCandidates.empty()); + } + + bool ExcludeServer( + ChunkServer& srv) + { + return mServerExcludes.Insert(&srv); + } + + bool ExcludeServer( + const ChunkServerPtr& srv) + { + return ExcludeServer(*srv); + } + + template + bool ExcludeServer( + IT start, + IT end) + { + bool ret = false; + while (start != end) { + ret = ExcludeServer(*start++) || ret; + } + return ret; + } + + bool ExcludeServer( + const Servers& servers) + { + return ExcludeServer(servers.begin(), servers.end()); + } + + bool ExcludeRack( + RackId rackId) + { + return mRackExcludes.Insert(rackId); + } + + bool ExcludeRack( + ChunkServer& srv, + chunkId_t chunkId = -1) + { + const RackId rackId = srv.GetRack(); + if (chunkId >= 0 && srv.IsEvacuationScheduled(chunkId)) { + return (! mRackExcludes.Find(rackId)); + } + return ExcludeRack(rackId); + } + + bool ExcludeRack( + const ChunkServerPtr& srv, + chunkId_t chunkId = -1) + { + return ExcludeRack(*srv, chunkId); + } + + template + bool ExcludeRack( + IT start, + IT end, + chunkId_t chunkId = -1) + { + bool ret = false; + while (start != end) { + ret = ExcludeRack(*start++, chunkId) || ret; + } + return ret; + } + + bool ExcludeRack( + const Servers& servers, + chunkId_t chunkId = -1) + { + return ExcludeRack(servers.begin(), servers.end(), chunkId); + } + + bool ExcludeServerAndRack( + ChunkServer& srv, + chunkId_t chunkId = -1) + { + mServerExcludes.Insert(&srv); + return ExcludeRack(srv, chunkId); + } + + bool ExcludeServerAndRack( + const ChunkServerPtr& srv, + chunkId_t chunkId = -1) + { + return ExcludeServerAndRack(*srv, chunkId); + } + + template + bool ExcludeServerAndRack( + IT start, + IT end, + chunkId_t chunkId = -1) + { + bool ret = false; + while (start != end) { + ret = ExcludeServerAndRack(*start++, chunkId) || ret; + } + return ret; + } + + bool ExcludeServerAndRack( + const Servers& servers, + chunkId_t chunkId = -1) + { + return ExcludeServerAndRack( + servers.begin(), servers.end(), chunkId); + } + + bool IsServerExcluded( + const ChunkServer& srv) const + { + return mServerExcludes.Find(&srv); + } + + bool IsServerExcluded( + const ChunkServerPtr& srv) const + { + return IsServerExcluded(*srv); + } + + bool IsRackExcluded( + const ChunkServer& srv) const + { + return mRackExcludes.Find(srv.GetRack()); + } + + bool IsRackExcluded( + const ChunkServerPtr& srv) const + { + return IsRackExcluded(*srv); + } + + bool IsExcluded( + const ChunkServer& srv) const + { + return ( + IsRackExcluded(srv) || + IsServerExcluded(srv) + ); + } + + bool IsExcluded( + const ChunkServerPtr& srv) const + { + return IsExcluded(*srv); + } + + bool CanBeUsed( + const ChunkServer& srv) + { + return ( + IsCandidateServer(srv) && + ! IsExcluded(srv) + ); + } + + bool CanBeUsed( + const ChunkServerPtr& srv) + { + return CanBeUsed(*srv); + } + + size_t GetExcludedServersCount() const + { return mServerExcludes.Size(); } + + size_t GetExcludedRacksCount() const + { return mRackExcludes.Size(); } + + size_t GetTotalExcludedServersCount() const + { return mServerExcludes.GetTotal(); } + + size_t GetTotalExcludedRacksCount() const + { return mRackExcludes.GetTotal(); } + + size_t GetExcludedServersMaxCount() const + { return mServerExcludes.GetMaxCount(); } + + size_t GetExcludedRacksMaxCount() const + { return mRackExcludes.GetMaxCount(); } + + RackId GetRackId() const + { return mCurRackId; } + + bool IsLastRack() const + { + return ( + mUsingRackExcludesFlag && + mRackExcludes.Size() < mRackPos + ); + } + + void Reserve( + size_t count) + { + mServerExcludes.Reserve(count); + mRackExcludes.Reserve(count); + } + + RackId GetMostUsedRackId() const + { return mRackExcludes.GetMaxId(); } + +private: + template + class IdSet + { + public: + IdSet() + : mIds(), + mMaxId(), + mTotal(0), + mMaxCount(0) + {} + bool Find( + const IdConstT& id) const + { + for (typename Ids::const_iterator it = mIds.begin(); + it != mIds.end(); + ++it) { + if (it->first == id) { + return true; + } + } + return false; + } + bool Insert( + const IdT& id) + { + mTotal++; + for (typename Ids::iterator it = mIds.begin(); + it != mIds.end(); + ++it) { + if (it->first == id) { + if (++(it->second) > mMaxCount) { + mMaxCount = it->second; + mMaxId = id; + } + return false; + } + } + if (mMaxCount <= 0) { + mMaxCount = size_t(1); + mMaxId = id; + } + mIds.push_back(make_pair(id, CountT(1))); + return true; + } + void SortByCount() + { + sort(mIds.begin(), mIds.end(), + bind(&Ids::value_type::second, _1) < + bind(&Ids::value_type::second, _2) + ); + } + size_t GetMaxCount() const + { return mMaxCount; } + const IdT& Get(size_t pos) const + { return mIds[pos].first; } + size_t Size() const + { return mIds.size(); } + bool IsEmpty() const + { return mIds.empty(); } + void Clear() + { + mIds.clear(); + mTotal = 0; + mMaxCount = 0; + mMaxId = IdT(); + } + size_t GetTotal() const + { return mTotal; } + void Reserve( + size_t count) + { mIds.reserve(count); } + const IdT& GetMaxId() const + { return mMaxId; } + private: + typedef vector< + pair, + StdFastAllocator > + > Ids; + Ids mIds; + IdT mMaxId; + size_t mTotal; + size_t mMaxCount; + private: + IdSet(const IdSet&); + IdSet& operator=(const IdSet&); + }; + class RackIdSet + { + public: + enum { kMaxId = (RackId(1) << 16) - 1 }; + + bool Find( + RackId id) const + { + if (id < 0 || id > kMaxId) { + return false; + } + return mIds.Find((int16_t)id); + } + bool Insert( + RackId id) + { + if (id < 0 || id > kMaxId) { + return false; + } + return mIds.Insert((int16_t)id); + } + void SortByCount() + { mIds.SortByCount(); } + size_t GetMaxCount() const + { return mIds.GetMaxCount(); } + RackId Get(size_t pos) const + { return RackId(mIds.Get(pos)); } + size_t Size() const + { return mIds.Size(); } + bool IsEmpty() const + { return mIds.IsEmpty(); } + void Clear() + { mIds.Clear(); } + void Reserve( + size_t count) + { mIds.Reserve(count); } + RackId GetMaxId() const + { + return (mIds.GetMaxCount() <= 0 ? + RackId(-1) : mIds.GetMaxId()); + } + private: + typedef IdSet RackIds; + RackIds mIds; + }; + typedef IdSet ServerExcludes; + typedef vector< + pair, + StdAllocator > + > CandidateRacks; + typedef vector< + pair, + StdAllocator > + > Candidates; + typedef Servers Sources; + enum { kSlaveScaleFracBits = LayoutManager::kSlaveScaleFracBits }; + + LayoutManager& mLayoutManager; + const RackInfos& mRacks; + RackIdSet mRackExcludes; + ServerExcludes mServerExcludes; + CandidateRacks mCandidateRacks; + Candidates mCandidates; + int64_t mLoadAvgSum; + size_t mRackPos; + size_t mCandidatePos; + RackId mCurRackId; + int64_t mCandidatesInRacksCount; + int mMaxReplicationsPerNode; + double mMaxSpaceUtilizationThreshold; + bool mForReplicationFlag; + bool mUsingRackExcludesFlag; + bool mUsingServerExcludesFlag; + bool mSortBySpaceUtilizationFlag; + bool mSortCandidatesByLoadAvgFlag; + bool mUseTotalFsSpaceFlag; + + int64_t Rand( + int64_t interval) + { return mLayoutManager.Rand(interval); } + void FindCandidatesSelf( + RackId rackIdToUse = -1) + { + FindCandidateRacks(0, rackIdToUse); + if (rackIdToUse >= 0 && mCandidateRacks.size() > 1) { + for (size_t ri = 0; ri < mCandidateRacks.size(); ri++) { + if (mCandidateRacks[ri].second->id() + != rackIdToUse) { + continue; + } + mCandidatesInRacksCount -= + mCandidateRacks[ri].first; + iter_swap(mCandidateRacks.begin() + mRackPos, + mCandidateRacks.begin() + ri); + const RackInfo& rack = *(mCandidateRacks[ + mRackPos++].second); + FindCandidateServers(rack.getServers()); + if (! mCandidates.empty()) { + mCurRackId = rack.id(); + return; + } + } + } + NextRack(); + } + void FindCandidateRacks( + bool* anyAvailableFlag = 0, + RackId rackIdToUse = -1) + { + if (anyAvailableFlag) { + if ((*anyAvailableFlag = ! mCandidateRacks.empty())) { + return; + } + } else { + mCandidateRacks.clear(); + } + if (mRacks.empty() || mRacks.size() <= mRackExcludes.Size()) { + return; + } + mCandidatesInRacksCount = 0; + for (typename RackInfos::const_iterator it = mRacks.begin(); + it != mRacks.end(); + ++it) { + const RackInfo& rack = *it; + const RackId id = rack.id(); + if (id < 0 || id > RackIdSet::kMaxId) { + continue; // Invalid rack id. + } + if (id != rackIdToUse && mRackExcludes.Find(id)) { + continue; + } + const int64_t cnt = + rack.getWeightedPossibleCandidatesCount(); + if (cnt <= 0) { + continue; + } + if (anyAvailableFlag) { + *anyAvailableFlag = true; + break; + } + mCandidatesInRacksCount += cnt; + mCandidateRacks.push_back(make_pair(cnt, &rack)); + } + } + int64_t GetLoad( + const ChunkServer& srv) const + { + const int64_t kLoadAvgFloor = 1; + if (mSortBySpaceUtilizationFlag) { + return ((int64_t)(srv.GetSpaceUtilization( + mUseTotalFsSpaceFlag) * + (int64_t(1) << (10 + kSlaveScaleFracBits))) + + kLoadAvgFloor); + } + if (mSortCandidatesByLoadAvgFlag) { + int64_t load = srv.GetLoadAvg(); + if (! srv.CanBeChunkMaster()) { + load = (load * + mLayoutManager.GetSlavePlacementScale() + ) >> kSlaveScaleFracBits; + } + return (load + kLoadAvgFloor); + } + return kLoadAvgFloor; + } + bool IsCandidateServer( + const ChunkServer& srv) const + { + return ( + mLayoutManager.IsCandidateServer(srv) && + srv.GetSpaceUtilization(mUseTotalFsSpaceFlag) < + mMaxSpaceUtilizationThreshold && + (! mForReplicationFlag || + srv.GetNumChunkReplications() < + mMaxReplicationsPerNode) + ); + } + void FindCandidateServers( + const Sources& sources) + { + mLoadAvgSum = 0; + mCandidatePos = 0; + mCandidates.clear(); + for (typename Servers::const_iterator it = sources.begin(); + it != sources.end(); + ++it) { + ChunkServer& srv = **it; + if (! IsCandidateServer(srv) || + mServerExcludes.Find(&srv)) { + continue; + } + const int64_t load = GetLoad(srv); + assert(mLoadAvgSum < mLoadAvgSum + load); + mLoadAvgSum += load; + mCandidates.push_back(make_pair(load, &srv)); + } + mCandidatePos = mCandidates.size(); + } +private: + ChunkPlacement(const ChunkPlacement&); + ChunkPlacement& operator=(const ChunkPlacement&); +public: + enum { kMaxRackId = RackIdSet::kMaxId }; +}; + +} +#endif /* CHUNK_PLACEMENT_H */ diff --git a/src/cc/meta/ChunkServer.cc b/src/cc/meta/ChunkServer.cc new file mode 100644 index 000000000..305098f71 --- /dev/null +++ b/src/cc/meta/ChunkServer.cc @@ -0,0 +1,1748 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/06 +// Author: Sriram Rao, Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Chunk server state machine implementation. +// +//---------------------------------------------------------------------------- + +#include "ChunkServer.h" +#include "LayoutManager.h" +#include "NetDispatch.h" +#include "util.h" +#include "kfsio/Globals.h" +#include "qcdio/QCUtils.h" +#include "common/MsgLogger.h" + +#include + +#include +#include +#include +#include +#include + +namespace KFS +{ + +using std::string; +using std::istream; +using std::max; +using std::make_pair; +using std::pair; +using std::hex; +using std::numeric_limits; +using libkfsio::globalNetManager; + +static inline time_t TimeNow() +{ + return globalNetManager().Now(); +} + +class HelloBufferQueueRunner : public ITimeout +{ +public: + static void Schedule() + { Instance().ScheduleSelf(); } + virtual void Timeout() + { + mWokenFlag = false; + if (ChunkServer::RunHelloBufferQueue() || ! mRegisteredFlag) { + return; + } + mRegisteredFlag = false; + globalNetManager().UnRegisterTimeoutHandler(this); + } +private: + bool mRegisteredFlag; + bool mWokenFlag; + + HelloBufferQueueRunner() + : ITimeout(), + mRegisteredFlag(false), + mWokenFlag(false) + {} + virtual ~HelloBufferQueueRunner() + { + if (! mRegisteredFlag) { + return; + } + mRegisteredFlag = false; + globalNetManager().UnRegisterTimeoutHandler(this); + } + void ScheduleSelf() + { + if (! mWokenFlag) { + mWokenFlag = true; + globalNetManager().Wakeup(); + } + if (mRegisteredFlag) { + return; + } + mRegisteredFlag = true; + globalNetManager().RegisterTimeoutHandler(this); + } + static HelloBufferQueueRunner& Instance() + { + static HelloBufferQueueRunner sHelloBufferQueueRunner; + return sHelloBufferQueueRunner; + } +private: + HelloBufferQueueRunner(const HelloBufferQueueRunner&); + HelloBufferQueueRunner& operator=(const HelloBufferQueueRunner&); +}; + +int ChunkServer::sHeartbeatTimeout = 60; +int ChunkServer::sHeartbeatInterval = 20; +int ChunkServer::sHeartbeatLogInterval = 1000; +int ChunkServer::sChunkAllocTimeout = 40; +int ChunkServer::sChunkReallocTimeout = 75; +int ChunkServer::sMakeStableTimeout = 330; +int ChunkServer::sReplicationTimeout = 330; +int ChunkServer::sRequestTimeout = 600; +int ChunkServer::sMetaClientPort = 0; +size_t ChunkServer::sMaxChunksToEvacuate = 2 << 10; // Max queue size +// sHeartbeatInterval * sSrvLoadSamplerSampleCount -- boxcar FIR filter +// if sSrvLoadSamplerSampleCount > 0 +int ChunkServer::sSrvLoadSamplerSampleCount = 0; +string ChunkServer::sSrvLoadPropName("Buffer-usec-wait-avg"); +bool ChunkServer::sRestartCSOnInvalidClusterKeyFlag = false; +ChunkServer::ChunkOpsInFlight ChunkServer::sChunkOpsInFlight; +ChunkServer* ChunkServer::sChunkServersPtr[kChunkSrvListsCount] = { 0, 0 }; +int ChunkServer::sChunkServerCount = 0; +int ChunkServer::sPendingHelloCount = 0; +int ChunkServer::sMinHelloWaitingBytes = 0; +int64_t ChunkServer::sHelloBytesCommitted = 0; +int64_t ChunkServer::sHelloBytesInFlight = 0; +int64_t ChunkServer::sMaxHelloBufferBytes = 256 << 20; +int ChunkServer::sEvacuateRateUpdateInterval = 120; + +const int kMaxReadAhead = 4 << 10; +// Bigger than the default MAX_RPC_HEADER_LEN: max heartbeat size. +const int kMaxRequestResponseHeader = 64 << 10; + +void ChunkServer::SetParameters(const Properties& prop, int clientPort) +{ + sHeartbeatTimeout = prop.getValue( + "metaServer.chunkServer.heartbeatTimeout", + sHeartbeatTimeout); + sHeartbeatInterval = max(3, prop.getValue( + "metaServer.chunkServer.heartbeatInterval", + sHeartbeatInterval)); + sHeartbeatLogInterval = prop.getValue( + "metaServer.chunkServer.heartbeatLogInterval", + sHeartbeatLogInterval); + sChunkAllocTimeout = prop.getValue( + "metaServer.chunkServer.chunkAllocTimeout", + sChunkAllocTimeout); + sChunkReallocTimeout = prop.getValue( + "metaServer.chunkServer.chunkReallocTimeout", + sChunkReallocTimeout); + sMakeStableTimeout = prop.getValue( + "metaServer.chunkServer.makeStableTimeout", + sMakeStableTimeout); + sReplicationTimeout = prop.getValue( + "metaServer.chunkServer.replicationTimeout", + sReplicationTimeout); + sRequestTimeout = prop.getValue( + "metaServer.chunkServer.requestTimeout", + sRequestTimeout); + sRequestTimeout = prop.getValue( + "metaServer.chunkServer.requestTimeout", + sRequestTimeout); + sSrvLoadSamplerSampleCount = prop.getValue( + "metaServer.chunkServer.srvLoadSampler.sampleCount", + sSrvLoadSamplerSampleCount); + sSrvLoadPropName = prop.getValue( + "metaServer.chunkServer.srvLoadPropName", + sSrvLoadPropName); + sMaxChunksToEvacuate = max(size_t(1), prop.getValue( + "metaServer.chunkServer.maxChunksToEvacuate", + sMaxChunksToEvacuate)); + if (clientPort > 0) { + sMetaClientPort = clientPort; + } + sRestartCSOnInvalidClusterKeyFlag = prop.getValue( + "metaServer.chunkServer.restartOnInvalidClusterKey", + sRestartCSOnInvalidClusterKeyFlag ? 1 : 0) != 0; +} + +static seq_t RandomSeqNo() +{ + seq_t ret = 0; + RAND_pseudo_bytes( + reinterpret_cast(&ret), int(sizeof(ret))); + return ((ret < 0 ? -ret : ret) >> 1); +} + +inline void +ChunkServerRequest(MetaChunkRequest& req, ostream& os, IOBuffer& buf) +{ + req.request(os, buf); +} + +inline void +ChunkServer::UpdateChunkWritesPerDrive( + int numChunkWrites, int numWritableDrives) +{ + const int deltaChunkWrites = numChunkWrites - mNumChunkWrites; + const int deltaWritableDrives = numWritableDrives - mNumWritableDrives; + mNumChunkWrites = numChunkWrites; + mNumWritableDrives = numWritableDrives; + gLayoutManager.UpdateChunkWritesPerDrive(*this, + deltaChunkWrites, deltaWritableDrives); + +} + +ChunkServer::ChunkServer(const NetConnectionPtr& conn, const string& peerName) + : KfsCallbackObj(), + CSMapServerInfo(), + mSeqNo(RandomSeqNo()), + mNetConnection(conn), + mHelloDone(false), + mDown(false), + mHeartbeatSent(false), + mHeartbeatSkipped(false), + mLastHeartbeatSent(TimeNow()), + mCanBeChunkMaster(false), + mIsRetiring(false), + mRetireStartTime(0), + mLastHeard(), + mChunksToMove(), + mChunksToEvacuate(), + mLocation(), + mRackId(-1), + mNumCorruptChunks(0), + mTotalSpace(0), + mPrevTotalSpace(0), + mTotalFsSpace(0), + mPrevTotalFsSpace(0), + mOneOverTotalSpace(0), + mOneOverTotalFsSpace(0), + mUsedSpace(0), + mAllocSpace(0), + mNumChunks(0), + mNumDrives(0), + mNumWritableDrives(0), + mNumChunkWrites(0), + mNumAppendsWithWid(0), + mNumChunkWriteReplications(0), + mNumChunkReadReplications(0), + mDispatchedReqs(), + mReqsTimeoutQueue(), + mLostChunks(0), + mUptime(0), + mHeartbeatProperties(), + mRestartScheduledFlag(false), + mRestartQueuedFlag(false), + mRestartScheduledTime(0), + mLastHeartBeatLoggedTime(0), + mDownReason(), + mOstream(), + mRecursionCount(0), + mHelloOp(0), + mSelfPtr(), + mSrvLoadSampler(sSrvLoadSamplerSampleCount, 0, TimeNow()), + mLoadAvg(0), + mCanBeCandidateServerFlag(false), + mStaleChunksHexFormatFlag(false), + mIStream(), + mEvacuateCnt(0), + mEvacuateBytes(0), + mEvacuateDoneCnt(0), + mEvacuateDoneBytes(0), + mEvacuateInFlight(0), + mPrevEvacuateDoneCnt(0), + mPrevEvacuateDoneBytes(0), + mEvacuateLastRateUpdateTime(TimeNow()), + mEvacuateCntRate(0.), + mEvacuateByteRate(0.), + mLostChunkDirs(), + mPeerName(peerName) +{ + assert(mNetConnection); + ChunkServersList::Init(*this); + PendingHelloList::Init(*this); + ChunkServersList::PushBack(sChunkServersPtr, *this); + SET_HANDLER(this, &ChunkServer::HandleRequest); + mNetConnection->SetInactivityTimeout(sHeartbeatInterval); + mNetConnection->SetMaxReadAhead(kMaxReadAhead); + sChunkServerCount++; + KFS_LOG_STREAM_INFO << + "new ChunkServer " << (const void*)this << " " << + GetPeerName() << + " total: " << sChunkServerCount << + KFS_LOG_EOM; +} + +ChunkServer::~ChunkServer() +{ + assert(! mSelfPtr); + KFS_LOG_STREAM_DEBUG << ServerID() << + " ~ChunkServer " << (const void*)this << + " total: " << sChunkServerCount << + KFS_LOG_EOM; + if (mNetConnection) { + mNetConnection->Close(); + } + RemoveFromPendingHelloList(); + delete mHelloOp; + ChunkServersList::Remove(sChunkServersPtr, *this); + sChunkServerCount--; +} + +void +ChunkServer::AddToPendingHelloList() +{ + if (PendingHelloList::IsInList(sChunkServersPtr, *this)) { + return; + } + if (! mHelloOp || mHelloOp->contentLength <= 0) { + sMinHelloWaitingBytes = 0; + } else if (sPendingHelloCount <= 0 || + mHelloOp->contentLength < sMinHelloWaitingBytes) { + sMinHelloWaitingBytes = mHelloOp->contentLength; + } + sPendingHelloCount++; + assert(sPendingHelloCount > 0); + PendingHelloList::PushBack(sChunkServersPtr, *this); + HelloBufferQueueRunner::Schedule(); +} + +void +ChunkServer::RemoveFromPendingHelloList() +{ + if (! PendingHelloList::IsInList(sChunkServersPtr, *this)) { + PutHelloBytes(mHelloOp); + return; + } + assert(sPendingHelloCount > 0); + sPendingHelloCount--; + PendingHelloList::Remove(sChunkServersPtr, *this); +} + +/* static */ bool +ChunkServer::RunHelloBufferQueue() +{ + if (PendingHelloList::IsEmpty(sChunkServersPtr)) { + return false; + } + int maxScan = sPendingHelloCount; + int minHelloSize = numeric_limits::max(); + PendingHelloList::Iterator it(sChunkServersPtr); + do { + const int64_t bytesAvail = GetHelloBytes(); + if (bytesAvail < sMinHelloWaitingBytes) { + break; + } + ChunkServer* ptr; + while ((ptr = it.Next()) && + ptr->mNetConnection && + ptr->mNetConnection->IsGood() && + ptr->mHelloOp && + bytesAvail < ptr->mHelloOp->contentLength) { + if (ptr->mHelloOp->contentLength < minHelloSize) { + minHelloSize = ptr->mHelloOp->contentLength; + } + --maxScan; + } + if (! ptr) { + sMinHelloWaitingBytes = minHelloSize; + break; + } + ptr->RemoveFromPendingHelloList(); + if (ptr->mNetConnection) { + if (GetHelloBytes(ptr->mHelloOp) < 0) { + ptr->mNetConnection->SetMaxReadAhead(0); + ptr->AddToPendingHelloList(); + break; + } + ptr->HandleRequest( + EVENT_NET_READ, + &ptr->mNetConnection->GetInBuffer() + ); + } else { + ptr->PutHelloBytes(ptr->mHelloOp); + } + } while (--maxScan > 0); + return (! PendingHelloList::IsEmpty(sChunkServersPtr)); +} + +/* static */ int64_t +ChunkServer::GetHelloBytes(MetaHello* req /* = 0 */) +{ + const int64_t avail = min( + sHelloBytesInFlight + gLayoutManager.GetFreeIoBufferByteCount(), + sMaxHelloBufferBytes) - sHelloBytesCommitted; + if (! req) { + return avail; + } + if (req->contentLength <= 0) { + return 0; + } + if (req->bytesReceived < 0) { + req->bytesReceived = 0; + } + if (avail <= 0) { + return avail; + } + if (avail >= req->contentLength) { + sHelloBytesCommitted += req->contentLength; + } + return (avail - req->contentLength); +} + +/* static */ void +ChunkServer::PutHelloBytes(MetaHello* req) +{ + if (! req || req->bytesReceived < 0) { + return; + } + if (sHelloBytesCommitted < req->contentLength) { + panic("invalid hello request byte counter", false); + sHelloBytesCommitted = 0; + } else { + sHelloBytesCommitted -= req->contentLength; + } + if (sHelloBytesInFlight < req->bytesReceived) { + panic("invalid hello received byte counter", false); + sHelloBytesInFlight = 0; + } else { + sHelloBytesInFlight -= req->bytesReceived; + } + // For debugging store the number of bytes received, just change + // the sign bit. + req->bytesReceived = -(req->bytesReceived + 1); + if (PendingHelloList::IsEmpty(sChunkServersPtr)) { + return; + } + HelloBufferQueueRunner::Schedule(); +} + +/// +/// Generic event handler. Decode the event that occurred and +/// appropriately extract out the data and deal with the event. +/// @param[in] code: The type of event that occurred +/// @param[in] data: Data being passed in relative to the event that +/// occurred. +/// @retval 0 to indicate successful event handling; -1 otherwise. +/// +int +ChunkServer::HandleRequest(int code, void *data) +{ + ChunkServerPtr const refSelf(mSelfPtr); + + mRecursionCount++; + switch (code) { + case EVENT_NET_READ: { + // We read something from the network. It is + // either an RPC (such as hello) or a reply to + // an RPC we sent earlier. + assert(mNetConnection); + IOBuffer& iobuf = mNetConnection->GetInBuffer(); + assert(&iobuf == data); + bool gotMsgHdr; + int msgLen = 0; + while ((gotMsgHdr = mHelloOp || IsMsgAvail(&iobuf, &msgLen))) { + const int retval = HandleMsg(&iobuf, msgLen); + if (retval < 0) { + iobuf.Clear(); + Error(mHelloDone ? + "request or response parse error" : + "failed to parse hello message"); + break; + } + if (retval > 0) { + break; // Need more data + } + } + if (! mDown && ! gotMsgHdr && iobuf.BytesConsumable() > + kMaxRequestResponseHeader) { + iobuf.Clear(); + Error(mHelloDone ? + "request or response header length" + " exceeds max allowed" : + "hello message header length" + " exceeds max allowed"); + } + break; + } + + case EVENT_CMD_DONE: { + assert(mHelloDone && data); + MetaRequest* const op = reinterpret_cast(data); + if (! mDown) { + SendResponse(op); + } + // nothing left to be done...get rid of it + delete op; + break; + } + + case EVENT_NET_WROTE: + if (! mHelloDone && + mNetConnection && + ! mNetConnection->IsWriteReady()) { + Error("hello error " + "cluster key or md5sum mismatch"); + } + // Something went out on the network. + break; + + case EVENT_INACTIVITY_TIMEOUT: + // Check heartbeat timeout. + if (mHelloDone || mLastHeartbeatSent + sHeartbeatTimeout > + TimeNow()) { + break; + } + Error("hello timeout"); + break; + + case EVENT_NET_ERROR: + Error("communication error"); + break; + + default: + assert(!"Unknown event"); + break; + } + if (mRecursionCount <= 1 && ! mDown && mNetConnection) { + mNetConnection->StartFlush(); + } + if (mHelloDone) { + if (mRecursionCount <= 1) { + const int hbTimeout = Heartbeat(); + const int opTimeout = TimeoutOps(); + if (! mDown && + mNetConnection && + mNetConnection->IsGood()) { + mNetConnection->SetInactivityTimeout( + NetManager::Timer::MinTimeout( + hbTimeout, opTimeout)); + } + + } + } else if (code != EVENT_INACTIVITY_TIMEOUT) { + mLastHeartbeatSent = TimeNow(); + } + assert(mRecursionCount > 0); + mRecursionCount--; + return 0; +} + +void +ChunkServer::ForceDown() +{ + if (mDown) { + return; + } + KFS_LOG_STREAM_WARN << + "forcing chunk server " << ServerID() << + "/" << (mNetConnection ? GetPeerName() : + string("not connected")) << + " down" << + KFS_LOG_EOM; + if (mNetConnection) { + mNetConnection->Close(); + mNetConnection->GetInBuffer().Clear(); + mNetConnection.reset(); + } + RemoveFromPendingHelloList(); + delete mHelloOp; + mHelloOp = 0; + mDown = true; + // Take out the server from write-allocation + mTotalSpace = 0; + mTotalFsSpace = 0; + mAllocSpace = 0; + mUsedSpace = 0; + const int64_t delta = -mLoadAvg; + mLoadAvg = 0; + gLayoutManager.UpdateSrvLoadAvg(*this, delta); + UpdateChunkWritesPerDrive(0, 0); + FailDispatchedOps(); + mSelfPtr.reset(); // Unref / delete self +} + +void +ChunkServer::SetCanBeChunkMaster(bool flag) +{ + if (mCanBeChunkMaster == flag) { + return; + } + const int64_t delta = -mLoadAvg; + mLoadAvg = 0; + const bool kCanBeCandidateFlag = false; + gLayoutManager.UpdateSrvLoadAvg(*this, delta, kCanBeCandidateFlag); + mCanBeChunkMaster = flag; + mLoadAvg = -delta; + gLayoutManager.UpdateSrvLoadAvg(*this, mLoadAvg); +} + +void +ChunkServer::Error(const char* errorMsg) +{ + const int socketErr = (mNetConnection && mNetConnection->IsGood()) ? + mNetConnection->GetSocketError() : 0; + KFS_LOG_STREAM_ERROR << + "chunk server " << ServerID() << + "/" << (mNetConnection ? GetPeerName() : + string("not connected")) << + " down" << + (mRestartQueuedFlag ? " restart" : "") << + " reason: " << (errorMsg ? errorMsg : "unspecified") << + " socket error: " << QCUtils::SysError(socketErr) << + KFS_LOG_EOM; + if (mNetConnection) { + mNetConnection->Close(); + mNetConnection->GetInBuffer().Clear(); + mNetConnection.reset(); + } + if (mDownReason.empty() && mRestartQueuedFlag) { + mDownReason = "restart"; + } + RemoveFromPendingHelloList(); + delete mHelloOp; + mHelloOp = 0; + mDown = true; + // Take out the server from write-allocation + mTotalSpace = 0; + mTotalFsSpace = 0; + mAllocSpace = 0; + mUsedSpace = 0; + const int64_t delta = -mLoadAvg; + mLoadAvg = 0; + gLayoutManager.UpdateSrvLoadAvg(*this, delta); + UpdateChunkWritesPerDrive(0, 0); + FailDispatchedOps(); + if (mHelloDone) { + // force the server down thru the main loop to avoid races + MetaBye* const mb = new MetaBye(0, shared_from_this()); + mb->clnt = this; + submit_request(mb); + } + mSelfPtr.reset(); // Unref / delete self +} + +/// +/// We have a message from the chunk server. The message we got is one +/// of: +/// - a HELLO message from the server +/// - it is a response to some command we previously sent +/// - is an RPC from the chunkserver +/// +/// Of these, for the first and third case,create an op and +/// send that down the pike; in the second case, retrieve the op from +/// the pending list, attach the response, and push that down the pike. +/// +/// @param[in] iobuf: Buffer containing the command +/// @param[in] msgLen: Length of the command in the buffer +/// @retval 0 if we handled the message properly; -1 on error; +/// 1 if there is more data needed for this message and we haven't +/// yet received the data. +int +ChunkServer::HandleMsg(IOBuffer *iobuf, int msgLen) +{ + if (! mHelloDone) { + return HandleHelloMsg(iobuf, msgLen); + } + char buf[3]; + if (iobuf->CopyOut(buf, 3) == 3 && + buf[0] == 'O' && buf[1] == 'K' && (buf[2] & 0xFF) <= ' ') { + return HandleReply(iobuf, msgLen); + } + return HandleCmd(iobuf, msgLen); +} + +void +ChunkServer::ShowLines(MsgLogger::LogLevel logLevel, const string& prefix, + IOBuffer& iobuf, int len, int linesToShow /* = 64 */) +{ + istream& is = mIStream.Set(iobuf, len); + int maxLines = linesToShow; + string line; + while (--maxLines >= 0 && getline(is, line)) { + string::iterator last = line.end(); + if (last != line.begin() && *--last == '\r') { + line.erase(last); + } + KFS_LOG_STREAM(logLevel) << + prefix << line << + KFS_LOG_EOM; + } + mIStream.Reset(); +} + +MetaRequest* +ChunkServer::GetOp(IOBuffer& iobuf, int msgLen, const char* errMsgPrefix) +{ + MetaRequest* op = 0; + if (ParseCommand(iobuf, msgLen, &op) >= 0) { + op->setChunkServer(shared_from_this()); + return op; + } + ShowLines(MsgLogger::kLogLevelERROR, + ServerID() + "/" + GetPeerName() + " " + + (errMsgPrefix ? errMsgPrefix : "") + ": ", + iobuf, + msgLen + ); + iobuf.Consume(msgLen); + return 0; +} + +class HexChunkInfoParser +{ +public: + typedef MetaHello::ChunkInfo ChunkInfo; + + HexChunkInfoParser(const IOBuffer& buf) + : mIt(buf), + mCur(), + mVal(0), + mPrevSpaceFlag(true), + mFieldIdx(0) + {} + const ChunkInfo* Next() + { + const unsigned char kInvalHex = 0xFF; + // All ids are expected to be positive. + const unsigned char* p; + while ((p = reinterpret_cast( + mIt.Next()))) { + if (*p <= ' ') { + if (mPrevSpaceFlag) { + continue; + } + mPrevSpaceFlag = true; + switch (mFieldIdx) { + case 0: mCur.allocFileId = mVal; break; + case 1: mCur.chunkId = mVal; break; + case 2: mCur.chunkVersion = mVal; + mFieldIdx = 0; + mVal = 0; + return &mCur; + default: assert(! "invald field index"); + return 0; + } + mFieldIdx++; + mVal = 0; + continue; + } + mPrevSpaceFlag = false; + const unsigned char h = sC2HexTable[*p]; + if (h == kInvalHex) { + return 0; + } + mVal = (mVal << 4) | h; + } + if (mFieldIdx == 2 && ! mPrevSpaceFlag) { + mCur.chunkVersion = mVal; + mFieldIdx = 0; + mVal = 0; + return &mCur; + } + return 0; + } + bool IsError() const { return (mFieldIdx != 0); } +private: + IOBuffer::ByteIterator mIt; + ChunkInfo mCur; + int64_t mVal; + bool mPrevSpaceFlag; + int mFieldIdx; + + static const unsigned char* const sC2HexTable; +}; +const unsigned char* const HexChunkInfoParser::sC2HexTable = char2HexTable(); + +/// Case #1: Handle Hello message from a chunkserver that +/// just connected to us. +int +ChunkServer::HandleHelloMsg(IOBuffer* iobuf, int msgLen) +{ + assert(!mHelloDone); + + const bool hasHelloOpFlag = mHelloOp != 0; + if (! hasHelloOpFlag) { + MetaRequest * const op = GetOp(*iobuf, msgLen, "invalid hello"); + if (! op) { + return -1; + } + if (op->op != META_HELLO) { + ShowLines(MsgLogger::kLogLevelERROR, + GetPeerName() + " ", + *iobuf, msgLen); + } else { + ShowLines(MsgLogger::kLogLevelINFO, + "new: " + GetPeerName() + " ", + *iobuf, msgLen); + } + iobuf->Consume(msgLen); + // We should only get a HELLO message here; anything else is + // invalid. + if (op->op != META_HELLO) { + KFS_LOG_STREAM_ERROR << GetPeerName() << + " unexpected request, expected hello" << + KFS_LOG_EOM; + delete op; + return -1; + } + mHelloOp = static_cast(op); + if (! gLayoutManager.Validate(*mHelloOp)) { + KFS_LOG_STREAM_ERROR << GetPeerName() << + " hello" + " location: " << mHelloOp->ToString() << + " error: " << op->status << + " " << op->statusMsg << + KFS_LOG_EOM; + if (mHelloOp->status != -EBADCLUSTERKEY) { + mHelloOp = 0; + delete op; + return -1; + } + if (! sRestartCSOnInvalidClusterKeyFlag) { + iobuf->Clear(); + mNetConnection->SetMaxReadAhead(0); + mNetConnection->SetInactivityTimeout( + sRequestTimeout); + mOstream.Set(mNetConnection->GetOutBuffer()); + mHelloOp->response(mOstream); + mOstream.Reset(); + delete mHelloOp; + mHelloOp = 0; + // Do not declare error, hello reply still + // pending. + return 0; + } + } + if (mHelloOp->status == 0 && + sMaxHelloBufferBytes < mHelloOp->contentLength) { + KFS_LOG_STREAM_ERROR << GetPeerName() << + " hello content length: " << + mHelloOp->contentLength << + " exceeds max. allowed: " << + sMaxHelloBufferBytes << + KFS_LOG_EOM; + mHelloOp = 0; + delete op; + return -1; + } + if (mHelloOp->status == 0 && + mHelloOp->contentLength > 0 && + iobuf->BytesConsumable() < + mHelloOp->contentLength && + GetHelloBytes(mHelloOp) < 0) { + KFS_LOG_STREAM_INFO << GetPeerName() << + " hello content length: " << + mHelloOp->contentLength << + " adding to pending list" + " ops: " << sPendingHelloCount << + " bytes:" + " committed: " << sHelloBytesCommitted << + " received: " << sHelloBytesInFlight << + KFS_LOG_EOM; + mNetConnection->SetMaxReadAhead(0); + AddToPendingHelloList(); + return 1; + } else { + // Hello didn't go through the buffer commit process, + // mark it such that PutHelloBytes() if invoked has no + // effect. + mHelloOp->bytesReceived = -1; + } + } + // make sure we have the chunk ids... + if (mHelloOp->contentLength > 0) { + const int nAvail = iobuf->BytesConsumable(); + if (nAvail < mHelloOp->contentLength) { + // need to wait for data... + if (mHelloOp->status != 0) { + mHelloOp->contentLength -= nAvail; + iobuf->Clear(); // Discard content. + } else if (nAvail > mHelloOp->bytesReceived) { + sHelloBytesInFlight += + nAvail - mHelloOp->bytesReceived; + mHelloOp->bytesReceived = nAvail; + } + mNetConnection->SetMaxReadAhead(max(kMaxReadAhead, + mHelloOp->status == 0 ? + mHelloOp->contentLength - nAvail : + kMaxRequestResponseHeader + )); + return 1; + } + const int contentLength = mHelloOp->contentLength; + if (hasHelloOpFlag && mHelloOp->status == 0) { + // Emit log message to have time stamp of when hello is + // fully received, and parsing of chunk lists starts. + KFS_LOG_STREAM_INFO << GetPeerName() << + " receiving hello: " << + contentLength << + " bytes done" << + KFS_LOG_EOM; + PutHelloBytes(mHelloOp); + } + mHelloOp->chunks.clear(); + mHelloOp->notStableChunks.clear(); + mHelloOp->notStableAppendChunks.clear(); + if (mHelloOp->status == 0) { + const size_t numStable(max(0, mHelloOp->numChunks)); + mHelloOp->chunks.reserve(mHelloOp->numChunks); + const size_t nonStableAppendNum( + max(0, mHelloOp->numNotStableAppendChunks)); + mHelloOp->notStableAppendChunks.reserve(nonStableAppendNum); + const size_t nonStableNum(max(0, mHelloOp->numNotStableChunks)); + mHelloOp->notStableChunks.reserve(nonStableNum); + // get the chunkids + istream& is = mIStream.Set(iobuf, contentLength); + HexChunkInfoParser hexParser(*iobuf); + for (int j = 0; j < 3; ++j) { + MetaHello::ChunkInfos& chunks = j == 0 ? + mHelloOp->chunks : (j == 1 ? + mHelloOp->notStableAppendChunks : + mHelloOp->notStableChunks); + int i = j == 0 ? + mHelloOp->numChunks : (j == 1 ? + mHelloOp->numNotStableAppendChunks : + mHelloOp->numNotStableChunks); + if (mHelloOp->contentIntBase == 16) { + const MetaHello::ChunkInfo* c; + while (i-- > 0 && (c = hexParser.Next())) { + chunks.push_back(*c); + } + } else { + MetaHello::ChunkInfo c; + while (i-- > 0) { + if (! (is >> c.allocFileId + >> c.chunkId + >> c.chunkVersion)) { + break; + } + chunks.push_back(c); + } + } + } + mIStream.Reset(); + iobuf->Consume(contentLength); + if (mHelloOp->chunks.size() != numStable || + mHelloOp->notStableAppendChunks.size() != + nonStableAppendNum || + mHelloOp->notStableChunks.size() != + nonStableNum) { + KFS_LOG_STREAM_ERROR << GetPeerName() << + " invalid or short chunk list:" + " expected: " << mHelloOp->numChunks << + "/" << mHelloOp->numNotStableAppendChunks << + "/" << mHelloOp->numNotStableChunks << + " got: " << mHelloOp->chunks.size() << + "/" << + mHelloOp->notStableAppendChunks.size() << + "/" << mHelloOp->notStableChunks.size() << + " last good chunk: " << + (mHelloOp->chunks.empty() ? -1 : + mHelloOp->chunks.back().chunkId) << + "/" << (mHelloOp->notStableAppendChunks.empty() ? -1 : + mHelloOp->notStableAppendChunks.back().chunkId) << + "/" << (mHelloOp->notStableChunks.empty() ? -1 : + mHelloOp->notStableChunks.back().chunkId) << + " content length: " << contentLength << + KFS_LOG_EOM; + delete mHelloOp; + mHelloOp = 0; + return -1; + } + } + } + if (mHelloOp->status != 0) { + iobuf->Clear(); + if (! mNetConnection) { + delete mHelloOp; + mHelloOp = 0; + return -1; + } + KFS_LOG_STREAM_INFO << + mHelloOp->ToString() << + " " << mHelloOp->Show() << + " status: " << mHelloOp->status << + " msg: " << mHelloOp->statusMsg << + " initiating chunk server restart" << + KFS_LOG_EOM; + // Tell him hello is OK in order to make the restart + // work. + mHelloOp->status = 0; + IOBuffer& ioBuf = mNetConnection->GetOutBuffer(); + mOstream.Set(ioBuf); + mHelloOp->response(mOstream); + if (gLayoutManager.IsRetireOnCSRestart()) { + MetaChunkRetire retire( + NextSeq(), shared_from_this()); + ChunkServerRequest(retire, mOstream, ioBuf); + } else { + MetaChunkServerRestart restart( + NextSeq(), shared_from_this()); + ChunkServerRequest(restart, mOstream, ioBuf); + } + mOstream.Reset(); + mNetConnection->SetMaxReadAhead(0); + mNetConnection->SetInactivityTimeout(sRequestTimeout); + delete mHelloOp; + mHelloOp = 0; + // Do not declare error, outbound data still pending. + // Create response and set timeout, the chunk server + // should disconnect when it restarts. + return 0; + } + + mNetConnection->SetMaxReadAhead(kMaxReadAhead); + // Hello done. + mHelloOp->peerName = GetPeerName(); + mHelloOp->clnt = this; + mHelloOp->server = shared_from_this(); + mHelloDone = true; + mLastHeard = TimeNow(); + mUptime = mHelloOp->uptime; + mNumAppendsWithWid = mHelloOp->numAppendsWithWid; + mStaleChunksHexFormatFlag = mHelloOp->staleChunksHexFormatFlag; + UpdateChunkWritesPerDrive((int)(mHelloOp->notStableAppendChunks.size() + + mHelloOp->notStableChunks.size()), mNumWritableDrives); + mLastHeartbeatSent = mLastHeard; + mHeartbeatSent = true; + Enqueue(new MetaChunkHeartbeat(NextSeq(), shared_from_this(), + IsRetiring() ? int64_t(1) : + (int64_t)mChunksToEvacuate.Size()), + 2 * sHeartbeatTimeout); + // Emit message to time parse. + KFS_LOG_STREAM_INFO << GetPeerName() << + " submit hello" << + KFS_LOG_EOM; + MetaRequest* const op = mHelloOp; + mHelloOp = 0; + submit_request(op); + return 0; +} + +/// +/// Case #2: Handle an RPC from a chunkserver. +/// +int +ChunkServer::HandleCmd(IOBuffer *iobuf, int msgLen) +{ + assert(mHelloDone); + + MetaRequest * const op = GetOp(*iobuf, msgLen, "invalid request"); + if (! op) { + return -1; + } + // Message is ready to be pushed down. So remove it. + iobuf->Consume(msgLen); + op->clnt = this; + submit_request(op); + return 0; +} + +void +ChunkServer::UpdateSpace(MetaChunkEvacuate& op) +{ + if (op.totalSpace >= 0) { + mTotalSpace = op.totalSpace; + } + if (op.totalFsSpace >= 0) { + mTotalFsSpace = op.totalFsSpace; + } + if (op.usedSpace >= 0) { + mUsedSpace = op.usedSpace; + } + if (op.numWritableDrives >= 0) { + UpdateChunkWritesPerDrive( + mNumChunkWrites, op.numWritableDrives); + } + if (op.numDrives >= 0) { + mNumDrives = op.numDrives; + UpdateChunkWritesPerDrive( + mNumChunkWrites, min(mNumWritableDrives, mNumDrives)); + } + if (op.numEvacuateInFlight == 0) { + mChunksToEvacuate.Clear(); + mEvacuateCnt = 0; + } else if (op.numEvacuateInFlight > 0) { + mEvacuateInFlight = op.numEvacuateInFlight; + } +} + +/// +/// Case #3: Handle a reply from a chunkserver to an RPC we +/// previously sent. +/// +int +ChunkServer::HandleReply(IOBuffer* iobuf, int msgLen) +{ + assert(mHelloDone); + + // We got a response for a command we previously + // sent. So, match the response to its request and + // resume request processing. + Properties prop; + const bool ok = ParseResponse(mIStream.Set(iobuf, msgLen), prop); + mIStream.Reset(); + if (! ok) { + return -1; + } + // Message is ready to be pushed down. So remove it. + iobuf->Consume(msgLen); + + const seq_t cseq = prop.getValue("Cseq", (seq_t) -1); + MetaChunkRequest* const op = FindMatchingRequest(cseq); + if (! op) { + // Most likely op was timed out, or chunk server sent response + // after re-connect. + KFS_LOG_STREAM_INFO << ServerID() << + " unable to find command for response cseq: " << cseq << + KFS_LOG_EOM; + return 0; + } + + mLastHeard = TimeNow(); + op->statusMsg = prop.getValue("Status-message", ""); + op->status = prop.getValue("Status", -1); + op->handleReply(prop); + if (op->op == META_CHUNK_HEARTBEAT) { + mTotalSpace = prop.getValue("Total-space", int64_t(0)); + mTotalFsSpace = prop.getValue("Total-fs-space", int64_t(-1)); + mUsedSpace = prop.getValue("Used-space", int64_t(0)); + mNumChunks = prop.getValue("Num-chunks", 0); + mNumDrives = prop.getValue("Num-drives", 0); + mUptime = prop.getValue("Uptime", int64_t(0)); + mLostChunks = prop.getValue("Chunk-lost", int64_t(0)); + mNumCorruptChunks = max(mNumCorruptChunks, + prop.getValue("Chunk-corrupted", int64_t(0))); + mNumAppendsWithWid = prop.getValue("Num-appends-with-wids", int64_t(0)); + mEvacuateCnt = prop.getValue("Evacuate", int64_t(-1)); + mEvacuateBytes = prop.getValue("Evacuate-bytes", int64_t(-1)); + mEvacuateDoneCnt = prop.getValue("Evacuate-done", int64_t(-1)); + mEvacuateDoneBytes = prop.getValue("Evacuate-done-bytes", int64_t(-1)); + mEvacuateInFlight = prop.getValue("Evacuate-in-flight", int64_t(-1)); + UpdateChunkWritesPerDrive( + max(0, prop.getValue("Num-writable-chunks", 0)), + prop.getValue("Num-wr-drives", mNumDrives) + ); + if (mEvacuateInFlight == 0) { + mChunksToEvacuate.Clear(); + } + const time_t now = TimeNow(); + if (mEvacuateCnt > 0 && mEvacuateLastRateUpdateTime + + sEvacuateRateUpdateInterval < now) { + const time_t delta = now - mEvacuateLastRateUpdateTime; + if (delta > 0 && delta <= + 2 * sEvacuateRateUpdateInterval + + 3 * sHeartbeatInterval) { + mEvacuateCntRate = max(int64_t(0), + mEvacuateDoneCnt - mPrevEvacuateDoneCnt + ) / double(delta); + mEvacuateByteRate = max(int64_t(0), + mEvacuateDoneBytes - mPrevEvacuateDoneBytes + ) / double(delta); + } else { + mEvacuateCntRate = 0.; + mEvacuateByteRate = 0.; + } + mEvacuateLastRateUpdateTime = now; + mPrevEvacuateDoneCnt = mEvacuateDoneCnt; + mPrevEvacuateDoneBytes = mEvacuateDoneBytes; + } + const int64_t srvLoad = + prop.getValue(sSrvLoadPropName, int64_t(0)); + int64_t loadAvg; + if (sSrvLoadSamplerSampleCount > 0) { + if (mSrvLoadSampler.GetMaxSamples() != + sSrvLoadSamplerSampleCount) { + mSrvLoadSampler.SetMaxSamples( + sSrvLoadSamplerSampleCount, srvLoad, + now); + } else { + mSrvLoadSampler.Put(srvLoad, now); + } + loadAvg = mSrvLoadSampler.GetLastFirstDiffByTime(); + } else { + loadAvg = srvLoad; + } + if (loadAvg < 0) { + KFS_LOG_STREAM_INFO << + " load average: " << loadAvg << + " resetting sampler" << + KFS_LOG_EOM; + if (sSrvLoadSamplerSampleCount > 0) { + mSrvLoadSampler.Reset(loadAvg, now); + } + loadAvg = 0; + } + mAllocSpace = mUsedSpace + mNumChunkWrites * CHUNKSIZE; + mHeartbeatSent = false; + mHeartbeatSkipped = + mLastHeartbeatSent + sHeartbeatInterval < now; + mHeartbeatProperties.swap(prop); + if (mTotalFsSpace < mTotalSpace) { + mTotalFsSpace = mTotalSpace; + } + const int64_t delta = loadAvg - mLoadAvg; + mLoadAvg = loadAvg; + gLayoutManager.UpdateSrvLoadAvg(*this, delta); + if (sHeartbeatLogInterval > 0 && + mLastHeartBeatLoggedTime + + sHeartbeatLogInterval <= mLastHeard) { + mLastHeartBeatLoggedTime = mLastHeard; + string hbp; + mHeartbeatProperties.getList(hbp, " ", ""); + KFS_LOG_STREAM_INFO << + "===chunk=server: " << mLocation.hostname << + ":" << mLocation.port << + " responsive=" << IsResponsiveServer() << + " retiring=" << IsRetiring() << + " restarting=" << IsRestartScheduled() << + hbp << + KFS_LOG_EOM; + } + } + op->resume(); + return 0; +} + +/// +/// The response sent by a chunkserver is of the form: +/// OK \r\n +/// Cseq: \r\n +/// Status: \r\n +/// {\r\n}*\r\n +/// +/// @param[in] buf Buffer containing the response +/// @param[in] bufLen length of buf +/// @param[out] prop Properties object with the response header/values +/// +bool +ChunkServer::ParseResponse(istream& is, Properties &prop) +{ + string token; + is >> token; + // Response better start with OK + if (token.compare("OK") != 0) { + int maxLines = 32; + do { + KFS_LOG_STREAM_ERROR << ServerID() << + " bad response header: " << token << + KFS_LOG_EOM; + } while (--maxLines > 0 && getline(is, token)); + return false; + } + const char separator = ':'; + prop.loadProperties(is, separator, false); + return true; +} + +/// +/// Request/responses are matched based on sequence #'s. +/// +MetaChunkRequest* +ChunkServer::FindMatchingRequest(seq_t cseq) +{ + DispatchedReqs::iterator const it = mDispatchedReqs.find(cseq); + if (it == mDispatchedReqs.end()) { + return 0; + } + MetaChunkRequest* const op = it->second.first->second; + mReqsTimeoutQueue.erase(it->second.first); + sChunkOpsInFlight.erase(it->second.second); + mDispatchedReqs.erase(it); + return op; +} + +int +ChunkServer::TimeSinceLastHeartbeat() const +{ + return (TimeNow() - mLastHeartbeatSent); +} + +/// +/// Queue an RPC request +/// +void +ChunkServer::Enqueue(MetaChunkRequest* r, int timeout /* = -1 */) +{ + if (r->submitCount++ == 0) { + r->submitTime = microseconds(); + } + if (mDown || ! mNetConnection || ! mNetConnection->IsGood()) { + r->status = -EIO; + r->resume(); + return; + } + if (! mDispatchedReqs.insert(make_pair( + r->opSeqno, + make_pair( + mReqsTimeoutQueue.insert(make_pair( + TimeNow() + (timeout < 0 ? + sRequestTimeout : timeout), + r + )), + sChunkOpsInFlight.insert(make_pair( + r->chunkId, + r + )) + ))).second) { + panic("duplicate op sequence number", false); + } + if (r->op == META_CHUNK_REPLICATE) { + KFS_LOG_STREAM_INFO << r->Show() << KFS_LOG_EOM; + } + EnqueueSelf(r); +} + +void +ChunkServer::EnqueueSelf(MetaChunkRequest* r) +{ + IOBuffer& buf = mNetConnection->GetOutBuffer(); + ChunkServerRequest(*r, mOstream.Set(buf), buf); + mOstream.Reset(); + if (mRecursionCount <= 0) { + mNetConnection->StartFlush(); + } +} + +int +ChunkServer::AllocateChunk(MetaAllocate *r, int64_t leaseId) +{ + mAllocSpace += CHUNKSIZE; + UpdateChunkWritesPerDrive(mNumChunkWrites + 1, mNumWritableDrives); + Enqueue(new MetaChunkAllocate( + NextSeq(), r, shared_from_this(), leaseId), + r->initialChunkVersion >= 0 ? + sChunkReallocTimeout : sChunkAllocTimeout); + return 0; +} + +int +ChunkServer::DeleteChunk(chunkId_t chunkId) +{ + mAllocSpace = max((int64_t)0, mAllocSpace - (int64_t)CHUNKSIZE); + mChunksToEvacuate.Erase(chunkId); + Enqueue(new MetaChunkDelete(NextSeq(), shared_from_this(), chunkId)); + return 0; +} + +int +ChunkServer::GetChunkSize(fid_t fid, chunkId_t chunkId, seq_t chunkVersion, + const string &pathname, bool retryFlag) +{ + Enqueue(new MetaChunkSize(NextSeq(), shared_from_this(), + fid, chunkId, chunkVersion, pathname, retryFlag)); + return 0; +} + +int +ChunkServer::BeginMakeChunkStable(fid_t fid, chunkId_t chunkId, seq_t chunkVersion) +{ + Enqueue(new MetaBeginMakeChunkStable( + NextSeq(), shared_from_this(), + mLocation, fid, chunkId, chunkVersion + ), sMakeStableTimeout); + return 0; +} + +int +ChunkServer::MakeChunkStable(fid_t fid, chunkId_t chunkId, seq_t chunkVersion, + chunkOff_t chunkSize, bool hasChunkChecksum, uint32_t chunkChecksum, + bool addPending) +{ + Enqueue(new MetaChunkMakeStable( + NextSeq(), shared_from_this(), + fid, chunkId, chunkVersion, + chunkSize, hasChunkChecksum, chunkChecksum, addPending + ), sMakeStableTimeout); + return 0; +} + +int +ChunkServer::ReplicateChunk(fid_t fid, chunkId_t chunkId, + const ChunkServerPtr& dataServer, const ChunkRecoveryInfo& recoveryInfo) +{ + MetaChunkReplicate* const r = new MetaChunkReplicate( + NextSeq(), shared_from_this(), fid, chunkId, + dataServer->GetServerLocation(), dataServer); + if (recoveryInfo.HasRecovery() && r->server == dataServer) { + r->chunkVersion = recoveryInfo.version; + r->chunkOffset = recoveryInfo.offset; + r->striperType = recoveryInfo.striperType; + r->numStripes = recoveryInfo.numStripes; + r->numRecoveryStripes = recoveryInfo.numRecoveryStripes; + r->stripeSize = recoveryInfo.stripeSize; + r->fileSize = recoveryInfo.fileSize; + r->dataServer.reset(); + r->srcLocation.hostname.clear(); + r->srcLocation.port = sMetaClientPort; + } + mNumChunkWriteReplications++; + UpdateChunkWritesPerDrive(mNumChunkWrites + 1, mNumWritableDrives); + mAllocSpace += CHUNKSIZE; + Enqueue(r, sReplicationTimeout); + return 0; +} + +void +ChunkServer::NotifyStaleChunks(ChunkIdQueue& staleChunkIds, + bool evacuatedFlag, bool clearStaleChunksFlag) +{ + mAllocSpace = max(int64_t(0), + mAllocSpace - (int64_t)(CHUNKSIZE * staleChunkIds.GetSize())); + MetaChunkStaleNotify * const r = new MetaChunkStaleNotify( + NextSeq(), shared_from_this(), evacuatedFlag, + mStaleChunksHexFormatFlag); + if (clearStaleChunksFlag) { + r->staleChunkIds.Swap(staleChunkIds); + } else { + r->staleChunkIds = staleChunkIds; + } + ChunkIdQueue::ConstIterator it(r->staleChunkIds); + const chunkId_t* id; + while ((id = it.Next())) { + mChunksToEvacuate.Erase(*id); + } + Enqueue(r); + +} + +void +ChunkServer::NotifyStaleChunk(chunkId_t staleChunkId, bool evacuatedFlag) +{ + mAllocSpace = max((int64_t)0, mAllocSpace - (int64_t)CHUNKSIZE); + MetaChunkStaleNotify * const r = new MetaChunkStaleNotify( + NextSeq(), shared_from_this(), evacuatedFlag, + mStaleChunksHexFormatFlag); + r->staleChunkIds.PushBack(staleChunkId); + mChunksToEvacuate.Erase(staleChunkId); + Enqueue(r); +} + +void +ChunkServer::NotifyChunkVersChange(fid_t fid, chunkId_t chunkId, seq_t chunkVers, + seq_t fromVersion, bool makeStableFlag, bool pendingAddFlag, + MetaChunkReplicate* replicate) +{ + Enqueue(new MetaChunkVersChange( + NextSeq(), shared_from_this(), fid, chunkId, chunkVers, + fromVersion, makeStableFlag, pendingAddFlag, replicate), + sMakeStableTimeout); +} + +void +ChunkServer::SetRetiring() +{ + mIsRetiring = true; + mRetireStartTime = TimeNow(); + mChunksToEvacuate.Clear(); + KFS_LOG_STREAM_INFO << ServerID() << + " initiation of retire for " << mNumChunks << " chunks" << + KFS_LOG_EOM; +} + +void +ChunkServer::Retire() +{ + Enqueue(new MetaChunkRetire(NextSeq(), shared_from_this())); +} + +void +ChunkServer::SetProperties(const Properties& props) +{ + Enqueue(new MetaChunkSetProperties(NextSeq(), shared_from_this(), props)); +} + +void +ChunkServer::Restart(bool justExitFlag) +{ + mRestartQueuedFlag = true; + if (justExitFlag) { + Enqueue(new MetaChunkRetire(NextSeq(), shared_from_this())); + return; + } + Enqueue(new MetaChunkServerRestart(NextSeq(), shared_from_this())); +} + +int +ChunkServer::Heartbeat() +{ + if (! mHelloDone || mDown) { + return -1; + } + assert(mNetConnection); + const time_t now = TimeNow(); + const int timeSinceSent = (int)(now - mLastHeartbeatSent); + if (mHeartbeatSent) { + if (sHeartbeatTimeout >= 0 && + timeSinceSent >= sHeartbeatTimeout) { + ostringstream os; + os << "heartbeat timed out, sent: " << + timeSinceSent << " sec. ago"; + const string str = os.str(); + Error(str.c_str()); + return -1; + } + // If a request is outstanding, don't send one more + if (! mHeartbeatSkipped && + mLastHeartbeatSent + sHeartbeatInterval < now) { + mHeartbeatSkipped = true; + KFS_LOG_STREAM_INFO << ServerID() << + " skipping heartbeat send," + " last sent " << timeSinceSent << " sec. ago" << + KFS_LOG_EOM; + } + return(sHeartbeatTimeout < 0 ? + sHeartbeatTimeout : sHeartbeatTimeout - timeSinceSent); + } + if (timeSinceSent >= sHeartbeatInterval) { + KFS_LOG_STREAM_DEBUG << ServerID() << + " sending heartbeat," + " last sent " << timeSinceSent << " sec. ago" << + KFS_LOG_EOM; + mHeartbeatSent = true; + mLastHeartbeatSent = now; + Enqueue(new MetaChunkHeartbeat(NextSeq(), shared_from_this(), + IsRetiring() ? int64_t(1) : + (int64_t)mChunksToEvacuate.Size()), + 2 * sHeartbeatTimeout); + return ((sHeartbeatTimeout >= 0 && + sHeartbeatTimeout < sHeartbeatInterval) ? + sHeartbeatTimeout : sHeartbeatInterval + ); + } + return (sHeartbeatInterval - timeSinceSent); +} + +int +ChunkServer::TimeoutOps() +{ + if (! mHelloDone || mDown) { + return -1; + } + time_t const now = TimeNow(); + ReqsTimeoutQueue::iterator const end = + mReqsTimeoutQueue.lower_bound(now); + ReqsTimeoutQueue timedOut; + for (ReqsTimeoutQueue::iterator it = mReqsTimeoutQueue.begin(); + it != end; + ) { + assert(it->second); + DispatchedReqs::iterator const dri = + mDispatchedReqs.find(it->second->opSeqno); + if (dri == mDispatchedReqs.end()) { + panic("invalid timeout queue entry", false); + } + sChunkOpsInFlight.erase(dri->second.second); + mDispatchedReqs.erase(dri); + timedOut.insert(*it); + mReqsTimeoutQueue.erase(it++); + } + for (ReqsTimeoutQueue::iterator it = timedOut.begin(); + it != timedOut.end(); + ++it) { + KFS_LOG_STREAM_INFO << ServerID() << + " request timed out" + " expired: " << (now - it->first) << + " in flight: " << mDispatchedReqs.size() << + " total: " << sChunkOpsInFlight.size() << + " " << it->second->Show() << + KFS_LOG_EOM; + it->second->status = -EIO; + it->second->resume(); + } + return (mReqsTimeoutQueue.empty() ? + -1 : int(mReqsTimeoutQueue.begin()->first - now + 1)); +} + +void +ChunkServer::FailDispatchedOps() +{ + DispatchedReqs reqs; + ReqsTimeoutQueue reqTimeouts; + mReqsTimeoutQueue.swap(reqTimeouts); + mDispatchedReqs.swap(reqs); + // Get all ops out of the in flight global queue first. + for (DispatchedReqs::iterator it = reqs.begin(); + it != reqs.end(); + ++it) { + sChunkOpsInFlight.erase(it->second.second); + } + // Fail in the same order as these were queued. + for (DispatchedReqs::iterator it = reqs.begin(); + it != reqs.end(); + ++it) { + MetaChunkRequest* const op = it->second.first->second; + op->status = -EIO; + op->resume(); + } +} + +void +ChunkServer::GetRetiringStatus(ostream &os) +{ + if (! mIsRetiring) { + return; + } + os << + "s=" << mLocation.hostname << + ", p=" << mLocation.port << + ", started=" << DisplayDateTime( + int64_t(mRetireStartTime) * 1000 * 1000) << + ", numLeft=" << GetChunkCount() << + ", numDone=" << (mNumChunks - GetChunkCount()) << + "\t"; +} + +void +ChunkServer::GetEvacuateStatus(ostream &os) +{ + if (mIsRetiring || mEvacuateCnt <= 0) { + return; + } + os << + "s=" << mLocation.hostname << + ", p=" << mLocation.port << + ", c=" << mEvacuateCnt << + ", b=" << mEvacuateBytes << + ", cDone=" << mEvacuateDoneCnt << + ", bDone=" << mEvacuateDoneBytes << + ", cFlight=" << mEvacuateInFlight << + ", cPend=" << mChunksToEvacuate.Size() << + ", cSec=" << mEvacuateCntRate << + ", bSec=" << mEvacuateByteRate << + ", eta=" << (mEvacuateByteRate > 0 ? + mEvacuateBytes / mEvacuateByteRate : double(0)) << + "\t"; +} + +void +ChunkServer::Ping(ostream& os, bool useTotalFsSpaceFlag) const +{ + // for nodes taken out of write allocation, send the info back; this allows + // the UI to color these nodes differently + const double utilisation = GetSpaceUtilization(useTotalFsSpaceFlag); + const bool isOverloaded = utilisation > + gLayoutManager.GetMaxSpaceUtilizationThreshold(); + const time_t now = TimeNow(); + const int64_t freeSpace = max(int64_t(0), mTotalSpace - mUsedSpace); + os << "s=" << mLocation.hostname << ", p=" << mLocation.port + << ", rack=" << mRackId + << ", used=" << mUsedSpace + << ", free=" << freeSpace + << ", total=" << mTotalFsSpace + << ", util=" << utilisation * 100.0 + << ", nblocks=" << mNumChunks + << ", lastheard=" << (now - mLastHeard) + << ", ncorrupt=" << mNumCorruptChunks + << ", nchunksToMove=" << mChunksToMove.Size() + << ", numDrives=" << mNumDrives + << ", numWritableDrives=" << + ((mIsRetiring || isOverloaded) ? 0 : mNumWritableDrives) + << ", overloaded=" << (isOverloaded ? 1 : 0) + << ", numReplications=" << GetNumChunkReplications() + << ", numReadReplications=" << GetReplicationReadLoad() + << ", good=" << (GetCanBeCandidateServerFlag() ? 1 : 0) + << ", nevacuate=" << mEvacuateCnt + << ", bytesevacuate=" << mEvacuateBytes + << ", nlost=" << mLostChunks + << ", lostChunkDirs=" + ; + LostChunkDirs::const_iterator it = mLostChunkDirs.begin(); + if (it != mLostChunkDirs.end()) { + for (; ;) { + os << *it; + if (++it == mLostChunkDirs.end()) { + break; + } + os << ";"; + } + } + os << "\t"; +} + +void +ChunkServer::SendResponse(MetaRequest* op) +{ + if (! mNetConnection) { + return; + } + op->response(mOstream.Set(mNetConnection->GetOutBuffer())); + mOstream.Reset(); + if (mRecursionCount <= 0) { + mNetConnection->StartFlush(); + } +} + +bool +ChunkServer::ScheduleRestart( + int64_t gracefulRestartTimeout, int64_t gracefulRestartAppendWithWidTimeout) +{ + if (mDown) { + return true; + } + if (! mRestartScheduledFlag) { + mRestartScheduledTime = TimeNow(); + mRestartScheduledFlag = true; + } + if ((mNumChunkWrites <= 0 && + mNumAppendsWithWid <= 0 && + mDispatchedReqs.empty()) || + mRestartScheduledTime + + (mNumAppendsWithWid <= 0 ? + gracefulRestartTimeout : + max(gracefulRestartTimeout, + gracefulRestartAppendWithWidTimeout)) + < TimeNow()) { + mDownReason = "restarting"; + Error("reconnect before restart"); + return true; + } + return false; +} + +/* static */ string +ChunkServer::Escape(const string& str) +{ + const char* const kHexChars = "0123456789ABCDEF"; + string ret; + const char* p = str.c_str(); + for (; ;) { + const int c = *p++ & 0xFF; + if (c == 0) { + break; + } + // For now do not escape '/' to make file names more readable. + if (c <= ' ' || c >= 0xFF || strchr("!*'();:@&=+$,?#[]", c)) { + ret.push_back('%'); + ret.push_back(kHexChars[(c >> 4) & 0xF]); + ret.push_back(kHexChars[c & 0xF]); + } else { + ret.push_back(c); + } + } + return ret; +} + +} // namespace KFS diff --git a/src/cc/meta/ChunkServer.h b/src/cc/meta/ChunkServer.h new file mode 100644 index 000000000..494129036 --- /dev/null +++ b/src/cc/meta/ChunkServer.h @@ -0,0 +1,897 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/05 +// Author: Sriram Rao, Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file ChunkServer.h +// \brief Object that handles the communication with an individual +// chunk server. Model here is the following: +// - For write-allocation, layout manager asks the ChunkServer object +// to send an RPC to the chunk server. +// - The ChunkServer object sends the RPC and holds on to the request +// that triggered the RPC. +// - Eventually, when the RPC reply is received, the request is +// re-activated (alongwith the response) and is sent back down the pike. +// +//---------------------------------------------------------------------------- + +#ifndef META_CHUNKSERVER_H +#define META_CHUNKSERVER_H + + +#include "common/LinearHash.h" +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/NetConnection.h" +#include "qcdio/QCDLList.h" +#include "common/kfstypes.h" +#include "common/Properties.h" +#include "common/ValueSampler.h" +#include "common/StdAllocator.h" +#include "common/MsgLogger.h" +#include "MetaRequest.h" + +#include +#include +#include +#include + +#include +#include + +#include + +namespace KFS +{ +using std::string; +using std::ostream; +using std::istream; +using std::map; +using std::multimap; +using std::pair; +using std::less; +using std::set; + +/// Chunk server connects to the meta server, sends a HELLO +/// message to configure its state with the meta server, and +/// from then onwards, the meta server then drives the RPCs. +/// Types of messages: +/// Meta server --> Chunk server: Allocate, Free, Heartbeat +/// +struct ChunkRecoveryInfo; +struct MetaHello; + +class CSMapServerInfo +{ +public: + CSMapServerInfo() + : mIndex(-1), + mChunkCount(0), + mSet(0) + {} + ~CSMapServerInfo() { + delete mSet; + } + int GetIndex() const { return mIndex; } + size_t GetChunkCount() const { return mChunkCount; } +private: + int mIndex; + size_t mChunkCount; + + void AddHosted() { + mChunkCount++; + assert(mChunkCount > 0); + } + void RemoveHosted() { + if (mChunkCount <= 0) { + panic("no hosted chunks", false); + return; + } + mChunkCount--; + } + void ClearHosted() { + mChunkCount = 0; + if (mSet) { + mSet->Clear(); + } + } + void SetIndex(int idx, bool debugTrackChunkIdFlag) { + mIndex = idx; + if (debugTrackChunkIdFlag) { + if (! mSet) { + mSet = new Set(); + } + } else { + delete mSet; + mSet = 0; + } + } + +private: + // The set here is for CSMap debugging only, see + // CSMap::SetDebugValidate() + typedef KeyOnly KeyVal; + typedef LinearHash< + KeyVal, + KeyCompare, + DynamicArray< + SingleLinkedList*, + 8 // 2^8 * sizeof(void*) => 2048 + >, + StdFastAllocator + > Set; + Set* mSet; + + void AddHosted(chunkId_t chunkId, int index) { + bool newEntryFlag = false; + if (mIndex < 0 || index != mIndex) { + panic("invalid index", false); + } + if (mSet && (! mSet->Insert(chunkId, chunkId, newEntryFlag) || + ! newEntryFlag)) { + panic("duplicate chunk id", false); + } + AddHosted(); + } + void RemoveHosted(chunkId_t chunkId, int index) { + if (mIndex < 0 || index != mIndex) { + panic("invalid index", false); + } + if (mSet && mSet->Erase(chunkId) <= 0) { + panic("no such chunk", false); + } + RemoveHosted(); + } + const int* HostedIdx(chunkId_t chunkId) const { + return ((mSet && mSet->Find(chunkId)) ? &mIndex : 0); + } + friend class CSMap; +private: + CSMapServerInfo(const CSMapServerInfo&); + CSMapServerInfo& operator=(const CSMapServerInfo&); +}; + +class ChunkServer : + public KfsCallbackObj, + public boost::enable_shared_from_this, + public CSMapServerInfo { +public: + typedef int RackId; + class ChunkIdSet + { + public: + ChunkIdSet() + : mSet() + {} + ~ChunkIdSet() + {} + const chunkId_t* Find(chunkId_t chunkId) const { + return mSet.Find(chunkId); + } + bool Erase(chunkId_t chunkId) { + return (mSet.Erase(chunkId) != 0); + } + void First() { + mSet.First(); + } + const chunkId_t* Next() { + const KeyVal* const ret = mSet.Next(); + return (ret ? &ret->GetKey() : 0); + } + bool Insert(chunkId_t chunkId) { + bool inserted = false; + mSet.Insert(chunkId, chunkId, inserted); + return inserted; + } + void Clear() { + mSet.Clear(); + } + size_t Size() const { + return mSet.GetSize(); + } + bool IsEmpty() const { + return mSet.IsEmpty(); + } + void Swap(ChunkIdSet& other) { + mSet.Swap(other.mSet); + } + private: + typedef KeyOnly KeyVal; + typedef LinearHash< + KeyVal, + KeyCompare, + DynamicArray< + SingleLinkedList*, + 5 // 2^5 * sizeof(void*) => 256 + >, + StdFastAllocator + > Set; + + Set mSet; + private: + ChunkIdSet(const ChunkIdSet&); + ChunkIdSet& operator=(const ChunkIdSet&); + }; + + typedef multimap < + chunkId_t, + const MetaChunkRequest*, + less, + StdFastAllocator< + pair + > + > ChunkOpsInFlight; + + static KfsCallbackObj* Create(const NetConnectionPtr &conn) { + if (! conn || ! conn->IsGood()) { + return 0; + } + ChunkServer* const ret = new ChunkServer(conn); + ret->mSelfPtr.reset(ret); + return ret; + } + /// + /// Sequence: + /// Chunk server connects. + /// - A new chunkserver sm is born + /// - chunkserver sends a HELLO with config info + /// - send/recv messages with that chunkserver. + /// + ChunkServer(const NetConnectionPtr& conn, + const string& peerName = string()); + ~ChunkServer(); + + bool CanBeChunkMaster() const { + return mCanBeChunkMaster; + } + void SetCanBeChunkMaster(bool flag); + + /// Generic event handler to handle network + /// events. This method gets from the net manager when + /// it sees some data is available on the socket. + int HandleRequest(int code, void *data); + + /// Send an RPC to allocate a chunk on this server. + /// An RPC request is enqueued and the call returns. + /// When the server replies to the RPC, the request + /// processing resumes. + /// @param[in] r the request associated with the RPC call. + /// @param[in] leaseId the id associated with the write lease. + /// @retval 0 on success; -1 on failure + /// + int AllocateChunk(MetaAllocate *r, int64_t leaseId); + + /// Send an RPC to delete a chunk on this server. + /// An RPC request is enqueued and the call returns. + /// When the server replies to the RPC, the request + /// processing resumes. + /// @param[in] chunkId name of the chunk that is being + /// deleted. + /// @retval 0 on success; -1 on failure + /// + int DeleteChunk(chunkId_t chunkId); + + /// + /// Send a message to the server asking it to go down. + /// + void Retire(); + + void Restart(bool justExitFlag); + + /// Method to get the size of a chunk from a chunkserver. + int GetChunkSize(fid_t fid, chunkId_t chunkId, + seq_t chunkVersion, const string &pathname, bool retryFlag = true); + + /// Methods to handle (re) replication of a chunk. If there are + /// insufficient copies of a chunk, we replicate it. + int ReplicateChunk(fid_t fid, chunkId_t chunkId, + const ChunkServerPtr& dataServer, + const ChunkRecoveryInfo& recoveryInfo); + /// Start write append recovery when chunk master is non operational. + int BeginMakeChunkStable(fid_t fid, chunkId_t chunkId, seq_t chunkVersion); + /// Notify a chunkserver that the writes to a chunk are done; + /// the chunkserver in turn should flush dirty data and make the + /// chunk "stable". + int MakeChunkStable(fid_t fid, chunkId_t chunkId, seq_t chunkVersion, + chunkOff_t chunkSize, bool hasChunkChecksum, uint32_t chunkChecksum, + bool addPending); + + /// Replication of a chunk finished. Update statistics + void ReplicateChunkDone(chunkId_t chunkId) { + mNumChunkWriteReplications--; + assert(mNumChunkWriteReplications >= 0); + if (mNumChunkWriteReplications < 0) + mNumChunkWriteReplications = 0; + MovingChunkDone(chunkId); + } + + + /// Accessor method to get # of replications that are being + /// handled by this server. + int GetNumChunkReplications() const { + return mNumChunkWriteReplications; + } + + /// During re-replication, we want to track how much b/w is + /// being spent read requests for replication by the server. This + /// is to prevent a server being overloaded and becoming + /// unresponsive as we try to increase the # of replicas. + int GetReplicationReadLoad() const { + return mNumChunkReadReplications; + } + + void UpdateReplicationReadLoad(int count) { + mNumChunkReadReplications += count; + if (mNumChunkReadReplications < 0) + mNumChunkReadReplications = 0; + } + + /// If a chunkserver isn't responding, don't send any + /// write load towards it. We detect loaded servers to be + /// those that don't respond to heartbeat messages. + bool IsResponsiveServer() const { + return (! mDown && ! mHeartbeatSkipped); + } + + /// To support scheduled down-time and allow maintenance to be + /// done on the server node, we could "retire" a server; when the + /// server is being retired, we evacuate the blocks on that server + /// and re-replicate them elsewhere (on non-retiring nodes). + /// During the stage where the server is being retired, we don't + /// want to send any new write traffic to the server. + /// + void SetRetiring(); + + bool IsRetiring() const { + return mIsRetiring; + } + + void IncCorruptChunks() { + mNumCorruptChunks++; + } + + /// Provide some stats...useful for ops + void GetRetiringStatus(ostream &os); + void GetEvacuateStatus(ostream &os); + + /// When the plan is read in, the set of chunks that + /// need to be moved to this node is updated. + bool AddToChunksToMove(chunkId_t chunkId) { + return mChunksToMove.Insert(chunkId); + } + + const ChunkIdSet& GetChunksToMove() { + return mChunksToMove; + } + + void ClearChunksToMove() { + mChunksToMove.Clear(); + } + + /// Whenever this node re-replicates a chunk that was targeted + /// for rebalancing, update the set. + bool MovingChunkDone(chunkId_t chunkId) { + return (mChunksToMove.Erase(chunkId) > 0); + } + + /// Whenever the layout manager determines that this + /// server has stale chunks, it queues an RPC to + /// notify the chunk server of the stale data. + void NotifyStaleChunks(ChunkIdQueue& staleChunks, + bool evacuatedFlag = false, bool clearStaleChunksFlag = true); + void NotifyStaleChunk(chunkId_t staleChunk, bool evacuatedFlag = false); + + /// There is a difference between the version # as stored + /// at the chunkserver and what is on the metaserver. By sending + /// this message, the metaserver is asking the chunkserver to change + /// the version # to what is passed in. + void NotifyChunkVersChange(fid_t fid, chunkId_t chunkId, seq_t chunkVers, + seq_t fromVersion, bool makeStableFlag, bool pendingAddFlag = false, + MetaChunkReplicate* replicate = 0); + + /// Accessor method to get the host name/port + const ServerLocation& GetServerLocation() const { + return mLocation; + } + + string ServerID() const + { + return mLocation.ToString(); + } + + /// Check if the hostname/port matches what is passed in + /// @param[in] name name to match + /// @param[in] port port # to match + /// @retval true if a match occurs; false otherwise + bool MatchingServer(const ServerLocation& loc) const { + return mLocation == loc; + } + + /// Setter method to set the host name/port + void SetServerLocation(const ServerLocation& loc) { + mLocation = loc; + } + + /// Setter method to set space + void SetSpace(int64_t total, int64_t used, int64_t alloc) { + mTotalSpace = total; + mUsedSpace = used; + mAllocSpace = alloc; + } + + const char* GetServerName() { + return mLocation.hostname.c_str(); + } + + void SetRack(RackId rackId) { + mRackId = rackId; + } + /// Return the unique identifier for the rack on which the + /// server is located. + RackId GetRack() const { + return mRackId; + } + + /// Available space is defined as the difference + /// between the total storage space available + /// on the server and the amount of space that + /// has been parceled out for outstanding writes + /// by the meta server. THat is, alloc space is tied + /// to the chunks that have been write-leased. This + /// has the effect of keeping alloc space tied closely + /// to used space. + int64_t GetAvailSpace() const { + return max(int64_t(0), mTotalSpace - mAllocSpace); + } + + /// Accessor to that returns an estimate of the # of + /// concurrent writes that are being handled by this server + int GetNumChunkWrites() const { + return mNumChunkWrites; + + } + + int64_t GetNumAppendsWithWid() const { + return mNumAppendsWithWid; + } + + int64_t GetTotalSpace(bool useFsTotalSpaceFlag) const { + return (useFsTotalSpaceFlag ? mTotalFsSpace : mTotalSpace); + } + + int64_t GetTotalFsSpace() const { + return mTotalFsSpace; + } + + int64_t GetUsedSpace() const { + return mUsedSpace; + } + + int GetNumChunks() const { + return mNumChunks; + } + + int64_t GetFreeFsSpace() const { + return GetAvailSpace(); + } + + int64_t GetFsUsedSpace() const { + return max(mUsedSpace, mTotalFsSpace - GetFreeFsSpace()); + } + + /// Return an estimate of disk space utilization on this server. + /// The estimate is between [0..1] + double GetSpaceUtilization(bool useFsTotalSpaceFlag) const { + return (useFsTotalSpaceFlag ? + GetFsSpaceUtilization() : + GetTotalSpaceUtilization() + ); + } + double GetTotalSpaceUtilization() const { + if (mTotalSpace <= 0) { + return 1; + } + if (mPrevTotalSpace != mTotalSpace) { + Mutable(mPrevTotalSpace) = mTotalSpace; + Mutable(mOneOverTotalSpace) = double(1) / mTotalSpace; + } + return (mUsedSpace * mOneOverTotalSpace); + } + double GetFsSpaceUtilization() const { + if (mTotalFsSpace <= 0) { + return 1; + } + if (mPrevTotalFsSpace != mTotalFsSpace) { + Mutable(mPrevTotalFsSpace) = mTotalFsSpace; + Mutable(mOneOverTotalFsSpace) = + double(1) / mTotalFsSpace; + } + return (GetFsUsedSpace() * mOneOverTotalFsSpace); + } + bool IsDown() const { + return mDown; + } + + /// + /// The chunk server went down. So, fail all the + /// outstanding ops. + /// + void FailPendingOps(); + + /// For monitoring purposes, dump out state as a string. + /// @param [out] result The state of this server + /// + void Ping(ostream& os, bool useTotalFsSpaceFlag) const; + + seq_t NextSeq() { return mSeqNo++; } + int TimeSinceLastHeartbeat() const; + void ForceDown(); + static void SetParameters(const Properties& prop, int clientPort); + void SetProperties(const Properties& props); + int64_t Uptime() const { return mUptime; } + bool ScheduleRestart(int64_t gracefulRestartTimeout, int64_t gracefulRestartAppendWithWidTimeout); + bool IsRestartScheduled() const { + return (mRestartScheduledFlag || mRestartQueuedFlag); + } + const string& DownReason() const { + return mDownReason; + } + const Properties& HeartBeatProperties() const { + return mHeartbeatProperties; + } + int64_t GetLoadAvg() const { + return mLoadAvg; + } + int Evacuate(chunkId_t chunkId) { + if (mIsRetiring) { + return -EEXIST; + } + if (mChunksToEvacuate.Size() >= sMaxChunksToEvacuate) { + return -EAGAIN; + } + return (mChunksToEvacuate.Insert(chunkId) ? 0 : -EEXIST); + } + bool IsEvacuationScheduled(chunkId_t chunkId) const { + return (mIsRetiring || mChunksToEvacuate.Find(chunkId)); + } + static const ChunkOpsInFlight& GetChunkOpsInFlight() { + return sChunkOpsInFlight; + } + static int GetChunkServerCount() { + return sChunkServerCount; + } + void UpdateSpace(MetaChunkEvacuate& op); + size_t GetChunksToEvacuateCount() const { + return mChunksToEvacuate.Size(); + } + bool GetCanBeCandidateServerFlag() const { + return mCanBeCandidateServerFlag; + } + void SetCanBeCandidateServerFlag(bool flag) { + mCanBeCandidateServerFlag = flag; + } + int64_t GetEvacuateCount() const { + return mEvacuateCnt; + } + int64_t GetEvacuateBytes() const { + return mEvacuateBytes; + } + int GetNumDrives() const { + return mNumDrives; + } + int GetNumWritableDrives() const { + return mNumWritableDrives; + } + void SetChunkDirStatus(const string& dir, bool dirOkFlag) { + if (dirOkFlag) { + mLostChunkDirs.erase(Escape(dir)); + } else { + mLostChunkDirs.insert(Escape(dir)); + } + } + static void SetMaxHelloBufferBytes(int64_t maxBytes) { + sMaxHelloBufferBytes = maxBytes; + } + static int64_t GetMaxHelloBufferBytes() { + return sMaxHelloBufferBytes; + } + static bool RunHelloBufferQueue(); + +protected: + /// Enqueue a request to be dispatched to this server + /// @param[in] r the request to be enqueued. + /// allow override in layout emulator. + virtual void EnqueueSelf(MetaChunkRequest* r); + void Enqueue(MetaChunkRequest* r, int timeout = -1); + + /// A sequence # associated with each RPC we send to + /// chunk server. This variable tracks the seq # that + /// we should use in the next RPC. + seq_t mSeqNo; + /// A handle to the network connection + NetConnectionPtr mNetConnection; + + /// Are we thru with processing HELLO message + bool mHelloDone; + + /// Boolean that tracks whether this server is down + bool mDown; + + /// Is there a heartbeat message for which we haven't + /// recieved a reply yet? If yes, dont' send one more + bool mHeartbeatSent; + + /// did we skip the sending of a heartbeat message? + bool mHeartbeatSkipped; + + time_t mLastHeartbeatSent; + + static int sHeartbeatTimeout; + static int sHeartbeatInterval; + static int sHeartbeatLogInterval; + static int sChunkAllocTimeout; + static int sChunkReallocTimeout; + static int sMakeStableTimeout; + static int sReplicationTimeout; + static int sRequestTimeout; + static int sMetaClientPort; + static bool sRestartCSOnInvalidClusterKeyFlag; + static int sSrvLoadSamplerSampleCount; + static string sSrvLoadPropName; + static size_t sMaxChunksToEvacuate; + + /// For record append's, can this node be a chunk master + bool mCanBeChunkMaster; + + /// is the server being retired + bool mIsRetiring; + /// when we did we get the retire request + time_t mRetireStartTime; + + /// when did we get the last heartbeat reply + time_t mLastHeard; + + /// Set of chunks that need to be moved to this server. + /// This set was previously computed by the rebalance planner. + ChunkIdSet mChunksToMove; + + ChunkIdSet mChunksToEvacuate; + + /// Location of the server at which clients can + /// connect to + ServerLocation mLocation; + + /// A unique id to denote the rack on which the server is located. + /// -1 signifies that we don't what rack the server is on and by + /// implication, all servers are on same rack + RackId mRackId; + + /// Keep a count of how many corrupt chunks we are seeing on + /// this node; an indicator of the node in trouble? + int64_t mNumCorruptChunks; + + /// total space available on this server + int64_t mTotalSpace; + int64_t mPrevTotalSpace; + int64_t mTotalFsSpace; + int64_t mPrevTotalFsSpace; + double mOneOverTotalSpace; + double mOneOverTotalFsSpace; + /// space that has been used by chunks on this server + int64_t mUsedSpace; + + /// space that has been allocated for chunks: this + /// corresponds to the allocations that have been + /// made, but not all of the allocated space is used. + /// For instance, when we have partially filled + /// chunks, there is space is allocated for a chunk + /// but that space hasn't been fully used up. + int64_t mAllocSpace; + + /// # of chunks hosted on this server; useful for + /// reporting purposes + long mNumChunks; + + /// An estimate of the CPU load average as reported by the + /// chunkserver. When selecting nodes for block allocation, we + /// can use this info to weed out the most heavily loaded N% of + /// the nodes. + double mCpuLoadAvg; + + /// Chunkserver returns the # of drives on the node in a + /// heartbeat response; we can then show this value on the UI + int mNumDrives; + int mNumWritableDrives; + + /// An estimate of the # of writes that are being handled + /// by this server. We use this value to update mAllocSpace + /// The problem we have is that, we can end up with lots of + /// partial chunks and over time such drift can significantly + /// reduce the available space on the server (space is held + /// down for by the partial chunks that may never be written to). + /// Since writes can occur only when someone gets a valid write lease, + /// we track the # of write leases that are issued and where the + /// writes are occurring. So, whenever we get a heartbeat, we + /// can update alloc space as a sum of the used space and the # of + /// writes that are currently being handled by this server. + int mNumChunkWrites; + int64_t mNumAppendsWithWid; + + /// Track the # of chunk replications (write/read) that are going on this server + int mNumChunkWriteReplications; + int mNumChunkReadReplications; + + typedef multimap < + time_t, + MetaChunkRequest*, + less, + StdFastAllocator< + pair + > + > ReqsTimeoutQueue; + typedef map < + seq_t, + pair< + ReqsTimeoutQueue::iterator, + ChunkOpsInFlight::iterator + >, + less, + StdFastAllocator< + pair + > + > DispatchedReqs; + typedef set < + string, + less, + StdFastAllocator + > LostChunkDirs; + + enum { kChunkSrvListsCount = 2 }; + /// RPCs that we have sent to this chunk server. + DispatchedReqs mDispatchedReqs; + ReqsTimeoutQueue mReqsTimeoutQueue; + int64_t mLostChunks; + int64_t mUptime; + Properties mHeartbeatProperties; + bool mRestartScheduledFlag; + bool mRestartQueuedFlag; + time_t mRestartScheduledTime; + time_t mLastHeartBeatLoggedTime; + string mDownReason; + IOBuffer::WOStream mOstream; + int mRecursionCount; + MetaHello* mHelloOp; + ChunkServerPtr mSelfPtr; + ValueSampler mSrvLoadSampler; + int64_t mLoadAvg; + bool mCanBeCandidateServerFlag; + bool mStaleChunksHexFormatFlag; + IOBuffer::IStream mIStream; + int64_t mEvacuateCnt; + int64_t mEvacuateBytes; + int64_t mEvacuateDoneCnt; + int64_t mEvacuateDoneBytes; + int64_t mEvacuateInFlight; + int64_t mPrevEvacuateDoneCnt; + int64_t mPrevEvacuateDoneBytes; + time_t mEvacuateLastRateUpdateTime; + double mEvacuateCntRate; + double mEvacuateByteRate; + LostChunkDirs mLostChunkDirs; + const string mPeerName; + ChunkServer* mPrevPtr[kChunkSrvListsCount]; + ChunkServer* mNextPtr[kChunkSrvListsCount]; + + static ChunkOpsInFlight sChunkOpsInFlight; + static ChunkServer* sChunkServersPtr[kChunkSrvListsCount]; + static int sChunkServerCount; + static int sPendingHelloCount; + static int sMinHelloWaitingBytes; + static int64_t sMaxHelloBufferBytes; + static int64_t sHelloBytesCommitted; + static int64_t sHelloBytesInFlight; + static int sEvacuateRateUpdateInterval; + + friend class QCDLListOp; + friend class QCDLListOp; + typedef QCDLList ChunkServersList; + typedef QCDLList PendingHelloList; + + void AddToPendingHelloList(); + void RemoveFromPendingHelloList(); + static int64_t GetHelloBytes(MetaHello* req = 0); + static void PutHelloBytes(MetaHello* req); + + /// + /// We have received a message from the chunk + /// server. Do something with it. + /// @param[in] iobuf An IO buffer stream with message + /// received from the chunk server. + /// @param[in] msgLen Length in bytes of the message. + /// @retval 0 if message was processed successfully; + /// -1 if there was an error + /// + int HandleMsg(IOBuffer *iobuf, int msgLen); + + /// Handlers for the 3 types of messages we could get: + /// 1. Hello message from a chunkserver + /// 2. An RPC from a chunkserver + /// 3. A reply to an RPC that we have sent previously. + + int HandleHelloMsg(IOBuffer *iobuf, int msgLen); + int HandleCmd(IOBuffer *iobuf, int msgLen); + int HandleReply(IOBuffer *iobuf, int msgLen); + + /// Send a response message to the MetaRequest we got. + void SendResponse(MetaRequest *op); + + /// + /// Given a response from a chunkserver, find the + /// associated request that we previously sent. + /// Request/responses are matched based on sequence + /// numbers in the messages. + /// + /// @param[in] cseq The sequence # of the op we are + /// looking for. + /// @retval The matching request if one exists; NULL + /// otherwise + /// + MetaChunkRequest *FindMatchingRequest(seq_t cseq); + + MetaRequest* GetOp(IOBuffer& iobuf, int msgLen, const char* errMsgPrefix); + + /// + /// The response sent by a chunkserver is of the form: + /// OK \r\n + /// Cseq: \r\n + /// Status: \r\n\r\n + /// Extract out Cseq, Status + /// + /// @param[in] buf Buffer containing the response + /// @param[in] bufLen length of buf + /// @param[out] prop Properties object with the response header/values + /// + bool ParseResponse(istream& is, Properties &prop); + /// + /// The chunk server went down. So, stop the network timer event; + /// also, fail all the dispatched ops. + /// + void Error(const char* errorMsg); + void FailDispatchedOps(); + /// Periodically, send a heartbeat message to the chunk server. + int Heartbeat(); + int TimeoutOps(); + inline void UpdateChunkWritesPerDrive( + int numChunkWrites, int numWritableDrives); + void ShowLines(MsgLogger::LogLevel logLevel, const string& prefix, + IOBuffer& iobuf, int len, int linesToShow = 64); + string GetPeerName() const { + return ((! mPeerName.empty() || ! mNetConnection) ? + mPeerName : mNetConnection->GetPeerName()); + } + static string Escape(const string& str); + template static T& Mutable(const T& v) { + return const_cast(v); + } +}; + +} // namespace KFS + +#endif // META_CHUNKSERVER_H diff --git a/src/cc/meta/ChunkServerFactory.h b/src/cc/meta/ChunkServerFactory.h new file mode 100644 index 000000000..38f021cc2 --- /dev/null +++ b/src/cc/meta/ChunkServerFactory.h @@ -0,0 +1,78 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/06 +// Author: Sriram Rao, Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file ChunkServerFactory.h +// \brief Create ChunkServer objects whenever a chunk server connects +// to us (namely, the meta server). +// +//---------------------------------------------------------------------------- + +#ifndef META_CHUNKSERVERFACTORY_H +#define META_CHUNKSERVERFACTORY_H + +#include "kfsio/Acceptor.h" +#include "kfsio/KfsCallbackObj.h" +#include "kfstypes.h" +#include "ChunkServer.h" + +namespace KFS +{ + +/// +/// ChunkServerFactory creates a ChunkServer object whenever +/// a chunk server connects to us. The ChunkServer object is +/// responsible for all the communication with that chunk +/// server. +/// +class ChunkServerFactory : public IAcceptorOwner +{ +public: + ChunkServerFactory() + : mAcceptor(0) + {} + virtual ~ChunkServerFactory() + { delete mAcceptor; } + /// Start an acceptor to listen on the specified port. + bool StartAcceptor(int port) + { + delete mAcceptor; + mAcceptor = new Acceptor(port, this); + return mAcceptor->IsAcceptorStarted(); + } + /// Callback that gets invoked whenever a chunkserver + /// connects to the acceptor port. The accepted socket + /// connection is passed in. + /// @param[in] conn: The accepted connection + /// @retval The continuation object that was created as a + /// result of this call. + KfsCallbackObj *CreateKfsCallbackObj(NetConnectionPtr &conn) + { return ChunkServer::Create(conn); } +private: + // The socket object which is setup to accept connections from + /// chunkserver. + Acceptor* mAcceptor; +}; + +} + +#endif // META_CHUNKSERVERFACTORY_H diff --git a/src/cc/meta/ClientManager.h b/src/cc/meta/ClientManager.h new file mode 100644 index 000000000..955de726c --- /dev/null +++ b/src/cc/meta/ClientManager.h @@ -0,0 +1,93 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/02 +// Author: Sriram Rao +// Mike Ovsiannikov implement multiple outstanding request processing, +// and "client threads". +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file ClientManager.h +// \brief Create client state machines whenever clients connect to meta server. +// +//---------------------------------------------------------------------------- + +#ifndef META_CLIENTMANAGER_H +#define META_CLIENTMANAGER_H + +#include "MetaRequest.h" + +class QCMutex; + +namespace KFS +{ + +class ClientSM; + +class ClientManager +{ +public: + class ClientThread; + + ClientManager(); + virtual ~ClientManager(); + bool StartAcceptor(int port, int threadCount, int startCpuAffinity); + void Shutdown(); + void ChildAtFork(); + QCMutex& GetMutex(); + static bool Enqueue(ClientThread* thread, MetaRequest& op) + { + if (op.next == &op) { + return false; + } + if (! thread) { + op.next = &op; + return false; + } + return EnqueueSelf(thread, op); + } + static void SubmitRequest(ClientThread* thread, MetaRequest& op) + { + if (thread) { + SubmitRequestSelf(thread, op); + } else { + submit_request(&op); + } + } + static bool Flush(ClientThread* thread, ClientSM& /* cli */) + { + return (thread != 0); // Thread invokes flush. + } + void PrepareCurrentThreadToFork(); + inline void PrepareToFork(); +private: + class Impl; + Impl& mImpl; + + static bool EnqueueSelf(ClientThread* thread, MetaRequest& op); + static void SubmitRequestSelf(ClientThread* thread, MetaRequest& op); +private: + ClientManager(const ClientManager&); + ClientManager& operator=(const ClientManager&); +}; + + +} + +#endif // META_CLIENTMANAGER_H diff --git a/src/cc/meta/ClientSM.cc b/src/cc/meta/ClientSM.cc new file mode 100644 index 000000000..5e1c07cd0 --- /dev/null +++ b/src/cc/meta/ClientSM.cc @@ -0,0 +1,443 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/05 +// Author: Sriram Rao +// Mike Ovsiannikov implement multiple outstanding request processing, +// and "client threads". +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// +// \file ClientSM.cc +// \brief Kfs client protocol state machine implementation. +// +//---------------------------------------------------------------------------- + +#include "ClientSM.h" +#include "ChunkServer.h" +#include "NetDispatch.h" +#include "util.h" +#include "common/kfstypes.h" +#include "kfsio/Globals.h" +#include "qcdio/qcstutils.h" +#include "kfsio/IOBuffer.h" +#include "common/MsgLogger.h" +#include "common/Properties.h" +#include "AuditLog.h" + +#include +#include +#include + +namespace KFS +{ + +using std::max; +using std::string; +using std::ostringstream; + + +inline string +PeerName(const NetConnectionPtr& conn) +{ + return (conn ? conn->GetPeerName() : string("unknown")); +} + +inline string +PeerIp(const NetConnectionPtr& conn) +{ + if (! conn) { + return string(); + } + const string peer = conn->GetPeerName(); + const size_t pos = peer.rfind(':'); + if (pos == string::npos) { + return peer; + } + return peer.substr(0, pos); +} + +int ClientSM::sMaxPendingOps = 1; +int ClientSM::sMaxPendingBytes = 3 << 10; +int ClientSM::sMaxReadAhead = 3 << 10; +int ClientSM::sInactivityTimeout = 8 * 60; +int ClientSM::sMaxWriteBehind = 3 << 10; +int ClientSM::sBufCompactionThreshold = 1 << 10; +int ClientSM::sOutBufCompactionThreshold = 8 << 10; +int ClientSM::sClientCount = 0; +bool ClientSM::sAuditLoggingFlag = false; +ClientSM* ClientSM::sClientSMPtr[1] = {0}; +IOBuffer::WOStream ClientSM::sWOStream; + +/* static */ void +ClientSM::SetParameters(const Properties& prop) +{ + const int maxPendingOps = prop.getValue( + "metaServer.clientSM.maxPendingOps", + -1); + if (maxPendingOps > 0) { + sMaxPendingOps = maxPendingOps; + } else if (! gNetDispatch.IsRunning() && + prop.getValue("metaServer.clientThreadCount", -1) > 0) { + sMaxPendingOps = 4; + } + sMaxPendingBytes = max(1, prop.getValue( + "metaServer.clientSM.maxPendingBytes", + sMaxPendingBytes)); + sMaxReadAhead = max(256, prop.getValue( + "metaServer.clientSM.maxReadAhead", + sMaxReadAhead)); + sInactivityTimeout = prop.getValue( + "metaServer.clientSM.inactivityTimeout", + sInactivityTimeout); + sMaxWriteBehind = max(1, prop.getValue( + "metaServer.clientSM.maxWriteBehind", + sMaxWriteBehind)); + sBufCompactionThreshold = prop.getValue( + "metaServer.clientSM.bufCompactionThreshold", + sBufCompactionThreshold); + sOutBufCompactionThreshold = prop.getValue( + "metaServer.clientSM.outBufCompactionThreshold", + sOutBufCompactionThreshold); + sAuditLoggingFlag = prop.getValue( + "metaServer.clientSM.auditLogging", + sAuditLoggingFlag ? 1 : 0) != 0; + AuditLog::SetParameters(prop); +} + +ClientSM::ClientSM( + const NetConnectionPtr& conn, + ClientManager::ClientThread* thread, + IOBuffer::WOStream* wostr, + char* parseBuffer) + : mNetConnection(conn), + mClientIp(PeerIp(conn)), + mPendingOpsCount(0), + mOstream(wostr ? *wostr : sWOStream), + mParseBuffer(parseBuffer), + mRecursionCnt(0), + mClientProtoVers(KFS_CLIENT_PROTO_VERS), + mDisconnectFlag(false), + mLastReadLeft(0), + mClientThread(thread), + mNext(0) +{ + assert(mNetConnection && mNetConnection->IsGood()); + + ClientSMList::Init(*this); + { + QCStMutexLocker locker(gNetDispatch.GetClientManagerMutex()); + ClientSMList::PushBack(sClientSMPtr, *this); + sClientCount++; + } + mNetConnection->SetInactivityTimeout(sInactivityTimeout); + mNetConnection->SetMaxReadAhead(sMaxReadAhead); + SET_HANDLER(this, &ClientSM::HandleRequest); +} + +ClientSM::~ClientSM() +{ + QCStMutexLocker locker(gNetDispatch.GetClientManagerMutex()); + ClientSMList::Remove(sClientSMPtr, *this); + sClientCount--; +} + +/// +/// Send out the response to the client request. The response is +/// generated by MetaRequest as per the protocol. +/// @param[in] op The request for which we finished execution. +/// +void +ClientSM::SendResponse(MetaRequest *op) +{ + if ((op->op == META_ALLOCATE && (op->status < 0 || + static_cast(op)->logFlag)) || + MsgLogger::GetLogger()->IsLogLevelEnabled( + MsgLogger::kLogLevelDEBUG)) { + // for chunk allocations, for debugging purposes, need to know + // where the chunk was placed. + KFS_LOG_STREAM_INFO << PeerName(mNetConnection) << + " -seq: " << op->opSeqno << + " status: " << op->status << + (op->statusMsg.empty() ? + "" : " msg: ") << op->statusMsg << + " " << op->Show() << + KFS_LOG_EOM; + } + if (! mNetConnection) { + return; + } + if (op->op == META_DISCONNECT) { + mDisconnectFlag = true; + } + op->response( + mOstream.Set(mNetConnection->GetOutBuffer()), + mNetConnection->GetOutBuffer()); + mOstream.Reset(); + if (mRecursionCnt <= 0) { + mNetConnection->StartFlush(); + } +} + +/// +/// Generic event handler. Decode the event that occurred and +/// appropriately extract out the data and deal with the event. +/// @param[in] code: The type of event that occurred +/// @param[in] data: Data being passed in relative to the event that +/// occurred. +/// @retval 0 to indicate successful event handling; -1 otherwise. +/// +int +ClientSM::HandleRequest(int code, void *data) +{ + if (code == EVENT_CMD_DONE) { + assert(data && mPendingOpsCount > 0); + if (ClientManager::Enqueue(mClientThread, + *reinterpret_cast(data))) { + return 0; + } + } + + assert(mRecursionCnt >= 0 && (mNetConnection || + (code == EVENT_CMD_DONE && data && mPendingOpsCount > 0))); + mRecursionCnt++; + + switch (code) { + case EVENT_NET_READ: { + // We read something from the network. Run the RPC that + // came in. + mLastReadLeft = 0; + IOBuffer& iobuf = mNetConnection->GetInBuffer(); + if (mDisconnectFlag) { + iobuf.Clear(); // Discard + } + assert(data == &iobuf); + // Do not start new op if response does not get unloaded by + // the client to prevent out of buffers. + bool overWriteBehindFlag = false; + for (; ;) { + while ((overWriteBehindFlag = + mNetConnection->GetNumBytesToWrite() >= + sMaxWriteBehind) && + mRecursionCnt <= 1 && + mNetConnection->CanStartFlush()) { + mNetConnection->StartFlush(); + } + int cmdLen; + if (overWriteBehindFlag || + IsOverPendingOpsLimit() || + ! IsMsgAvail(&iobuf, &cmdLen)) { + break; + } + HandleClientCmd(iobuf, cmdLen); + } + if (overWriteBehindFlag) { + break; + } + if (! IsOverPendingOpsLimit() && ! mDisconnectFlag) { + mLastReadLeft = iobuf.BytesConsumable(); + if (mLastReadLeft <= MAX_RPC_HEADER_LEN) { + mNetConnection->SetMaxReadAhead(sMaxReadAhead); + break; + } + KFS_LOG_STREAM_ERROR << PeerName(mNetConnection) << + " exceeded max request header size: " << + mLastReadLeft << + " > " << MAX_RPC_HEADER_LEN << + " closing connection" << + KFS_LOG_EOM; + mLastReadLeft = 0; + iobuf.Clear(); + mNetConnection->Close(); + HandleRequest(EVENT_NET_ERROR, NULL); + } + break; + } + + case EVENT_CMD_DONE: { + assert(data && mPendingOpsCount > 0); + MetaRequest* const op = reinterpret_cast(data); + if (sAuditLoggingFlag && ! op->reqHeaders.IsEmpty()) { + AuditLog::Log(*op); + } + SendResponse(op); + delete op; + mPendingOpsCount--; + if (! mNetConnection) { + break; + } + if (mRecursionCnt <= 1 && + (mPendingOpsCount <= 0 || + ! ClientManager::Flush( + mClientThread, *this))) { + mNetConnection->StartFlush(); + } + } + // Fall through. + case EVENT_NET_WROTE: + // Something went out on the network. + // Process next command. + if (! IsOverPendingOpsLimit() && + mRecursionCnt <= 1 && + (code == EVENT_CMD_DONE || + ! mNetConnection->IsReadReady()) && + mNetConnection->GetNumBytesToWrite() < + sMaxWriteBehind) { + if (mNetConnection->GetNumBytesToRead() > + mLastReadLeft || + mDisconnectFlag) { + HandleRequest(EVENT_NET_READ, + &mNetConnection->GetInBuffer()); + } else if (! mNetConnection->IsReadReady()) { + mNetConnection->SetMaxReadAhead(sMaxReadAhead); + } + } + break; + + case EVENT_NET_ERROR: + if (mNetConnection->IsGood() && + (mPendingOpsCount > 0 || + mNetConnection->IsWriteReady())) { + // Fin from the other side, flush and close connection. + mDisconnectFlag = true; + break; + } + // Fall through. + case EVENT_INACTIVITY_TIMEOUT: + KFS_LOG_STREAM_DEBUG << PeerName(mNetConnection) << + " closing connection" << + KFS_LOG_EOM; + mNetConnection->Close(); + mNetConnection->GetInBuffer().Clear(); + break; + + default: + assert(!"Unknown event"); + break; + } + + if (mRecursionCnt <= 1) { + bool goodFlag = mNetConnection && mNetConnection->IsGood(); + if (goodFlag && (mPendingOpsCount <= 0 || + ! ClientManager::Flush( + mClientThread, *this))) { + mNetConnection->StartFlush(); + goodFlag = mNetConnection && mNetConnection->IsGood(); + } + if (goodFlag && mDisconnectFlag) { + if (mPendingOpsCount <= 0 && + ! mNetConnection->IsWriteReady()) { + mNetConnection->Close(); + goodFlag = false; + } else { + mNetConnection->SetMaxReadAhead(0); + } + } + if (goodFlag) { + IOBuffer& inbuf = mNetConnection->GetInBuffer(); + int numBytes = inbuf.BytesConsumable(); + if (numBytes <= sBufCompactionThreshold && + numBytes > 0) { + inbuf.MakeBuffersFull(); + } + IOBuffer& outbuf = mNetConnection->GetOutBuffer(); + numBytes = outbuf.BytesConsumable(); + if (numBytes <= sOutBufCompactionThreshold && + numBytes > 0) { + outbuf.MakeBuffersFull(); + } + if (mNetConnection->IsReadReady() && + (IsOverPendingOpsLimit() || + mNetConnection->GetNumBytesToWrite() >= + sMaxWriteBehind || + mNetConnection->GetNumBytesToRead() >= + sMaxPendingBytes)) { + mLastReadLeft = 0; + mNetConnection->SetMaxReadAhead(0); + } + } else { + if (mPendingOpsCount > 0) { + mNetConnection.reset(); + } else { + delete this; + return 0; + } + } + } + assert( + mRecursionCnt > 0 && + (mRecursionCnt > 1 || mPendingOpsCount > 0 || + (mNetConnection && mNetConnection->IsGood())) + ); + mRecursionCnt--; + return 0; +} + +/// +/// We have a command in a buffer. So, parse out the command and +/// execute it if possible. +/// @param[in] iobuf: Buffer containing the command +/// @param[in] cmdLen: Length of the command in the buffer +/// +void +ClientSM::HandleClientCmd(IOBuffer& iobuf, int cmdLen) +{ + assert(! IsOverPendingOpsLimit() && mNetConnection); + MetaRequest* op = 0; + if (ParseCommand(iobuf, cmdLen, &op, mParseBuffer) != 0) { + IOBuffer::IStream is(iobuf, cmdLen); + char buf[128]; + int maxLines = 16; + while (maxLines-- > 0 && is.getline(buf, sizeof(buf))) { + KFS_LOG_STREAM_ERROR << PeerName(mNetConnection) << + " invalid request: " << buf << + KFS_LOG_EOM; + } + iobuf.Clear(); + mNetConnection->Close(); + HandleRequest(EVENT_NET_ERROR, NULL); + return; + } + if (op->clientProtoVers < mClientProtoVers) { + mClientProtoVers = op->clientProtoVers; + KFS_LOG_STREAM_WARN << PeerName(mNetConnection) << + " command with old protocol version: " << + op->clientProtoVers << ' ' << op->Show() << + KFS_LOG_EOM; + } + // Command is ready to be pushed down. So remove the cmd from the buffer. + if (sAuditLoggingFlag) { + op->reqHeaders.Move(&iobuf, cmdLen); + } else { + iobuf.Consume(cmdLen); + } + KFS_LOG_STREAM_DEBUG << PeerName(mNetConnection) << + " +seq: " << op->opSeqno << + " " << op->Show() << + " pending:" + " rd: " << mNetConnection->GetNumBytesToRead() << + " wr: " << mNetConnection->GetNumBytesToWrite() << + KFS_LOG_EOM; + op->clientIp = mClientIp; + op->clnt = this; + mPendingOpsCount++; + ClientManager::SubmitRequest(mClientThread, *op); +} + +} // namespace KFS diff --git a/src/cc/meta/ClientSM.h b/src/cc/meta/ClientSM.h new file mode 100644 index 000000000..a67c95a59 --- /dev/null +++ b/src/cc/meta/ClientSM.h @@ -0,0 +1,119 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/02 +// Author: Sriram Rao +// Mike Ovsiannikov implement multiple outstanding request processing, +// and "client threads". +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file ClientSM.h +// \brief Kfs client protocol state machine responsible for handling the +// corresponding network io: receiving, and parsing request, and creating and +// sending response. +// +//---------------------------------------------------------------------------- + +#ifndef META_CLIENTSM_H +#define META_CLIENTSM_H + +#include "ClientManager.h" +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/NetConnection.h" +#include "kfsio/IOBuffer.h" +#include "qcdio/QCDLList.h" + +#include + +namespace KFS +{ +using std::string; + +class Properties; +struct MetaRequest; + +class ClientSM : public KfsCallbackObj +{ +public: + ClientSM(const NetConnectionPtr& conn, + ClientManager::ClientThread* thread = 0, + IOBuffer::WOStream* wostr = 0, + char* parseBuffer = 0); + ~ClientSM(); + + // + // Sequence: + // Client connects. + // - A new client sm is born + // - reads a request out of the connection + // - submit the request for execution + // - when the request is done, send a response back. + // + int HandleRequest(int code, void *data); + ClientSM*& GetNext() + { return mNext; } + const NetConnectionPtr& GetConnection() const + { return mNetConnection; } + + static void SetParameters(const Properties& prop); + static int GetClientCount() { return sClientCount; } +private: + /// A handle to a network connection + NetConnectionPtr mNetConnection; + const string mClientIp; + int mPendingOpsCount; + IOBuffer::WOStream& mOstream; + char* const mParseBuffer; + int mRecursionCnt; + /// used to print message about old protocol version once + int mClientProtoVers; + bool mDisconnectFlag; + int mLastReadLeft; + ClientManager::ClientThread* const mClientThread; + ClientSM* mNext; + ClientSM* mPrevPtr[1]; + ClientSM* mNextPtr[1]; + + friend class QCDLListOp; + typedef QCDLList ClientSMList; + + /// Given a (possibly) complete op in a buffer, run it. + void HandleClientCmd(IOBuffer& iobuf, int cmdLen); + + /// Op has finished execution. Send a response to the client. + void SendResponse(MetaRequest *op); + bool IsOverPendingOpsLimit() const + { return (mPendingOpsCount >= sMaxPendingOps); } + + static int sMaxPendingOps; + static int sMaxPendingBytes; + static int sMaxReadAhead; + static int sInactivityTimeout; + static int sMaxWriteBehind; + static int sBufCompactionThreshold; + static int sOutBufCompactionThreshold; + static int sClientCount; + static bool sAuditLoggingFlag; + static ClientSM* sClientSMPtr[1]; + static IOBuffer::WOStream sWOStream; +}; + +} + +#endif // META_CLIENTSM_H diff --git a/src/cc/meta/DiskEntry.cc b/src/cc/meta/DiskEntry.cc new file mode 100644 index 000000000..3b1dd2150 --- /dev/null +++ b/src/cc/meta/DiskEntry.cc @@ -0,0 +1,296 @@ +/* + * $Id$ + * + * \file DiskEntry.cc + * \brief parse checkpoint and log entries + * \author Blake Lewis (Kosmix Corp.) + * Mike Ovsiannikov implement DETokenizer to speed up parsing / load by + * making it more cpu efficient. + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "DiskEntry.h" +#include "util.h" + +namespace KFS +{ +using std::istream; +using std::streamsize; + +bool +DiskEntry::parse(DETokenizer& tokenizer) +{ + if (tokenizer.empty()) + return true; + + parsetab::const_iterator c = table.find(tokenizer.front()); + return (c != table.end() && c->second(tokenizer)); +} + +bool +DETokenizer::next(ostream* os) +{ + Token* const tend = tokens + kMaxEntryTokens; + cur = tokens; + end = tokens; + if (os && prevStart < nextEnt) { + os->write(prevStart, nextEnt - prevStart); + prevStart = nextEnt; + } + for (; ;) { + while (*nextEnt == '\n') { + ++nextEnt; + } + char* s = nextEnt; + char* p = s; + while (*p != '\n') { + if (*p == '/') { + end->ptr = s; + end->len = p - s; + s = ++p; + if (++end >= tend) { + end = tokens; + return false; + } + } else { + ++p; + } + } + if (p < bend) { + end->ptr = s; + end->len = p - s; + ++end; + nextEnt = p + 1; + entryCount++; + break; + } + end = tokens; + const size_t size = nextEnt >= bend ? 0 : + (p < bend ? p : bend) - nextEnt; + if (kMaxEntrySize <= size) { + return false; + } + if (os && prevStart < nextEnt) { + const size_t sz = + (nextEnt < bend ? nextEnt : bend) - prevStart; + if (sz > 0) { + os->write(prevStart, sz); + } + } + memmove(buffer, nextEnt, size); + nextEnt = buffer; + prevStart = nextEnt; + if (! is.read(buffer + size, kMaxEntrySize - size)) { + bend = buffer + size; + if (! is.eof()) { + MarkEnd(); + return false; + } + streamsize const cnt = is.gcount(); + if (cnt <= 0) { + MarkEnd(); + return false; + } + bend += cnt; + MarkEnd(); + } + } + return true; +} + +const unsigned char* const DETokenizer::c2hex = char2HexTable(); + +/*! + * \brief remove a file name from the front of the deque + * \param[out] name the returned name + * \param[in] tag the keyword that precedes the name + * \param[in] c the deque of components from the entry + * \param[in] ok if false, do nothing and return false + * \return true if parse was successful + * + * The ok parameter short-circuits parsing if an error occurs. + * This lets us do a series of steps without checking until the + * end. + */ +bool +pop_name(string &name, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 2 || c.front() != tag) + return false; + + c.pop_front(); + name = c.front(); + c.pop_front(); + if (!name.empty()) + return true; + + /* + * Special hack: the initial entry for "/" shows up + * as two empty components ("///"); I should probably + * come up with a more elegant way to do this. + */ + if (c.empty() || !c.front().empty()) + return false; + + c.pop_front(); + name = "/"; + return true; +} + +/*! + * \brief remove a path name from the front of the deque + * \param[out] path the returned path + * \param[in] tag the keyword that precedes the path + * \param[in] c the deque of components from the entry + * \param[in] ok if false, do nothing and return false + * \return true if parse was successful + * + * The ok parameter short-circuits parsing if an error occurs. + * This lets us do a series of steps without checking until the + * end. + */ +bool +pop_path(string &path, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 2 || c.front() != tag) + return false; + + c.pop_front(); + /* Collect everything else in path with components separated by '/' */ + c.popPath(path); + return ! path.empty(); +} + +/*! + * \brief remove a file ID from the component deque + */ +bool +pop_fid(fid_t &fid, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 2 || c.front() != tag) + return false; + + c.pop_front(); + fid = c.toNumber(); + c.pop_front(); + return (fid != -1); +} + +/*! + * \brief remove a size_t value from the component deque + */ +bool +pop_size(size_t &sz, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 2 || c.front() != tag) + return false; + + c.pop_front(); + sz = c.toNumber(); + c.pop_front(); + return (sz != -1u); +} + +/*! + * \brief remove a short value from the component deque + */ +bool +pop_short(int16_t &num, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 2 || c.front() != tag) + return false; + + c.pop_front(); + num = (int16_t) c.toNumber(); + c.pop_front(); + return (num != (int16_t) -1); +} + +/*! + * \brief remove a chunkOff_t value from the component deque + */ +bool +pop_offset(chunkOff_t &o, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 2 || c.front() != tag) + return false; + + c.pop_front(); + o = c.toNumber(); + c.pop_front(); + return (o != -1); +} + +/*! + * \brief remove a file type from the component deque + */ +bool +pop_type(FileType& t, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 2 || c.front() != tag) + return false; + + c.pop_front(); + string type = c.front(); + c.pop_front(); + if (type == "file") { + t = KFS_FILE; + } else if (type == "dir") { + t = KFS_DIR; + } else + t = KFS_NONE; + + return (t != KFS_NONE); +} + +/*! + * \brief remove a time value from the component deque + */ +bool +pop_time(int64_t& tv, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 3 || c.front() != tag) + return false; + + c.pop_front(); + int64_t sec = c.toNumber(); + c.pop_front(); + int64_t usec = c.toNumber(); + c.pop_front(); + const int64_t kMicroSec = 1000 * 1000; + tv = sec * kMicroSec + usec; + return (sec != -1 && usec >= 0 && usec <= kMicroSec); +} + +/*! + * \brief remove a int64_t value from the component deque + */ +bool +pop_num(int64_t& n, const char* tag, DETokenizer& c, bool ok) +{ + if (!ok || c.size() < 2 || c.front() != tag) + return false; + + c.pop_front(); + n = c.toNumber(); + c.pop_front(); + return c.isLastOk(); +} + +} // Namespace KFS diff --git a/src/cc/meta/DiskEntry.h b/src/cc/meta/DiskEntry.h new file mode 100644 index 000000000..f3e29e9fb --- /dev/null +++ b/src/cc/meta/DiskEntry.h @@ -0,0 +1,293 @@ +/*! + * $Id$ + * + * \file DiskEntry.h + * \brief process entries from the checkpoint and log files + * \author Blake Lewis (Kosmix Corp.) + * Mike Ovsiannikov implement DETokenizer to speed up parsing / load by + * making it more cpu efficient. + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ +#if !defined(KFS_ENTRY_H) +#define KFS_ENTRY_H + +#include "kfstypes.h" + +#include +#include +#include + +#include +#include +#include +#include + +namespace KFS +{ +using std::vector; +using std::map; +using std::istream; +using std::ostream; +using std::string; + +class DETokenizer +{ +public: + struct Token + { + Token() + : ptr(0), + len(0) + {} + Token(const char* p) + : ptr(p), + len(strlen(p)) + {} + Token(const char* p, size_t l) + : ptr(p), + len(l) + {} + bool operator==(const Token& other) const { + return (len == other.len && + memcmp(ptr, other.ptr, len) == 0); + } + bool operator!=(const Token& other) const { + return ! (*this == other); + } + bool operator==(const char* str) const { + const char* c = ptr; + const char* const e = c + len; + const char* p = str; + // Token should have no 0 characters. + while (c < e) { + if (*p++ != *c++) { + return false; + } + } + return (*p == 0); + } + bool operator!=(const char* str) const { + return ! (*this == str); + } + // Not lexicographic order, only intended for std::map. + bool operator<(const Token& other) const { + return (len < other.len || (len == other.len && + memcmp(ptr, other.ptr, len) < 0)); + } + operator string () const { + return string(ptr, len); + } + bool empty() const { + return len <= 0; + } + const char* ptr; + size_t len; + }; + DETokenizer(istream& in) + : tokens(new Token[kMaxEntryTokens]), + cur(tokens), + end(tokens), + is(in), + entryCount(0), + buffer(new char [kMaxEntrySize + 3]), + bend(buffer + kMaxEntrySize), + nextEnt(bend), + prevStart(nextEnt), + base(10), + lastOk(true) + { MarkEnd(); } + ~DETokenizer() { + delete [] tokens; + delete [] buffer; + } + void pop_front() { + assert(cur < end); + ++cur; + } + const Token& front() const { + assert(cur < end); + return *cur; + } + size_t size() const { + return (end - cur); + } + bool empty() const { + return (cur >= end); + } + bool next(ostream* os = 0); + size_t getEntryCount() const { + return entryCount; + } + string getEntry() const { + const char* const p = end == tokens ? nextEnt : tokens->ptr; + const char* const e = strchr(p, '\n'); + return (e ? string(p, e - p) : string(p)); + } + void popPath(string& path) { + assert(cur != end); + const char* const s = cur->ptr; + const char* p; + do { + p = cur->ptr + cur->len; + ++cur; + } while (cur != end && ! cur->empty()); + path.assign(s, p - s); + } + int64_t toNumber() { + assert(cur < end); + if (cur->len <= 0) { + return -1; + } + if (base == 16) { + return hexToNumber(); + } + char* end; + const int64_t ret = strtoll(cur->ptr, &end, base); + lastOk = end == cur->ptr + cur->len; + return (lastOk ? ret : (int64_t)-1); + } + void setIntBase(int b) { + base = b; + } + int getIntBase() const { + return base; + } + bool isLastOk() const { + return lastOk; + } +private: + enum { kMaxEntrySize = 512 << 10 }; + enum { kMaxEntryTokens = 1 << 10 }; + Token* tokens; + Token* cur; + Token* end; + istream& is; + size_t entryCount; + char* const buffer; + char* bend; + char* nextEnt; + const char* prevStart; + int base; + bool lastOk; + static const unsigned char* const c2hex; + + void MarkEnd() { + // sentinel for next() + assert(bend <= buffer + kMaxEntrySize); + bend[0] = '\n'; + bend[1] = 0; + bend[2] = '\n'; + } + int64_t hexToNumber() { + if (cur->len <= 0) { + return -1; + } + const unsigned char* p = + reinterpret_cast(cur->ptr); + const unsigned char* const e = p + cur->len; + const bool minus = *p == '-'; + if (minus || *p == '+') { + ++p; + } + int64_t ret = 0; + if (p + sizeof(ret) * 2 < e) { + lastOk = false; + return -1; + } + while (p < e) { + const unsigned char h = c2hex[*p++]; + if (h == (unsigned char)0xFF) { + lastOk = false; + return -1; + } + ret = (ret << 4) | h; + } + return (minus ? -ret : ret); + } +private: + DETokenizer(const DETokenizer&); + DETokenizer& operator=(const DETokenizer&); +}; + +inline static bool operator==(const char* str, const DETokenizer::Token& token) { + return (token == str); +} + +inline static bool operator!=(const char* str, const DETokenizer::Token& token) { + return ! (token == str); +} + +inline static ostream& operator<<(ostream& os, const DETokenizer::Token& token) { + return os.write(token.ptr, token.len); +} + +/*! + * \brief a checkpoint or log entry read back from disk + * + * This class represents lines that have been read back from either + * the checkpoint or log file during KFS startup. Each entry in + * these files is in the form + * + * ///... + * + * where represents a type of metatree node in the case + * of checkpoint, or an update request, in a log file. The basic + * processing is to split the line into ts component parts, then + * use the keyword to look up a function that validates the remaining + * data and performs whatever action is appropriate. In the case + * of checkpoints, this will be to insert the specified node into + * the tree, while for log entries, we redo the update (taking care + * to specify any new file ID's so that they remain the same as + * they were before the restart). + */ +class DiskEntry { +public: + typedef DETokenizer::Token Token; + typedef bool (*parser)(DETokenizer &c); //!< a parsing function +private: + typedef map parsetab; //!< map type to parser + parsetab table; +public: + void add_parser(const Token& k, parser f) { table[k] = f; } + bool parse(DETokenizer& tonenizer); //!< look up parser and call it +}; + + +/*! + * \brief parser helper routines + * These functions remove items of the specified kind from the deque + * of components. The item will be preceded by an identifying keyword, + * which is passed in as "tag". + */ +extern bool pop_name( + string &name, const char* tag, DETokenizer &c, bool ok); +extern bool pop_path( + string &path, const char* tag, DETokenizer &c, bool ok); +extern bool pop_fid(fid_t &fid, const char* tag, DETokenizer &c, bool ok); +extern bool pop_size(size_t &sz, const char* tag, DETokenizer &c, bool ok); +extern bool pop_offset(chunkOff_t &o, const char* tag, DETokenizer &c, bool ok); +extern bool pop_short(int16_t &n, const char* tag, DETokenizer &c, bool ok); +extern bool pop_type( + FileType &t, const char* tag, DETokenizer &c, bool ok); +extern bool pop_time( + int64_t &tv, const char* tag, DETokenizer &c, bool ok); +extern bool pop_num(int64_t &n, const char* tag, DETokenizer& c, bool ok); + +} +#endif // !defined(KFS_ENTRY_H) diff --git a/src/cc/meta/Key.h b/src/cc/meta/Key.h new file mode 100644 index 000000000..5942b605c --- /dev/null +++ b/src/cc/meta/Key.h @@ -0,0 +1,127 @@ +/*! + * $Id$ + * + * \file Key.h + * \brief Base class for KFS metadata key operations. + * \author Blake Lewis (Kosmix Corp.) + * Mike Ovsiannikov pack keys into 128 bits by using the fact that + * that the chunk offset is 64MB aligned. + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ +#if !defined(META_KEY_H) +#define META_KEY_H + +#include "kfstypes.h" + +namespace KFS { + +typedef int64_t KeyData; //!< "opaque" key data + +/*! + * \brief search key + * + * Key values for tree operations. + */ +class Key +{ +public: + Key(MetaType k, KeyData d1, KeyData d2 = 0) { + hi = uint64_t(k & 0x30) << (62 - 4); + const uint64_t dh = (uint64_t)(d1 - (int64_t(1) << 63)); + hi |= (dh >> 2) & ~(uint64_t(3) << 62); + lo = dh << 62; + const uint64_t dl = (uint64_t)(d2 - (int64_t(1) << 63)); + lo |= (dl >> 2) & (~uint64_t(3)); + lo |= uint64_t(k & 0x3); + } + Key() { + *this = Key(KFS_UNINIT, 0, 0); + } + bool operator < (const Key &test) const + { return (hi < test.hi || (hi == test.hi && lo < test.lo)); } + bool operator == (const Key &test) const + { return (((hi ^ test.hi) | (lo ^ test.lo)) == 0); } + bool operator != (const Key &test) const + { return ! (*this == test); } + bool operator > (const Key &test) const + { return (test < *this); } + bool operator <= (const Key &test) const + { return ! (*this > test); } + bool operator >= (const Key &test) const + { return ! (*this < test); } +private: + uint64_t hi; + uint64_t lo; + friend class PartialMatch; +}; + +class PartialMatch +{ +private: + static const uint64_t mask = (uint64_t(3) << 62) | uint64_t(3); + Key key; +public: + PartialMatch(MetaType k, KeyData d1) + : key(k, d1) + {} + PartialMatch(const Key& k) + : key(k) + {} + bool operator < (const Key &test) const { + return (key.hi < test.hi || (key.hi == test.hi && + (key.lo & mask) < (test.lo & mask))); + } + bool operator > (const Key &test) const { + return (key.hi > test.hi || (key.hi == test.hi && + (key.lo & mask) > (test.lo & mask))); + } + bool operator == (const Key &test) const { + return (((key.hi ^ test.hi) | + ((key.lo ^ test.lo) & mask)) == 0); + } + bool operator != (const Key &test) const + { return ! (*this == test); } + bool operator <= (const Key &test) const + { return ! (*this > test); } + bool operator >= (const Key &test) const + { return ! (*this < test); } +}; + +inline bool operator < (const Key &l, const PartialMatch &r) { + return r > l; +} +inline bool operator > (const Key &l, const PartialMatch &r) { + return r < l; +} +inline bool operator == (const Key &l, const PartialMatch &r) { + return r == l; +} +inline bool operator != (const Key &l, const PartialMatch &r) { + return r != l; +} +inline bool operator <= (const Key &l, const PartialMatch &r) { + return r >= l; +} +inline bool operator >= (const Key &l, const PartialMatch &r) { + return r <= l; +} + +} + +#endif // !defined(META_KEY_H) diff --git a/src/cc/meta/LayoutManager.cc b/src/cc/meta/LayoutManager.cc new file mode 100644 index 000000000..790ef0025 --- /dev/null +++ b/src/cc/meta/LayoutManager.cc @@ -0,0 +1,9235 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/06 +// Author: Sriram Rao +// Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file LayoutManager.cc +// \brief Layout manager implementation. +// +//---------------------------------------------------------------------------- + +#include "LayoutManager.h" +#include "kfstree.h" +#include "ClientSM.h" +#include "NetDispatch.h" + +#include "kfsio/Globals.h" +#include "kfsio/IOBuffer.h" +#include "qcdio/QCIoBufferPool.h" +#include "qcdio/QCUtils.h" +#include "common/MsgLogger.h" +#include "common/Properties.h" +#include "common/time.h" +#include "common/Version.h" +#include "common/RequestParser.h" +#include "common/StdAllocator.h" +#include "common/rusage.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace KFS { + +using std::for_each; +using std::find; +using std::sort; +using std::unique; +using std::random_shuffle; +using std::vector; +using std::min; +using std::max; +using std::istringstream; +using std::ostream; +using std::ostringstream; +using std::make_pair; +using std::pair; +using std::make_heap; +using std::pop_heap; +using std::ostream_iterator; +using std::copy; +using std::string; +using std::ofstream; +using std::ifstream; +using std::numeric_limits; +using std::iter_swap; +using std::setw; +using std::setfill; +using boost::mem_fn; +using boost::bind; +using boost::ref; + +using libkfsio::globalNetManager; +using libkfsio::globals; + +const int64_t kSecs2MicroSecs = 1000 * 1000; + +//LayoutManager gLayoutManager; + +class LayoutManager::RandGen +{ +public: + RandGen(LayoutManager& m) + : mLm(m) + {} + size_t operator()(size_t interval) { + return (size_t)mLm.Rand((int64_t)interval); + } +private: + LayoutManager& mLm; +}; + +static inline time_t +TimeNow() +{ + return globalNetManager().Now(); +} + +static inline time_t +GetInitialWriteLeaseExpireTime() { + return (TimeNow() + 10 * 365 * 24 * 60 * 60); +} + +static inline seq_t +RandomSeqNo() +{ + seq_t ret = 0; + RAND_pseudo_bytes( + reinterpret_cast(&ret), int(sizeof(ret))); + return ((ret < 0 ? -ret : ret) >> 1); +} + +static inline void +UpdatePendingRecovery(CSMap& csmap, CSMap::Entry& ent) +{ + // Chunk wasn't previously available, check to see if recovery can + // proceed now. + // Schedule re-check of all pending recovery chunks that belong to the + // file, and let the CanReplicateChunkNow decide if recovery can start + // or not. + if (csmap.GetState(ent) != CSMap::Entry::kStatePendingRecovery) { + return; + } + const MetaFattr* const fa = ent.GetFattr(); + for (CSMap::Entry* prev = csmap.Prev(ent); + prev && prev->GetFattr() == fa; ) { + CSMap::Entry& entry = *prev; + prev = csmap.Prev(entry); + csmap.SetState(entry, CSMap::Entry::kStateCheckReplication); + } + for (CSMap::Entry* next = &ent; ;) { + CSMap::Entry& entry = *next; + next = csmap.Next(entry); + csmap.SetState(entry, CSMap::Entry::kStateCheckReplication); + if (! next || next->GetFattr() != fa) { + break; + } + } +} + +static inline void +UpdateReplicationState(CSMap& csmap, CSMap::Entry& entry) +{ + // Re-schedule replication check if needed. + CSMap::Entry::State const curState = csmap.GetState(entry); + if (curState == CSMap::Entry::kStatePendingRecovery) { + UpdatePendingRecovery(csmap, entry); + } else if (curState == CSMap::Entry::kStatePendingReplication) { + csmap.SetState(entry, CSMap::Entry::kStateCheckReplication); + } +} + +class ChunkIdMatcher +{ + const chunkId_t myid; +public: + ChunkIdMatcher(chunkId_t c) : myid(c) { } + bool operator() (MetaChunkInfo *c) const { + return c->chunkId == myid; + } +}; + +inline bool +LayoutManager::InRecoveryPeriod() const +{ + return (TimeNow() < mRecoveryStartTime + mRecoveryIntervalSec); +} + +inline bool +LayoutManager::InRecovery() const +{ + return ( + mChunkServers.size() < mMinChunkserversToExitRecovery || + InRecoveryPeriod() + ); +} + +inline bool +LayoutManager::IsChunkServerRestartAllowed() const +{ + return ( + ! InRecovery() && + mChunkServers.size() > mMinChunkserversToExitRecovery && + mHibernatingServers.empty() + ); +} + +inline bool +ARAChunkCache::Invalidate(iterator it) +{ + assert(it != mMap.end() && ! mMap.empty()); + mMap.erase(it); + return true; +} + +inline bool +ARAChunkCache::Invalidate(fid_t fid) +{ + iterator const it = mMap.find(fid); + if (it == mMap.end()) { + return false; + } + mMap.erase(it); + return true; +} + +inline bool +ARAChunkCache::Invalidate(fid_t fid, chunkId_t chunkId) +{ + iterator const it = mMap.find(fid); + if (it == mMap.end() || it->second.chunkId != chunkId) { + return false; + } + mMap.erase(it); + return true; +} + +void +ARAChunkCache::RequestNew(MetaAllocate& req) +{ + if (req.offset < 0 || (req.offset % CHUNKSIZE) != 0 || + ! req.appendChunk) { + panic("ARAChunkCache::RequestNew: invalid parameters"); + return; + } + // Find the end of the list, normally list should have only one element. + MetaAllocate* last = &req; + while (last->next) { + last = last->next; + } + mMap[req.fid] = Entry( + req.chunkId, + req.chunkVersion, + req.offset, + TimeNow(), + last, + req.permissions + ); +} + +bool +ARAChunkCache::Entry::AddPending(MetaAllocate& req) +{ + assert(req.appendChunk); + + if (! lastPendingRequest || ! req.appendChunk) { + if (req.appendChunk) { + req.responseStr = responseStr; + } + return false; + } + assert(lastPendingRequest->suspended); + MetaAllocate* last = &req; + last->suspended = true; + while (last->next) { + last = last->next; + last->suspended = true; + } + // Put request to the end of the queue. + // Make sure that the last pointer is correct. + while (lastPendingRequest->next) { + lastPendingRequest = lastPendingRequest->next; + } + lastPendingRequest->next = last; + lastPendingRequest = last; + return true; +} + +void +ARAChunkCache::RequestDone(const MetaAllocate& req) +{ + assert(req.appendChunk); + + iterator const it = mMap.find(req.fid); + if (it == mMap.end()) { + return; + } + Entry& entry = it->second; + if (entry.chunkId != req.chunkId) { + return; + } + if (req.status != 0) { + // Failure, invalidate the cache. + mMap.erase(it); + return; + } + entry.lastAccessedTime = TimeNow(); + entry.offset = req.offset; + if (entry.lastPendingRequest) { + // Transition from pending to complete. + // Cache the response. Restart decay timer. + entry.responseStr = req.responseStr; + entry.lastDecayTime = entry.lastAccessedTime; + entry.lastPendingRequest = 0; + } +} + +void +ARAChunkCache::Timeout(time_t minTime) +{ + for (iterator it = mMap.begin(); it != mMap.end(); ) { + const Entry& entry = it->second; + if (entry.lastAccessedTime >= minTime || + entry.lastPendingRequest) { + ++it; // valid entry; keep going + } else { + mMap.erase(it++); + } + } +} + +ChunkLeases::ChunkLeases() + : mLeaseId(RandomSeqNo()), + mReadLeases(), + mWriteLeases(), + mCurWrIt(mWriteLeases.end()), + mTimerRunningFlag(false) +{} + +inline void +ChunkLeases::Erase( + WriteLeases::iterator it) +{ + assert(it != mWriteLeases.end()); + if (mTimerRunningFlag && it == mCurWrIt) { + ++mCurWrIt; + } + mWriteLeases.erase(it); +} + +inline void +ChunkLeases::Erase( + ReadLeases::iterator it) +{ + assert(it != mReadLeases.end()); + const bool updateFlag = it->second.mScheduleReplicationCheckFlag; + const chunkId_t chunkId = it->first; + mReadLeases.erase(it); + if (updateFlag) { + gLayoutManager.ChangeChunkReplication(chunkId); + } +} + +inline bool +ChunkLeases::IsReadLease( + ChunkLeases::LeaseId leaseId) +{ + return ((leaseId & 0x1) == 0); +} + +inline bool +ChunkLeases::IsWriteLease( + ChunkLeases::LeaseId leaseId) +{ + return (! IsReadLease(leaseId)); +} + +inline ChunkLeases::LeaseId +ChunkLeases::NewReadLeaseId() +{ + const LeaseId id = IsReadLease(mLeaseId) ? mLeaseId : (mLeaseId + 1); + assert(IsReadLease(id)); + return id; +} + +inline ChunkLeases::LeaseId +ChunkLeases::NewWriteLeaseId() +{ + const LeaseId id = IsWriteLease(mLeaseId) ? mLeaseId : (mLeaseId + 1); + assert(IsWriteLease(id)); + return id; +} + +inline const ChunkLeases::WriteLease* +ChunkLeases::GetWriteLease( + chunkId_t chunkId) const +{ + WriteLeases::const_iterator const wi = mWriteLeases.find(chunkId); + return (wi != mWriteLeases.end() ? &wi->second : 0); +} + +inline const ChunkLeases::WriteLease* +ChunkLeases::GetValidWriteLease( + chunkId_t chunkId) const +{ + WriteLeases::const_iterator const wi = mWriteLeases.find(chunkId); + return ((wi != mWriteLeases.end() && TimeNow() <= wi->second.expires) ? + &wi->second : 0); +} + +inline const ChunkLeases::WriteLease* +ChunkLeases::RenewValidWriteLease( + chunkId_t chunkId) +{ + WriteLeases::iterator const wi = mWriteLeases.find(chunkId); + if (wi == mWriteLeases.end()) { + return 0; + } + const time_t now = TimeNow(); + if (wi->second.expires < now) { + return 0; + } + if (! wi->second.allocInFlight) { + wi->second.expires = + max(wi->second.expires, now + LEASE_INTERVAL_SECS); + } + return (&wi->second); +} + +inline bool +ChunkLeases::HasValidWriteLease( + chunkId_t chunkId) const +{ + WriteLeases::const_iterator const wi = mWriteLeases.find(chunkId); + return (wi != mWriteLeases.end() && TimeNow() <= wi->second.expires); +} + +inline bool +ChunkLeases::HasWriteLease( + chunkId_t chunkId) const +{ + return (mWriteLeases.find(chunkId) != mWriteLeases.end()); +} + +inline bool +ChunkLeases::HasValidLease( + chunkId_t chunkId) const +{ + if (HasValidWriteLease(chunkId)) { + return true; + } + ReadLeases::const_iterator const ri = mReadLeases.find(chunkId); + if (ri == mReadLeases.end()) { + return false; + } + const time_t now = TimeNow(); + for (ChunkReadLeases::const_iterator it = ri->second.mLeases.begin(); + it != ri->second.mLeases.end(); ++it) { + if (now <= it->expires) { + return true; + } + } + return false; +} + +inline bool +ChunkLeases::HasLease( + chunkId_t chunkId) const +{ + ReadLeases::const_iterator const ri = mReadLeases.find(chunkId); + if (ri != mReadLeases.end() && ! ri->second.mLeases.empty()) { + return true; + } + return (mWriteLeases.find(chunkId) != mWriteLeases.end()); +} + +inline bool +ChunkLeases::UpdateReadLeaseReplicationCheck( + chunkId_t chunkId, + bool setScheduleReplicationCheckFlag) +{ + ReadLeases::iterator const ri = mReadLeases.find(chunkId); + if (ri != mReadLeases.end() && ! ri->second.mLeases.empty()) { + if (setScheduleReplicationCheckFlag) { + ri->second.mScheduleReplicationCheckFlag = true; + } + return true; + } + return false; +} + +inline int +ChunkLeases::ReplicaLost( + chunkId_t chunkId, + const ChunkServer* chunkServer) +{ + WriteLeases::iterator it = mWriteLeases.find(chunkId); + if (it == mWriteLeases.end()) { + return -EINVAL; + } + return ReplicaLost(it->second, chunkServer); +} + +inline int +ChunkLeases::ReplicaLost( + ChunkLeases::WriteLease& wl, + const ChunkServer* chunkServer) +{ + if (wl.chunkServer.get() == chunkServer && ! wl.relinquishedFlag && + ! wl.allocInFlight) { + const time_t now = TimeNow(); + if (wl.stripedFileFlag && now <= wl.expires) { + // Keep the valid lease for striped files, instead, to + // allow lease renewal when/if the next chunk allocation + // comes in. + wl.expires = max(wl.expires, now + LEASE_INTERVAL_SECS); + } else { + wl.expires = now - 1; + } + wl.ownerWasDownFlag = wl.ownerWasDownFlag || + (chunkServer && chunkServer->IsDown()); + WriteLease::Mutable(wl.chunkServer).reset(); + } + return 0; +} + +inline void +ChunkLeases::ServerDown( + const ChunkServerPtr& chunkServer, + ARAChunkCache& arac, + CSMap& csmap) +{ + for (WriteLeases::iterator it = mWriteLeases.begin(); + it != mWriteLeases.end(); + ) { + chunkId_t const chunkId = it->first; + WriteLease& wl = it->second; + CSMap::Entry* ci = 0; + ++it; + if (wl.appendFlag && + (ci = csmap.Find(chunkId)) && + csmap.HasServer(chunkServer, *ci)) { + arac.Invalidate(ci->GetFileId(), chunkId); + } + ReplicaLost(wl, chunkServer.get()); + } +} + +inline bool +ChunkLeases::ExpiredCleanup( + ChunkLeases::ReadLeases::iterator ri, + time_t now) +{ + const time_t maxLeaseEndTime = now + LEASE_INTERVAL_SECS; + for (ChunkReadLeases::iterator it = ri->second.mLeases.begin(); + it != ri->second.mLeases.end(); ) { + if (it->expires < now) { + it = ri->second.mLeases.erase(it); + continue; + } + if (it->expires <= maxLeaseEndTime) { + // List is ordered by expiration time. + break; + } + // Reset to the max allowed. + it->expires = maxLeaseEndTime; + ++it; + } + if (ri->second.mLeases.empty()) { + Erase(ri); + return true; + } + return false; +} + +inline bool +ChunkLeases::ExpiredCleanup( + ChunkLeases::WriteLeases::iterator it, + time_t now, + int ownerDownExpireDelay, + ARAChunkCache& arac, + CSMap& csmap) +{ + WriteLease& wl = it->second; + const chunkId_t chunkId = it->first; + CSMap::Entry* const ci = csmap.Find(chunkId); + if (! ci) { + Erase(it); + return true; + } + if (wl.allocInFlight || now <= wl.expires + + ((wl.ownerWasDownFlag && ownerDownExpireDelay > 0) ? + ownerDownExpireDelay : 0)) { + return false; + } + const bool relinquishedFlag = wl.relinquishedFlag; + const seq_t chunkVersion = wl.chunkVersion; + const string pathname = wl.pathname; + const bool appendFlag = wl.appendFlag; + const bool stripedFileFlag = wl.stripedFileFlag; + Erase(it); + if (relinquishedFlag) { + UpdateReplicationState(csmap, *ci); + return true; + } + if (appendFlag) { + arac.Invalidate(ci->GetFileId(), chunkId); + } + const bool leaseRelinquishFlag = true; + gLayoutManager.MakeChunkStableInit( + *ci, + chunkVersion, + pathname, + appendFlag, + -1, + false, + 0, + stripedFileFlag, + appendFlag, + leaseRelinquishFlag + ); + return true; +} + +inline bool +ChunkLeases::ExpiredCleanup( + chunkId_t chunkId, + time_t now, + int ownerDownExpireDelay, + ARAChunkCache& arac, + CSMap& csmap) +{ + ReadLeases::iterator const ri = mReadLeases.find(chunkId); + if (ri != mReadLeases.end()) { + assert(mWriteLeases.find(chunkId) == mWriteLeases.end()); + const bool ret = ExpiredCleanup(ri, now); + if (! ret && ! csmap.Find(chunkId)) { + Erase(ri); + return true; + } + return ret; + } + WriteLeases::iterator const wi = mWriteLeases.find(chunkId); + return (wi == mWriteLeases.end() || ExpiredCleanup( + wi, now, ownerDownExpireDelay, arac, csmap)); +} + +inline const char* +ChunkLeases::FlushWriteLease( + chunkId_t chunkId, + ARAChunkCache& arac, + CSMap& csmap) +{ + WriteLeases::iterator const wi = mWriteLeases.find(chunkId); + if (wi == mWriteLeases.end()) { + return "no write lease"; + } + WriteLease& wl = wi->second; + if (! wl.appendFlag) { + return "not append lease"; + } + if (wl.allocInFlight) { + return "allocation in flight"; + } + if (wl.relinquishedFlag || wl.ownerWasDownFlag) { + return "write lease expiration in flight"; + } + const time_t now = TimeNow(); + wi->second.expires = min(wi->second.expires, now - 1); + if (ExpiredCleanup(wi, now, 0, arac, csmap)) { + return 0; + } + return "write lease expiration delayed"; +} + +inline int +ChunkLeases::LeaseRelinquish( + const MetaLeaseRelinquish& req, + ARAChunkCache& arac, + CSMap& csmap) +{ + ReadLeases::iterator const ri = mReadLeases.find(req.chunkId); + if (ri != mReadLeases.end()) { + assert(mWriteLeases.find(req.chunkId) == mWriteLeases.end()); + for (ChunkReadLeases::iterator it = ri->second.mLeases.begin(); + it != ri->second.mLeases.end(); ++it) { + if (it->leaseId == req.leaseId) { + const time_t now = TimeNow(); + const int ret = it->expires < now ? + -ELEASEEXPIRED : 0; + ri->second.mLeases.erase(it); + if (ri->second.mLeases.empty()) { + Erase(ri); + } + return ret; + } + } + return -EINVAL; + } + + WriteLeases::iterator const wi = mWriteLeases.find(req.chunkId); + if (wi == mWriteLeases.end() || wi->second.leaseId != req.leaseId) { + return -EINVAL; + } + const CSMap::Entry* const ci = csmap.Find(req.chunkId); + if (! ci) { + return -ELEASEEXPIRED; + } + WriteLease& wl = wi->second; + if (wl.allocInFlight) { + // If relinquish comes in before alloc completes, then + // run completion when / if allocation finishes successfully. + if (! wl.allocInFlight->pendingLeaseRelinquish) { + const_cast(wl.allocInFlight + )->pendingLeaseRelinquish = + new MetaLeaseRelinquish(); + } + MetaLeaseRelinquish& lr = + *(wl.allocInFlight->pendingLeaseRelinquish); + lr.leaseType = req.leaseType; + lr.chunkId = req.chunkId; + lr.leaseId = req.leaseId; + lr.chunkSize = req.chunkSize; + lr.hasChunkChecksum = req.hasChunkChecksum; + lr.chunkChecksum = req.chunkChecksum; + return 0; + } + const time_t now = TimeNow(); + const int ret = wl.expires < now ? -ELEASEEXPIRED : 0; + const bool hadLeaseFlag = ! wl.relinquishedFlag; + WriteLease::Mutable(wl.chunkServer).reset(); + wl.relinquishedFlag = true; + // the owner of the lease is giving up the lease; update the expires so + // that the normal lease cleanup will work out. + wl.expires = min(time_t(0), now - 100 * LEASE_INTERVAL_SECS); + if (hadLeaseFlag) { + // For write append lease checksum and size always have to be + // specified for make chunk stable, otherwise run begin make + // chunk stable. + const CSMap::Entry& v = *ci; + const bool beginMakeChunkStableFlag = wl.appendFlag && + (! req.hasChunkChecksum || req.chunkSize < 0); + if (wl.appendFlag) { + arac.Invalidate(v.GetFileId(), req.chunkId); + } + const bool leaseRelinquishFlag = true; + gLayoutManager.MakeChunkStableInit( + v, + wl.chunkVersion, + wl.pathname, + beginMakeChunkStableFlag, + req.chunkSize, + req.hasChunkChecksum, + req.chunkChecksum, + wl.stripedFileFlag, + wl.appendFlag, + leaseRelinquishFlag + ); + } + return ret; +} + +inline bool +ChunkLeases::Timer( + time_t now, + int ownerDownExpireDelay, + ARAChunkCache& arac, + CSMap& csmap) +{ + if (mTimerRunningFlag) { + return false; // Do not allow recursion. + } + mTimerRunningFlag = true; + // Properly handling clock jump is a bit more involved, for now check + // for long leases, and cleanup stale ones. + const time_t checkChunkPresentExpireTime = + now + 2 * LEASE_INTERVAL_SECS; + bool cleanedFlag = false; + for (ReadLeases::iterator ri = mReadLeases.begin(); + ri != mReadLeases.end(); ) { + ReadLeases::iterator const it = ri++; + if (ExpiredCleanup(it, now)) { + cleanedFlag = true; + } + } + for (mCurWrIt = mWriteLeases.begin(); + mCurWrIt != mWriteLeases.end(); ) { + WriteLeases::iterator const it = mCurWrIt++; + if (ExpiredCleanup( + it, + now, + ownerDownExpireDelay, + arac, + csmap)) { + cleanedFlag = true; + } else if (! it->second.allocInFlight && + checkChunkPresentExpireTime < + it->second.expires) { + if (! csmap.Find(it->first)) { + Erase(it); + cleanedFlag = true; + } + } + } + mTimerRunningFlag = false; + return cleanedFlag; +} + +inline bool +ChunkLeases::NewReadLease( + chunkId_t chunkId, + time_t expires, + ChunkLeases::LeaseId& leaseId) +{ + if (mWriteLeases.find(chunkId) != mWriteLeases.end()) { + assert(mReadLeases.find(chunkId) == mReadLeases.end()); + return false; + } + // Keep list sorted by expiration time. + const LeaseId id = NewReadLeaseId(); + ChunkReadLeases& rl = mReadLeases[chunkId].mLeases; + ChunkReadLeases::iterator it = rl.end(); + while (it != rl.begin()) { + --it; + if (it->expires <= expires) { + ++it; + break; + } + } + rl.insert(it, ReadLease(id, expires)); + leaseId = id; + mLeaseId = id + 1; + return true; +} + +inline bool +ChunkLeases::NewWriteLease( + chunkId_t chunkId, + seq_t chunkVersion, + time_t expires, + const ChunkServerPtr& server, + const string& path, + bool append, + bool stripedFileFlag, + const MetaAllocate* allocInFlight, + ChunkLeases::LeaseId& leaseId) +{ + if (mReadLeases.find(chunkId) != mReadLeases.end()) { + assert(mWriteLeases.find(chunkId) == mWriteLeases.end()); + return false; + } + const LeaseId id = NewWriteLeaseId(); + WriteLease const wl( + id, + chunkVersion, + server, + path, + append, + stripedFileFlag, + allocInFlight, + expires + ); + pair const res = + mWriteLeases.insert(make_pair(chunkId, wl)); + leaseId = res.first->second.leaseId; + if (res.second) { + mLeaseId = id + 1; + } + return res.second; +} + +inline int +ChunkLeases::Renew( + chunkId_t chunkId, + ChunkLeases::LeaseId leaseId, + bool allocDoneFlag /* = false */) +{ + if (IsReadLease(leaseId)) { + ReadLeases::iterator const ri = mReadLeases.find(chunkId); + if (ri == mReadLeases.end()) { + return -EINVAL; + } + assert(mWriteLeases.find(chunkId) == mWriteLeases.end()); + for (ChunkReadLeases::iterator it = ri->second.mLeases.begin(); + it != ri->second.mLeases.end(); ++it) { + if (it->leaseId == leaseId) { + const time_t now = TimeNow(); + if (it->expires < now) { + // Don't renew expired leases. + ri->second.mLeases.erase(it); + if (ri->second.mLeases.empty()) { + Erase(ri); + } + return -ELEASEEXPIRED; + } + it->expires = now + LEASE_INTERVAL_SECS; + // Keep the list sorted by expiration time. + // Max expiration time is + // now + LEASE_INTERVAL_SECS + ri->second.mLeases.splice( + ri->second.mLeases.end(), + ri->second.mLeases, it); + return 0; + } + } + return -EINVAL; + } + WriteLeases::iterator const wi = mWriteLeases.find(chunkId); + if (wi == mWriteLeases.end() || wi->second.leaseId != leaseId) { + return -EINVAL; + } + assert(mReadLeases.find(chunkId) == mReadLeases.end()); + const time_t now = TimeNow(); + if (wi->second.expires < now && ! wi->second.allocInFlight) { + // Don't renew expired leases, and let the timer to clean it up + // to avoid posible recursion. + return -ELEASEEXPIRED; + } + if (allocDoneFlag) { + wi->second.allocInFlight = 0; + } + if (! wi->second.allocInFlight) { + wi->second.expires = now + LEASE_INTERVAL_SECS; + } + return 0; +} + +inline bool +ChunkLeases::DeleteWriteLease( + chunkId_t chunkId, + ChunkLeases::LeaseId leaseId) +{ + WriteLeases::iterator const wi = mWriteLeases.find(chunkId); + if (wi == mWriteLeases.end() || wi->second.leaseId != leaseId) { + return false; + } + Erase(wi); + return true; +} + +inline void +ChunkLeases::SetMaxLeaseId( + ChunkLeases::LeaseId id) +{ + if (id > mLeaseId) { + mLeaseId = id; + } +} + +inline bool +ChunkLeases::Delete( + chunkId_t chunkId) +{ + WriteLeases::iterator const wi = mWriteLeases.find(chunkId); + const bool hadWr = wi != mWriteLeases.end(); + if (hadWr) { + Erase(wi); + } + ReadLeases::iterator ri = mReadLeases.find(chunkId); + const bool hadRd = ri != mReadLeases.end(); + if (hadRd) { + Erase(ri); + } + assert(! hadWr || ! hadRd); + return (hadWr || hadRd); +} + +inline void +ChunkLeases::GetOpenFiles( + MetaOpenFiles::ReadInfo& openForRead, + MetaOpenFiles::WriteInfo& openForWrite, + const CSMap& csmap) const +{ + const time_t now = TimeNow(); + for (ReadLeases::const_iterator ri = mReadLeases.begin(); + ri != mReadLeases.end(); ++ri) { + const CSMap::Entry* const ci = csmap.Find(ri->first); + if (! ci) { + continue; + } + size_t count = 0; + for (ChunkReadLeases::const_iterator + it = ri->second.mLeases.begin(); + it != ri->second.mLeases.end(); + ++it) { + if (now <= it->expires) { + count++; + } + } + if (count > 0) { + openForRead[ci->GetFileId()] + .push_back(make_pair(ri->first, count)); + } + } + for (WriteLeases::const_iterator wi = mWriteLeases.begin(); + wi != mWriteLeases.end(); ++wi) { + if (now <= wi->second.expires) { + const CSMap::Entry* const ci = csmap.Find(wi->first); + if (ci) { + openForWrite[ci->GetFileId()] + .push_back(wi->first); + } + } + } +} + +class MatchingServer +{ + const ServerLocation loc; +public: + MatchingServer(const ServerLocation& l) : loc(l) {} + bool operator() (const ChunkServerPtr &s) const { + return s->MatchingServer(loc); + } +}; + +inline CSMap::Entry& +LayoutManager::GetCsEntry(MetaChunkInfo& chunkInfo) +{ + return CSMap::Entry::GetCsEntry(chunkInfo); +} + +inline CSMap::Entry* +LayoutManager::GetCsEntry(MetaChunkInfo* chunkInfo) +{ + return CSMap::Entry::GetCsEntry(chunkInfo); +} + +inline void +LayoutManager::UpdatePendingRecovery(CSMap::Entry& ent) +{ + KFS::UpdatePendingRecovery(mChunkToServerMap, ent); +} + +inline bool +LayoutManager::AddHosted(CSMap::Entry& entry, const ChunkServerPtr& c) +{ + // Schedule replication even if the server went down, let recovery + // logic decide what to do. + UpdatePendingRecovery(entry); + if (! c || c->IsDown()) { + return false; + } + if (c->IsEvacuationScheduled(entry.GetChunkId())) { + CheckReplication(entry); + } + return mChunkToServerMap.AddServer(c, entry); +} + +inline bool +LayoutManager::AddHosted(chunkId_t chunkId, CSMap::Entry& entry, const ChunkServerPtr& c) +{ + if (chunkId != entry.GetChunkId()) { + panic("add hosted chunk id mismatch"); + return false; + } + return AddHosted(entry, c); +} + +inline void +LayoutManager::UpdateReplicationState(CSMap::Entry& entry) +{ + KFS::UpdateReplicationState(mChunkToServerMap, entry); +} + +inline void +LayoutManager::SetReplicationState(CSMap::Entry& entry, + CSMap::Entry::State state) +{ + CSMap::Entry::State const curState = mChunkToServerMap.GetState(entry); + if (curState == state) { + return; + } + if (curState == CSMap::Entry::kStatePendingRecovery) { + // Re-schedule replication check if needed. + UpdatePendingRecovery(entry); + } + mChunkToServerMap.SetState(entry, state); +} + +inline void +LayoutManager::CheckReplication(CSMap::Entry& entry) +{ + SetReplicationState(entry, CSMap::Entry::kStateCheckReplication); +} + +inline seq_t +LayoutManager::GetChunkVersionRollBack(chunkId_t chunkId) +{ + ChunkVersionRollBack::iterator const it = + mChunkVersionRollBack.find(chunkId); + if (it != mChunkVersionRollBack.end()) { + if (it->second <= 0) { + ostringstream os; + os << + "invalid chunk roll back entry:" + " chunk: " << it->first << + " version increment: " << it->second; + const string msg = os.str(); + panic(msg.c_str()); + mChunkVersionRollBack.erase(it); + } else { + return it->second; + } + } + return 0; +} + +inline seq_t +LayoutManager::IncrementChunkVersionRollBack(chunkId_t chunkId) +{ + pair const res = + mChunkVersionRollBack.insert(make_pair(chunkId, 0)); + if (! res.second && res.first->second <= 0) { + ostringstream os; + os << + "invalid chunk roll back entry:" + " chunk: " << res.first->first << + " version increment: " << res.first->second; + const string msg = os.str(); + panic(msg.c_str()); + res.first->second = 0; + } + ++(res.first->second); + return res.first->second; +} + +LayoutManager::Random::result_type +LayoutManager::RandSeed() +{ + Random::result_type theRet = 1; + RAND_pseudo_bytes( + reinterpret_cast(&theRet), + int(sizeof(theRet)) + ); + return theRet; +} + +int64_t +LayoutManager::Rand(int64_t interval) +{ + // Use this simpler and hopefully faster version instead of + // variate_generator > + // Scaling up and down 32 bit random number should be more than + // adequate for now, even though the result will at most 32 + // random bits. + + // Don't use modulo, low order bits might be "less random". + // Though this shouldn't be a problem with Mersenne twister. + return min((int64_t)((uint64_t)(mRandom() - mRandMin) * interval / + mRandInterval), interval - 1); +} + +LayoutManager::ChunkPlacement::ChunkPlacement() + : Super(gLayoutManager) +{ + Reserve(512); +} + +LayoutManager::LayoutManager() : + mNumOngoingReplications(0), + mIsRebalancingEnabled(false), + mMaxRebalanceSpaceUtilThreshold(0.8), + mMinRebalanceSpaceUtilThreshold(0.65), + mIsExecutingRebalancePlan(false), + mRecoveryStartTime(0), + mStartTime(time(0)), + mRecoveryIntervalSec(LEASE_INTERVAL_SECS), + mLeaseCleaner(60 * 1000), + mChunkReplicator(5 * 1000), + mCheckpoint(5 * 1000), + mMinChunkserversToExitRecovery(1), + mMastersCount(0), + mSlavesCount(0), + mAssignMasterByIpFlag(false), + mLeaseOwnerDownExpireDelay(30), + mMaxReservationSize(4 << 20), + mReservationDecayStep(4), // decrease by factor of 2 every 4 sec + mChunkReservationThreshold(CHUNKSIZE), + mAllocAppendReuseInFlightTimeoutSec(25), + mMinAppendersPerChunk(96), + mMaxAppendersPerChunk(4 << 10), + mReservationOvercommitFactor(.95), + mServerDownReplicationDelay(2 * 60), + mMaxDownServersHistorySize(4 << 10), + mChunkServersProps(), + mCSToRestartCount(0), + mMastersToRestartCount(0), + mMaxCSRestarting(0), + mRetireOnCSRestartFlag(true), + mMaxCSUptime(int64_t(24) * 60 * 60 * 36500), + mCSRestartTime(TimeNow() + int64_t(24) * 60 * 60 * 36500), + mCSGracefulRestartTimeout(15 * 60), + mCSGracefulRestartAppendWithWidTimeout(40 * 60), + mLastReplicationCheckTime(numeric_limits::min()), // check all + mLastRecomputeDirsizeTime(TimeNow()), + mRecomputeDirSizesIntervalSec(60 * 60 * 24 * 3650), + mMaxConcurrentWriteReplicationsPerNode(5), + mMaxConcurrentReadReplicationsPerNode(10), + mUseEvacuationRecoveryFlag(true), + mReplicationFindWorkTimeouts(0), + // Replication check 30ms/.20-30ms = 120 -- 20% cpu when idle + mMaxTimeForChunkReplicationCheck(30 * 1000), + mMinChunkReplicationCheckInterval(120 * 1000), + mLastReplicationCheckRunEndTime(microseconds()), + mReplicationCheckTimeouts(0), + mNoServersAvailableForReplicationCount(0), + mFullReplicationCheckInterval( + int64_t(7) * 24 * 60 * 60 * kSecs2MicroSecs), + mCheckAllChunksInProgressFlag(false), + mConcurrentWritesPerNodeWatermark(10), + mMaxSpaceUtilizationThreshold(0.95), + mUseFsTotalSpaceFlag(true), + mChunkAllocMinAvailSpace(2 * (int64_t)CHUNKSIZE), + mCompleteReplicationCheckInterval(30 * kSecs2MicroSecs), + mCompleteReplicationCheckTime( + microseconds() - mCompleteReplicationCheckInterval), + mPastEofRecoveryDelay(int64_t(60) * 60 * kSecs2MicroSecs), + mMaxServerCleanupScan(2 << 10), + mMaxRebalanceScan(1024), + mRebalanceReplicationsThreshold(0.5), + mRebalanceReplicationsThresholdCount(0), + mMaxRebalanceRunTime(int64_t(30) * 1000), + mLastRebalanceRunTime(microseconds()), + mRebalanceRunInterval(int64_t(512) * 1024), + mMaxRebalancePlanRead(2048), + mRebalancePlanFileName(), + mRebalanceCtrs(), + mRebalancePlan(), + mCleanupScheduledFlag(false), + mCSCountersUpdateInterval(2), + mCSCountersUpdateTime(0), + mCSCounters(), + mCSCountersResponse(), + mPingUpdateInterval(2), + mPingUpdateTime(0), + mPingResponse(), + mStringStream(), + mWOstream(), + mBufferPool(0), + mMightHaveRetiringServersFlag(false), + mRackPrefixUsePortFlag(false), + mRackPrefixes(), + mRackWeights(), + mChunkServerMd5sums(), + mClusterKey(), + mDelayedRecoveryUpdateMaxScanCount(32), + mForceDelayedRecoveryUpdateFlag(false), + mSortCandidatesBySpaceUtilizationFlag(false), + mSortCandidatesByLoadAvgFlag(false), + mMaxFsckFiles(128 << 10), + mFsckAbandonedFileTimeout(int64_t(1000) * kSecs2MicroSecs), + mMaxFsckTime(int64_t(19) * 60 * kSecs2MicroSecs), + mFullFsckFlag(true), + mMTimeUpdateResolution(kSecs2MicroSecs), + mMaxPendingRecoveryMsgLogInfo(1 << 10), + mAllowLocalPlacementFlag(true), + mAllowLocalPlacementForAppendFlag(false), + mInRackPlacementForAppendFlag(false), + mInRackPlacementFlag(false), + mAllocateDebugVerifyFlag(false), + mChunkEntryToChange(0), + mFattrToChangeTo(0), + mCSLoadAvgSum(0), + mCSMasterLoadAvgSum(0), + mCSSlaveLoadAvgSum(0), + mCSTotalPossibleCandidateCount(0), + mCSMasterPossibleCandidateCount(0), + mCSSlavePossibleCandidateCount(0), + mUpdateCSLoadAvgFlag(false), + mUpdatePlacementScaleFlag(false), + mCSMaxGoodCandidateLoadAvg(0), + mCSMaxGoodMasterCandidateLoadAvg(0), + mCSMaxGoodSlaveCandidateLoadAvg(0), + mCSMaxGoodCandidateLoadRatio(4), + mCSMaxGoodMasterLoadRatio(4), + mCSMaxGoodSlaveLoadRatio(4), + mSlavePlacementScale(int64_t(1) << kSlaveScaleFracBits), + mMaxSlavePlacementRange( + (int64_t)(1.8 * (int64_t(1) << kSlaveScaleFracBits))), + mMaxReplicasPerFile(MAX_REPLICAS_PER_FILE), + mMaxReplicasPerRSFile(MAX_REPLICAS_PER_FILE), + mGetAllocOrderServersByLoadFlag(true), + mMinChunkAllocClientProtoVersion(-1), + mMaxResponseSize(256 << 20), + mMinIoBufferBytesToProcessRequest(mMaxResponseSize + (10 << 20)), + mReadDirLimit(8 << 10), + mAllowChunkServerRetireFlag(false), + mPanicOnInvalidChunkFlag(false), + mAppendCacheCleanupInterval(-1), + mTotalChunkWrites(0), + mTotalWritableDrives(0), + mMinWritesPerDrive(10), + mMaxWritesPerDriveThreshold(mMinWritesPerDrive), + mMaxWritesPerDriveRatio(1.5), + mMaxLocalPlacementWeight(1.0), + mTotalWritableDrivesMult(0.), + mConfig(), + mDefaultUser(kKfsUserNone), // Request defaults + mDefaultGroup(kKfsGroupNone), + mDefaultFileMode(0644), + mDefaultDirMode(0755), + mDefaultLoadUser(kKfsUserRoot), // Checkpoint load and replay defaults + mDefaultLoadGroup(kKfsGroupRoot), + mDefaultLoadFileMode(0666), + mDefaultLoadDirMode(0777), + mForceEUserToRootFlag(false), + mVerifyAllOpsPermissionsFlag(false), + mRootHosts(), + mHostUserGroupRemap(), + mLastUidGidRemap(), + mIoBufPending(0), + mChunkInfosTmp(), + mChunkInfos2Tmp(), + mServersTmp(), + mServers2Tmp(), + mServers3Tmp(), + mServers4Tmp(), + mChunkPlacementTmp(), + mRandom(RandSeed()), + mRandMin(mRandom.min()), + mRandInterval(mRandom.max() - mRandMin) +{ + globals(); + mReplicationTodoStats = new Counter("Num Replications Todo"); + mOngoingReplicationStats = new Counter("Num Ongoing Replications"); + mTotalReplicationStats = new Counter("Total Num Replications"); + mFailedReplicationStats = new Counter("Num Failed Replications"); + mStaleChunkCount = new Counter("Num Stale Chunks"); + // how much to be done before we are done + globals().counterManager.AddCounter(mReplicationTodoStats); + // how many chunks are "endangered" + // how much are we doing right now + globals().counterManager.AddCounter(mOngoingReplicationStats); + globals().counterManager.AddCounter(mTotalReplicationStats); + globals().counterManager.AddCounter(mFailedReplicationStats); + globals().counterManager.AddCounter(mStaleChunkCount); +} + +LayoutManager::~LayoutManager() +{ + globals().counterManager.RemoveCounter(mOngoingReplicationStats); + globals().counterManager.RemoveCounter(mTotalReplicationStats); + globals().counterManager.RemoveCounter(mFailedReplicationStats); + globals().counterManager.RemoveCounter(mStaleChunkCount); + delete mReplicationTodoStats; + delete mOngoingReplicationStats; + delete mTotalReplicationStats; + delete mFailedReplicationStats; + delete mStaleChunkCount; + if (mCleanupScheduledFlag) { + globalNetManager().UnRegisterTimeoutHandler(this); + } +} + +template void +LayoutManager::LoadIdRemap(istream& fs, T OT::* map) +{ + string line; + string prefix; + istringstream is; + HostPrefix hp; + line.reserve(4 << 10); + prefix.reserve(256); + while(getline(fs, line)) { + is.str(line); + is.clear(); + if (! (is >> prefix)) { + continue; + } + if (hp.Parse(prefix) <= 0) { + continue; + } + HostUserGroupRemap::iterator it = find_if( + mHostUserGroupRemap.begin(), + mHostUserGroupRemap.end(), + bind(&HostUserGroupRemap::value_type::mHostPrefix, _1) + == hp + ); + if (it == mHostUserGroupRemap.end()) { + it = mHostUserGroupRemap.insert( + mHostUserGroupRemap.end(), + HostUserGroupRemap::value_type() + ); + it->mHostPrefix = hp; + } + typename T::key_type key; + typename T::mapped_type val; + T& m = (*it).*map; + while ((is >> key >> val)) { + m.insert(make_pair(key, val)); + } + // de-reference "line", to avoid re-allocation [hopefully] + is.str(string()); + } +} + +void +LayoutManager::SetParameters(const Properties& props, int clientPort) +{ + if (MsgLogger::GetLogger()) { + MsgLogger::GetLogger()->SetParameters( + props, "metaServer.msgLogWriter."); + } + ChunkServer::SetParameters(props, clientPort); + MetaRequest::SetParameters(props); + + mMaxConcurrentReadReplicationsPerNode = props.getValue( + "metaServer.maxConcurrentReadReplicationsPerNode", + mMaxConcurrentReadReplicationsPerNode); + mMaxConcurrentWriteReplicationsPerNode = props.getValue( + "metaServer.maxConcurrentWriteReplicationsPerNode", + mMaxConcurrentWriteReplicationsPerNode); + mUseEvacuationRecoveryFlag = props.getValue( + "metaServer.useEvacuationRecoveryFlag", + mUseEvacuationRecoveryFlag ? 1 : 0) != 0; + mFullReplicationCheckInterval = (int64_t)(props.getValue( + "metaServer.fullReplicationCheckInterval", + mFullReplicationCheckInterval * 1e-6) * 1e6); + mMaxTimeForChunkReplicationCheck = (int64_t)(props.getValue( + "metaServer.maxTimeForChunkReplicationCheck", + mMaxTimeForChunkReplicationCheck * 1e-6) * 1e6); + mMinChunkReplicationCheckInterval = (int64_t)(props.getValue( + "metaServer.minChunkReplicationCheckInterval", + mMinChunkReplicationCheckInterval * 1e-6) * 1e6); + mConcurrentWritesPerNodeWatermark = props.getValue( + "metaServer.concurrentWritesPerNodeWatermark", + mConcurrentWritesPerNodeWatermark); + mMaxSpaceUtilizationThreshold = props.getValue( + "metaServer.maxSpaceUtilizationThreshold", + mMaxSpaceUtilizationThreshold); + mUseFsTotalSpaceFlag = props.getValue( + "metaServer.useFsTotalSpace", + mUseFsTotalSpaceFlag ? 1 : 0) != 0; + mChunkAllocMinAvailSpace = props.getValue( + "metaServer.chunkAllocMinAvailSpace", + mChunkAllocMinAvailSpace); + mCompleteReplicationCheckInterval = (int64_t)(props.getValue( + "metaServer.completeReplicationCheckInterval", + mCompleteReplicationCheckInterval * 1e-6) * 1e6); + mPastEofRecoveryDelay = (int64_t)(props.getValue( + "metaServer.pastEofRecoveryDelay", + mPastEofRecoveryDelay * 1e-6) * 1e6); + mMaxServerCleanupScan = max(0, props.getValue( + "metaServer.maxServerCleanupScan", + (int)mMaxServerCleanupScan)); + + mMaxRebalanceScan = max(0, props.getValue( + "metaServer.maxRebalanceScan", + mMaxRebalanceScan)); + mRebalanceReplicationsThreshold = props.getValue( + "metaServer.rebalanceReplicationsThreshold", + mRebalanceReplicationsThreshold); + mRebalanceReplicationsThreshold = (int64_t)(props.getValue( + "metaServer.maxRebalanceRunTime", + double(mMaxRebalanceRunTime) * 1e-6) * 1e6); + mRebalanceRunInterval = (int64_t)(props.getValue( + "metaServer.rebalanceRunInterval", + double(mRebalanceRunInterval) * 1e-6) * 1e6); + mIsRebalancingEnabled = props.getValue( + "metaServer.rebalancingEnabled", + mIsRebalancingEnabled ? 1 : 0) != 0; + mMaxRebalanceSpaceUtilThreshold = props.getValue( + "metaServer.maxRebalanceSpaceUtilThreshold", + mMaxRebalanceSpaceUtilThreshold); + mMinRebalanceSpaceUtilThreshold = props.getValue( + "metaServer.minRebalanceSpaceUtilThreshold", + mMinRebalanceSpaceUtilThreshold); + mMaxRebalancePlanRead = props.getValue( + "metaServer.maxRebalancePlanRead", + mMaxRebalancePlanRead); + LoadRebalancePlan(props.getValue( + "metaServer.rebalancePlanFileName", + mRebalancePlanFileName)); + + mAssignMasterByIpFlag = props.getValue( + "metaServer.assignMasterByIp", + mAssignMasterByIpFlag ? 1 : 0) != 0; + mLeaseOwnerDownExpireDelay = max(0, props.getValue( + "metaServer.leaseOwnerDownExpireDelay", + mLeaseOwnerDownExpireDelay)); + mMaxReservationSize = max(0, props.getValue( + "metaServer.wappend.maxReservationSize", + mMaxReservationSize)); + mReservationDecayStep = props.getValue( + "metaServer.reservationDecayStep", + mReservationDecayStep); + mChunkReservationThreshold = props.getValue( + "metaServer.reservationThreshold", + mChunkReservationThreshold); + mAllocAppendReuseInFlightTimeoutSec = props.getValue( + "metaServer.wappend.reuseInFlightTimeoutSec", + mAllocAppendReuseInFlightTimeoutSec); + mMinAppendersPerChunk = props.getValue( + "metaserver.wappend.minAppendersPerChunk", + mMinAppendersPerChunk); + mMaxAppendersPerChunk = props.getValue( + "metaserver.wappend.maxAppendersPerChunk", + mMaxAppendersPerChunk); + mReservationOvercommitFactor = max(0., props.getValue( + "metaServer.wappend.reservationOvercommitFactor", + mReservationOvercommitFactor)); + + mLeaseCleaner.SetTimeoutInterval((int)(props.getValue( + "metaServer.leaseCleanupInterval", + mLeaseCleaner.GetTimeoutInterval() * 1e-3) * 1e3)); + mChunkReplicator.SetTimeoutInterval((int)(props.getValue( + "metaServer.replicationCheckInterval", + mChunkReplicator.GetTimeoutInterval() * 1e-3) * 1e3)); + + mCheckpoint.GetOp().SetParameters(props); + + mCSCountersUpdateInterval = props.getValue( + "metaServer.CSCountersUpdateInterval", + mCSCountersUpdateInterval); + mPingUpdateInterval = props.getValue( + "metaServer.pingUpdateInterval", + mPingUpdateInterval); + + /// On startup, the # of secs to wait before we are open for reads/writes + mRecoveryIntervalSec = props.getValue( + "metaServer.recoveryInterval", mRecoveryIntervalSec); + mServerDownReplicationDelay = (int)props.getValue( + "metaServer.serverDownReplicationDelay", + double(mServerDownReplicationDelay)); + mMaxDownServersHistorySize = props.getValue( + "metaServer.maxDownServersHistorySize", + mMaxDownServersHistorySize); + + mMaxCSRestarting = props.getValue( + "metaServer.maxCSRestarting", + mMaxCSRestarting); + mRetireOnCSRestartFlag = props.getValue( + "metaServer.retireOnCSRestart", + mRetireOnCSRestartFlag ? 1 : 0) != 0; + mMaxCSUptime = props.getValue( + "metaServer.maxCSUptime", + mMaxCSUptime); + mCSGracefulRestartTimeout = max((int64_t)0, props.getValue( + "metaServer.CSGracefulRestartTimeout", + mCSGracefulRestartTimeout)); + mCSGracefulRestartAppendWithWidTimeout = max((int64_t)0, props.getValue( + "metaServer.CSGracefulRestartAppendWithWidTimeout", + mCSGracefulRestartAppendWithWidTimeout)); + + mRecomputeDirSizesIntervalSec = max(0, props.getValue( + "metaServer.recomputeDirSizesIntervalSec", + mRecomputeDirSizesIntervalSec)); + + mDelayedRecoveryUpdateMaxScanCount = props.getValue( + "metaServer.delayedRecoveryUpdateMaxScanCount", + mDelayedRecoveryUpdateMaxScanCount); + mForceDelayedRecoveryUpdateFlag = props.getValue( + "metaServer.forceDelayedRecoveryUpdate", + mForceDelayedRecoveryUpdateFlag ? 1 : 0) != 0; + + mSortCandidatesBySpaceUtilizationFlag = props.getValue( + "metaServer.sortCandidatesBySpaceUtilization", + mSortCandidatesBySpaceUtilizationFlag ? 1 : 0) != 0; + mSortCandidatesByLoadAvgFlag = props.getValue( + "metaServer.sortCandidatesByLoadAvg", + mSortCandidatesByLoadAvgFlag ? 1 : 0) != 0; + + mMaxFsckFiles = props.getValue( + "metaServer.maxFsckChunks", + mMaxFsckFiles); + mFsckAbandonedFileTimeout = (int64_t)(props.getValue( + "metaServer.fsckAbandonedFileTimeout", + mFsckAbandonedFileTimeout * 1e-6) * 1e6); + mMaxFsckTime = (int64_t)(props.getValue( + "metaServer.mMaxFsckTime", + mMaxFsckTime * 1e-6) * 1e6); + mFullFsckFlag = props.getValue( + "metaServer.fullFsck", + mFullFsckFlag ? 1 : 0) != 0; + + mMTimeUpdateResolution = (int64_t)(props.getValue( + "metaServer.MTimeUpdateResolution", + mMTimeUpdateResolution * 1e-6) * 1e6); + + mMaxPendingRecoveryMsgLogInfo = props.getValue( + "metaServer.maxPendingRecoveryMsgLogInfo", + mMaxPendingRecoveryMsgLogInfo); + + mAllowLocalPlacementFlag = props.getValue( + "metaServer.allowLocalPlacement", + mAllowLocalPlacementFlag ? 1 : 0) != 0; + mAllowLocalPlacementForAppendFlag = props.getValue( + "metaServer.allowLocalPlacementForAppend", + mAllowLocalPlacementForAppendFlag ? 1 : 0) != 0; + mInRackPlacementForAppendFlag = props.getValue( + "metaServer.inRackPlacementForAppend", + mInRackPlacementForAppendFlag ? 1 : 0) != 0; + mInRackPlacementFlag = props.getValue( + "metaServer.inRackPlacement", + mInRackPlacementFlag ? 1 : 0) != 0; + mAllocateDebugVerifyFlag = props.getValue( + "metaServer.allocateDebugVerify", + mAllocateDebugVerifyFlag ? 1 : 0) != 0; + mGetAllocOrderServersByLoadFlag = props.getValue( + "metaServer.getAllocOrderServersByLoad", + mGetAllocOrderServersByLoadFlag ? 1 : 0) != 0; + mMinChunkAllocClientProtoVersion = props.getValue( + "metaServer.minChunkAllocClientProtoVersion", + mMinChunkAllocClientProtoVersion); + mRackPrefixUsePortFlag = props.getValue( + "metaServer.rackPrefixUsePort", + mRackPrefixUsePortFlag ? 1 : 0) != 0; + + mRackPrefixes.clear(); + { + istringstream is(props.getValue("metaServer.rackPrefixes", "")); + string pref; + RackId id = -1; + HostPrefix hp; + pref.reserve(256); + while ((is >> pref >> id)) { + if (id < 0) { + id = -1; + } else if (id >= ChunkPlacement::kMaxRackId) { + KFS_LOG_STREAM_ERROR << + "invalid rack id: " << + pref << " " << id << + KFS_LOG_EOM; + id = -1; + } + if (hp.Parse(pref) > 0) { + mRackPrefixes.push_back(make_pair(hp, id)); + KFS_LOG_STREAM_INFO << + "rack:" + " prefix: " << pref << + " id: " << id << + KFS_LOG_EOM; + } + pref.clear(); + id = -1; + } + } + mRackWeights.clear(); + { + istringstream is(props.getValue("metaServer.rackWeights", "")); + RackId id = -1; + double weight = -1; + while ((is >> id >> weight)) { + if (id >= 0 && weight >= 0) { + mRackWeights[id] = weight; + KFS_LOG_STREAM_INFO << + "rack: " << id << + " weight: " << weight << + KFS_LOG_EOM; + } + id = -1; + weight = -1; + } + for (RackInfos::iterator it = mRacks.begin(); + it != mRacks.end(); + ++it) { + RackWeights::const_iterator const wi = + mRackWeights.find(it->id()); + it->setWeight(wi == mRackWeights.end() ? + double(1) : wi->second); + } + } + + mCSMaxGoodCandidateLoadRatio = props.getValue( + "metaServer.maxGoodCandidateLoadRatio", + mCSMaxGoodCandidateLoadRatio); + mCSMaxGoodMasterLoadRatio = props.getValue( + "metaServer.maxGoodMasterLoadRatio", + mCSMaxGoodMasterLoadRatio); + mCSMaxGoodSlaveLoadRatio = props.getValue( + "metaServer.maxGoodSlaveLoadRatio", + mCSMaxGoodSlaveLoadRatio); + const double k1Frac = (double)(int64_t(1) << kSlaveScaleFracBits); + mMaxSlavePlacementRange = (int64_t)(props.getValue( + "metaServer.maxSlavePlacementRange", + (double)mMaxSlavePlacementRange / k1Frac) * k1Frac); + + const int kMaxReplication = (1 << 14) - 1; // 14 bit field in file attribute + mMaxReplicasPerFile = (int16_t)min(kMaxReplication, props.getValue( + "metaServer.maxReplicasPerFile", int(mMaxReplicasPerFile))); + mMaxReplicasPerRSFile = (int16_t)min(kMaxReplication, props.getValue( + "metaServer.maxReplicasPerRSFile", int(mMaxReplicasPerRSFile))); + + mChunkServerMd5sums.clear(); + { + istringstream is(props.getValue( + "metaServer.chunkServerMd5sums", "")); + string md5sum; + while ((is >> md5sum)) { + mChunkServerMd5sums.push_back(md5sum); + md5sum.clear(); + } + } + mClusterKey = props.getValue("metaServer.clusterKey", string()); + + mMaxResponseSize = props.getValue( + "metaServer.maxResponseSize", + mMaxResponseSize); + mMinIoBufferBytesToProcessRequest = props.getValue( + "metaServer.minIoBufferBytesToProcessRequest", + mMinIoBufferBytesToProcessRequest); + + int64_t totalIoBytes = mBufferPool ? + (int64_t)mBufferPool->GetTotalBufferCount() * + mBufferPool->GetBufferSize() : int64_t(-1); + if (totalIoBytes > 0) { + const int64_t minReserve = + (16 << 10) * mBufferPool->GetBufferSize(); + if (totalIoBytes > minReserve * 3) { + totalIoBytes -= minReserve; + } else { + totalIoBytes = totalIoBytes * 2 / 3; + } + if (mMaxResponseSize > totalIoBytes) { + mMaxResponseSize = (int)min(totalIoBytes, + (int64_t)numeric_limits::max()); + } + ChunkServer::SetMaxHelloBufferBytes( + min(totalIoBytes, props.getValue( + "chunkServer.maxHelloBufferBytes", + ChunkServer::GetMaxHelloBufferBytes())) + ); + } else { + ChunkServer::SetMaxHelloBufferBytes(props.getValue( + "chunkServer.maxHelloBufferBytes", + ChunkServer::GetMaxHelloBufferBytes()) + ); + } + if (mMinIoBufferBytesToProcessRequest > totalIoBytes) { + mMinIoBufferBytesToProcessRequest = totalIoBytes; + } + KFS_LOG_STREAM_INFO << + "max. response size: " << + mMaxResponseSize << + " minIoBufferBytesToProcessRequest: " << + mMinIoBufferBytesToProcessRequest << + KFS_LOG_EOM; + mReadDirLimit = props.getValue( + "metaServer.readDirLimit", mReadDirLimit); + + SetChunkServersProperties(props); + ClientSM::SetParameters(props); + gNetDispatch.SetParameters(props); + if (mDownServers.size() > mMaxDownServersHistorySize) { + mDownServers.erase(mDownServers.begin(), mDownServers.begin() + + mDownServers.size() - mMaxDownServersHistorySize); + } + MetaFsck::SetParameters(props); + SetRequestParameters(props); + CSMapUnitTest(props); + mChunkToServerMap.SetDebugValidate(props.getValue( + "metaServer.chunkToServerMap.debugValidate", 0) != 0); + mAllowChunkServerRetireFlag = props.getValue( + "metaServer.allowChunkServerRetire", + mAllowChunkServerRetireFlag ? 1 : 0) != 0; + mPanicOnInvalidChunkFlag = props.getValue( + "metaServer.panicOnInvalidChunk", + mPanicOnInvalidChunkFlag ? 1 : 0) != 0; + mAppendCacheCleanupInterval = (int)props.getValue( + "metaServer.appendCacheCleanupInterval", + double(mAppendCacheCleanupInterval)); + UpdateReplicationsThreshold(); + mMaxWritesPerDriveRatio = props.getValue( + "metaServer.maxWritesPerDriveRatio", + mMaxWritesPerDriveRatio); + mMaxLocalPlacementWeight = props.getValue( + "metaServer.maxLocalPlacementWeight", + mMaxLocalPlacementWeight); + mMinWritesPerDrive = max(1, props.getValue( + "metaServer.minWritesPerDrive", + mMinWritesPerDrive)); + mMaxWritesPerDriveThreshold = + max(mMinWritesPerDrive, mMaxWritesPerDriveThreshold); + mDefaultUser = props.getValue( + "metaServer.defaultUser", + mDefaultUser); + mDefaultGroup = props.getValue( + "metaServer.defaultGroup", + mDefaultGroup); + mDefaultFileMode = props.getValue( + "metaServer.defaultFileMode", + mDefaultFileMode); + mDefaultDirMode = props.getValue( + "metaServer.defaultDirMode", + mDefaultDirMode); + mDefaultLoadUser = props.getValue( + "metaServer.defaultLoadUser", + mDefaultLoadUser); + mDefaultLoadGroup = props.getValue( + "metaServer.defaultLoadGroup", + mDefaultLoadGroup); + mDefaultLoadFileMode = props.getValue( + "metaServer.defaultLoadFileMode", + mDefaultLoadFileMode); + mDefaultLoadDirMode = props.getValue( + "metaServer.defaultLoadDirMode", + mDefaultLoadDirMode); + mForceEUserToRootFlag = props.getValue( + "metaServer.forceEUserToRoot", + mForceEUserToRootFlag ? 1 : 0) != 0; + mVerifyAllOpsPermissionsFlag = props.getValue( + "metaServer.verifyAllOpsPermissions", + mVerifyAllOpsPermissionsFlag ? 1 : 0) != 0; + mRootHosts.clear(); + { + istringstream is(props.getValue("metaServer.rootHosts", "")); + string host; + while ((is >> host)) { + mRootHosts.insert(host); + } + } + mHostUserGroupRemap.clear(); + mLastUidGidRemap.mIp.clear(); + for (int i = 0; i < 2; i++) { + const string idRemapFileName = props.getValue( + i == 0 ? + "metaServer.hostUserRemap" : + "metaServer.hostGroupRemap", + string() + ); + if (idRemapFileName.empty()) { + continue; + } + ifstream fs(idRemapFileName.c_str()); + if (! fs) { + KFS_LOG_STREAM_ERROR << "failed to open: " << + idRemapFileName << + KFS_LOG_EOM; + continue; + } + if (i == 0) { + LoadIdRemap(fs, + &HostUserGroupRemap::value_type::mUserMap); + } else { + LoadIdRemap(fs, + &HostUserGroupRemap::value_type::mGroupMap); + } + } + mConfig.clear(); + mConfig.reserve(10 << 10); + props.getList(mConfig, string(), string(";")); +} + +void +LayoutManager::UpdateReplicationsThreshold() +{ + const int64_t srvCnt = (int64_t)mChunkServers.size(); + mRebalanceReplicationsThresholdCount = max(min(srvCnt, int64_t(1)), + (int64_t)( + mRebalanceReplicationsThreshold * + mMaxConcurrentWriteReplicationsPerNode * + srvCnt + )); +} + +/*! + * \brief Validate the cluster key, and that md5 sent by a chunk server + * matches one of the acceptable md5's. + */ +bool +LayoutManager::Validate(MetaHello& r) const +{ + if (r.clusterKey != mClusterKey) { + r.statusMsg = "cluster key mismatch:" + " expect: " + mClusterKey + + " recieved: " + r.clusterKey; + r.status = -EBADCLUSTERKEY; + return false; + } + if (mChunkServerMd5sums.empty() || find( + mChunkServerMd5sums.begin(), + mChunkServerMd5sums.end(), + r.md5sum) != mChunkServerMd5sums.end()) { + return true; + } + r.statusMsg = "MD5sum mismatch: recieved: " + r.md5sum; + r.status = -EBADCLUSTERKEY; + return false; +} + +LayoutManager::RackId +LayoutManager::GetRackId(const ServerLocation& loc) +{ + if (mRackPrefixUsePortFlag) { + ostringstream os; + os << loc.hostname << ":" << loc.port; + const string name = os.str(); + return GetRackId(name); + } else { + return GetRackId(loc.hostname); + } +} + +LayoutManager::RackId +LayoutManager::GetRackId(const string& name) +{ + RackPrefixes::const_iterator it = mRackPrefixes.begin(); + while (it != mRackPrefixes.end()) { + if (it->first.Match(name)) { + return it->second; + } + ++it; + } + return -1; +} + +void +LayoutManager::SetChunkServersProperties(const Properties& props) +{ + if (props.empty()) { + return; + } + props.copyWithPrefix("chunkServer.", mChunkServersProps); + if (mChunkServersProps.empty()) { + return; + } + string display; + mChunkServersProps.getList(display, "", ";"); + KFS_LOG_STREAM_INFO << "setting properties for " << + mChunkServers.size() << " chunk servers: " << display << + KFS_LOG_EOM; + Servers const chunkServers(mChunkServers); + for (Servers::const_iterator i = chunkServers.begin(); + i != chunkServers.end(); + ++i) { + (*i)->SetProperties(mChunkServersProps); + } +} + +void +LayoutManager::Shutdown() +{ + // Return io buffers back into the pool. + mCSCountersResponse.Clear(); + mPingResponse.Clear(); +} + +inline ostream& +LayoutManager::ClearStringStream() +{ + mStringStream.flush(); + mStringStream.str(string()); + return mStringStream; +} + +inline const string& +LayoutManager::BoolToString(bool flag) +{ + static const string falseStr("0"); + static const string trueStr ("1"); + return (flag ? trueStr : falseStr); +} + +inline LayoutManager::CSCounters::mapped_type& +LayoutManager::CSCountersMakeRow( + const string& name, size_t width, CSCounters::iterator& it) +{ + if (it == mCSCounters.end() || + ++it == mCSCounters.end() || + it->first != name) { + it = mCSCounters.insert( + make_pair(name, CSCounters::mapped_type())).first; + } + it->second.resize(width); + return it->second; +} + +void +LayoutManager::UpdateChunkServerCounters() +{ + static const string locationStr ("XMeta-location"); + static const string retiringStr ("XMeta-retiring"); + static const string restartingStr ("XMeta-restarting"); + static const string responsiveStr ("XMeta-responsive"); + static const string spaceAvailStr ("XMeta-space-avail"); + static const string heartbeatTimeStr ("XMeta-heartbeat-time"); + static const string replicationReadStr ("XMeta-replication-read"); + static const string replicationWriteStr("XMeta-replication-write"); + static const string rackIdStr ("XMeta-rack"); + static const string rackWeightStr ("XMeta-rack-placement-weight"); + static const string loadAvgStr ("XMeta-load-avg"); + static const string toEvacuateCntStr ("XMeta-to-evacuate-cnt"); + + const size_t srvCount = mChunkServers.size(); + CSCounters::iterator csi = mCSCounters.end(); + CSCounters::mapped_type& location = + CSCountersMakeRow(locationStr, srvCount, csi); + CSCounters::mapped_type& retiring = + CSCountersMakeRow(retiringStr, srvCount, csi); + CSCounters::mapped_type& restarting = + CSCountersMakeRow(restartingStr, srvCount, csi); + CSCounters::mapped_type& responsive = + CSCountersMakeRow(responsiveStr, srvCount, csi); + CSCounters::mapped_type& spaceAvail = + CSCountersMakeRow(spaceAvailStr, srvCount, csi); + CSCounters::mapped_type& heartbeatTime = + CSCountersMakeRow(heartbeatTimeStr, srvCount, csi); + CSCounters::mapped_type& replicationRead = + CSCountersMakeRow(replicationReadStr, srvCount, csi); + CSCounters::mapped_type& replicationWrite = + CSCountersMakeRow(replicationWriteStr, srvCount, csi); + CSCounters::mapped_type& rackId = + CSCountersMakeRow(rackIdStr, srvCount, csi); + CSCounters::mapped_type& rackWeight = + CSCountersMakeRow(rackWeightStr, srvCount, csi); + CSCounters::mapped_type& loadAvg = + CSCountersMakeRow(loadAvgStr, srvCount, csi); + CSCounters::mapped_type& toEvacuateCnt = + CSCountersMakeRow(toEvacuateCntStr, srvCount, csi); + const time_t now = TimeNow(); + int i = 0; + for (Servers::const_iterator it = mChunkServers.begin(); + it != mChunkServers.end(); + ++it) { + const Properties& props = (*it)->HeartBeatProperties(); + csi = mCSCounters.end(); + for (Properties::iterator pi = props.begin(); + pi != props.end(); + ++pi) { + if (pi->first == "Cseq") { + continue; + } + CSCountersMakeRow(pi->first, srvCount, csi)[i] = pi->second; + } + ClearStringStream() << + (*it)->GetServerLocation().hostname << ":" << + (*it)->GetServerLocation().port; + location[i] = mStringStream.str(); + retiring[i] = BoolToString((*it)->IsRetiring()); + restarting[i] = BoolToString((*it)->IsRestartScheduled()); + responsive[i] = BoolToString((*it)->IsResponsiveServer()); + ClearStringStream() << (*it)->GetAvailSpace(); + spaceAvail[i] = mStringStream.str(); + ClearStringStream() << (now - (*it)->TimeSinceLastHeartbeat()); + heartbeatTime[i] = mStringStream.str(); + ClearStringStream() << (*it)->GetReplicationReadLoad(); + replicationRead[i] = mStringStream.str(); + ClearStringStream() << (*it)->GetNumChunkReplications(); + replicationWrite[i] = mStringStream.str(); + const RackId rid = (*it)->GetRack(); + ClearStringStream() << rid; + rackId[i] = mStringStream.str(); + RackInfos::iterator const rackIter = rid >= 0 ? find_if( + mRacks.begin(), mRacks.end(), + bind(&RackInfo::id, _1) == rid + ) : mRacks.end(); + ClearStringStream() << (rackIter != mRacks.end() ? + rackIter->getWeightedPossibleCandidatesCount() : -1); + rackWeight[i] = mStringStream.str(); + ClearStringStream() << (*it)->GetLoadAvg(); + loadAvg[i] = mStringStream.str(); + ClearStringStream() << (*it)->GetChunksToEvacuateCount(); + toEvacuateCnt[i] = mStringStream.str(); + i++; + } + ClearStringStream(); +} + +void +LayoutManager::GetChunkServerCounters(IOBuffer& buf) +{ + if (! mCSCountersResponse.IsEmpty() && + TimeNow() < mCSCountersUpdateTime + + mCSCountersUpdateInterval) { + buf.Copy(&mCSCountersResponse, + mCSCountersResponse.BytesConsumable()); + return; + } + mCSCountersResponse.Clear(); + UpdateChunkServerCounters(); + if (mCSCounters.empty() || mCSCounters.begin()->second.empty()) { + return; + } + static const string columnDelim(","); + static const string rowDelim("\n"); + bool next = false; + size_t size = mCSCounters.begin()->second.size(); + for (CSCounters::const_iterator it = mCSCounters.begin(); + it != mCSCounters.end(); + ++it) { + if (next) { + mCSCountersResponse.CopyIn( + columnDelim.data(), columnDelim.size()); + } + next = true; + mCSCountersResponse.CopyIn(it->first.data(), it->first.size()); + size = min(size, it->second.size()); + } + if (size > 0) { + mCSCountersResponse.CopyIn(rowDelim.data(), rowDelim.size()); + } + for (size_t i = 0; i < size; i++) { + next = false; + for (CSCounters::const_iterator it = mCSCounters.begin(); + it != mCSCounters.end(); + ++it) { + if (next) { + mCSCountersResponse.CopyIn( + columnDelim.data(), columnDelim.size()); + } + next = true; + const string& str = it->second[i]; + mCSCountersResponse.CopyIn(str.data(), str.length()); + } + mCSCountersResponse.CopyIn(rowDelim.data(), rowDelim.size()); + } + mCSCountersUpdateTime = TimeNow(); + buf.Copy(&mCSCountersResponse, mCSCountersResponse.BytesConsumable()); +} + +// +// Try to match servers by hostname: for write allocation, we'd like to place +// one copy of the block on the same host on which the client is running. +// +template +class HostNameEqualsTo +{ + const T& host; +public: + HostNameEqualsTo(const T &h) : host(h) {} + bool operator()(const ChunkServerPtr &s) const { + return s->GetServerLocation().hostname == host; + } +}; +template HostNameEqualsTo +MatchServerByHost(const T& host) { return HostNameEqualsTo(host); } + +void +LayoutManager::UpdateDelayedRecovery(const MetaFattr& fa, + bool forceUpdateFlag /* = false */) +{ + // See comment in CanReplicateChunkNow() + size_t const count = mChunkToServerMap.GetCount( + CSMap::Entry::kStateDelayedRecovery); + if (count <= 0) { + return; + } + if (! fa.HasRecovery() || fa.chunkcount() <= 0 || fa.filesize <= 0) { + return; + } + const bool forceFlag = + forceUpdateFlag || mForceDelayedRecoveryUpdateFlag; + if ((int64_t)count <= max(forceFlag ? 2 * fa.chunkcount() : 0, + mDelayedRecoveryUpdateMaxScanCount)) { + mChunkToServerMap.First(CSMap::Entry::kStateDelayedRecovery); + CSMap::Entry* p; + while ((p = mChunkToServerMap.Next( + CSMap::Entry::kStateDelayedRecovery))) { + if (p->GetFattr() == &fa) { + mChunkToServerMap.SetState(*p, + CSMap::Entry::kStateCheckReplication); + } + } + return; + } + if (! forceFlag) { + // Rely on low priority replication check to eventurally + // update the state. + return; + } + vector chunks; + if (metatree.getalloc(fa.id(), chunks) != 0) { + return; + } + for (vector::const_iterator it = chunks.begin(); + it != chunks.end(); + ++it) { + CSMap::Entry& entry = GetCsEntry(**it); + if (mChunkToServerMap.GetState(entry) != + CSMap::Entry::kStateDelayedRecovery) { + continue; + } + mChunkToServerMap.SetState( + entry, CSMap::Entry::kStateCheckReplication); + } +} + +bool +LayoutManager::HasWriteAppendLease(chunkId_t chunkId) const +{ + const ChunkLeases::WriteLease* const wl = + mChunkLeases.GetWriteLease(chunkId); + return (wl && wl->appendFlag); +} + +/// Add the newly joined server to the list of servers we have. Also, +/// update our state to include the chunks hosted on this server. +void +LayoutManager::AddNewServer(MetaHello *r) +{ + if (r->server->IsDown()) { + return; + } + ChunkServer& srv = *r->server.get(); + srv.SetServerLocation(r->location); + + const string srvId = r->location.ToString(); + Servers::iterator const existing = find_if( + mChunkServers.begin(), mChunkServers.end(), + MatchingServer(r->location)); + if (existing != mChunkServers.end()) { + KFS_LOG_STREAM_DEBUG << "duplicate server: " << srvId << + " possible reconnect, taking: " << + (const void*)existing->get() << " down " << + " replacing with: " << (const void*)&srv << + KFS_LOG_EOM; + ServerDown(*existing); + if (srv.IsDown()) { + return; + } + } + + // Add server first, then add chunks, otherwise if/when the server goes + // down in the process of adding chunks, taking out server from chunk + // info will not work in ServerDown(). + if ( ! mChunkToServerMap.AddServer(r->server)) { + KFS_LOG_STREAM_WARN << + "failed to add server: " << srvId << + " no slots available " + " servers: " << mChunkToServerMap.GetServerCount() << + " / " << mChunkServers.size() << + KFS_LOG_EOM; + srv.ForceDown(); + return; + } + mChunkServers.push_back(r->server); + + const uint64_t allocSpace = r->chunks.size() * CHUNKSIZE; + srv.SetSpace(r->totalSpace, r->usedSpace, allocSpace); + RackId rackId = GetRackId(r->location); + if (rackId < 0 && r->rackId >= 0) { + rackId = r->rackId; + } + srv.SetRack(rackId); + // Ensure that rack exists before invoking UpdateSrvLoadAvg(), as it + // can update rack possible allocation candidates count. + if (rackId >= 0) { + RackInfos::iterator const rackIter = find_if( + mRacks.begin(), mRacks.end(), + bind(&RackInfo::id, _1) == rackId); + if (rackIter != mRacks.end()) { + rackIter->addServer(r->server); + } else { + RackWeights::const_iterator const + it = mRackWeights.find(rackId); + mRacks.push_back(RackInfo( + rackId, + it != mRackWeights.end() ? + it->second : double(1), + r->server + )); + } + } else { + KFS_LOG_STREAM_INFO << srvId << + ": no rack specified: " << rackId << + KFS_LOG_EOM; + } + UpdateSrvLoadAvg(srv, 0); + + if (mAssignMasterByIpFlag) { + // if the server node # is odd, it is master; else slave + string ipaddr = r->peerName; + string::size_type delimPos = ipaddr.rfind(':'); + if (delimPos != string::npos) { + ipaddr.erase(delimPos); + } + delimPos = ipaddr.rfind('.'); + if (delimPos == string::npos) { + srv.SetCanBeChunkMaster(Rand(2) != 0); + } else { + string nodeNumStr = ipaddr.substr(delimPos + 1); + srv.SetCanBeChunkMaster((toNumber(nodeNumStr) % 2) != 0); + } + } else { + srv.SetCanBeChunkMaster(mSlavesCount >= mMastersCount); + } + if (srv.CanBeChunkMaster()) { + mMastersCount++; + } else { + mSlavesCount++; + } + + int maxLogInfoCnt = 32; + ChunkIdQueue staleChunkIds; + for (MetaHello::ChunkInfos::const_iterator it = r->chunks.begin(); + it != r->chunks.end() && ! srv.IsDown(); + ++it) { + const chunkId_t chunkId = it->chunkId; + const char* staleReason = 0; + CSMap::Entry* const cmi = mChunkToServerMap.Find(chunkId); + seq_t chunkVersion = -1; + if (cmi) { + CSMap::Entry& c = *cmi; + const fid_t fileId = c.GetFileId(); + const ChunkServerPtr cs = c.GetServer( + mChunkToServerMap, srv.GetServerLocation()); + if (cs) { + KFS_LOG_STREAM_ERROR << srvId << + " stable chunk: <" << + fileId << "/" << + it->allocFileId << "," << + chunkId << ">" << + " already hosted on: " << + (const void*)cs.get() << + " new server: " << + (const void*)&srv << + " has the same location: " << + srv.GetServerLocation() << + (cs.get() == &srv ? + " duplicate chunk entry" : + " possible stale chunk to" + " server mapping entry" + ) << + KFS_LOG_EOM; + if (cs.get() == &srv) { + // Ignore duplicate chunk inventory entries. + continue; + } + } + const MetaFattr& fa = *(cmi->GetFattr()); + const MetaChunkInfo& ci = *(cmi->GetChunkInfo()); + chunkVersion = ci.chunkVersion; + if (chunkVersion > it->chunkVersion) { + staleReason = "lower chunk version"; + } else if (chunkVersion + + GetChunkVersionRollBack(chunkId) < + it->chunkVersion) { + staleReason = "higher chunk version"; + } else { + if (chunkVersion != it->chunkVersion) { + bool kMakeStableFlag = false; + bool kPendingAddFlag = true; + srv.NotifyChunkVersChange( + fileId, + chunkId, + chunkVersion, + it->chunkVersion, + kMakeStableFlag, + kPendingAddFlag + ); + continue; + } + const ChunkLeases::WriteLease* const wl = + mChunkLeases.GetWriteLease(chunkId); + if (wl && wl->allocInFlight && + wl->allocInFlight->status == 0) { + staleReason = "chunk allocation in flight"; + } else { + // This chunk is non-stale. Check replication, + // and update file size if this is the last + // chunk and update required. + const int res = AddHosted(c, r->server); + assert(res >= 0); + if (! fa.IsStriped() && fa.filesize < 0 && + ci.offset + + (chunkOff_t)CHUNKSIZE >= + fa.nextChunkOffset()) { + KFS_LOG_STREAM_DEBUG << srvId << + " chunk size: <" << fileId << + "," << chunkId << ">" << + KFS_LOG_EOM; + srv.GetChunkSize(fileId, chunkId, chunkVersion, ""); + } + if (! srv.IsDown()) { + const int srvCount = + (int)mChunkToServerMap.ServerCount(c); + if (fa.numReplicas <= srvCount) { + CancelPendingMakeStable(fileId, chunkId); + } + if (fa.numReplicas != srvCount) { + CheckReplication(c); + } + } + } + } + } else { + staleReason = "no chunk mapping exists"; + } + if (staleReason) { + maxLogInfoCnt--; + KFS_LOG_STREAM((maxLogInfoCnt > 0) ? + MsgLogger::kLogLevelINFO : + MsgLogger::kLogLevelDEBUG) << + srvId << + " stable chunk: <" << + it->allocFileId << "," << chunkId << ">" + " version: " << it->chunkVersion << + "/" << chunkVersion << + " " << staleReason << + " => stale" << + KFS_LOG_EOM; + staleChunkIds.PushBack(it->chunkId); + mStaleChunkCount->Update(1); + } + } + + for (int i = 0; i < 2; i++) { + const MetaHello::ChunkInfos& chunks = i == 0 ? + r->notStableAppendChunks : r->notStableChunks; + int maxLogInfoCnt = 64; + for (MetaHello::ChunkInfos::const_iterator it = chunks.begin(); + it != chunks.end() && ! srv.IsDown(); + ++it) { + const char* const staleReason = AddNotStableChunk( + r->server, + it->allocFileId, + it->chunkId, + it->chunkVersion, + i == 0, + srvId + ); + maxLogInfoCnt--; + KFS_LOG_STREAM((maxLogInfoCnt > 0) ? + MsgLogger::kLogLevelINFO : + MsgLogger::kLogLevelDEBUG) << + srvId << + " not stable chunk:" << + (i == 0 ? " append" : "") << + " <" << + it->allocFileId << "," << it->chunkId << ">" + " version: " << it->chunkVersion << + " " << (staleReason ? staleReason : "") << + (staleReason ? " => stale" : "added back") << + KFS_LOG_EOM; + if (staleReason) { + staleChunkIds.PushBack(it->chunkId); + mStaleChunkCount->Update(1); + } + // MakeChunkStableDone will process pending recovery. + } + } + const size_t staleCnt = staleChunkIds.GetSize(); + if (! staleChunkIds.IsEmpty() && ! srv.IsDown()) { + srv.NotifyStaleChunks(staleChunkIds); + } + if (! mChunkServersProps.empty() && ! srv.IsDown()) { + srv.SetProperties(mChunkServersProps); + } + // All ops are queued at this point, make sure that the server is still up. + if (srv.IsDown()) { + KFS_LOG_STREAM_ERROR << srvId << + ": went down in the process of adding it" << + KFS_LOG_EOM; + return; + } + + // Update the list since a new server is in + CheckHibernatingServersStatus(); + + const char* msg = "added"; + if (IsChunkServerRestartAllowed() && + mCSToRestartCount < mMaxCSRestarting) { + if (srv.Uptime() >= GetMaxCSUptime() && + ! srv.IsDown() && + ! srv.IsRestartScheduled()) { + mCSToRestartCount++; + if (srv.CanBeChunkMaster()) { + mMastersToRestartCount++; + } + if (srv.GetNumChunkWrites() <= 0 && + srv.GetNumAppendsWithWid() <= 0) { + srv.Restart(mRetireOnCSRestartFlag); + msg = "restarted"; + } else { + srv.ScheduleRestart( + mCSGracefulRestartTimeout, + mCSGracefulRestartAppendWithWidTimeout); + } + } else { + ScheduleChunkServersRestart(); + } + } + UpdateReplicationsThreshold(); + KFS_LOG_STREAM_INFO << + msg << " chunk server: " << r->peerName << "/" << srv.ServerID() << + (srv.CanBeChunkMaster() ? " master" : " slave") << + " rack: " << r->rackId << " => " << rackId << + " chunks: stable: " << r->chunks.size() << + " not stable: " << r->notStableChunks.size() << + " append: " << r->notStableAppendChunks.size() << + " +wid: " << r->numAppendsWithWid << + " writes: " << srv.GetNumChunkWrites() << + " +wid: " << srv.GetNumAppendsWithWid() << + " stale: " << staleCnt << + " masters: " << mMastersCount << + " slaves: " << mSlavesCount << + " total: " << mChunkServers.size() << + " uptime: " << srv.Uptime() << + " restart: " << srv.IsRestartScheduled() << + " 2restart: " << mCSToRestartCount << + " 2restartmasters: " << mMastersToRestartCount << + KFS_LOG_EOM; +} + +const char* +LayoutManager::AddNotStableChunk( + const ChunkServerPtr& server, + fid_t allocFileId, + chunkId_t chunkId, + seq_t chunkVersion, + bool appendFlag, + const string& logPrefix) +{ + CSMap::Entry* const cmi = mChunkToServerMap.Find(chunkId); + if (! cmi) { + return "no chunk mapping exists"; + } + CSMap::Entry& pinfo = *cmi; + const fid_t fileId = pinfo.GetFileId(); + const ChunkServerPtr cs = pinfo.GetServer( + mChunkToServerMap, server->GetServerLocation()); + if (cs) { + KFS_LOG_STREAM_ERROR << logPrefix << + " not stable chunk:" << + (appendFlag ? " append " : "") << + " <" << fileId << + "/" << allocFileId << + "," << chunkId << ">" << + " already hosted on: " << (const void*)cs.get() << + " new server: " << (const void*)server.get() << + (cs.get() == server.get() ? + " duplicate chunk entry" : + " possible stale chunk to server mapping entry") << + KFS_LOG_EOM; + return 0; + } + const char* staleReason = 0; + if (AddServerToMakeStable(pinfo, server, + chunkId, chunkVersion, staleReason) || staleReason) { + return staleReason; + } + // At this point it is known that no make chunk stable is in progress: + // AddServerToMakeStable() invoked already. + // Delete the replica if sufficient number of replicas already exists. + const MetaFattr * const fa = pinfo.GetFattr(); + if (fa && fa->numReplicas <= (int)mChunkToServerMap.ServerCount(pinfo)) { + CancelPendingMakeStable(fileId, chunkId); + return "sufficient number of replicas exists"; + } + // See if it is possible to add the chunk back before "[Begin] Make + // Chunk Stable" ([B]MCS) starts. Expired lease cleanup lags behind. If + // expired lease exists, and [B]MCS is not in progress, then add the + // chunk back. + const ChunkLeases::WriteLease* const wl = + mChunkLeases.GetWriteLease(chunkId); + if (wl && appendFlag != wl->appendFlag) { + return (appendFlag ? "not append lease" : "append lease"); + } + if ((! wl || wl->relinquishedFlag) && appendFlag) { + PendingMakeStableMap::iterator const msi = + mPendingMakeStable.find(chunkId); + if (msi == mPendingMakeStable.end()) { + return "no make stable info"; + } + if (chunkVersion != msi->second.mChunkVersion) { + return "pending make stable chunk version mismatch"; + } + const bool beginMakeStableFlag = msi->second.mSize < 0; + if (beginMakeStableFlag) { + AddHosted(chunkId, pinfo, server); + if (InRecoveryPeriod() || + ! mPendingBeginMakeStable.empty()) { + // Allow chunk servers to connect back. + mPendingBeginMakeStable.insert( + make_pair(chunkId, chunkVersion)); + return 0; + } + assert(! wl || ! wl->stripedFileFlag); + const bool kStripedFileFlag = false; + const bool leaseRelinquishFlag = false; + MakeChunkStableInit( + pinfo, + chunkVersion, + string(), + beginMakeStableFlag, + -1, + false, + 0, + kStripedFileFlag, + appendFlag, + leaseRelinquishFlag + ); + return 0; + } + const bool kPendingAddFlag = true; + server->MakeChunkStable( + fileId, chunkId, chunkVersion, + msi->second.mSize, + msi->second.mHasChecksum, + msi->second.mChecksum, + kPendingAddFlag + ); + return 0; + } + if (! wl && ! appendFlag && + mPendingMakeStable.find(chunkId) != + mPendingMakeStable.end()) { + return "chunk was open for append"; + } + const seq_t curChunkVersion = pinfo.GetChunkInfo()->chunkVersion; + if (chunkVersion < curChunkVersion) { + return "lower chunk version"; + } + if (chunkVersion > curChunkVersion + GetChunkVersionRollBack(chunkId)) { + // This indicates that part of meta server log or checkpoint + // was lost, or rolled back to the previous state. + return "higher chunk version"; + } + if (curChunkVersion != chunkVersion && + (appendFlag || ! wl || ! wl->allocInFlight)) { + // Make stable below will be invoked to make the chunk stable + const bool kMakeStableFlag = false; + server->NotifyChunkVersChange( + fileId, chunkId, curChunkVersion, chunkVersion, + kMakeStableFlag); + if (server->IsDown()) { + // Went down while sending notification. + // Op completion invokes required error handling. + return 0; + } + // AddHosted below completes before the the notification op + // completion. + } + // Adding server back can change replication chain (order) -- invalidate + // record appender cache to prevent futher appends to this chunk. + if (wl) { + if (appendFlag) { + mARAChunkCache.Invalidate(fileId, chunkId); + } else if (wl->allocInFlight) { + if (wl->allocInFlight->status == 0) { + return "re-allocation in flight"; + } + if (wl->allocInFlight->chunkVersion != chunkVersion) { + // Allocation / version change has already + // failed, but the allocation op still pending + // waiting remaining replies. + // Set allocation version to ensure that the + // version roll back won't fail when the final + // reply comes in. + const bool kMakeStableFlag = false; + server->NotifyChunkVersChange( + fileId, + chunkId, + wl->allocInFlight->chunkVersion, // to + chunkVersion, // from + kMakeStableFlag); + if (server->IsDown()) { + return 0; // Went down do not add chunk. + } + } + } + AddHosted(chunkId, pinfo, server); + } else if (! appendFlag) { + const bool kPendingAddFlag = true; + server->MakeChunkStable( + fileId, chunkId, curChunkVersion, + -1, false, 0, kPendingAddFlag + ); + } + return 0; +} + + +void +LayoutManager::Done(MetaChunkVersChange& req) +{ + if (req.replicate) { + assert(req.replicate->versChange = &req); + ChunkReplicationDone(req.replicate); + MetaChunkReplicate* const repl = req.replicate; + req.replicate = 0; + if (repl && ! repl->suspended) { + submit_request(repl); + } + return; + } + + CSMap::Entry* const cmi = mChunkToServerMap.Find(req.chunkId); + if (! cmi) { + KFS_LOG_STREAM_INFO << req.Show() << + " chunk no longer esists," + " declaring stale replica" << + KFS_LOG_EOM; + req.server->NotifyStaleChunk(req.chunkId); + return; + } + UpdateReplicationState(*cmi); + + if (req.status != 0) { + KFS_LOG_STREAM_ERROR << req.Show() << + " status: " << req.status << + " msg: " << req.statusMsg << + " pendingAdd: " << req.pendingAddFlag << + KFS_LOG_EOM; + if (! req.pendingAddFlag) { + ChunkCorrupt(req.chunkId, req.server); + } + return; + } + if (! req.pendingAddFlag || req.server->IsDown()) { + return; + } + // For now do not start recursive version change. + // If this was the last replica, then another round of version change + // won't start. If another replica existed, and version change has + // failed, then such sequence might result in loosing all replicas. + // + // This replica is assumed to be stable. + // Non stable replica add path, uses another code path, which ends + // with make chunk stable. + // Ensure that no write lease exists. + if (GetChunkVersionRollBack(req.chunkId) <= 0 || + mChunkLeases.GetWriteLease(req.chunkId)) { + KFS_LOG_STREAM_INFO << req.Show() << + " no version roll back or write lese exists," + " declaring stale replica" << + KFS_LOG_EOM; + req.server->NotifyStaleChunk(req.chunkId); + return; + } + // Coalesce block can change file id while version change + // was in flight. Use fid from the chunk mappings. + const MetaChunkInfo* const ci = cmi->GetChunkInfo(); + if (ci->chunkVersion != req.chunkVersion) { + KFS_LOG_STREAM_INFO << req.Show() << + " chunk version mismatch," + " declaring replica stale" << + KFS_LOG_EOM; + req.server->NotifyStaleChunk(req.chunkId); + return; + } + if (! AddHosted(*cmi, req.server)) { + KFS_LOG_STREAM_ERROR << req.Show() << + " no such server, or mappings update failed" << + KFS_LOG_EOM; + return; + } + KFS_LOG_STREAM_INFO << req.Show() << + " replica added: " << req.server->GetServerLocation() << + KFS_LOG_EOM; + const MetaFattr* const fa = cmi->GetFattr(); + const fid_t fileId = fa->id(); + if (! fa->IsStriped() && + ci->offset + (chunkOff_t)CHUNKSIZE >= + fa->nextChunkOffset() && + fa->filesize < 0) { + KFS_LOG_STREAM_DEBUG << + " get chunk size: <" << fileId << + "," << req.chunkId << ">" << + KFS_LOG_EOM; + req.server->GetChunkSize( + fileId, req.chunkId, req.chunkVersion, string()); + } + if (req.server->IsDown()) { + // Went down in GetChunkSize(). + return; + } + const int srvCount = (int)mChunkToServerMap.ServerCount(*cmi); + if (fa->numReplicas <= srvCount) { + CancelPendingMakeStable(fileId, req.chunkId); + } + if (fa->numReplicas != srvCount) { + CheckReplication(*cmi); + } +} + +void +LayoutManager::ProcessPendingBeginMakeStable() +{ + if (mPendingBeginMakeStable.empty()) { + return; + } + PendingBeginMakeStable pendingBeginMakeStable; + pendingBeginMakeStable.swap(mPendingBeginMakeStable); + const bool kBeginMakeStableFlag = true; + const bool kStripedFileFlag = false; + for (PendingBeginMakeStable::const_iterator + it = pendingBeginMakeStable.begin(); + it != pendingBeginMakeStable.end(); + ++it) { + CSMap::Entry* const cmi = mChunkToServerMap.Find(it->first); + if (! cmi) { + continue; + } + const bool appendFlag = true; + const bool leaseRelinquishFlag = false; + MakeChunkStableInit( + *cmi, + it->second, + string(), + kBeginMakeStableFlag, + -1, + false, + 0, + kStripedFileFlag, + appendFlag, + leaseRelinquishFlag + ); + } +} + +class PrintChunkServerInfo +{ + ostream& ofs; + const bool useFsTotalSpaceFlag; +public: + PrintChunkServerInfo(ostream& o, bool f) + : ofs(o), + useFsTotalSpaceFlag(f) + {} + void operator()(const ChunkServerPtr& c) const { + ofs << + c->GetServerLocation() << + ' ' << c->GetRack() << + ' ' << c->GetTotalSpace(useFsTotalSpaceFlag) << + ' ' << (useFsTotalSpaceFlag ? + c->GetFsUsedSpace() : c->GetUsedSpace()) << + "\n"; + } +}; + +// +// Dump out the chunk block map to a file. The output can be used in emulation +// modes where we setup the block map and experiment. +// +void +LayoutManager::DumpChunkToServerMap(const string& dirToUse) +{ + // + // to make offline rebalancing/re-replication easier, dump out where the + // servers are and how much space each has. + // + string fn = dirToUse + "/network.def"; + ofstream ofs(fn.c_str(), ofstream::out | ofstream::trunc); + if (ofs) { + for_each(mChunkServers.begin(), mChunkServers.end(), + PrintChunkServerInfo(ofs, mUseFsTotalSpaceFlag)); + ofs.close(); + } + if (! ofs) { + unlink(fn.c_str()); + return; + } + + fn = dirToUse + "/chunkmap.txt"; + ofs.open(fn.c_str(), ofstream::out | ofstream::trunc); + if (ofs) { + DumpChunkToServerMap(ofs); + ofs.close(); + } + if (! ofs) { + unlink(fn.c_str()); + return; + } +} + +void +LayoutManager::DumpChunkReplicationCandidates(MetaDumpChunkReplicationCandidates* op) +{ + int64_t total = 0; + int64_t outLeft = mMaxFsckFiles; + op->resp.Clear(); + ostream& os = mWOstream.Set(op->resp, mMaxResponseSize); + for (int state = CSMap::Entry::kStateCheckReplication; + state < CSMap::Entry::kStateCount; + ++state) { + for (const CSMap::Entry* entry = mChunkToServerMap.Front( + CSMap::Entry::State(state)); + entry != 0 && outLeft-- >= 0; + entry = mChunkToServerMap.Next(*entry)) { + if (! (os << entry->GetChunkId() << ' ')) { + break; + } + } + if ( !(os << '\n')) { + break; + } + total += mChunkToServerMap.GetCount(CSMap::Entry::State(state)); + + } + os.flush(); + if (! os) { + op->status = -ENOMEM; + op->statusMsg = "response exceeds max. size"; + } + mWOstream.Reset(); + if (op->status == 0) { + op->numReplication = total; + op->numPendingRecovery = mChunkToServerMap.GetCount( + CSMap::Entry::kStatePendingRecovery); + } else { + op->numReplication = 0; + op->numPendingRecovery = 0; + } +} + +bool +LayoutManager::CanBeRecovered(const CSMap::Entry& entry, + bool& incompleteChunkBlockFlag, + bool* incompleteChunkBlockWriteHasLeaseFlag, + vector& cblk) const +{ + cblk.clear(); + incompleteChunkBlockFlag = false; + if (incompleteChunkBlockWriteHasLeaseFlag) { + *incompleteChunkBlockWriteHasLeaseFlag = false; + } + const MetaFattr* const fa = entry.GetFattr(); + if (! fa) { + panic("chunk mapping null file attribute"); + return false; + } + if (mChunkToServerMap.HasServers(entry)) { + return true; + } + if (! fa->HasRecovery()) { + return false; + } + MetaFattr* mfa = 0; + MetaChunkInfo* mci = 0; + chunkOff_t start = -1; + chunkOff_t offset = entry.GetChunkInfo()->offset; + cblk.reserve(fa->numStripes + fa->numRecoveryStripes); + if (metatree.getalloc(fa->id(), + offset, mfa, mci, &cblk, + &start) != 0 || + mfa != fa || + mci != entry.GetChunkInfo()) { + panic("chunk mapping / getalloc mismatch"); + return false; + } + vector::const_iterator it = cblk.begin(); + int stripeIdx = 0; + int goodCnt = 0; + chunkOff_t const end = start + fa->ChunkBlkSize(); + for (chunkOff_t pos = start; + pos < end; + pos += (chunkOff_t)CHUNKSIZE, stripeIdx++) { + if (it == cblk.end()) { + // Incomplete chunk block. + incompleteChunkBlockFlag = true; + break; + } + assert((*it)->offset % CHUNKSIZE == 0); + if (pos < (*it)->offset) { + if (fa->numStripes <= stripeIdx) { + // No recovery: incomplete chunk block. + incompleteChunkBlockFlag = true; + break; + } + goodCnt++; + continue; // no chunk -- hole. + } + if (mChunkToServerMap.HasServers(GetCsEntry(**it))) { + goodCnt++; + } + ++it; + } + if (incompleteChunkBlockFlag && incompleteChunkBlockWriteHasLeaseFlag) { + for (it = cblk.begin(); it != cblk.end(); ++it) { + if (mChunkLeases.GetWriteLease((*it)->chunkId)) { + *incompleteChunkBlockWriteHasLeaseFlag = true; + break; + } + } + } + return (fa->numStripes <= goodCnt); +} + +typedef KeyOnly KeyOnlyFattrPtr; +typedef LinearHash< + KeyOnlyFattrPtr, + KeyCompare, + DynamicArray*> + // Use straight new to reduce the chances to encounter locked by + // the parent process mutex. Glibs malloc can properly deal with + // forking multi threaded processes. + // StdFastAllocator +> MetaFattrSet; + +void +LayoutManager::Fsck(ostream& os, bool reportAbandonedFilesFlag) +{ + // Do full scan, for added safety: the replication lists don't have to + // be correct / up to date. + const int kLostSet = 0; + const int kEndangeredSet = 1; + const int kAbandonedSet = 2; + const int kSetCount = 3; + const char* const setNames[kSetCount] = { + "Lost", + "Endangered", + "Abandoned" + }; + MetaFattrSet files[kSetCount]; + int64_t maxFilesToReport = mMaxFsckFiles; + const int64_t startTime = microseconds(); + const int64_t pastEofRecoveryEndTime = startTime - + mPastEofRecoveryDelay; + const int64_t abandonedFileEndTime = startTime - + mFsckAbandonedFileTimeout; + const int64_t maxEndTime = startTime + mMaxFsckTime; + unsigned int timeCheckCnt = 0; + bool timedOutFlag = false; + vector cblk; + mChunkToServerMap.First(); + for (const CSMap::Entry* p; + (p = mChunkToServerMap.Next()) && + maxFilesToReport > 0; ) { + const size_t serverCnt = mChunkToServerMap.ServerCount(*p); + if (serverCnt <= 0 || + (reportAbandonedFilesFlag && + p->GetFattr()->IsStriped())) { + const MetaFattr* const fa = p->GetFattr(); + // For now treat all striped files as bein written into, + // regardless if the file has recovery stripes or not, + // if chunk logical position is past logical EOF. + if (fa->IsStriped()) { + if (files[kLostSet].Find(fa)) { + continue; // Already has missing chunks. + } + if (fa->mtime < pastEofRecoveryEndTime && + fa->filesize <= + fa->ChunkPosToChunkBlkFileStartPos( + p->GetChunkInfo()->offset)) { + bool insertedFlag = false; + if (fa->mtime < abandonedFileEndTime && + files[kAbandonedSet].Insert( + fa, fa, + insertedFlag) && + insertedFlag && + files[kEndangeredSet].Erase(fa) == 0) { + maxFilesToReport--; + } + continue; + } + if (serverCnt > 0) { + continue; + } + bool incompleteCBFlag = false; + bool incompleteCBWriteHasLeaseFlag = false; + if (CanBeRecovered(*p, incompleteCBFlag, + &incompleteCBWriteHasLeaseFlag, + cblk)) { + continue; + } + if (incompleteCBFlag) { + if (incompleteCBWriteHasLeaseFlag || + fa->mtime >= + abandonedFileEndTime) { + continue; + } + bool insertedFlag = false; + if (files[kAbandonedSet].Insert( + fa, fa, insertedFlag) && + insertedFlag && + files[kEndangeredSet].Erase(fa) == 0) { + maxFilesToReport--; + } + continue; + } + } + bool insertedFlag = false; + if (files[kLostSet].Insert(fa, fa, insertedFlag) && + insertedFlag && + files[kEndangeredSet].Erase(fa) == 0 && + files[kAbandonedSet].Erase(fa) == 0) { + maxFilesToReport--; + } + } else if (serverCnt == 1 && p->GetFattr()->numReplicas > 1) { + const MetaFattr* const fa = p->GetFattr(); + bool insertedFlag = false; + if (! files[kLostSet].Find(fa) && + ! files[kAbandonedSet].Find(fa) && + files[kEndangeredSet].Insert( + fa, fa, insertedFlag) && + insertedFlag) { + maxFilesToReport--; + } + } + if ((++timeCheckCnt & 0x3FFF) == 0 && + maxEndTime < microseconds()) { + timedOutFlag = true; + break; + } + } + for (int i = 0; i < kSetCount; i++) { + os << setNames[i] << " files total: " << + files[i].GetSize() << "\n"; + } + if (maxFilesToReport <= 0) { + os << "Warning: report limited to first: " << + mMaxFsckFiles << " files\n"; + } + if (timedOutFlag) { + os << "Warning: report limited due to reaching time" + " limit of: " << (mMaxFsckTime * 1e-6) << " seconds\n"; + } + os << "Fsck run time: " << + ((microseconds() - startTime) * 1e-6) << " sec.\n"; + for (int i = 0; i < kSetCount; i++) { + if (files[i].GetSize() <= 0) { + continue; + } + os << setNames[i] << " files [path size mtime]:\n"; + files[i].First(); + for (const KeyOnlyFattrPtr* p; (p = files[i].Next()); ) { + const MetaFattr* const fa = p->GetKey(); + const string path = metatree.getPathname(fa); + if (path.empty()) { + continue; + } + os << path << "\t" << fa->filesize << "\t" << + DisplayIsoDateTime(fa->mtime) << + "\n"; + } + } +} + +class LayoutManager::FilesChecker +{ +public: + typedef LayoutManager::ChunkPlacement ChunkPlacement; + + enum Status + { + kStateNone = 0, + kLost, + kLostIfServerDown, + kLostIfRackDown, + kAbandoned, + kOk, + kStateCount + }; + static int GetStreamsCount(bool reportAbandonedFilesFlag) + { + return (kStateCount - (reportAbandonedFilesFlag ? 1 : 2)); + } + + FilesChecker( + LayoutManager& layoutManager, + int64_t maxFilesToReport, + ostream** os) + : mLayoutManager(layoutManager), + mPlacement(), + mStartTime(microseconds()), + mMaxToReportFileCount(maxFilesToReport), + mPath(), + mDepth(0), + mStopFlag(false), + mStopReason(), + mDirCount(0), + mFileCount(0), + mMaxDirDepth(0), + mOverReplicatedCount(0), + mUnderReplicatedCount(0), + mChunkLostCount(0), + mNoRackCount(0), + mRecoveryBlock(0), + mPartialRecoveryBlock(0), + mReplicaCount(0), + mMaxReplicaCount(0), + mToReportFileCount(0), + mMaxChunkCount(0), + mTotalChunkCount(0), + mMaxFileSize(0), + mTotalFilesSize(0), + mStripedFilesCount(0), + mFilesWithRecoveryCount(0), + mMaxReplication(0) + { + mPath.reserve(8 << 10); + for (int i = 0, k = 0; i < kStateCount; i++) { + if ((mOs[i] = os ? os[k] : 0)) { + k++; + } + mFileCounts[i] = 0; + } + } + bool operator()(const MetaDentry& de, + const MetaFattr& fa, + size_t depth) + { + if (mStopFlag) { + return false; + } + if (depth < mDepth) { + mDepth = depth; + mPath.resize(mDepth); + } else if (depth > mDepth) { + assert(depth == mDepth + 1); + mDepth = depth; + } + if (fa.type == KFS_DIR) { + mPath.resize(mDepth); + mPath.push_back(&de); + mMaxDirDepth = max(mMaxDirDepth, mDepth); + mDirCount++; + return true; + } + const chunkOff_t fsize = metatree.getFileSize(fa); + mMaxFileSize = max(mMaxFileSize, fsize); + mTotalFilesSize += fsize; + mMaxChunkCount = max(mMaxChunkCount, fa.chunkcount()); + mFileCount++; + if (fa.HasRecovery()) { + mFilesWithRecoveryCount++; + } else if (fa.IsStriped()) { + mStripedFilesCount++; + } + mMaxReplication = max(mMaxReplication, fa.numReplicas); + mLayoutManager.CheckFile(*this, de, fa); + return (! mStopFlag); + } + void Report( + Status status, + const MetaDentry& de, + const MetaFattr& fa) + { + mFileCounts[status]++; + if (mOs[status] == 0 || + mToReportFileCount++ > + mMaxToReportFileCount) { + return; + } + ostream& os = *(mOs[status]); + os << + status << + " " << metatree.getFileSize(fa) << + " " << fa.numReplicas << + " " << fa.striperType << + " " << fa.numStripes << + " " << fa.numRecoveryStripes << + " " << fa.stripeSize << + " " << fa.chunkcount() << + " " << DisplayIsoDateTime(fa.mtime) << + " "; + DisplayPath(os) << "/" << de.getName() << "\n"; + } + void OverReplicated() { mOverReplicatedCount++; } + void UnderReplicated() { mUnderReplicatedCount++; } + void NoRack() { mNoRackCount++; } + void RecoveryBlock() { mRecoveryBlock++; } + void PartialRecoveryBlock() { mPartialRecoveryBlock++; } + void Chunk() { mTotalChunkCount++; } + void ChunkLost() { mChunkLostCount++; } + void ChunkReplicas(size_t cnt) + { + mReplicaCount += cnt; + mMaxReplicaCount = max(mMaxReplicaCount, cnt); + } + ChunkPlacement& GetPlacement() { return mPlacement; } + int64_t StartTime() const { return mStartTime; } + int64_t GetFileCount() const { return mFileCount; } + int64_t ItemsCount() const + { return (mTotalChunkCount + (int64_t)mFileCount); } + void Stop(const string& reason = string()) + { + mStopReason = reason; + mStopFlag = true; + } + const string& GetStopReason() const { return mStopReason; } + void Report(size_t chunkCount) + { + if (! mOs[kStateNone]) { + return; + } + const char* const kStateNames[kStateCount] = { + "none", + "lost", + "lost if server down", + "lost if rack down", + "abandoned", + "ok" + }; + ostream& os = *(mOs[kStateNone]); + if (! mStopFlag) { + // For backward compatibility with kfsfsck client the + // first line must be the following: + os << "Lost files total: " << + mFileCounts[kLost] << "\n"; + } + const char* const suff = mStopFlag ? "checked" : "reachable"; + if (mStopFlag) { + os << + "WARNING: fsck report is incomplete: " << + mStopReason << "\n"; + } else if (mMaxToReportFileCount < mToReportFileCount) { + os << + "WARNING: fsck report is incomplete: " << + " exceeded max number files to report: " << + mMaxToReportFileCount << + " total: " << mToReportFileCount << "\n"; + } + const int64_t dirCnt = GetNumDirs(); + const int64_t fileCnt = GetNumFiles(); + os << + "Directories: " << dirCnt << "\n" + "Directories " << suff << ": " << (mDirCount + 1) << + " " << ((mDirCount + 1) * 1e2 / + max(dirCnt, int64_t(1))) << "%\n" + "Directory " << suff << " max depth: " << + (mMaxDirDepth + 1) << "\n" + "Files: " << fileCnt << "\n" + "Files " << suff << ": " << mFileCount << + " " << (mFileCount * 1e2 / + max(fileCnt, int64_t(1))) << "%\n"; + const double fpct = 1e2 / max(mFileCount, size_t(1)); + os << + "Files " << suff << " with recovery: " << + mFilesWithRecoveryCount << + " " << (mFilesWithRecoveryCount * fpct) << "%\n" + "Files " << suff << " striped: " << + mStripedFilesCount << + " " << (mStripedFilesCount * fpct) << "%\n" + "Files " << suff << " sum of logical sizes: " << + mTotalFilesSize << "\n"; + for (int i = kStateNone + 1; i < kStateCount; i++) { + os << + i << " Files " << + suff << " " << kStateNames[i] << ": " << + mFileCounts[i] << " " << + (mFileCounts[i] * fpct) << "%\n"; + } + const double cpct = 1e2 / max(mTotalChunkCount, int64_t(1)); + os << + "File " << suff << " max size: " << mMaxFileSize << "\n" + "File " << suff << " max chunks: " << mMaxChunkCount << "\n" + "File " << suff << " max replication: " << + mMaxReplication << "\n" + "Chunks: " << chunkCount << "\n" + "Chunks " << suff << ": " << mTotalChunkCount << + " " << (mTotalChunkCount * 1e2 / + max(chunkCount, size_t(1))) << "%\n" + "Chunks " << suff << " lost: " << + mChunkLostCount << + " " << (mChunkLostCount * cpct) << "%\n" + "Chunks " << suff << " no rack assigned: " << + mNoRackCount << + " " << (mNoRackCount * cpct) << "%\n" + "Chunks " << suff << " over replicated: " << + mOverReplicatedCount << + " " << (mOverReplicatedCount * cpct) << "%\n" + "Chunks " << suff << " under replicated: " << + mUnderReplicatedCount << + " " << (mUnderReplicatedCount * cpct) << "%\n" + "Chunks " << suff << " replicas: " << mReplicaCount << + " " << (mReplicaCount * cpct) << "%\n" + "Chunk " << suff << " max replicas: " << + mMaxReplicaCount << "\n" + "Recovery blocks " << suff << ": " << mRecoveryBlock << "\n" + "Recovery blocks " << suff << " partial: " << + mPartialRecoveryBlock << + " " << (mPartialRecoveryBlock * 1e2 / + max(mRecoveryBlock, size_t(1))) << "%\n" + "Fsck run time: " << + (microseconds() - mStartTime) * 1e-6 << " sec.\n" + "Files: [fsck_state size replication type stripes" + " recovery_stripes stripe_size chunk_count mtime" + " path]\n" + ; + } +private: + typedef vector Path; + + LayoutManager& mLayoutManager; + ChunkPlacement mPlacement; + ostream* mOs[kStateCount]; + size_t mFileCounts[kStateCount]; + const int64_t mStartTime; + const int64_t mMaxToReportFileCount; + Path mPath; + size_t mDepth; + bool mStopFlag; + string mStopReason; + size_t mDirCount; + size_t mFileCount; + size_t mMaxDirDepth; + size_t mOverReplicatedCount; + size_t mUnderReplicatedCount; + size_t mChunkLostCount; + size_t mNoRackCount; + size_t mRecoveryBlock; + size_t mPartialRecoveryBlock; + size_t mReplicaCount; + size_t mMaxReplicaCount; + int64_t mToReportFileCount; + int64_t mMaxChunkCount; + int64_t mTotalChunkCount; + chunkOff_t mMaxFileSize; + int64_t mTotalFilesSize; + size_t mStripedFilesCount; + size_t mFilesWithRecoveryCount; + int mMaxReplication; + + ostream& DisplayPath(ostream& os) const + { + for (Path::const_iterator it = mPath.begin(); + it != mPath.end(); + ++it) { + os << "/" << (*it)->getName(); + } + return os; + } +private: + FilesChecker(const FilesChecker&); + FilesChecker& operator=(const FilesChecker&); +}; + +void +LayoutManager::CheckFile( + FilesChecker& fsck, + const MetaDentry& de, + const MetaFattr& fa) +{ + const int64_t kScanCheckMask = ((int64_t(1) << 16) - 1); + bool stopFlag = false; + FilesChecker::Status status = FilesChecker::kOk; + const size_t recoveryStripeCnt = (size_t)(fa.HasRecovery() ? + fa.numRecoveryStripes : 0); + const chunkOff_t recoveryPos = (chunkOff_t)CHUNKSIZE * + (recoveryStripeCnt > 0 ? fa.numStripes : 1); + const chunkOff_t chunkBlockSize = (chunkOff_t)CHUNKSIZE * + (recoveryStripeCnt > 0 ? + (fa.numStripes + fa.numRecoveryStripes) : 1); + ChunkIterator it = metatree.getAlloc(fa.id()); + StTmp serversTmp(mServersTmp); + StTmp placementTmp(mChunkPlacementTmp); + ChunkPlacement& placement = placementTmp.Get(); + ChunkPlacement& chunkPlacement = fsck.GetPlacement(); + chunkOff_t chunkBlockEnd = -1; + chunkOff_t chunkBlockCount = 0; + bool invalidBlkFlag = false; + for (const MetaChunkInfo* p = it.next(); p; ) { + if (p->offset != chunkBlockEnd) { + chunkBlockEnd = p->offset; + if (recoveryStripeCnt > 0 && chunkBlockEnd != 0) { + const chunkOff_t blockHead = + chunkBlockEnd % chunkBlockSize; + chunkBlockEnd -= blockHead; + if (blockHead != 0) { + fsck.PartialRecoveryBlock(); + invalidBlkFlag = true; + } + } + } + const chunkOff_t recoveryStartPos = chunkBlockEnd + recoveryPos; + chunkBlockEnd += chunkBlockSize; + placement.clear(); + size_t missingCnt = 0; + size_t blockRecoveryStripeCnt = 0; + if (recoveryStripeCnt > 0) { + fsck.RecoveryBlock(); + } + for ( ; p; p = it.next()) { + const MetaChunkInfo& ci = *p; + if (chunkBlockEnd <= ci.offset) { + break; + } + fsck.Chunk(); + if ((fsck.ItemsCount() & kScanCheckMask) == 0 && + fsck.StartTime() + mMaxFsckTime < + microseconds()) { + stopFlag = true; + break; + } + if (recoveryStartPos <= ci.offset) { + blockRecoveryStripeCnt++; + } + const CSMap::Entry& entry = + CSMap::Entry::GetCsEntry(ci); + Servers& srvs = serversTmp.Get(); + mChunkToServerMap.GetServers(entry, srvs); + chunkPlacement.clear(); + // Count here chunks that are being evacuated, and the + // servers that are being retired. + chunkPlacement.ExcludeServerAndRack(srvs); + const size_t srvsCnt = + chunkPlacement.GetExcludedServersCount(); + if (srvsCnt == 0) { + fsck.ChunkLost(); + missingCnt++; + continue; + } + fsck.ChunkReplicas(srvsCnt); + const size_t rackCnt = + chunkPlacement.GetExcludedRacksCount(); + if (rackCnt <= 1) { + if (chunkPlacement.GetExcludedServersCount() <= + 1) { + placement.ExcludeServer(srvs); + } + if (rackCnt > 0) { + placement.ExcludeRack(srvs); + } else { + fsck.NoRack(); + } + } + if (srvsCnt < (size_t)fa.numReplicas) { + fsck.UnderReplicated(); + } else if (srvsCnt > (size_t)fa.numReplicas) { + fsck.OverReplicated(); + } + } + if (stopFlag) { + break; + } + chunkBlockCount++; + if (blockRecoveryStripeCnt < recoveryStripeCnt) { + fsck.PartialRecoveryBlock(); + invalidBlkFlag = true; + } + if (recoveryStripeCnt < missingCnt) { + status = FilesChecker::kLost; + } + if (status == FilesChecker::kLost) { + continue; + } + if (recoveryStripeCnt < missingCnt + + placement.GetExcludedServersMaxCount()) { + status = FilesChecker::kLostIfServerDown; + } else if (status == FilesChecker::kOk && + recoveryStripeCnt < missingCnt + + placement.GetExcludedRacksMaxCount()) { + status = FilesChecker::kLostIfRackDown; + } + } + if (! stopFlag) { + if (recoveryStripeCnt > 0 && chunkBlockCount > 0 && + (status != FilesChecker::kLost || + fa.filesize <= 0 || + invalidBlkFlag) && + fa.mtime + mPastEofRecoveryDelay < + fsck.StartTime() && + fa.filesize <= (chunkBlockCount - 1) * + fa.numStripes * (chunkOff_t)CHUNKSIZE && + fa.mtime + mFsckAbandonedFileTimeout < + fsck.StartTime()) { + status = FilesChecker::kAbandoned; + } + fsck.Report(status, de, fa); + } else if (chunkBlockCount <= 0) { + stopFlag = (fsck.ItemsCount() & kScanCheckMask) == 0 && + fsck.StartTime() + mMaxFsckTime < microseconds(); + } + if (stopFlag) { + ostringstream os; + os << "exceeded fsck run time limit of " << + (mMaxFsckTime * 1e-6) << " sec."; + fsck.Stop(os.str()); + } +} + +int +LayoutManager::FsckStreamCount(bool reportAbandonedFilesFlag) const +{ + return (mFullFsckFlag ? + FilesChecker::GetStreamsCount(reportAbandonedFilesFlag) : 1); +} + +void +LayoutManager::Fsck(ostream** os, bool reportAbandonedFilesFlag) +{ + if (mFullFsckFlag) { + FilesChecker fsck(*this, mMaxFsckFiles, os); + metatree.iterateDentries(fsck); + fsck.Report(mChunkToServerMap.Size()); + } else if (os && os[0]) { + Fsck(*(os[0]), reportAbandonedFilesFlag); + } +} + +void +LayoutManager::DumpChunkToServerMap(ostream& os) +{ + mChunkToServerMap.First(); + StTmp serversTmp(mServersTmp); + for (const CSMap::Entry* p; (p = mChunkToServerMap.Next()); ) { + Servers& cs = serversTmp.Get(); + mChunkToServerMap.GetServers(*p, cs); + os << p->GetChunkId() << + " " << p->GetFileId() << + " " << cs.size(); + for (Servers::const_iterator it = cs.begin(); + it != cs.end(); + ++it) { + os << + " " << (*it)->ServerID() << + " " << (*it)->GetRack(); + } + os << "\n"; + } +} + +void +LayoutManager::ServerDown(const ChunkServerPtr& server) +{ + if (! server->IsDown()) { + server->ForceDown(); + } + const bool validFlag = mChunkToServerMap.Validate(server); + Servers::iterator const i = find( + mChunkServers.begin(), mChunkServers.end(), server); + if (validFlag != (i != mChunkServers.end())) { + panic("stale server"); + return; + } + if (! validFlag) { + return; + } + RackInfos::iterator const rackIter = find_if( + mRacks.begin(), mRacks.end(), + bind(&RackInfo::id, _1) == server->GetRack()); + if (rackIter != mRacks.end()) { + rackIter->removeServer(server); + if (rackIter->getServers().empty()) { + // the entire rack of servers is gone + // so, take the rack out + KFS_LOG_STREAM_INFO << "All servers in rack " << + server->GetRack() << " are down; taking out the rack" << + KFS_LOG_EOM; + mRacks.erase(rackIter); + } + } + + // Schedule to expire write leases, and invalidate record append cache. + mChunkLeases.ServerDown(server, mARAChunkCache, mChunkToServerMap); + + const bool canBeMaster = server->CanBeChunkMaster(); + const time_t now = TimeNow(); + const ServerLocation loc = server->GetServerLocation(); + const size_t blockCount = server->GetChunkCount(); + string reason = server->DownReason(); + + KFS_LOG_STREAM_INFO << + "server down: " << loc << + " block count: " << blockCount << + " master: " << canBeMaster << + (reason.empty() ? "" : " reason: " ) << reason << + KFS_LOG_EOM; + + // check if this server was sent to hibernation + HibernatingServerInfo_t* const hs = FindHibernatingServer(loc); + bool isHibernating = hs != 0; + if (isHibernating) { + HibernatingServerInfo_t& hsi = *hs; + const bool wasHibernatedFlag = hsi.IsHibernated(); + const size_t prevIdx = hsi.csmapIdx; + if (mChunkToServerMap.SetHibernated(server, hsi.csmapIdx)) { + if (! wasHibernatedFlag) { + reason = "Hibernated"; + } else { + if (! mChunkToServerMap.RemoveHibernatedServer( + prevIdx)) { + panic("failed to update hibernated" + " server index"); + } + KFS_LOG_STREAM_ERROR << + "hibernated server reconnect " + " failure " << + " location: " << loc << + " index: " << prevIdx << + " -> " << hsi.csmapIdx << + " blocks: " << blockCount << + KFS_LOG_EOM; + reason = "Reconnect failed"; + } + } else { + reason = "Hibernated"; + } + } + + if (! isHibernating && server->IsRetiring()) { + reason = "Retired"; + } else if (reason.empty()) { + reason = "Unreachable"; + } + // for reporting purposes, record when it went down + ostringstream os; + os << + "s=" << loc.hostname << + ", p=" << loc.port << + ", down=" << + DisplayDateTime(int64_t(now) * kSecs2MicroSecs) << + ", reason=" << reason << + "\t"; + mDownServers.push_back(os.str()); + if (mDownServers.size() > mMaxDownServersHistorySize) { + mDownServers.erase(mDownServers.begin(), mDownServers.begin() + + mDownServers.size() - mMaxDownServersHistorySize); + } + + if (! isHibernating && server->GetChunkCount() > 0) { + const int kMinReplicationDelay = 15; + const int replicationDelay = mServerDownReplicationDelay - + server->TimeSinceLastHeartbeat(); + if (replicationDelay > kMinReplicationDelay) { + // Delay replication by marking server as hibernated, + // to allow the server to reconnect back. + mHibernatingServers.push_back( + HibernatingServerInfo_t()); + HibernatingServerInfo_t& hsi = + mHibernatingServers.back(); + hsi.location = loc; + hsi.sleepEndTime = TimeNow() + replicationDelay; + if (! mChunkToServerMap.SetHibernated( + server, hsi.csmapIdx)) { + panic("failed to initiate hibernation"); + } + isHibernating = true; + } + } + + if (canBeMaster) { + if (mMastersCount > 0) { + mMastersCount--; + } + } else if (mSlavesCount > 0) { + mSlavesCount--; + } + if (server->IsRestartScheduled()) { + if (mCSToRestartCount > 0) { + mCSToRestartCount--; + } + if (mMastersToRestartCount > 0 && server->CanBeChunkMaster()) { + mMastersToRestartCount--; + } + } + if (! isHibernating) { + if (! mChunkToServerMap.RemoveServer(server)) { + panic("remove server failure"); + } + } + mChunkServers.erase(i); + if (! mAssignMasterByIpFlag && + mMastersCount == 0 && ! mChunkServers.empty()) { + assert(mSlavesCount > 0 && + ! mChunkServers.front()->CanBeChunkMaster()); + mSlavesCount--; + mMastersCount++; + mChunkServers.front()->SetCanBeChunkMaster(true); + } + UpdateReplicationsThreshold(); + ScheduleCleanup(); +} + +HibernatingServerInfo_t* +LayoutManager::FindHibernatingServer(const ServerLocation& loc) +{ + HibernatedServerInfos::iterator const it = find_if( + mHibernatingServers.begin(), mHibernatingServers.end(), + bind(&HibernatingServerInfo_t::location, _1) == loc + ); + return (it != mHibernatingServers.end() ? &(*it) : 0); +} + +int +LayoutManager::RetireServer(const ServerLocation &loc, int downtime) +{ + if (! mAllowChunkServerRetireFlag && downtime <= 0) { + KFS_LOG_STREAM_INFO << "chunk server retire is not enabled" << + KFS_LOG_EOM; + return -EPERM; + } + Servers::iterator const si = find_if( + mChunkServers.begin(), mChunkServers.end(), + MatchingServer(loc) + ); + if (si == mChunkServers.end() || (*si)->IsDown()) { + // Update down time, and let hibernation status check to + // take appropriate action. + HibernatingServerInfo_t* const hs = FindHibernatingServer(loc); + if (hs) { + hs->sleepEndTime = TimeNow() + max(0, downtime); + return 0; + } + return -ENOENT; + } + + mMightHaveRetiringServersFlag = true; + ChunkServerPtr const server(*si); + if (server->IsRetiring()) { + KFS_LOG_STREAM_INFO << "server: " << loc << + " has already retiring status" << + " down time: " << downtime << + KFS_LOG_EOM; + if (downtime <= 0) { + // The server is already retiring. + return 0; + } + // Change from retiring to hibernating state. + } + + server->SetRetiring(); + if (downtime > 0) { + HibernatingServerInfo_t* const hs = FindHibernatingServer(loc); + if (hs) { + hs->sleepEndTime = TimeNow() + downtime; + } else { + mHibernatingServers.push_back(HibernatingServerInfo_t()); + HibernatingServerInfo_t& hsi = mHibernatingServers.back(); + hsi.location = loc; + hsi.sleepEndTime = TimeNow() + downtime; + } + KFS_LOG_STREAM_INFO << "hibernating server: " << loc << + " down time: " << downtime << + KFS_LOG_EOM; + server->Retire(); // Remove when connection will go down. + return 0; + } + + if (server->GetChunkCount() <= 0) { + server->Retire(); + return 0; + } + + InitCheckAllChunks(); + return 0; +} + +int64_t +LayoutManager::GetSlavePlacementScale() +{ + if (! mUpdatePlacementScaleFlag) { + return mSlavePlacementScale; + } + mUpdatePlacementScaleFlag = false; + // Make slaves comparable to masters for the purpose of RS placement + // replication 1 or non append placement. + mSlavePlacementScale = max(int64_t(1), + ((mCSMasterLoadAvgSum * mCSSlavePossibleCandidateCount) << + kSlaveScaleFracBits / 2) / + max(int64_t(1), (mCSSlaveLoadAvgSum * + mCSMasterPossibleCandidateCount) >> + (kSlaveScaleFracBits - kSlaveScaleFracBits / 2))); + if (mSlavePlacementScale > mMaxSlavePlacementRange) { + mSlavePlacementScale -= mMaxSlavePlacementRange; + } else { + mSlavePlacementScale = int64_t(1) << kSlaveScaleFracBits; + } + return mSlavePlacementScale; +} + +void +LayoutManager::UpdateGoodCandidateLoadAvg() +{ + if (! mUpdateCSLoadAvgFlag) { + return; + } + mUpdateCSLoadAvgFlag = false; + mCSMaxGoodCandidateLoadAvg = (int64_t)(mCSLoadAvgSum * + mCSMaxGoodCandidateLoadRatio / + max(mCSTotalPossibleCandidateCount, 1)); + mCSMaxGoodMasterCandidateLoadAvg = (int64_t)(mCSMasterLoadAvgSum * + mCSMaxGoodMasterLoadRatio / + max(mCSMasterPossibleCandidateCount, 1)); + mCSMaxGoodSlaveCandidateLoadAvg = (int64_t)(mCSSlaveLoadAvgSum * + mCSMaxGoodSlaveLoadRatio / + max(mCSSlavePossibleCandidateCount, 1)); +} + +bool +LayoutManager::CanBeCandidateServer(const ChunkServer& c) const +{ + return ( + c.GetAvailSpace() >= mChunkAllocMinAvailSpace && + c.IsResponsiveServer() && + ! c.IsRetiring() && + ! c.IsRestartScheduled() && + c.GetSpaceUtilization(mUseFsTotalSpaceFlag) <= + mMaxSpaceUtilizationThreshold + ); +} + +bool +LayoutManager::IsCandidateServer(const ChunkServer& c, + double writableChunksThresholdRatio /* = 1 */) +{ + UpdateGoodCandidateLoadAvg(); + return ( + c.GetCanBeCandidateServerFlag() && + c.GetNumChunkWrites() < c.GetNumWritableDrives() * + mMaxWritesPerDriveThreshold * + writableChunksThresholdRatio && + (c.GetLoadAvg() <= (c.CanBeChunkMaster() ? + mCSMaxGoodMasterCandidateLoadAvg : + mCSMaxGoodSlaveCandidateLoadAvg)) + ); +} + +void +LayoutManager::UpdateSrvLoadAvg(ChunkServer& srv, int64_t delta, + bool canBeCandidateFlag /* = true */) +{ + mUpdateCSLoadAvgFlag = true; + mUpdatePlacementScaleFlag = true; + const bool wasPossibleCandidate = srv.GetCanBeCandidateServerFlag(); + if (wasPossibleCandidate && delta != 0) { + mCSLoadAvgSum += delta; + if (srv.CanBeChunkMaster()) { + mCSMasterLoadAvgSum += delta; + } else { + mCSSlaveLoadAvgSum += delta; + } + assert(mCSLoadAvgSum >= 0 && + mCSMasterLoadAvgSum >= 0 && mCSSlaveLoadAvgSum >= 0); + } + const bool isPossibleCandidate = + canBeCandidateFlag && CanBeCandidateServer(srv); + if (wasPossibleCandidate == isPossibleCandidate) { + return; + } + const int inc = isPossibleCandidate ? 1 : -1; + RackInfos::iterator const rackIter = find_if( + mRacks.begin(), mRacks.end(), + bind(&RackInfo::id, _1) == srv.GetRack()); + if (rackIter != mRacks.end()) { + rackIter->updatePossibleCandidatesCount(inc); + } + mCSTotalPossibleCandidateCount += inc; + if (srv.CanBeChunkMaster()) { + mCSMasterPossibleCandidateCount += inc; + } else { + mCSSlavePossibleCandidateCount += inc; + } + assert( + mCSTotalPossibleCandidateCount >= 0 && + mCSMasterPossibleCandidateCount >= 0 && + mCSSlavePossibleCandidateCount >= 0 + ); + const int64_t davg = isPossibleCandidate ? + srv.GetLoadAvg() : -srv.GetLoadAvg(); + mCSLoadAvgSum += davg; + if (srv.CanBeChunkMaster()) { + mCSMasterLoadAvgSum += davg; + } else { + mCSSlaveLoadAvgSum += davg; + } + assert(mCSLoadAvgSum >= 0 && + mCSMasterLoadAvgSum >= 0 && mCSSlaveLoadAvgSum >= 0); + srv.SetCanBeCandidateServerFlag(isPossibleCandidate); +} + +void +LayoutManager::UpdateChunkWritesPerDrive(ChunkServer& /* srv */, + int deltaNumChunkWrites, int deltaNumWritableDrives) +{ + mTotalChunkWrites += deltaNumChunkWrites; + mTotalWritableDrives += deltaNumWritableDrives; + if (deltaNumWritableDrives != 0) { + mTotalWritableDrivesMult = mTotalWritableDrives > 0 ? + mMaxWritesPerDriveRatio / mTotalWritableDrives : 0.; + } + if (mTotalChunkWrites <= 0) { + mTotalChunkWrites = 0; + } + if (mTotalWritableDrives <= 0) { + mTotalWritableDrives = 0; + mMaxWritesPerDriveThreshold = mMinWritesPerDrive; + } else if (mTotalChunkWrites <= mTotalWritableDrives) { + mMaxWritesPerDriveThreshold = mMinWritesPerDrive; + } else { + mMaxWritesPerDriveThreshold = max(mMinWritesPerDrive, + (int)(mTotalChunkWrites * mTotalWritableDrivesMult)); + } +} + +inline double +GetRackWeight( + const LayoutManager::RackInfos& racks, + LayoutManager::RackId rack, + double maxWeight) +{ + if (rack < 0) { + return maxWeight; + } + LayoutManager::RackInfos::const_iterator const it = find_if( + racks.begin(), racks.end(), bind(&RackInfo::id, _1) == rack); + return (it == racks.end() ? + maxWeight : min(maxWeight, it->getWeight())); +} +/// +/// The algorithm for picking a set of servers to hold a chunk is: (1) pick +/// the server with the most amount of free space, and (2) to break +/// ties, pick the one with the least amount of used space. This +/// policy has the effect of doing round-robin allocations. The +/// allocated space is something that we track. Note: We rely on the +/// chunk servers to tell us how much space is used up on the server. +/// Since servers can respond at different rates, doing allocations +/// based on allocated space ensures equitable distribution; +/// otherwise, if we were to do allocations based on the amount of +/// used space, then a slow responding server will get pummelled with +/// lots of chunks (i.e., used space will be updated on the meta +/// server at a slow rate, causing the meta server to think that the +/// chunk server has lot of space available). +/// +int +LayoutManager::AllocateChunk( + MetaAllocate* r, const vector& chunkBlock) +{ + // r->offset is a multiple of CHUNKSIZE + assert(r->offset >= 0 && (r->offset % CHUNKSIZE) == 0); + + r->servers.clear(); + if (r->numReplicas <= 0) { + // huh? allocate a chunk with 0 replicas??? + KFS_LOG_STREAM_DEBUG << + "allocate chunk reaplicas: " << r->numReplicas << + " request: " << r->Show() << + KFS_LOG_EOM; + r->statusMsg = "0 replicas"; + return -EINVAL; + } + StTmp placementTmp(mChunkPlacementTmp); + ChunkPlacement& placement = placementTmp.Get(); + if (r->stripedFileFlag) { + // For replication greater than one do the same placement, but + // only take into the account write masters, or the chunk server + // hosting the first replica. + for (StripedFilesAllocationsInFlight::const_iterator it = + mStripedFilesAllocationsInFlight.lower_bound( + make_pair(make_pair( + r->fid, r->chunkBlockStart), 0)); + it != mStripedFilesAllocationsInFlight.end() && + it->first.first == r->fid; + ++it) { + if (it->first.second == r->chunkBlockStart) { + const ChunkLeases::WriteLease* const lease = + mChunkLeases.GetWriteLease(it->second); + if (! lease || ! lease->chunkServer) { + continue; + } + if (lease->allocInFlight && + lease->allocInFlight->status == 0) { + placement.ExcludeServerAndRack( + lease->allocInFlight->servers, + it->second); + } else { + placement.ExcludeServerAndRack( + lease->chunkServer, it->second); + } + } + } + StTmp serversTmp(mServersTmp); + for (vector::const_iterator it = + chunkBlock.begin(); + it != chunkBlock.end(); + ++it) { + Servers& srvs = serversTmp.Get(); + mChunkToServerMap.GetServers(GetCsEntry(**it), srvs); + placement.ExcludeServerAndRack(srvs, (*it)->chunkId); + } + } + r->servers.reserve(r->numReplicas); + + // for non-record append case, take the server local to the machine on + // which the client is on make that the master; this avoids a network transfer. + // For the record append case, to avoid deadlocks when writing out large + // records, we are doing hierarchical allocation: a chunkserver that is + // a chunk master is never made a slave. + ChunkServerPtr localserver; + int replicaCnt = 0; + Servers::iterator const li = (! (r->appendChunk ? + mAllowLocalPlacementForAppendFlag && + ! mInRackPlacementForAppendFlag : + mAllowLocalPlacementFlag) || + r->clientIp.empty()) ? + mChunkServers.end() : + find_if(mChunkServers.begin(), mChunkServers.end(), + MatchServerByHost(r->clientIp)); + if (li != mChunkServers.end() && + (! r->appendChunk || (*li)->CanBeChunkMaster()) && + IsCandidateServer( + **li, + GetRackWeight(mRacks, (*li)->GetRack(), + mMaxLocalPlacementWeight) + ) && + placement.CanBeUsed(*li)) { + replicaCnt++; + localserver = *li; + placement.ExcludeServer(localserver); + } + RackId rackIdToUse = -1; + if ((r->appendChunk ? + mInRackPlacementForAppendFlag : + mInRackPlacementFlag) && + ! mRacks.empty() && ! r->clientIp.empty()) { + if (li != mChunkServers.end()) { + rackIdToUse = (*li)->GetRack(); + } + if (rackIdToUse < 0) { + rackIdToUse = GetRackId(r->clientIp); + } + if (rackIdToUse < 0 && li == mChunkServers.end()) { + Servers::iterator const it = find_if( + mChunkServers.begin(), + mChunkServers.end(), + MatchServerByHost(r->clientIp)); + if (it != mChunkServers.end()) { + rackIdToUse = (*it)->GetRack(); + } + } + } + const bool kForReplicationFlag = false; + placement.FindCandidates(kForReplicationFlag, rackIdToUse); + size_t numServersPerRack(1); + if (r->numReplicas > 1) { + numServersPerRack = placement.GetCandidateRackCount(); + if (r->appendChunk ? + mInRackPlacementForAppendFlag : + mInRackPlacementFlag) { + numServersPerRack = r->numReplicas; + } else if (numServersPerRack <= 1 || + (numServersPerRack < (size_t)r->numReplicas && + placement.GetExcludedRacksCount() > 0)) { + // Place first replica, then re-calculate. + numServersPerRack = 1; + } else { + numServersPerRack = ((size_t)r->numReplicas + + numServersPerRack - 1) / numServersPerRack; + } + } + // For append always reserve the first slot -- write master. + if (r->appendChunk || localserver) { + r->servers.push_back(localserver); + } + int mastersSkipped = 0; + int slavesSkipped = 0; + size_t numCandidates = 0; + for (; ;) { + // take as many as we can from this rack + const size_t psz = r->servers.size(); + const RackId rackId = placement.GetRackId(); + for (size_t n = (localserver && + rackId == localserver->GetRack()) ? 1 : 0; + (n < numServersPerRack || rackId < 0) && + replicaCnt < r->numReplicas; + ) { + const ChunkServerPtr cs = + placement.GetNext(r->stripedFileFlag); + if (! cs) { + break; + } + if (placement.IsUsingServerExcludes() && + find(r->servers.begin(), + r->servers.end(), cs) != + r->servers.end()) { + continue; + } + numCandidates++; + if (r->appendChunk) { + // for record appends, to avoid deadlocks for + // buffer allocation during atomic record + // appends, use hierarchical chunkserver + // selection + if (cs->CanBeChunkMaster()) { + if (r->servers.front()) { + mastersSkipped++; + continue; + } + r->servers.front() = cs; + } else { + if (r->servers.size() >= + (size_t)r->numReplicas) { + slavesSkipped++; + continue; + } + if (mAllocateDebugVerifyFlag && + find(r->servers.begin(), + r->servers.end(), cs) != + r->servers.end()) { + panic("allocate: duplicate slave"); + continue; + } + r->servers.push_back(cs); + } + } else { + if (mAllocateDebugVerifyFlag && + find(r->servers.begin(), + r->servers.end(), cs) != + r->servers.end()) { + panic("allocate: duplicate server"); + continue; + } + r->servers.push_back(cs); + } + n++; + replicaCnt++; + } + if (r->numReplicas <= replicaCnt || placement.IsLastRack()) { + break; + } + if (r->appendChunk && mInRackPlacementForAppendFlag && + rackId >= 0 && + (r->numReplicas + 1) * + placement.GetCandidateRackCount() < + mChunkServers.size()) { + // Reset, try to find another rack where both replicas + // can be placed. + // This assumes that the racks are reasonably + // "balanced". + replicaCnt = 0; + r->servers.clear(); + r->servers.push_back(ChunkServerPtr()); + localserver.reset(); + } else if (r->stripedFileFlag && r->numReplicas > 1 && + numServersPerRack == 1 && + psz == 0 && r->servers.size() == size_t(1)) { + // Striped file placement: attempt to place the first + // chunk replica on a different rack / server than other + // chunks in the stripe. + // Attempt to place all subsequent replicas on different + // racks. + placement.clear(); + placement.ExcludeServerAndRack(r->servers); + placement.FindCandidates(kForReplicationFlag); + numServersPerRack = placement.GetCandidateRackCount(); + numServersPerRack = numServersPerRack <= 1 ? + (size_t)(r->numReplicas - replicaCnt) : + ((size_t)(r->numReplicas - replicaCnt) + + numServersPerRack - 1) / numServersPerRack; + } else { + placement.ExcludeServer( + r->servers.begin() + psz, r->servers.end()); + } + if (! placement.NextRack()) { + break; + } + } + bool noMaster = false; + if (r->servers.empty() || (noMaster = ! r->servers.front())) { + int dontLikeCount[2] = { 0, 0 }; + int outOfSpaceCount[2] = { 0, 0 }; + int notResponsiveCount[2] = { 0, 0 }; + int retiringCount[2] = { 0, 0 }; + int restartingCount[2] = { 0, 0 }; + for (Servers::const_iterator it = + mChunkServers.begin(); + it != mChunkServers.end(); + ++it) { + const ChunkServer& cs = **it; + const int i = cs.CanBeChunkMaster() ? 0 : 1; + if (! IsCandidateServer(cs)) { + dontLikeCount[i]++; + } + if (cs.GetAvailSpace() < mChunkAllocMinAvailSpace || + cs.GetSpaceUtilization( + mUseFsTotalSpaceFlag) > + mMaxSpaceUtilizationThreshold) { + outOfSpaceCount[i]++; + } + if (! cs.IsResponsiveServer()) { + notResponsiveCount[i]++; + } + if (cs.IsRetiring()) { + retiringCount[i]++; + } + if (cs.IsRestartScheduled()) { + restartingCount[i]++; + } + } + const size_t numFound = r->servers.size(); + r->servers.clear(); + KFS_LOG_STREAM_INFO << "allocate chunk no " << + (noMaster ? "master" : "servers") << + " repl: " << r->numReplicas << + "/" << replicaCnt << + " servers: " << numFound << + "/" << mChunkServers.size() << + " dont like: " << dontLikeCount[0] << + "/" << dontLikeCount[1] << + " no space: " << outOfSpaceCount[0] << + "/" << outOfSpaceCount[1] << + " slow: " << notResponsiveCount[0] << + "/" << notResponsiveCount[1] << + " retire: " << retiringCount[0] << + "/" << retiringCount[1] << + " restart: " << restartingCount[0] << + "/" << restartingCount[1] << + " racks: " << placement.GetCandidateRackCount() << + " candidates: " << numCandidates << + " masters: " << mastersSkipped << + "/" << mMastersCount << + " slaves: " << slavesSkipped << + "/" << mSlavesCount << + " to restart: " << mCSToRestartCount << + "/" << mMastersToRestartCount << + " request: " << r->Show() << + KFS_LOG_EOM; + r->statusMsg = noMaster ? "no master" : "no servers"; + return -ENOSPC; + } + assert(r->servers.size() <= (size_t)r->numReplicas); + r->master = r->servers[0]; + + if (! mChunkLeases.NewWriteLease( + r->chunkId, + r->chunkVersion, + GetInitialWriteLeaseExpireTime(), + r->servers[0], + r->pathname.GetStr(), + r->appendChunk, + r->stripedFileFlag, + r, + r->leaseId)) { + panic("failed to get write lease for a new chunk"); + } + + if (r->stripedFileFlag) { + if (! mStripedFilesAllocationsInFlight.insert(make_pair(make_pair( + r->fid, r->chunkBlockStart), r->chunkId)).second) { + panic("duplicate in striped file allocation entry"); + } + } + for (size_t i = r->servers.size(); i-- > 0; ) { + r->servers[i]->AllocateChunk(r, i == 0 ? r->leaseId : -1); + } + // Handle possible recursion ensure that request still valid. + if (! r->servers.empty() && r->appendChunk && r->status >= 0) { + mARAChunkCache.RequestNew(*r); + } + return 0; +} + +struct MetaLogChunkVersionChange : public MetaRequest, public KfsCallbackObj +{ + MetaAllocate& alloc; + MetaLogChunkVersionChange(MetaAllocate& alloc) + : MetaRequest( + META_LOG_CHUNK_VERSION_CHANGE, true, alloc.opSeqno), + KfsCallbackObj(), + alloc(alloc) + { + SET_HANDLER(this, &MetaLogChunkVersionChange::logDone); + clnt = this; + } + virtual void handle() + { status = 0; } + virtual string Show() const + { + return string("log-chunk-version-change: ") + alloc.Show(); + } + virtual int log(ostream &file) const + { + file << "beginchunkversionchange" + "/file/" << alloc.fid << + "/chunkId/" << alloc.chunkId << + "/chunkVersion/" << alloc.chunkVersion << + "\n"; + return file.fail() ? -EIO : 0; + } + int logDone(int code, void* data) + { + assert(code == EVENT_CMD_DONE && data == this); + MetaAllocate& r = alloc; + delete this; + for (size_t i = r.servers.size(); i-- > 0; ) { + r.servers[i]->AllocateChunk(&r, i == 0 ? r.leaseId : -1); + } + return 0; + } +}; + +bool +LayoutManager::ReplayBeginChangeChunkVersion( + fid_t fid, + chunkId_t chunkId, + seq_t chunkVersion) +{ + const char* err = 0; + const CSMap::Entry* const cs = mChunkToServerMap.Find(chunkId); + if (! cs) { + err = "no such chunk"; + } + const seq_t vers = err ? -1 : cs->GetChunkInfo()->chunkVersion; + if (! err && vers >= chunkVersion) { + err = "invalid version transition"; + } + if (! err) { + mChunkVersionRollBack[chunkId] = chunkVersion - vers; + } + KFS_LOG_STREAM(err ? + MsgLogger::kLogLevelWARN : + MsgLogger::kLogLevelDEBUG) << + "replay beginchunkversionchange" + " fid: " << fid << + " chunkId: " << chunkId << + " version: " << vers << "=>" << chunkVersion << + " " << (err ? err : "OK") << + KFS_LOG_EOM; + return (! err); +} + +int +LayoutManager::WritePendingChunkVersionChange(ostream& os) const +{ + for (ChunkVersionRollBack::const_iterator + it = mChunkVersionRollBack.begin(); + it != mChunkVersionRollBack.end() && os; + ++it) { + if (it->second <= 0) { + KFS_LOG_STREAM_ERROR << + "version change invalid chunk roll back entry:" + " chunk: " << it->first << + " version increment: " << it->second << + KFS_LOG_EOM; + continue; + } + const CSMap::Entry* const ci = mChunkToServerMap.Find(it->first); + if (! ci) { + // Stale mapping. + KFS_LOG_STREAM_ERROR << + "version change failed to get chunk mapping:" + " chunk: " << it->first << + " version increment: " << it->second << + KFS_LOG_EOM; + continue; + } + const seq_t vers = ci->GetChunkInfo()->chunkVersion; + os << "beginchunkversionchange" + "/file/" << ci->GetFileId() << + "/chunkId/" << it->first << + "/chunkVersion/" << (vers + it->second) << + "\n"; + } + return (os ? 0 : -EIO); +} + +int +LayoutManager::GetInFlightChunkOpsCount(chunkId_t chunkId, MetaOp opType) const +{ + const MetaOp types[] = { opType, META_NUM_OPS_COUNT }; + return GetInFlightChunkOpsCount(chunkId, types); +} + +int +LayoutManager::GetInFlightChunkModificationOpCount( + chunkId_t chunkId, + LayoutManager::Servers* srvs /* = 0 */) const +{ + MetaOp const types[] = { + META_CHUNK_REPLICATE, // Recovery or replication. + META_CHUNK_VERSCHANGE, // Always runs after recovery. + META_CHUNK_MAKE_STABLE, + META_NUM_OPS_COUNT // Sentinel + }; + return GetInFlightChunkOpsCount(chunkId, types, srvs); +} + +int +LayoutManager::GetInFlightChunkOpsCount( + chunkId_t chunkId, + const MetaOp* opTypes, + LayoutManager::Servers* srvs /* = 0 */) const +{ + int ret = 0; + const ChunkServer::ChunkOpsInFlight& ops = + ChunkServer::GetChunkOpsInFlight(); + pair< + ChunkServer::ChunkOpsInFlight::const_iterator, + ChunkServer::ChunkOpsInFlight::const_iterator + > const range = ops.equal_range(chunkId); + for (ChunkServer::ChunkOpsInFlight::const_iterator it = range.first; + it != range.second; + ++it) { + for (const MetaOp* op = opTypes; + *op != META_NUM_OPS_COUNT; + op++) { + if (it->second->op == *op) { + ret++; + } + if (srvs && find(srvs->begin(), srvs->end(), + it->second->server) == srvs->end()) { + srvs->push_back(it->second->server); + } + } + } + return ret; +} + +int +LayoutManager::GetChunkWriteLease(MetaAllocate *r, bool &isNewLease) +{ + if (InRecovery()) { + KFS_LOG_STREAM_INFO << + "GetChunkWriteLease: InRecovery() => EBUSY" << + KFS_LOG_EOM; + r->statusMsg = "meta server in recovery mode"; + return -EBUSY; + } + if (GetInFlightChunkModificationOpCount(r->chunkId) > 0) { + // Wait for re-replication to finish. + KFS_LOG_STREAM_INFO << "Write lease: " << r->chunkId << + " is being re-replicated => EBUSY" << + KFS_LOG_EOM; + r->statusMsg = "replication is in progress"; + return -EBUSY; + } + const CSMap::Entry* const ci = mChunkToServerMap.Find(r->chunkId); + if (! ci) { + r->statusMsg = "no such chunk"; + return -EINVAL; + } + int ret = 0; + if (! mChunkToServerMap.HasServers(*ci)) { + r->statusMsg = "no replicas available"; + ret = -EDATAUNAVAIL; + if (! r->stripedFileFlag) { + return ret; + } + // Renew write lease with striped files, even if no + // replica available to ensure that the chunk block can not + // change, and recovery can not be started. + // Chunk invalidation and normal chunk close (in the case when + // replica re-appears) will expire the lease. + } + + const ChunkLeases::WriteLease* const l = + mChunkLeases.RenewValidWriteLease(r->chunkId); + if (l) { + if (l->allocInFlight) { + r->statusMsg = + "allocation or version change is in progress"; + KFS_LOG_STREAM_INFO << "write lease denied" + " chunk " << r->chunkId << " " << r->statusMsg << + KFS_LOG_EOM; + return -EBUSY; + } + if (l->appendFlag) { + r->statusMsg = "valid write append lease exists"; + KFS_LOG_STREAM_INFO << "write lease denied" + " chunk " << r->chunkId << " " << r->statusMsg << + KFS_LOG_EOM; + return -EBUSY; + } + // valid write lease; so, tell the client where to go + KFS_LOG_STREAM_INFO << + "valid write lease:" + " chunk: " << r->chunkId << + " expires in: " << (l->expires - TimeNow()) << " sec." + " replicas: " << mChunkToServerMap.ServerCount(*ci) << + " status: " << ret << + KFS_LOG_EOM; + if (ret < 0) { + isNewLease = false; + r->servers.clear(); + mChunkToServerMap.GetServers(*ci, r->servers); + r->master = l->chunkServer; + return ret; + } + // Delete the lease to force version number bump. + // Assume that the client encountered a write error. + mChunkLeases.Delete(r->chunkId); + } + if (ret < 0) { + return ret; + } + // there is no valid write lease; to issue a new write lease, we + // need to do a version # bump. do that only if we haven't yet + // handed out valid read leases + if (! ExpiredLeaseCleanup(r->chunkId)) { + r->statusMsg = "valid read lease"; + KFS_LOG_STREAM_DEBUG << "write lease denied" + " chunk " << r->chunkId << " " << r->statusMsg << + KFS_LOG_EOM; + return -EBUSY; + } + // Check if make stable is in progress. + // It is crucial to check the after invoking ExpiredLeaseCleanup() + // Expired lease cleanup the above can start make chunk stable. + if (! IsChunkStable(r->chunkId)) { + r->statusMsg = "chunk is not stable"; + KFS_LOG_STREAM_DEBUG << "write lease denied" + " chunk " << r->chunkId << " " << r->statusMsg << + KFS_LOG_EOM; + return -EBUSY; + } + // Check if servers vector has changed: + // chunk servers can go down in ExpiredLeaseCleanup() + r->servers.clear(); + mChunkToServerMap.GetServers(*ci, r->servers); + if (r->servers.empty()) { + // all the associated servers are dead...so, fail + // the allocation request. + r->statusMsg = "no replicas available"; + return -EDATAUNAVAIL; + } + // Need space on the servers..otherwise, fail it + Servers::size_type i; + for (i = 0; i < r->servers.size(); i++) { + if (r->servers[i]->GetAvailSpace() < mChunkAllocMinAvailSpace) { + return -ENOSPC; + } + } + isNewLease = true; + assert(r->chunkVersion == r->initialChunkVersion); + // When issuing a new lease, increment the version, skipping over + // the failed version increment attemtps. + r->chunkVersion += IncrementChunkVersionRollBack(r->chunkId); + if (! mChunkLeases.NewWriteLease( + r->chunkId, + r->chunkVersion, + GetInitialWriteLeaseExpireTime(), + r->servers[0], + r->pathname.GetStr(), + r->appendChunk, + r->stripedFileFlag, + r, + r->leaseId)) { + panic("failed to get write lease for a new chunk"); + } + + r->master = r->servers[0]; + KFS_LOG_STREAM_INFO << + "new write" + " lease:" << r->leaseId << + " chunk: " << r->chunkId << + " version: " << r->chunkVersion << + KFS_LOG_EOM; + submit_request(new MetaLogChunkVersionChange(*r)); + return 0; +} + +bool +LayoutManager::IsAllocationAllowed(MetaAllocate* req) +{ + if (req->clientProtoVers < mMinChunkAllocClientProtoVersion) { + req->status = -EPERM; + req->statusMsg = "client upgrade required"; + return false; + } + return true; +} +/* + * \brief During atomic record appends, a client tries to allocate a block. + * Since the client doesn't know the file size, the client notifies the + * metaserver it is trying to append. Simply allocating a new chunk for each + * such request will cause too many chunks. Instead, the metaserver picks one + * of the existing chunks of the file which has a valid write lease (presumably, + * that chunk is not full), and returns that info. When the client gets the + * info, it is possible that the chunk became full. In such a scenario, the + * client may have to try multiple times until it finds a chunk that it can + * write to. + */ +int +LayoutManager::AllocateChunkForAppend(MetaAllocate* req) +{ + ARAChunkCache::iterator const it = mARAChunkCache.Find(req->fid); + ARAChunkCache::Entry* const entry = mARAChunkCache.Get(it); + if (! entry) { + return -1; + } + + KFS_LOG_STREAM_DEBUG << "Append on file " << req->fid << + " with offset " << req->offset << + " max offset " << entry->offset << + (entry->IsAllocationPending() ? + " allocation in progress" : "") << + " appenders: " << entry->numAppendersInChunk << + KFS_LOG_EOM; + + if (entry->offset < 0 || (entry->offset % CHUNKSIZE) != 0 || + ! entry->master) { + panic("invalid write append cache entry"); + mARAChunkCache.Invalidate(req->fid); + return -1; + } + if (mVerifyAllOpsPermissionsFlag && ! entry->permissions.CanWrite( + req->euser, req->egroup)) { + return -EACCES; + } + // The client is providing an offset hint in the case when it needs a + // new chunk: space allocation failed because chunk is full, or it can + // not talk to the chunk server. + // + // If allocation has already finished, then cache entry offset is valid, + // otherwise the offset is equal to EOF at the time the initial request + // has started. The client specifies offset just to indicate that it + // wants a new chunk, and when the allocation finishes it will get the + // new chunk. + if (entry->offset < req->offset && ! entry->IsAllocationPending()) { + mARAChunkCache.Invalidate(it); + return -1; + } + // Ensure that master is still good. + if (entry->numAppendersInChunk > mMinAppendersPerChunk) { + UpdateGoodCandidateLoadAvg(); + if (entry->master->GetLoadAvg() > + mCSMaxGoodMasterCandidateLoadAvg) { + KFS_LOG_STREAM_INFO << + "invalidating append cache entry: " << + req->fid << + " " << entry->master->GetServerLocation() << + " load: " << entry->master->GetLoadAvg() << + " exceeds: " << + mCSMaxGoodMasterCandidateLoadAvg << + KFS_LOG_EOM; + mARAChunkCache.Invalidate(it); + return -1; + } + } + // Since there is no un-reservation mechanism, decay reservation by + // factor of 2 every mReservationDecayStep sec. + // The goal is primarily to decrease # or rtt and meta server cpu + // consumption due to chunk space reservation contention between + // multiple concurrent appenders, while keeping chunk size as large as + // possible. + // Start decay only after allocation completes. + // Enforce timeout on pending allocation, in order not to re-queue the + // timed out client back to the same allocation group. + const time_t now = TimeNow(); + if (entry->IsAllocationPending()) { + if (entry->lastDecayTime + mAllocAppendReuseInFlightTimeoutSec < now) { + mARAChunkCache.Invalidate(it); + return -1; + } + } else if (mReservationDecayStep > 0 && + entry->lastDecayTime + + mReservationDecayStep <= now) { + const size_t exp = (now - entry->lastDecayTime) / + mReservationDecayStep; + if (exp >= sizeof(entry->spaceReservationSize) * 8) { + entry->spaceReservationSize = 0; + } else { + entry->spaceReservationSize >>= exp; + } + entry->lastDecayTime = now; + } + const int reservationSize = (int)(min(double(mMaxReservationSize), + mReservationOvercommitFactor * + max(1, req->spaceReservationSize))); + if (entry->spaceReservationSize + reservationSize > + mChunkReservationThreshold) { + return -1; + } + const ChunkLeases::WriteLease* const wl = + mChunkLeases.RenewValidWriteLease(entry->chunkId); + if (! wl) { + mARAChunkCache.Invalidate(it); + return -1; + } + // valid write lease; so, tell the client where to go + req->chunkId = entry->chunkId; + req->offset = entry->offset; + req->chunkVersion = entry->chunkVersion; + entry->numAppendersInChunk++; + entry->lastAccessedTime = now; + entry->spaceReservationSize += reservationSize; + const bool pending = entry->AddPending(*req); + if (! pending && req->responseStr.empty()) { + // The cached response will have or already has all the info. + // Presently it should never get here. + KFS_LOG_STREAM_WARN << + "invalid write append cache entry:" + " no cached response" << + " file: " << req->fid << + " chunk: " << entry->chunkId << + " offset: " << entry->offset << + KFS_LOG_EOM; + mARAChunkCache.Invalidate(it); + return -1; + } + KFS_LOG_STREAM_DEBUG << + "Valid write lease exists for " << req->chunkId << + " expires in " << (wl->expires - TimeNow()) << " sec" << + " space: " << entry->spaceReservationSize << + " (+" << reservationSize << + "," << req->spaceReservationSize << ")" << + " num appenders: " << entry->numAppendersInChunk << + (pending ? " allocation in progress" : "") << + KFS_LOG_EOM; + if (entry->numAppendersInChunk >= mMaxAppendersPerChunk) { + mARAChunkCache.Invalidate(it); + } + return 0; +} + +/* + * The chunk files are named . The fid is now ignored by + * the meta server. +*/ +void +LayoutManager::ChangeChunkFid(MetaFattr* srcFattr, MetaFattr* dstFattr, + MetaChunkInfo* chunk) +{ + if (mChunkEntryToChange || mFattrToChangeTo) { + panic("coalesce blocks: invalid invocation:" + " previous change pending"); + return; + } + if (! chunk) { + if (! srcFattr) { + if (dstFattr) { + panic("coalesce blocks: invalid invocation:" + " src fattr is not null"); + } + return; + } + // Invalidate fid cache. + mARAChunkCache.Invalidate(srcFattr->id()); + return; + } + if (! dstFattr) { + panic("coalesce blocks: invalid invocation:" + " null destination fattr"); + return; + } + + CSMap::Entry& entry = GetCsEntry(*chunk); + if (entry.GetFattr() != srcFattr) { + ostringstream os; + os << + "coalesce blocks: chunk: " << chunk->chunkId << + " undexpected file attr: " << (void*)entry.GetFattr() << + " id: " << entry.GetFileId() << + " expect: " << (void*)srcFattr << + " id: " << srcFattr->id() + ; + const string msg = os.str(); + panic(msg.c_str()); + return; + } + mChunkEntryToChange = &entry; + mFattrToChangeTo = dstFattr; +} + +int +LayoutManager::GetChunkReadLeases(MetaLeaseAcquire& req) +{ + if (req.chunkIds.empty()) { + req.leaseIds.clear(); + return 0; + } + const bool recoveryFlag = InRecovery(); + const char* p = req.chunkIds.GetPtr(); + const char* e = p + req.chunkIds.GetSize(); + ostream& os = ClearStringStream(); + int ret = 0; + while (p < e) { + chunkId_t chunkId; + if (! ValueParser::ParseInt(p, e - p, chunkId)) { + while (p < e && *p <= ' ') { + p++; + } + if (p != e) { + req.status = -EINVAL; + req.statusMsg = "chunk id list parse error"; + ClearStringStream(); + ret = req.status; + } + break; + } + ChunkLeases::LeaseId leaseId = 0; + const CSMap::Entry* cs = 0; + if ((recoveryFlag && ! req.fromChunkServerFlag) || + ! IsChunkStable(chunkId)) { + leaseId = -EBUSY; + } else if ((req.leaseTimeout <= 0 ? + mChunkLeases.HasWriteLease(chunkId) : + ! ((cs = mChunkToServerMap.Find(chunkId)) && + mChunkToServerMap.HasServers(*cs) && + mChunkLeases.NewReadLease( + chunkId, + TimeNow() + min(req.leaseTimeout, + LEASE_INTERVAL_SECS), + leaseId)))) { + leaseId = -EBUSY; + if (req.flushFlag) { + mChunkLeases.FlushWriteLease( + chunkId, mARAChunkCache, + mChunkToServerMap); + } + } else if (! ((cs = mChunkToServerMap.Find(chunkId)) && + mChunkToServerMap.HasServers(*cs))) { + // Cannot obtain lease if no replicas exist. + leaseId = cs ? -EAGAIN : -EINVAL; + } + os << " " << leaseId; + } + req.leaseIds = mStringStream.str(); + ClearStringStream(); + return ret; +} + +/* + * \brief Process a reqeuest for a READ lease. +*/ +int +LayoutManager::GetChunkReadLease(MetaLeaseAcquire* req) +{ + const int ret = GetChunkReadLeases(*req); + if (ret != 0 || req->chunkId < 0) { + return ret; + } + if (InRecovery() && ! req->fromChunkServerFlag) { + req->statusMsg = "recovery is in progress"; + KFS_LOG_STREAM_INFO << "chunk " << req->chunkId << + " " << req->statusMsg << " => EBUSY" << + KFS_LOG_EOM; + return -EBUSY; + } + const CSMap::Entry* const cs = mChunkToServerMap.Find(req->chunkId); + if (! cs || ! mChunkToServerMap.HasServers(*cs)) { + req->statusMsg = cs ? "no replica available" : "no such chunk"; + return (cs ? -EAGAIN : -EINVAL); + } + if (! req->fromChunkServerFlag && mVerifyAllOpsPermissionsFlag && + ! cs->GetFattr()->CanRead(req->euser, req->egroup)) { + return -EACCES; + } + // + // Even if there is no write lease, wait until the chunk is stable + // before the client can read the data. We could optimize by letting + // the client read from servers where the data is stable, but that + // requires more book-keeping; so, we'll defer for now. + // + if (! IsChunkStable(req->chunkId)) { + req->statusMsg = "is not yet stable"; + KFS_LOG_STREAM_INFO << "Chunk " << req->chunkId << + " " << req->statusMsg << " => EBUSY" << + KFS_LOG_EOM; + return -EBUSY; + } + if ((req->leaseTimeout <= 0 ? + ! mChunkLeases.HasWriteLease(req->chunkId) : + mChunkLeases.NewReadLease( + req->chunkId, + TimeNow() + min(req->leaseTimeout, + LEASE_INTERVAL_SECS), + req->leaseId))) { + return 0; + } + req->statusMsg = "has write lease"; + if (req->flushFlag) { + const char* errMsg = mChunkLeases.FlushWriteLease( + req->chunkId, mARAChunkCache, mChunkToServerMap); + req->statusMsg += "; "; + req->statusMsg += errMsg ? errMsg : + "initiated write lease relinquish"; + } + KFS_LOG_STREAM_INFO << "Chunk " << req->chunkId << + " " << req->statusMsg << " => EBUSY" << + KFS_LOG_EOM; + return -EBUSY; +} + +class ValidLeaseIssued +{ + const ChunkLeases& leases; +public: + ValidLeaseIssued(const ChunkLeases& cl) + : leases(cl) {} + bool operator() (MetaChunkInfo *c) const { + return leases.HasValidLease(c->chunkId); + } +}; + +bool +LayoutManager::IsValidLeaseIssued(const vector& c) +{ + vector::const_iterator const i = find_if( + c.begin(), c.end(), + ValidLeaseIssued(mChunkLeases) + ); + if (i == c.end()) { + return false; + } + KFS_LOG_STREAM_DEBUG << "Valid lease issued on chunk: " << + (*i)->chunkId << KFS_LOG_EOM; + return true; +} + +int +LayoutManager::LeaseRenew(MetaLeaseRenew *req) +{ + if (! mChunkToServerMap.Find(req->chunkId)) { + if (InRecovery()) { + mChunkLeases.SetMaxLeaseId(req->leaseId + 1); + } + return -EINVAL; + } + return mChunkLeases.Renew(req->chunkId, req->leaseId); +} + +/// +/// Handling a corrupted chunk involves removing the mapping +/// from chunk id->chunkserver that we know has it. +/// +void +LayoutManager::ChunkCorrupt(MetaChunkCorrupt *r) +{ + if (! r->isChunkLost) { + r->server->IncCorruptChunks(); + } + KFS_LOG_STREAM_INFO << + "server " << r->server->ServerID() << + " claims chunk: <" << + r->fid << "," << r->chunkId << + "> to be " << (r->isChunkLost ? "lost" : "corrupt") << + KFS_LOG_EOM; + ChunkCorrupt(r->chunkId, r->server, false); +} + +void +LayoutManager::ChunkCorrupt(chunkId_t chunkId, const ChunkServerPtr& server, + bool notifyStale) +{ + CSMap::Entry* const ci = mChunkToServerMap.Find(chunkId); + if (! ci) { + return; + } + const bool removedFlag = ci->Remove(mChunkToServerMap, server); + mChunkLeases.ReplicaLost(chunkId, server.get()); + // Invalidate cache. + mARAChunkCache.Invalidate(ci->GetFileId(), chunkId); + if (removedFlag) { + // check the replication state when the replicaiton checker gets to it + CheckReplication(*ci); + } + KFS_LOG_STREAM_INFO << "server " << server->ServerID() << + " declaring: <" << + ci->GetFileId() << "," << chunkId << + "> lost" << + " servers: " << mChunkToServerMap.ServerCount(*ci) << + (removedFlag ? " -1" : " -0") << + KFS_LOG_EOM; + if (! notifyStale || server->IsDown()) { + return; + } + server->NotifyStaleChunk(chunkId); +} + +void +LayoutManager::ChunkEvacuate(MetaChunkEvacuate* r) +{ + if (r->server->IsDown() || r->server->IsRetiring()) { + return; + } + r->server->UpdateSpace(*r); + ChunkIdQueue deletedChunks; + ChunkIdQueue evacuatedChunks; + const MetaAllocate* alloc = 0; + const char* p = r->chunkIds.GetPtr(); + const char* e = p + r->chunkIds.GetSize(); + while (p < e) { + chunkId_t chunkId; + if (! ValueParser::ParseInt(p, e - p, chunkId)) { + while (p < e && *p <= ' ') { + p++; + } + if (p != e) { + r->status = -EINVAL; + r->statusMsg = "chunk id list parse error"; + } + break; + } + CSMap::Entry* const ci = mChunkToServerMap.Find(chunkId); + if (! ci) { + const ChunkLeases::WriteLease* const lease = + mChunkLeases.GetWriteLease(chunkId); + if (! lease || ! (alloc = lease->allocInFlight) || + find(alloc->servers.begin(), + alloc->servers.end(), + r->server) == + alloc->servers.end()) { + deletedChunks.PushBack(chunkId); + alloc = 0; + continue; + } + } else if (! ci->HasServer(mChunkToServerMap, r->server)) { + evacuatedChunks.PushBack(chunkId); + continue; + } + const int status = r->server->Evacuate(chunkId); + if (status == -EEXIST) { + continue; // Already scheduled. + } + if (status != 0) { + r->status = status; + if (status == -EAGAIN) { + r->statusMsg = "exceeded evacuate queue limit"; + } + break; + } + if (! ci) { + assert(alloc); + alloc = 0; + continue; + } + CheckReplication(*ci); + } + if (! deletedChunks.IsEmpty()) { + r->server->NotifyStaleChunks(deletedChunks); + } + if (! evacuatedChunks.IsEmpty() && ! r->server->IsDown()) { + const bool kEvacuatedFlag = true; + r->server->NotifyStaleChunks(evacuatedChunks, kEvacuatedFlag); + } +} + +void +CSMap::Entry::destroy() +{ + gLayoutManager.DeleteChunk(*this); +} + +void +LayoutManager::DeleteChunk(CSMap::Entry& entry) +{ + if (mChunkEntryToChange == &entry) { + // The entry is deleted from the b+tree, it should be inserted + // back shortly with different file attribute. + MetaFattr* const fa = mFattrToChangeTo; + const bool checkReplicationFlag = ! fa || ! entry.GetFattr() || + fa->numReplicas != entry.GetFattr()->numReplicas; + mChunkEntryToChange = 0; + mFattrToChangeTo = 0; + entry.SetFattr(fa); + if (checkReplicationFlag) { + CheckReplication(entry); + } + return; + } + + const fid_t fid = entry.GetFileId(); + const chunkId_t chunkId = entry.GetChunkInfo()->chunkId; + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(entry, servers); + // remove the mapping + mChunkToServerMap.Erase(chunkId); + DeleteChunk(fid, chunkId, servers); +} + +void +LayoutManager::DeleteChunk(fid_t fid, chunkId_t chunkId, + const LayoutManager::Servers& servers) +{ + // Make a copy to deal with possible recursion. + Servers const cs(servers); + + for (StripedFilesAllocationsInFlight::iterator it = + mStripedFilesAllocationsInFlight.lower_bound( + make_pair(make_pair(fid, 0), 0)); + it != mStripedFilesAllocationsInFlight.end() && + it->first.first == fid; + ++it) { + if (it->second == chunkId) { + mStripedFilesAllocationsInFlight.erase(it); + break; + } + } + mARAChunkCache.Invalidate(fid, chunkId); + mPendingBeginMakeStable.erase(chunkId); + mPendingMakeStable.erase(chunkId); + mChunkLeases.Delete(chunkId); + mChunkVersionRollBack.erase(chunkId); + + // submit an RPC request + for_each(cs.begin(), cs.end(), + bind(&ChunkServer::DeleteChunk, _1, chunkId)); +} + +void +LayoutManager::DeleteChunk(MetaAllocate *req) +{ + if (mChunkToServerMap.Find(req->chunkId)) { + panic("allocation attempts to delete existing chunk mapping"); + return; + } + DeleteChunk(req->fid, req->chunkId, req->servers); +} + +bool +LayoutManager::InvalidateAllChunkReplicas( + fid_t fid, chunkOff_t offset, chunkId_t chunkId, seq_t& chunkVersion) +{ + CSMap::Entry* const ci = mChunkToServerMap.Find(chunkId); + if (! ci || ci->GetFileId() != fid) { + return false; + } + MetaChunkInfo* const mci = ci->GetChunkInfo(); + if (mci->offset != offset) { + return false; + } + mci->chunkVersion += IncrementChunkVersionRollBack(chunkId); + chunkVersion = mci->chunkVersion; + StTmp serversTmp(mServers3Tmp); + Servers& c = serversTmp.Get(); + mChunkToServerMap.GetServers(*ci, c); + ci->RemoveAllServers(mChunkToServerMap); + mARAChunkCache.Invalidate(ci->GetFileId(), chunkId); + mPendingBeginMakeStable.erase(chunkId); + mPendingMakeStable.erase(chunkId); + mChunkLeases.Delete(chunkId); + mChunkVersionRollBack.erase(chunkId); + const bool kEvacuateChunkFlag = false; + for_each(c.begin(), c.end(), bind(&ChunkServer::NotifyStaleChunk, + _1, chunkId, kEvacuateChunkFlag)); + return true; +} + +MetaChunkInfo* +LayoutManager::AddChunkToServerMapping(MetaFattr* fattr, + chunkOff_t offset, chunkId_t chunkId, seq_t chunkVersion, + bool& newEntryFlag) +{ + if (! fattr) { + panic("AddChunkToServerMapping: fattr == null"); + return 0; + } + CSMap::Entry* const ret = mChunkToServerMap.Insert(fattr, + offset, chunkId, chunkVersion, newEntryFlag); + if (! ret) { + panic("failed to create chunk map entry"); + return 0; + } + // Chunk allocation log or checkpoint entry resets chunk version roll + // back. + mChunkVersionRollBack.erase(chunkId); + return ret->GetChunkInfo(); +} + +int +LayoutManager::UpdateChunkToServerMapping(chunkId_t chunkId, const ChunkServerPtr& s) +{ + // If the chunkid isn't present in the mapping table, it could be a + // stale chunk + CSMap::Entry* const ci = mChunkToServerMap.Find(chunkId); + if (! ci) { + return -1; + } + AddHosted(chunkId, *ci, s); + return 0; +} + +bool +LayoutManager::GetChunkFileId(chunkId_t chunkId, fid_t& fileId, + const MetaChunkInfo** chunkInfo, const MetaFattr** fa, + LayoutManager::Servers* srvs) +{ + const CSMap::Entry* const entry = mChunkToServerMap.Find(chunkId); + if (! entry) { + return false; + } + fileId = entry->GetFileId(); + if (fa) { + *fa = entry->GetFattr(); + } + if (chunkInfo) { + *chunkInfo = entry->GetChunkInfo(); + } + if (srvs) { + mChunkToServerMap.GetServers(*entry, *srvs); + } + return true; +} + +int +LayoutManager::GetChunkToServerMapping(MetaChunkInfo& chunkInfo, + LayoutManager::Servers& c, MetaFattr*& fa, bool* orderReplicasFlag /* = 0 */) +{ + const CSMap::Entry& entry = GetCsEntry(chunkInfo); + fa = entry.GetFattr(); + c.clear(); + const size_t cnt = mChunkToServerMap.GetServers(entry, c); + if (cnt <= 0) { + return -1; + } + if (cnt <= 1 || ! orderReplicasFlag || + ! mGetAllocOrderServersByLoadFlag) { + return 0; + } + // Random shuffle hosting servers, such that the servers with + // smaller load go before the servers with larger load. + int64_t loadAvgSum = 0; + const int64_t kLoadAvgFloor = 1; + for (Servers::const_iterator it = c.begin(); + it != c.end(); + ++it) { + loadAvgSum += (*it)->GetLoadAvg() + kLoadAvgFloor; + } + *orderReplicasFlag = true; + for (size_t i = c.size(); i >= 2; ) { + assert(loadAvgSum > 0); + int64_t rnd = Rand(loadAvgSum); + size_t ri = i--; + int64_t load; + do { + --ri; + load = c[ri]->GetLoadAvg() + kLoadAvgFloor; + rnd -= load; + } while (rnd >= 0 && ri > 0); + iter_swap(c.begin() + i, c.begin() + ri); + loadAvgSum -= load; + } + return 0; +} + +int64_t +LayoutManager::GetFreeIoBufferByteCount() const +{ + // This has to be re-entrant. Racy check is OK though. + return ( + mBufferPool ? + (int64_t)mBufferPool->GetFreeBufferCount() * + mBufferPool->GetBufferSize() : int64_t(-1) + ); +} + +class Pinger +{ + ostream& os; + const bool useFsTotalSpaceFlag; +public: + uint64_t totalSpace; + uint64_t usedSpace; + uint64_t freeFsSpace; + uint64_t goodMasters; + uint64_t goodSlaves; + uint64_t writableDrives; + uint64_t totalDrives; + LayoutManager::Servers retiring; + LayoutManager::Servers evacuating; + + Pinger(ostream& s, bool f) + : os(s), + useFsTotalSpaceFlag(f), + totalSpace(0), + usedSpace(0), + freeFsSpace(0), + goodMasters(0), + goodSlaves(0), + writableDrives(0), + totalDrives(0), + retiring(), + evacuating() + {} + void Process(const ChunkServerPtr& c) + { + ChunkServer& cs = *c; + cs.Ping(os, useFsTotalSpaceFlag); + totalSpace += cs.GetTotalSpace(useFsTotalSpaceFlag); + usedSpace += cs.GetUsedSpace(); + freeFsSpace += cs.GetFreeFsSpace(); + if (gLayoutManager.IsCandidateServer(cs)) { + if (cs.CanBeChunkMaster()) { + goodMasters++; + } else { + goodSlaves++; + } + writableDrives += max(0, cs.GetNumWritableDrives()); + } + totalDrives += max(0, cs.GetNumDrives()); + if (cs.IsRetiring()) { + retiring.push_back(c); + } else if (cs.GetEvacuateCount() > 0) { + evacuating.push_back(c); + } + } +}; + +void +LayoutManager::Ping(IOBuffer& buf, bool wormModeFlag) +{ + if (! mPingResponse.IsEmpty() && + TimeNow() < mPingUpdateTime + mPingUpdateInterval) { + buf.Copy(&mPingResponse, mPingResponse.BytesConsumable()); + return; + } + UpdateGoodCandidateLoadAvg(); + mPingResponse.Clear(); + IOBuffer tmpbuf; + mWOstream.Set(tmpbuf); + mWOstream << + "\r\n" + "Servers: "; + Pinger pinger(mWOstream, mUseFsTotalSpaceFlag); + for_each(mChunkServers.begin(), mChunkServers.end(), + bind(&Pinger::Process, ref(pinger), _1)); + mWOstream << + "\r\n" + "Retiring Servers: "; + for_each(pinger.retiring.begin(), pinger.retiring.end(), + bind(&ChunkServer::GetRetiringStatus, _1, ref(mWOstream))); + mWOstream << + "\r\n" + "Evacuating Servers: "; + for_each(pinger.evacuating.begin(), pinger.evacuating.end(), + bind(&ChunkServer::GetEvacuateStatus, _1, ref(mWOstream))); + mWOstream << + "\r\n" + "Down Servers: "; + copy(mDownServers.begin(), mDownServers.end(), + ostream_iterator(mWOstream)); + mWOstream << + "\r\n" + "Rebalance status: "; + mRebalanceCtrs.Show(mWOstream, "= ", "\t"); + mWOstream << + "\r\n" + "Config: " << mConfig; + const bool kRusageSelfFlag = true; + mWOstream << + "\r\n" + "Rusage self: "; + showrusage(mWOstream, "= ", "\t", kRusageSelfFlag); + mWOstream << + "\r\n" + "Rusage children: "; + showrusage(mWOstream, "= ", "\t", ! kRusageSelfFlag); + mWOstream << "\r\n\r\n"; // End of headers. + mWOstream.flush(); + // Initial headers. + mWOstream.Set(mPingResponse); + mPingUpdateTime = TimeNow(); + mWOstream << + "Build-version: " << KFS_BUILD_VERSION_STRING << "\r\n" + "Source-version: " << KFS_SOURCE_REVISION_STRING << "\r\n" + "WORM: " << (wormModeFlag ? "1" : "0") << "\r\n" + "System Info: " + "Up since= " << DisplayDateTime(kSecs2MicroSecs * mStartTime) << "\t" + "Total space= " << pinger.totalSpace << "\t" + "Used space= " << pinger.usedSpace << "\t" + "Replications= " << mNumOngoingReplications << "\t" + "Replications check= " << mChunkToServerMap.GetCount( + CSMap::Entry::kStateCheckReplication) << "\t" + "Pending recovery= " << mChunkToServerMap.GetCount( + CSMap::Entry::kStatePendingRecovery) << "\t" + "Repl check timeouts= " << mReplicationCheckTimeouts << "\t" + "Find repl timemoust= " << mReplicationFindWorkTimeouts << "\t" + "Update time= " << DisplayDateTime(kSecs2MicroSecs * mPingUpdateTime) << "\t" + "Uptime= " << (mPingUpdateTime - mStartTime) << "\t" + "Buffers= " << + (mBufferPool ? mBufferPool->GetUsedBufferCount() : 0) << "\t" + "Clients= " << ClientSM::GetClientCount() << "\t" + "Chunk srvs= " << ChunkServer::GetChunkServerCount() << "\t" + "Requests= " << MetaRequest::GetRequestCount() << "\t" + "Sockets= " << globals().ctrOpenNetFds.GetValue() << "\t" + "Chunks= " << mChunkToServerMap.Size() << "\t" + "Pending replication= " << mChunkToServerMap.GetCount( + CSMap::Entry::kStatePendingReplication) << "\t" + "Internal nodes= " << + MetaNode::getPoolAllocator().GetInUseCount() << "\t" + "Internal node size= " << + MetaNode::getPoolAllocator().GetItemSize() << "\t" + "Internal nodes storage= " << + MetaNode::getPoolAllocator().GetStorageSize() << "\t" + "Dentry nodes= " << + MetaNode::getPoolAllocator().GetInUseCount() << "\t" + "Dentry node size= " << + MetaNode::getPoolAllocator().GetItemSize() << "\t" + "Dentry nodes storage= " << + MetaNode::getPoolAllocator().GetStorageSize() << "\t" + "Fattr nodes= " << + MetaNode::getPoolAllocator().GetInUseCount() << "\t" + "Fattr node size= " << + MetaNode::getPoolAllocator().GetItemSize() << "\t" + "Fattr nodes storage= " << + MetaNode::getPoolAllocator().GetStorageSize() << "\t" + "ChunkInfo nodes= " << + CSMap::Entry::GetAllocBlockCount() << "\t" + "ChunkInfo node size= " << + sizeof(MetaChunkInfo) << "\t" + "ChunkInfo nodes storage= " << + 0 << "\t" + "CSmap nodes= " << + mChunkToServerMap.GetAllocator().GetInUseCount() << "\t" + "CSmap node size= " << + mChunkToServerMap.GetAllocator().GetItemSize() << "\t" + "CSmap nodes storage= " << + mChunkToServerMap.GetAllocator().GetStorageSize() << "\t" + "CSmap entry nodes= " << + CSMap::Entry::GetAllocBlockCount() << "\t" + "CSmap entry bytes= " << + CSMap::Entry::GetAllocByteCount() << "\t" + "Delayed recovery= " << mChunkToServerMap.GetCount( + CSMap::Entry::kStateDelayedRecovery) << "\t" + "Replication backlog= " << mChunkToServerMap.GetCount( + CSMap::Entry::kStateNoDestination) << "\t" + "In recovery= " << (InRecovery() ? 1 : 0) << "\t" + "To restart= " << mCSToRestartCount << "\t" + "To restart masters= " << mMastersToRestartCount << "\t" << + "CS Max Good Load Avg= " << + mCSMaxGoodCandidateLoadAvg << "\t" << + "CS Max Good Master Load Avg= " << + mCSMaxGoodMasterCandidateLoadAvg << "\t" << + "CS Max Good Slave Load Avg= " << + mCSMaxGoodSlaveCandidateLoadAvg << "\t" << + "Hibernated servers= " << + mChunkToServerMap.GetHibernatedCount() << "\t" + "Free space= " << pinger.freeFsSpace << "\t" + "Good masters= " << pinger.goodMasters << "\t" + "Good slaves= " << pinger.goodSlaves << "\t" + "Total drives= " << pinger.totalDrives << "\t" + "Writable drives= " << pinger.writableDrives << "\t" + "Append cache size= " << mARAChunkCache.GetSize() + ; + mWOstream.flush(); + mWOstream.Reset(); + mPingResponse.Move(&tmpbuf); + buf.Copy(&mPingResponse, mPingResponse.BytesConsumable()); +} + +class UpServersList +{ + ostream& os; +public: + UpServersList(ostream& s) : os(s) {} + void operator () (const ChunkServerPtr& c) { + os << c->GetServerLocation() << "\n"; + } +}; + +void +LayoutManager::UpServers(ostream &os) +{ + for_each(mChunkServers.begin(), mChunkServers.end(), UpServersList(os)); +} + +// Periodically, check the replication level of ALL chunks in the system. +void +LayoutManager::InitCheckAllChunks() +{ + // HandoutChunkReplicationWork() iterates trough this list when + // replication check is exhausted. + mChunkToServerMap.First(CSMap::Entry::kStateNone); + mCheckAllChunksInProgressFlag = true; + mChunkReplicator.ScheduleNext(); +} + +bool +LayoutManager::ExpiredLeaseCleanup(chunkId_t chunkId) +{ + const int ownerDownExpireDelay = 0; + return mChunkLeases.ExpiredCleanup( + chunkId, TimeNow(), ownerDownExpireDelay, + mARAChunkCache, mChunkToServerMap + ); +} + +void +LayoutManager::LeaseCleanup() +{ + const time_t now = TimeNow(); + + mChunkLeases.Timer(now, mLeaseOwnerDownExpireDelay, + mARAChunkCache, mChunkToServerMap); + if (mAppendCacheCleanupInterval >= 0) { + // Timing out the cache entries should now be redundant, + // and is disabled by default, as the cache should not have + // any stale entries. The lease cleanup, allocation + // completion in the case of failure, and chunk deletion + // should cleanup the cache. + mARAChunkCache.Timeout(now - mAppendCacheCleanupInterval); + } + if (metatree.getUpdatePathSpaceUsageFlag() && + mLastRecomputeDirsizeTime + + mRecomputeDirSizesIntervalSec < now) { + KFS_LOG_STREAM_INFO << "Doing a recompute dir size..." << + KFS_LOG_EOM; + metatree.recomputeDirSize(); + mLastRecomputeDirsizeTime = now; + KFS_LOG_STREAM_INFO << "Recompute dir size is done..." << + KFS_LOG_EOM; + } + ScheduleChunkServersRestart(); +} + +void +LayoutManager::ScheduleRestartChunkServers() +{ + mCSRestartTime = TimeNow(); + if (mMaxCSRestarting <= 0) { + mMaxCSRestarting = 2; + } + KFS_LOG_STREAM_INFO << + "scheduling chunk servers restart:" + " servers: " << mChunkServers.size() << + " masters: " << mMastersCount << + " restarting: " << mCSToRestartCount << + " masters restarting: " << mMastersToRestartCount << + " max restarting: " << mMaxCSRestarting << + KFS_LOG_EOM; +} + +int64_t +LayoutManager::GetMaxCSUptime() const +{ + const time_t now = TimeNow(); + return (mCSRestartTime <= now ? + min(mMaxCSUptime, now - mCSRestartTime) : mMaxCSUptime); +} + +void +LayoutManager::ScheduleChunkServersRestart() +{ + if (mMaxCSRestarting <= 0 || ! IsChunkServerRestartAllowed()) { + return; + } + Servers servers(mChunkServers); + make_heap(servers.begin(), servers.end(), + bind(&ChunkServer::Uptime, _1) < + bind(&ChunkServer::Uptime, _2)); + const int64_t maxCSUptime = GetMaxCSUptime(); + const size_t minMastersUp = max(size_t(1), mSlavesCount / 3 * 2); + while (! servers.empty()) { + ChunkServer& srv = *servers.front().get(); + if (srv.Uptime() < maxCSUptime) { + break; + } + bool restartFlag = srv.IsRestartScheduled(); + if (! restartFlag && mCSToRestartCount < mMaxCSRestarting) { + // Make sure that there are enough masters. + restartFlag = ! srv.CanBeChunkMaster() || + mMastersCount > + mMastersToRestartCount + minMastersUp; + if (! restartFlag && ! mAssignMasterByIpFlag) { + for (Servers::iterator + it = servers.begin(); + it != servers.end(); + ++it) { + ChunkServer& cs = **it; + if (! cs.CanBeChunkMaster() && + ! cs.IsRestartScheduled() && + IsCandidateServer(cs)) { + cs.SetCanBeChunkMaster(true); + srv.SetCanBeChunkMaster(false); + restartFlag = true; + break; + } + } + } + if (restartFlag) { + mCSToRestartCount++; + if (srv.CanBeChunkMaster()) { + mMastersToRestartCount++; + } + } + } + if (restartFlag && + srv.ScheduleRestart( + mCSGracefulRestartTimeout, + mCSGracefulRestartAppendWithWidTimeout)) { + KFS_LOG_STREAM_INFO << + "initiated restart sequence for: " << + servers.front()->ServerID() << + KFS_LOG_EOM; + break; + } + pop_heap(servers.begin(), servers.end(), + bind(&ChunkServer::Uptime, _1) < + bind(&ChunkServer::Uptime, _2)); + servers.pop_back(); + } +} + +bool +LayoutManager::Validate(MetaAllocate* r) +{ + const ChunkLeases::WriteLease* const lease = + mChunkLeases.GetWriteLease(r->chunkId); + if (lease && lease->allocInFlight && + lease->leaseId == r->leaseId && + lease->chunkVersion == r->chunkVersion) { + return true; + } + if (r->status >= 0) { + r->status = -EALLOCFAILED; + } + return false; +} + +void +LayoutManager::CommitOrRollBackChunkVersion(MetaAllocate* r) +{ + if (r->stripedFileFlag && r->initialChunkVersion < 0) { + if (mStripedFilesAllocationsInFlight.erase(make_pair(make_pair( + r->fid, r->chunkBlockStart), r->chunkId)) != 1 && + r->status >= 0) { + panic("no striped file allocation entry"); + } + } + if (r->status >= 0) { + // Tree::assignChunkId() succeeded. + // File and chunk ids are valid and in sync with meta tree. + const int ret = mChunkLeases.Renew(r->chunkId, r->leaseId, true); + if (ret < 0) { + panic("failed to renew allocation write lease"); + r->status = ret; + return; + } + // AddChunkToServerMapping() should delete version roll back for + // new chunks. + if (mChunkVersionRollBack.erase(r->chunkId) > 0 && + r->initialChunkVersion < 0) { + panic("chunk version roll back still exists"); + r->statusMsg = "internal error:" + " chunk version roll back still exists"; + r->status = -EINVAL; + return; + } + CSMap::Entry* const ci = mChunkToServerMap.Find(r->chunkId); + if (! ci) { + panic("missing chunk mapping"); + r->statusMsg = "internal error:" + " missing chunk mapping"; + r->status = -EINVAL; + return; + } + if (r->initialChunkVersion < 0) { + // New valid chunk -- set servers. + // r->offset is a multiple of CHUNKSIZE + assert(r->offset >= 0 && (r->offset % CHUNKSIZE) == 0); + if (r->fid != ci->GetFileId() || + mChunkToServerMap.HasServers(*ci)) { + panic("invalid chunk mapping"); + r->statusMsg = "internal error:" + " invalid chunk mapping"; + r->status = -EINVAL; + return; + } + for (Servers::const_iterator + it = r->servers.begin(); + it != r->servers.end(); + ++it) { + AddHosted(*ci, *it); + } + // Schedule replication check if needed. + if (r->servers.size() != (size_t)r->numReplicas) { + CheckReplication(*ci); + } + } + if (r->appendChunk) { + // Insert pending make stable entry here, to ensure that + // it gets into the checkpoint. + // With checkpoints from forked copy enabled checkpoint + // can start *before* the corresponding make stable + // starts + pair const res = + mPendingMakeStable.insert(make_pair( + r->chunkId, PendingMakeStableEntry())); + if (res.second) { + res.first->second.mChunkVersion = + r->chunkVersion; + } + } + return; + } + // Delete write lease, it wasn't ever handed to the client, and + // version change will make chunk stable, thus there is no need to + // go trough the normal lease cleanup procedure. + if (! mChunkLeases.DeleteWriteLease(r->chunkId, r->leaseId)) { + if (! mChunkToServerMap.Find(r->chunkId)) { + // Chunk does not exist, deleted. + mChunkVersionRollBack.erase(r->chunkId); + return; + } + panic("chunk version roll back failed to delete write lease"); + } + if (r->initialChunkVersion < 0) { + return; + } + if (r->initialChunkVersion >= r->chunkVersion) { + panic("invalid chunk version transition"); + } + CSMap::Entry* const ci = mChunkToServerMap.Find(r->chunkId); + if (! ci) { + mChunkVersionRollBack.erase(r->chunkId); + return; + } + if (r->initialChunkVersion + GetChunkVersionRollBack(r->chunkId) != + r->chunkVersion) { + ostringstream os; + os << + "invalid chunk version transition:" << + " " << r->initialChunkVersion << + "+" << GetChunkVersionRollBack(r->chunkId) << + " => " << r->chunkVersion; + const string msg = os.str(); + panic(msg.c_str()); + return; + } + // Roll back to the initial chunk version, and make chunk stable. + StTmp serversTmp(mServers3Tmp); + Servers& srvs = serversTmp.Get(); + mChunkToServerMap.GetServers(*ci, srvs); + const bool kMakeStableFlag = true; + for (Servers::const_iterator + it = srvs.begin(); it != srvs.end(); ++it) { + (*it)->NotifyChunkVersChange( + r->fid, + r->chunkId, + r->initialChunkVersion, // to + r->chunkVersion, // from + kMakeStableFlag + ); + } +} + +int +LayoutManager::LeaseRelinquish(MetaLeaseRelinquish *req) +{ + return mChunkLeases.LeaseRelinquish( + *req, mARAChunkCache, mChunkToServerMap); +} + +// Periodically, check the status of all the leases +// This is an expensive call...use sparingly.... +void +LayoutManager::CheckAllLeases() +{ + mChunkLeases.Timer(TimeNow(), mLeaseOwnerDownExpireDelay, + mARAChunkCache, mChunkToServerMap); +} + +/* + +Make chunk stable protocol description. + +The protocol is mainly designed for write append, though it is also partially +used for random write. + +The protocol is needed to solve consensus problem, i.e. make all chunk replicas +identical. This also allows replication participants (chunk servers) to +determine the status of a particular write append operation, and, if requested, +to convey this status to the write append client(s). + +The fundamental idea is that the meta server always makes final irrevocable +decision what stable chunk replicas should be: chunk size and chunk checksum. +The meta server selects exactly one variant of the replica, and broadcast this +information to all replication participants (the hosting chunk servers). More +over, the meta server maintains this information until sufficient number or +replicas become "stable" as a result of "make chunk stable" operation, or as a +result of re-replication from already "stable" replica(s). + +The meta server receives chunk size and chunk checksum from the chunk servers. +There are two ways meta server can get this information: +1. Normally chunk size, and checksum are conveyed by the write master in the +write lease release request. +2. The meta server declares chunk master nonoperational, and broadcasts "begin +make chunk stable" request to all remaining operational replication participants +(slaves). The slaves reply with chunk size and chunk checksum. The meta server +always selects one reply that has the smallest chunk size, in the hope that +other participants can converge their replicas to this chunk size, and checksum +by simple truncation. Begin make chunk stable repeated until the meta server +gets at least one valid reply. + +The meta server writes this decision: chunk version, size, and checksum into the +log before broadcasting make chunk stable request with these parameters to all +operational replication participants. This guarantees that the decision is +final: it can never change as long as the log write is persistent. Once log +write completes successfully the meta server broadcasts make chunk stable +request to all operational +replication participants. + +The chunk server maintains the information about chunk state: stable -- read +only, or not stable -- writable, and if the chunk was open for write append or +for random write. This information conveyed (back) to the meta server in the +chunk server hello message. The hello message contains 3 chunk lists: stable, +not stable write append, and not stable random write. This is needed to make +appropriate decision when chunk server establishes communication with the meta +server. + +The chunk server can never transition not stable chunk replica into stable +replica, unless it receives make chunk stable request from the meta server. +The chunk server discards all non stable replicas on startup (restart). + +For stable chunks the server is added to the list of the servers hosting the +chunk replica, as long as the corresponding chunk meta data exists, and version +of the replica matches the meta data version. + +In case of a failure, the meta server declares chunk replica stale and conveys +this decision to the chunk server, then the chunk server discards the stale +replica. + +For not stable random write chunk replicas the same checks are performed, plus +additional step: make chunk stable request is issued, The request in this case +does not specify the chunk size, and checksum. When make chunk stable completes +successfully the server added to the list of servers hosting the chunk replica. + +With random writes version number is used to detect missing writes, and the task +of making chunk replicas consistent left entirely up to the writer (client). +Write lease mechanism is used to control write concurrency: for random write +only one concurrent writer per chunk is allowed. + +Not stable write append chunk handling is more involved, because multiple +concurrent write appenders are allowed to append to the same chunk. + +First, the same checks for chunk meta data existence, and the version match are +applied. If successful, then the check for existing write lease is performed. +If the write (possibly expired) lease exists the server is added to the list of +servers hosting the replica. If the write lease exists, and begin make chunk +stable or make chunk stable operation for the corresponding chunk is in +progress, the chunk server is added to the operation. + +If no lease exists, and begin make chunk stable was never successfully completed +(no valid pending make chunk stable info exists), then the meta server issues +begin make chunk stable request. + +Once begin make chunks stable successfully completes the meta server writes +"mkstable" log record with the chunk version, size, and checksum into the log, +and adds this information to in-memory pending make chunk stable table. Make +chunk stable request is issued after log write successfully completes. + +The make chunk stable info is kept in memory, and in the checkpoint file until +sufficient number of stable replicas created, or chunk ceases to exist. Once +sufficient number of replicas is created, the make chunk stable info is purged +from memory, and the "mkstabledone" record written to the log. If chunk ceases +to exist then only in-memory information purged, but no log write performed. +"Mkstabledone" log records effectively cancels "mkstable" record. + +Chunk allocation log records have an additional "append" attribute set to 1. Log +replay process creates in-memory make chunk stable entry with chunk size +attribute set to -1 for every chunk allocation record with the append attribute +set to 1. In memory entries with size set to -1 mark not stable chunks for which +chunk size and chunk checksum are not known. For such chunks begin make stable +has to be issued first. The "mkstable" records are used to update in-memory +pending make stable info with the corresponding chunk size and checksum. The +"mkstabledone" records are used to delete the corresponding in-memory pending +make stable info. Chunk delete log records also purge the corresponding +in-memory pending make stable info. + +In memory pending delete info is written into the checkpoint file, after the +meta (tree) information. One "mkstable" entry for every chunk that is not +stable, or does not have sufficient number of replicas. + +During "recovery" period, begin make chunk stable is not issued, instead these +are delayed until recovery period ends, in the hope that begin make stable with +more servers has higher chances of succeeding, and can potentially produce more +stable replicas. + +*/ + +void +LayoutManager::MakeChunkStableInit( + const CSMap::Entry& entry, + seq_t chunkVersion, + string pathname, + bool beginMakeStableFlag, + chunkOff_t chunkSize, + bool hasChunkChecksum, + uint32_t chunkChecksum, + bool stripedFileFlag, + bool appendFlag, + bool leaseRelinquishFlag) +{ + const char* const logPrefix = beginMakeStableFlag ? "BMCS:" : "MCS:"; + const chunkId_t chunkId = entry.GetChunkId(); + const fid_t fid = entry.GetFileId(); + StTmp serversTmp(mServers3Tmp); + Servers& srvs = serversTmp.Get(); + const int serversCnt = + (int)mChunkToServerMap.GetServers(entry, srvs); + const Servers& servers = srvs; + if (serversCnt <= 0) { + if (leaseRelinquishFlag) { + // Update file modification time. + MetaFattr* const fa = entry.GetFattr(); + const int64_t now = microseconds(); + if (fa->mtime + mMTimeUpdateResolution < now) { + fa->mtime = now; + submit_request(new MetaSetMtime(fid, fa->mtime)); + } + } + if (beginMakeStableFlag) { + // Ensure that there is at least pending begin make + // stable. + // Append allocations are marked as such, and log replay + // adds begin make stable entries if necessary. + pair const res = + mPendingMakeStable.insert(make_pair( + chunkId, PendingMakeStableEntry())); + if (res.second) { + res.first->second.mChunkVersion = chunkVersion; + } + } + // If no servers, MCS with append (checksum and size) still + // needs to be logged and the corresponding pending make stable + // entry has to be created. + if (beginMakeStableFlag || ! appendFlag) { + KFS_LOG_STREAM_INFO << logPrefix << + " <" << fid << "," << chunkId << ">" + " name: " << pathname << + " no servers" << + KFS_LOG_EOM; + // Update replication state. + ChangeChunkReplication(chunkId); + return; + } + } + pair const ret = + mNonStableChunks.insert(make_pair(chunkId, + MakeChunkStableInfo( + serversCnt, + beginMakeStableFlag, + pathname, + chunkVersion, + stripedFileFlag, + leaseRelinquishFlag && serversCnt > 0 + ))); + if (! ret.second) { + KFS_LOG_STREAM_INFO << logPrefix << + " <" << fid << "," << chunkId << ">" + " name: " << pathname << + " already in progress" << + KFS_LOG_EOM; + return; + } + KFS_LOG_STREAM_INFO << logPrefix << + " <" << fid << "," << chunkId << ">" + " name: " << pathname << + " version: " << chunkVersion << + " servers: " << serversCnt << + " size: " << chunkSize << + " checksum: " << (hasChunkChecksum ? + (int64_t)chunkChecksum : (int64_t)-1) << + " append: " << appendFlag << + KFS_LOG_EOM; + if (beginMakeStableFlag) { + for_each(servers.begin(), servers.end(), + bind(&ChunkServer::BeginMakeChunkStable, _1, + fid, chunkId, chunkVersion)); + } else if (appendFlag) { + // Remember chunk check sum and size. + PendingMakeStableEntry const pmse( + chunkSize, + hasChunkChecksum, + chunkChecksum, + chunkVersion + ); + pair const res = + mPendingMakeStable.insert(make_pair(chunkId, pmse)); + if (! res.second) { + KFS_LOG_STREAM((res.first->second.mSize >= 0 || + res.first->second.mHasChecksum) ? + MsgLogger::kLogLevelWARN : + MsgLogger::kLogLevelDEBUG) << + logPrefix << + " <" << fid << "," << chunkId << ">" + " updating existing pending MCS: " << + " chunkId: " << chunkId << + " version: " << + res.first->second.mChunkVersion << + "=>" << pmse.mChunkVersion << + " size: " << res.first->second.mSize << + "=>" << pmse.mSize << + " checksum: " << + (res.first->second.mHasChecksum ? + int64_t(res.first->second.mChecksum) : + int64_t(-1)) << + "=>" << (pmse.mHasChecksum ? + int64_t(pmse.mChecksum) : + int64_t(-1)) << + KFS_LOG_EOM; + res.first->second = pmse; + } + ret.first->second.logMakeChunkStableFlag = true; + submit_request(new MetaLogMakeChunkStable( + fid, chunkId, chunkVersion, + chunkSize, hasChunkChecksum, chunkChecksum, chunkId + )); + } else { + const bool kPendingAddFlag = false; + for_each(servers.begin(), servers.end(), bind( + &ChunkServer::MakeChunkStable, _1, + fid, chunkId, chunkVersion, + chunkSize, hasChunkChecksum, chunkChecksum, + kPendingAddFlag + )); + } +} + +bool +LayoutManager::AddServerToMakeStable( + CSMap::Entry& placementInfo, + ChunkServerPtr server, + chunkId_t chunkId, + seq_t chunkVersion, + const char*& errMsg) +{ + errMsg = 0; + NonStableChunksMap::iterator const it = mNonStableChunks.find(chunkId); + if (it == mNonStableChunks.end()) { + return false; // Not in progress + } + MakeChunkStableInfo& info = it->second; + if (info.chunkVersion != chunkVersion) { + errMsg = "version mismatch"; + return false; + } + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(placementInfo, servers); + if (find_if(servers.begin(), servers.end(), + MatchingServer(server->GetServerLocation()) + ) != servers.end()) { + // Already there, duplicate chunk? Same as in progress. + return true; + } + KFS_LOG_STREAM_DEBUG << + (info.beginMakeStableFlag ? "B" : + info.logMakeChunkStableFlag ? "L" : "") << + "MCS:" + " <" << placementInfo.GetFileId() << "," << chunkId << ">" + " adding server: " << server->ServerID() << + " name: " << info.pathname << + " servers: " << info.numAckMsg << + "/" << info.numServers << + "/" << servers.size() << + " size: " << info.chunkSize << + " checksum: " << info.chunkChecksum << + " added: " << info.serverAddedFlag << + KFS_LOG_EOM; + AddHosted(chunkId, placementInfo, server); + info.numServers++; + info.serverAddedFlag = true; + if (info.beginMakeStableFlag) { + server->BeginMakeChunkStable( + placementInfo.GetFileId(), chunkId, info.chunkVersion); + } else if (! info.logMakeChunkStableFlag) { + const bool kPendingAddFlag = false; + server->MakeChunkStable( + placementInfo.GetFileId(), + chunkId, + info.chunkVersion, + info.chunkSize, + info.chunkSize >= 0, + info.chunkChecksum, + kPendingAddFlag + ); + } + // If log make stable is in progress, then make stable or begin make + // stable will be started when logging is done. + return true; +} + +void +LayoutManager::BeginMakeChunkStableDone(const MetaBeginMakeChunkStable* req) +{ + const char* const logPrefix = "BMCS: done"; + NonStableChunksMap::iterator const it = + mNonStableChunks.find(req->chunkId); + if (it == mNonStableChunks.end() || ! it->second.beginMakeStableFlag) { + KFS_LOG_STREAM_DEBUG << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " " << req->Show() << + " ignored: " << + (it == mNonStableChunks.end() ? + "not in progress" : "MCS in progress") << + KFS_LOG_EOM; + return; + } + MakeChunkStableInfo& info = it->second; + KFS_LOG_STREAM_DEBUG << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " name: " << info.pathname << + " servers: " << info.numAckMsg << "/" << info.numServers << + " size: " << info.chunkSize << + " checksum: " << info.chunkChecksum << + " " << req->Show() << + KFS_LOG_EOM; + CSMap::Entry* ci = 0; + bool noSuchChunkFlag = false; + if (req->status != 0 || req->chunkSize < 0) { + if (req->status == 0 && req->chunkSize < 0) { + KFS_LOG_STREAM_ERROR << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " invalid chunk size: " << req->chunkSize << + " declaring chunk replica corrupt" << + " " << req->Show() << + KFS_LOG_EOM; + } + ci = mChunkToServerMap.Find(req->chunkId); + if (ci) { + const ChunkServerPtr server = ci->GetServer( + mChunkToServerMap, req->serverLoc); + if (server && ! server->IsDown()) { + ChunkCorrupt(req->chunkId, server); + } + } else { + noSuchChunkFlag = true; + } + } else if (req->chunkSize < info.chunkSize || info.chunkSize < 0) { + // Pick the smallest good chunk. + info.chunkSize = req->chunkSize; + info.chunkChecksum = req->chunkChecksum; + } + if (++info.numAckMsg < info.numServers) { + return; + } + if (! noSuchChunkFlag && ! ci) { + ci = mChunkToServerMap.Find(req->chunkId); + noSuchChunkFlag = ! ci; + } + if (noSuchChunkFlag) { + KFS_LOG_STREAM_DEBUG << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " no such chunk, cleaning up" << + KFS_LOG_EOM; + mNonStableChunks.erase(it); + mPendingMakeStable.erase(req->chunkId); + return; + } + info.beginMakeStableFlag = false; + info.logMakeChunkStableFlag = true; + info.serverAddedFlag = false; + // Remember chunk check sum and size. + PendingMakeStableEntry const pmse( + info.chunkSize, + info.chunkSize >= 0, + info.chunkChecksum, + req->chunkVersion + ); + pair const res = + mPendingMakeStable.insert(make_pair(req->chunkId, pmse)); + assert( + res.second || + (res.first->second.mSize < 0 && + res.first->second.mChunkVersion == pmse.mChunkVersion) + ); + if (! res.second && pmse.mSize >= 0) { + res.first->second = pmse; + } + if (res.first->second.mSize < 0) { + int numUpServers = 0; + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(*ci, servers); + for (Servers::const_iterator + si = servers.begin(); + si != servers.end(); + ++si) { + if (! (*si)->IsDown()) { + numUpServers++; + } + } + if (numUpServers <= 0) { + KFS_LOG_STREAM_DEBUG << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " no servers up, retry later" << + KFS_LOG_EOM; + } else { + // Shouldn't get here. + KFS_LOG_STREAM_WARN << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " internal error:" + " up servers: " << numUpServers << + " invalid chunk size: " << + res.first->second.mSize << + KFS_LOG_EOM; + } + // Try again later. + mNonStableChunks.erase(it); + UpdateReplicationState(*ci); + return; + } + submit_request(new MetaLogMakeChunkStable( + req->fid, req->chunkId, req->chunkVersion, + info.chunkSize, info.chunkSize >= 0, info.chunkChecksum, + req->opSeqno + )); +} + +void +LayoutManager::LogMakeChunkStableDone(const MetaLogMakeChunkStable* req) +{ + const char* const logPrefix = "LMCS: done"; + NonStableChunksMap::iterator const it = + mNonStableChunks.find(req->chunkId); + if (it == mNonStableChunks.end()) { + KFS_LOG_STREAM_DEBUG << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " " << req->Show() << + " ignored: not in progress" << + KFS_LOG_EOM; + // Update replication state. + ChangeChunkReplication(req->chunkId); + return; + } + if (! it->second.logMakeChunkStableFlag) { + KFS_LOG_STREAM_ERROR << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " " << req->Show() << + " ignored: " << + (it->second.beginMakeStableFlag ? "B" : "") << + "MCS in progress" << + KFS_LOG_EOM; + return; + } + MakeChunkStableInfo& info = it->second; + CSMap::Entry* const ci = mChunkToServerMap.Find(req->chunkId); + if (! ci || ! mChunkToServerMap.HasServers(*ci)) { + KFS_LOG_STREAM_INFO << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" << + " name: " << info.pathname << + (! ci ? + " does not exist, cleaning up" : + " no servers, run MCS later") << + KFS_LOG_EOM; + if (ci) { + UpdateReplicationState(*ci); + } else { + // If chunk was deleted, do not emit mkstabledone log + // entry. Only ensure that no stale pending make stable + // entry exists. + mPendingMakeStable.erase(req->chunkId); + } + mNonStableChunks.erase(it); + return; + } + const bool serverWasAddedFlag = info.serverAddedFlag; + const int prevNumServer = info.numServers; + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + info.numServers = + (int)mChunkToServerMap.GetServers(*ci, servers); + info.numAckMsg = 0; + info.beginMakeStableFlag = false; + info.logMakeChunkStableFlag = false; + info.serverAddedFlag = false; + info.chunkSize = req->chunkSize; + info.chunkChecksum = req->chunkChecksum; + KFS_LOG_STREAM_INFO << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " starting MCS" + " version: " << req->chunkVersion << + " name: " << info.pathname << + " size: " << info.chunkSize << + " checksum: " << info.chunkChecksum << + " servers: " << prevNumServer << "->" << info.numServers << + " " << (serverWasAddedFlag ? "new servers" : "") << + KFS_LOG_EOM; + if (serverWasAddedFlag && info.chunkSize < 0) { + // Retry make chunk stable with newly added servers. + info.beginMakeStableFlag = true; + for_each(servers.begin(), servers.end(), bind( + &ChunkServer::BeginMakeChunkStable, _1, + ci->GetFileId(), req->chunkId, info.chunkVersion + )); + return; + } + const bool kPendingAddFlag = false; + for_each(servers.begin(), servers.end(), bind( + &ChunkServer::MakeChunkStable, _1, + req->fid, req->chunkId, req->chunkVersion, + req->chunkSize, req->hasChunkChecksum, req->chunkChecksum, + kPendingAddFlag + )); +} + +void +LayoutManager::MakeChunkStableDone(const MetaChunkMakeStable* req) +{ + const char* const logPrefix = "MCS: done"; + string pathname; + CSMap::Entry* pinfo = 0; + bool updateSizeFlag = false; + bool updateMTimeFlag = false; + NonStableChunksMap::iterator const it = + mNonStableChunks.find(req->chunkId); + if (req->addPending) { + // Make chunk stable started in AddNotStableChunk() is now + // complete. Sever can be added if nothing has changed since + // the op was started. + // It is also crucial to ensure to the server with the + // identical location is not already present in the list of + // servers hosting the chunk before declaring chunk stale. + bool notifyStaleFlag = true; + const char* res = 0; + ChunkLeases::WriteLease const* li = 0; + PendingMakeStableMap::iterator msi; + if (it != mNonStableChunks.end()) { + res = "not stable again"; + } else { + msi = mPendingMakeStable.find(req->chunkId); + } + if (res) { + // Has already failed. + } else if (req->chunkSize >= 0 || req->hasChunkChecksum) { + if (msi == mPendingMakeStable.end()) { + // Chunk went away, or already sufficiently + // replicated. + res = "no pending make stable info"; + } else if (msi->second.mChunkVersion != + req->chunkVersion || + msi->second.mSize != req->chunkSize || + msi->second.mHasChecksum != + req->hasChunkChecksum || + msi->second.mChecksum != + req->chunkChecksum) { + // Stale request. + res = "pending make stable info has changed"; + } + } else if (msi != mPendingMakeStable.end()) { + res = "pending make stable info now exists"; + } + if (req->server->IsDown()) { + res = "server down"; + notifyStaleFlag = false; + } else if (req->status != 0) { + res = "request failed"; + notifyStaleFlag = false; + } else { + CSMap::Entry* const ci = mChunkToServerMap.Find(req->chunkId); + if (! ci) { + res = "no such chunk"; + } else if (ci->HasServer(mChunkToServerMap, + req->server->GetServerLocation())) { + res = "already added"; + notifyStaleFlag = false; + } else if ((li = mChunkLeases.GetWriteLease(req->chunkId)) && + (((! li->relinquishedFlag && + li->expires >= TimeNow()) || + li->chunkVersion != + req->chunkVersion))) { + // No write lease existed when this was started. + res = "new write lease exists"; + } else if (req->chunkVersion != + ci->GetChunkInfo()->chunkVersion) { + res = "chunk version has changed"; + } else { + pinfo = ci; + updateSizeFlag = + ! mChunkToServerMap.HasServers(*pinfo); + AddHosted(*ci, req->server); + notifyStaleFlag = false; + } + } + if (res) { + KFS_LOG_STREAM_INFO << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " " << req->server->ServerID() << + " not added: " << res << + (notifyStaleFlag ? " => stale" : "") << + "; " << req->Show() << + KFS_LOG_EOM; + if (notifyStaleFlag) { + req->server->NotifyStaleChunk(req->chunkId); + } + // List of servers hosting the chunk remains unchanged. + return; + } + } else { + if (it == mNonStableChunks.end() || + it->second.beginMakeStableFlag || + it->second.logMakeChunkStableFlag) { + KFS_LOG_STREAM_ERROR << "MCS" + " " << req->Show() << + " ignored: BMCS in progress" << + KFS_LOG_EOM; + return; + } + MakeChunkStableInfo& info = it->second; + KFS_LOG_STREAM_DEBUG << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " name: " << info.pathname << + " servers: " << info.numAckMsg << + "/" << info.numServers << + " size: " << req->chunkSize << + "/" << info.chunkSize << + " checksum: " << req->chunkChecksum << + "/" << info.chunkChecksum << + " " << req->Show() << + KFS_LOG_EOM; + if (req->status != 0 && ! req->server->IsDown()) { + ChunkCorrupt(req->chunkId, req->server); + } + if (++info.numAckMsg < info.numServers) { + return; + } + // Cleanup mNonStableChunks, after the lease cleanup, for extra + // safety: this will prevent make chunk stable from restarting + // recursively, in the case if there are double or stale + // write lease. + ExpiredLeaseCleanup(req->chunkId); + pathname = info.pathname; + updateSizeFlag = ! info.stripedFileFlag; + updateMTimeFlag = info.updateMTimeFlag; + mNonStableChunks.erase(it); + // "&info" is invalid at this point. + } + if (! pinfo) { + CSMap::Entry* const ci = mChunkToServerMap.Find(req->chunkId); + if (! ci) { + KFS_LOG_STREAM_INFO << logPrefix << + " <" << req->fid << + "," << req->chunkId << ">" << + " name: " << pathname << + " does not exist, skipping size update" << + KFS_LOG_EOM; + return; + } + pinfo = ci; + } + UpdateReplicationState(*pinfo); + int numServers = 0; + int numDownServers = 0; + ChunkServerPtr goodServer; + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(*pinfo, servers); + for (Servers::const_iterator csi = servers.begin(); + csi != servers.end(); + ++csi) { + if ((*csi)->IsDown()) { + numDownServers++; + } else { + numServers++; + if (! goodServer) { + goodServer = *csi; + } + } + } + MetaFattr* const fa = pinfo->GetFattr(); + const fid_t fileId = pinfo->GetFileId(); + if (updateMTimeFlag || mChunkToServerMap.GetState(*pinfo) != + CSMap::Entry::kStateCheckReplication) { + if (fa->IsStriped()) { + updateSizeFlag = false; + } + if (updateMTimeFlag) { + const int64_t now = microseconds(); + if (fa->mtime + mMTimeUpdateResolution < now) { + fa->mtime = now; + submit_request( + new MetaSetMtime(fileId, fa->mtime)); + } + } + if (fa->numReplicas != numServers) { + CheckReplication(*pinfo); + } else { + CancelPendingMakeStable(fileId, req->chunkId); + } + } + KFS_LOG_STREAM_INFO << logPrefix << + " <" << req->fid << "," << req->chunkId << ">" + " fid: " << fileId << + " version: " << req->chunkVersion << + " name: " << pathname << + " size: " << req->chunkSize << + " checksum: " << req->chunkChecksum << + " replicas: " << fa->numReplicas << + " is now stable on: " << numServers << + " down: " << numDownServers << + " server(s)" << + KFS_LOG_EOM; + if (! updateSizeFlag || + numServers <= 0 || + fa->filesize >= 0 || + fa->IsStriped() || + pinfo->GetChunkInfo()->offset + + (chunkOff_t)CHUNKSIZE < fa->nextChunkOffset()) { + // if no servers, or not the last chunk can not update size. + return; + } + if (req->chunkSize >= 0) { + // Already know the size, update it. + // The following will invoke GetChunkSizeDone(), + // and update the log. + MetaChunkSize* const op = new MetaChunkSize( + 0, // seq # + req->server, // chunk server + fileId, req->chunkId, req->chunkVersion, pathname, + false + ); + op->chunkSize = req->chunkSize; + submit_request(op); + } else { + // Get the chunk's size from one of the servers. + goodServer->GetChunkSize( + fileId, req->chunkId, req->chunkVersion, pathname); + } +} + +void +LayoutManager::ReplayPendingMakeStable( + chunkId_t chunkId, + seq_t chunkVersion, + chunkOff_t chunkSize, + bool hasChunkChecksum, + uint32_t chunkChecksum, + bool addFlag) +{ + const char* res = 0; + seq_t curChunkVersion = -1; + const CSMap::Entry* const ci = mChunkToServerMap.Find(chunkId); + MsgLogger::LogLevel logLevel = MsgLogger::kLogLevelDEBUG; + if (! ci) { + res = "no such chunk"; + } else if ((curChunkVersion = ci->GetChunkInfo()->chunkVersion) != + chunkVersion) { + res = "chunk version mismatch"; + logLevel = MsgLogger::kLogLevelERROR; + } + if (res) { + // Failure. + } else if (addFlag) { + const PendingMakeStableEntry entry( + chunkSize, + hasChunkChecksum, + chunkChecksum, + chunkVersion + ); + pair const res = + mPendingMakeStable.insert(make_pair(chunkId, entry)); + if (! res.second) { + KFS_LOG_STREAM((res.first->second.mHasChecksum || + res.first->second.mSize >= 0) ? + MsgLogger::kLogLevelWARN : + MsgLogger::kLogLevelDEBUG) << + "replay MCS add:" << + " update:" + " chunkId: " << chunkId << + " version: " << + res.first->second.mChunkVersion << + "=>" << entry.mChunkVersion << + " size: " << res.first->second.mSize << + "=>" << entry.mSize << + " checksum: " << + (res.first->second.mHasChecksum ? + int64_t(res.first->second.mChecksum) : + int64_t(-1)) << + "=>" << (entry.mHasChecksum ? + int64_t(entry.mChecksum) : + int64_t(-1)) << + KFS_LOG_EOM; + res.first->second = entry; + } + } else { + PendingMakeStableMap::iterator const it = + mPendingMakeStable.find(chunkId); + if (it == mPendingMakeStable.end()) { + res = "no such entry"; + logLevel = MsgLogger::kLogLevelERROR; + } else { + const bool warn = + it->second.mChunkVersion != chunkVersion || + (it->second.mSize >= 0 && ( + it->second.mSize != chunkSize || + it->second.mHasChecksum != + hasChunkChecksum || + (hasChunkChecksum && + it->second.mChecksum != chunkChecksum + ))); + KFS_LOG_STREAM(warn ? + MsgLogger::kLogLevelWARN : + MsgLogger::kLogLevelDEBUG) << + "replay MCS remove:" + " chunkId: " << chunkId << + " version: " << it->second.mChunkVersion << + "=>" << chunkVersion << + " size: " << it->second.mSize << + "=>" << chunkSize << + " checksum: " << (it->second.mHasChecksum ? + int64_t(it->second.mChecksum) : + int64_t(-1)) << + "=>" << (hasChunkChecksum ? + int64_t(chunkChecksum) : int64_t(-1)) << + KFS_LOG_EOM; + mPendingMakeStable.erase(it); + } + } + KFS_LOG_STREAM(logLevel) << + "replay MCS: " << + (addFlag ? "add" : "remove") << + " " << (res ? res : "ok") << + " total: " << mPendingMakeStable.size() << + " chunkId: " << chunkId << + " version: " << chunkVersion << + " cur vers: " << curChunkVersion << + " size: " << chunkSize << + " checksum: " << (hasChunkChecksum ? + int64_t(chunkChecksum) : int64_t(-1)) << + KFS_LOG_EOM; +} + +int +LayoutManager::WritePendingMakeStable(ostream& os) const +{ + // Write all entries in restore_makestable() format. + for (PendingMakeStableMap::const_iterator it = + mPendingMakeStable.begin(); + it != mPendingMakeStable.end() && os; + ++it) { + os << + "mkstable" + "/chunkId/" << it->first << + "/chunkVersion/" << it->second.mChunkVersion << + "/size/" << it->second.mSize << + "/checksum/" << it->second.mChecksum << + "/hasChecksum/" << (it->second.mHasChecksum ? 1 : 0) << + "\n"; + } + return (os ? 0 : -EIO); +} + +void +LayoutManager::CancelPendingMakeStable(fid_t fid, chunkId_t chunkId) +{ + PendingMakeStableMap::iterator const it = + mPendingMakeStable.find(chunkId); + if (it == mPendingMakeStable.end()) { + return; + } + NonStableChunksMap::iterator const nsi = mNonStableChunks.find(chunkId); + if (nsi != mNonStableChunks.end()) { + KFS_LOG_STREAM_ERROR << + "delete pending MCS:" + " <" << fid << "," << chunkId << ">" << + " attempt to delete while " << + (nsi->second.beginMakeStableFlag ? "B" : + (nsi->second.logMakeChunkStableFlag ? "L" : "")) << + "MCS is in progress denied" << + KFS_LOG_EOM; + return; + } + // Emit done log record -- this "cancels" "mkstable" log record. + // Do not write if begin make stable wasn't started before the + // chunk got deleted. + MetaLogMakeChunkStableDone* const op = + (it->second.mSize < 0 || it->second.mChunkVersion < 0) ? 0 : + new MetaLogMakeChunkStableDone( + fid, chunkId, it->second.mChunkVersion, + it->second.mSize, it->second.mHasChecksum, + it->second.mChecksum, chunkId + ); + mPendingMakeStable.erase(it); + mPendingBeginMakeStable.erase(chunkId); + KFS_LOG_STREAM_DEBUG << + "delete pending MCS:" + " <" << fid << "," << chunkId << ">" << + " total: " << mPendingMakeStable.size() << + " " << (op ? op->Show() : string("size < 0")) << + KFS_LOG_EOM; + if (op) { + submit_request(op); + } +} + +int +LayoutManager::GetChunkSizeDone(MetaChunkSize* req) +{ + if (! req->retryFlag && (req->chunkSize < 0 || req->status < 0)) { + return -1; + } + if (! IsChunkStable(req->chunkId) || + mChunkLeases.HasWriteLease(req->chunkId)) { + return -1; // Chunk isn't stable yet, or being written again. + } + const CSMap::Entry* const ci = mChunkToServerMap.Find(req->chunkId); + if (! ci) { + return -1; // No such chunk, do not log. + } + MetaFattr* const fa = ci->GetFattr(); + const MetaChunkInfo* const chunk = ci->GetChunkInfo(); + // Coalesce can change file id while request is in flight. + if (req->fid != fa->id()) { + req->fid = fa->id(); + req->pathname.clear(); // Path name is no longer valid. + } + if (fa->IsStriped() || fa->filesize >= 0 || fa->type != KFS_FILE || + chunk->offset + (chunkOff_t)CHUNKSIZE < + fa->nextChunkOffset()) { + return -1; // No update needed, do not write log entry. + } + if (req->chunkVersion != chunk->chunkVersion) { + KFS_LOG_STREAM_DEBUG << + " last chunk: " << chunk->chunkId << + " version: " << chunk->chunkVersion << + " ignoring: " << req->Show() << + " status: " << req->status << + " msg: " << req->statusMsg << + KFS_LOG_EOM; + return -1; + } + if (req->chunkSize < 0 || req->status < 0) { + KFS_LOG_STREAM_ERROR << + req->Show() << + " status: " << req->status << + " msg: " << req->statusMsg << + KFS_LOG_EOM; + if (! req->retryFlag) { + return -1; + } + // Retry the size request with all servers. + StTmp serversTmp(mServers3Tmp); + Servers& srvs = serversTmp.Get(); + mChunkToServerMap.GetServers(*ci, srvs); + for (Servers::const_iterator it = srvs.begin(); + it != srvs.end(); + ++it) { + if ((*it)->IsDown()) { + continue; + } + const bool retryFlag = false; + (*it)->GetChunkSize( + req->fid, req->chunkId, req->chunkVersion, + req->pathname, retryFlag); + } + return -1; + } + metatree.setFileSize(fa, chunk->offset + req->chunkSize); + KFS_LOG_STREAM_INFO << + "file: " << req->fid << + " chunk: " << req->chunkId << + " size: " << req->chunkSize << + " filesize: " << fa->filesize << + KFS_LOG_EOM; + return 0; +} + +bool +LayoutManager::IsChunkStable(chunkId_t chunkId) +{ + return (mNonStableChunks.find(chunkId) == mNonStableChunks.end()); +} + +int +LayoutManager::ReplicateChunk( + CSMap::Entry& clli, + int extraReplicas, + LayoutManager::ChunkPlacement& placement, + const ChunkRecoveryInfo& recoveryInfo) +{ + if (extraReplicas <= 0) { + return 0; + } + bool useServerExcludesFlag = clli.GetFattr()->IsStriped(); + if (! recoveryInfo.HasRecovery()) { + GetPlacementExcludes(clli, placement); + if (useServerExcludesFlag && + clli.GetFattr()->numReplicas > 1 && + placement.GetExcludedRacksCount() + + extraReplicas > mRacks.size()) { + // Do not pay attention to other stripes, with + // replication higher than 1 and insufficient number of + // racks. + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(clli, servers); + placement.clear(); + placement.ExcludeServerAndRack( + servers, clli.GetChunkId()); + useServerExcludesFlag = false; + } + } + placement.FindCandidatesForReplication(); + const size_t numRacks = placement.GetCandidateRackCount(); + const size_t numServersPerRack = numRacks <= 1 ? + (size_t)extraReplicas : + ((size_t)extraReplicas + numRacks - 1) / numRacks; + // Find candidates other than those that are already hosting the chunk. + StTmp serversTmp(mServersTmp); + Servers& candidates = serversTmp.Get(); + for (int rem = extraReplicas; ;) { + const size_t psz = candidates.size(); + for (size_t i = 0; ; ) { + const ChunkServerPtr cs = + placement.GetNext(useServerExcludesFlag); + if (! cs) { + break; + } + if (placement.IsUsingServerExcludes() && ( + find(candidates.begin(), + candidates.end(), cs) != + candidates.end() || + mChunkToServerMap.HasServer(cs, clli))) { + continue; + } + candidates.push_back(cs); + if (--rem <= 0 || ++i >= numServersPerRack) { + break; + } + } + if (rem <= 0 || placement.IsLastRack()) { + break; + } + placement.ExcludeServer( + candidates.begin() + psz, candidates.end()); + if (! placement.NextRack()) { + break; + } + } + if (candidates.empty()) { + KFS_LOG_STREAM_WARN << + "can not find replication destination for: <" << + clli.GetFileId() << "," << clli.GetChunkId() << + "> replicas: " << mChunkToServerMap.ServerCount(clli) << + " extra: " << extraReplicas << + KFS_LOG_EOM; + return 0; + } + return ReplicateChunk(clli, extraReplicas, candidates, recoveryInfo); +} + +int +LayoutManager::ReplicateChunk( + CSMap::Entry& clli, + int extraReplicas, + const LayoutManager::Servers& candidates, + const ChunkRecoveryInfo& recoveryInfo, + const char* reasonMsg) +{ + // prefer a server that is being retired to the other nodes as + // the source of the chunk replication + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(clli, servers); + Servers::const_iterator const iter = find_if( + servers.begin(), servers.end(), + bind(&ChunkServer::IsEvacuationScheduled, _1, clli.GetChunkId()) + ); + int numDone = 0; + for (Servers::const_iterator it = candidates.begin(); + numDone < extraReplicas && it != candidates.end(); + ++it) { + const ChunkServerPtr& c = *it; + ChunkServer& cs = *c; + // verify that we got good candidates + if (find(servers.begin(), servers.end(), c) != servers.end()) { + panic("invalid replication candidate"); + } + if (cs.IsDown()) { + continue; + } + const char* reason = "none"; + ChunkServerPtr dataServer; + if (iter != servers.end()) { + ChunkServer& ds = **iter; + reason = "evacuation"; + if (recoveryInfo.HasRecovery()) { + reason = "evacuation recovery"; + dataServer = c; + } else if (ds.GetReplicationReadLoad() < + mMaxConcurrentReadReplicationsPerNode && + (ds.IsResponsiveServer() || + servers.size() <= 1)) { + dataServer = *iter; + } + } else if (recoveryInfo.HasRecovery()) { + reason = "recovery"; + dataServer = c; + } else { + reason = "re-replication"; + } + // if we can't find a retiring server, pick a server that + // has read b/w available + for (Servers::const_iterator si = servers.begin(); + ! dataServer && si != servers.end(); + ++si) { + ChunkServer& ss = **si; + if (ss.GetReplicationReadLoad() >= + mMaxConcurrentReadReplicationsPerNode || + ! ss.IsResponsiveServer()) { + continue; + } + dataServer = *si; + } + if (! dataServer) { + continue; + } + KFS_LOG_STREAM_INFO << + "starting re-replication:" + " chunk: " << clli.GetChunkId() << + " from: " << + dataServer->GetServerLocation() << + " to: " << + cs.GetServerLocation() << + " reason: " << reason << + ((reasonMsg && reasonMsg[0]) ? " " : "") << + (reasonMsg ? reasonMsg : "") << + KFS_LOG_EOM; + // Do not increment replication read load when starting + // chunk recovery. + // Recovery decides from where to read. + // With recovery dataServer == &cs here, and the source + // location in the request will only have meta server + // port, and empty host name. + if (! recoveryInfo.HasRecovery() || dataServer != c) { + dataServer->UpdateReplicationReadLoad(1); + } + assert(mNumOngoingReplications >= 0); + // Bump counters here, completion can be invoked + // immediately, for example when send fails. + mNumOngoingReplications++; + mOngoingReplicationStats->Update(1); + mTotalReplicationStats->Update(1); + const CSMap::Entry::State replicationState = + mChunkToServerMap.GetState(clli); + if (replicationState == CSMap::Entry::kStateNone || + replicationState == + CSMap::Entry::kStateCheckReplication) { + SetReplicationState(clli, + CSMap::Entry::kStatePendingReplication); + } + cs.ReplicateChunk(clli.GetFileId(), clli.GetChunkId(), + dataServer, recoveryInfo); + // Do not count synchronous failures. + if (! cs.IsDown()) { + numDone++; + } + } + return numDone; +} + +bool +LayoutManager::GetPlacementExcludes( + const CSMap::Entry& entry, + LayoutManager::ChunkPlacement& placement, + bool includeThisChunkFlag /* = true */, + bool stopIfHasAnyReplicationsInFlight /* = false */, + vector* chunkBlock /* = 0 */) +{ + const MetaFattr* const fa = entry.GetFattr(); + if (! fa->IsStriped()) { + if (includeThisChunkFlag) { + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(entry, servers); + placement.ExcludeServerAndRack( + servers, entry.GetChunkId()); + } + return true; + } + StTmp > cinfoTmp(mChunkInfosTmp); + const MetaChunkInfo* const chunk = entry.GetChunkInfo(); + chunkOff_t start = -1; + MetaFattr* mfa = 0; + MetaChunkInfo* mci = 0; + chunkOff_t offset = chunk->offset; + vector& cblk = + chunkBlock ? *chunkBlock : cinfoTmp.Get(); + cblk.reserve(fa->numStripes + fa->numRecoveryStripes); + if (cblk.empty() && + (metatree.getalloc(fa->id(), offset, + mfa, mci, &cblk, &start) != 0 || + mfa != fa || mci != chunk)) { + panic("chunk mapping / getalloc mismatch"); + return false; + } + StTmp serversTmp(mServers3Tmp); + for (vector::const_iterator it = cblk.begin(); + it != cblk.end(); + ++it) { + if (! includeThisChunkFlag && chunk == *it) { + continue; + } + Servers& servers = serversTmp.Get(); + const CSMap::Entry& ce = GetCsEntry(**it); + mChunkToServerMap.GetServers(ce, servers); + if (chunk != *it) { + if (stopIfHasAnyReplicationsInFlight && + mChunkToServerMap.GetState(ce) != + CSMap::Entry::kStateNone) { + return false; + } + if (GetInFlightChunkModificationOpCount( + (*it)->chunkId, + stopIfHasAnyReplicationsInFlight ? + 0 : &servers + ) > 0 && + stopIfHasAnyReplicationsInFlight) { + return false; // Early termination -- ignore the rest + } + } + placement.ExcludeServerAndRack(servers, ce.GetChunkId()); + } + return true; +} + +bool +LayoutManager::CanReplicateChunkNow( + CSMap::Entry& c, + int& extraReplicas, + LayoutManager::ChunkPlacement& placement, + int* hibernatedReplicaCount /* = 0 */, + ChunkRecoveryInfo* recoveryInfo /* = 0 */, + bool forceRecoveryFlag /* = false */) +{ + extraReplicas = 0; + if (hibernatedReplicaCount) { + *hibernatedReplicaCount = 0; + } + if (recoveryInfo) { + recoveryInfo->Clear(); + } + + const MetaFattr* const fa = c.GetFattr(); + const chunkId_t chunkId = c.GetChunkId(); + // Don't replicate chunks for which a write lease has been + // issued. + const ChunkLeases::WriteLease* const wl = + mChunkLeases.GetWriteLease(chunkId); + if (wl) { + KFS_LOG_STREAM_DEBUG << + "re-replication delayed chunk:" + " <" << c.GetFileId() << "," << chunkId << ">" + " " << (TimeNow() <= wl->expires ? + "valid" : "expired") << + " write lease exists" << + KFS_LOG_EOM; + if (recoveryInfo) { + SetReplicationState(c, + CSMap::Entry::kStatePendingReplication); + } + return false; + } + if (! IsChunkStable(chunkId)) { + KFS_LOG_STREAM_DEBUG << + "re-replication delayed chunk:" + " <" << c.GetFileId() << "," << chunkId << ">" + " is not stable yet" << + KFS_LOG_EOM; + if (recoveryInfo) { + SetReplicationState(c, + CSMap::Entry::kStatePendingReplication); + } + return false; + } + const MetaChunkInfo* const chunk = c.GetChunkInfo(); + size_t hibernatedCount = 0; + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(c, servers, hibernatedCount); + if (hibernatedReplicaCount) { + *hibernatedReplicaCount = (int)hibernatedCount; + } + if (forceRecoveryFlag || + (servers.empty() && + (! recoveryInfo || hibernatedCount <= 0)) || + (mUseEvacuationRecoveryFlag && + recoveryInfo && + servers.size() == 1 && + servers.front()->GetReplicationReadLoad() >= + mMaxConcurrentReadReplicationsPerNode && + servers.front()->IsEvacuationScheduled(chunkId) && + fa->numReplicas == 1 && + fa->HasRecovery())) { + if (! recoveryInfo || ! fa->HasRecovery()) { + if (recoveryInfo) { + KFS_LOG_STREAM_DEBUG << + "can not re-replicate chunk:" + " <" << c.GetFileId() << "," << chunkId << ">" + " no copies left," + " canceling re-replication" << + KFS_LOG_EOM; + SetReplicationState(c, + CSMap::Entry::kStatePendingReplication); + } + return false; + } + StTmp > cinfoTmp(mChunkInfosTmp); + vector& cblk = cinfoTmp.Get(); + cblk.reserve(fa->numStripes + fa->numRecoveryStripes); + chunkOff_t start = -1; + MetaFattr* mfa = 0; + MetaChunkInfo* mci = 0; + chunkOff_t offset = chunk->offset; + if (metatree.getalloc(fa->id(), offset, + mfa, mci, &cblk, &start) != 0 || + mfa != fa || mci != chunk) { + panic("chunk mapping / getalloc mismatch"); + return false; + } + const chunkOff_t end = start + fa->ChunkBlkSize(); + int good = 0; + int notStable = 0; + int stripeIdx = 0; + bool holeFlag = false; + vector::const_iterator it = cblk.begin(); + StTmp serversTmp(mServers4Tmp); + for (chunkOff_t pos = start; + pos < end; + pos += (chunkOff_t)CHUNKSIZE, stripeIdx++) { + if (it == cblk.end()) { + notStable = -1; + break; // incomplete chunk block. + } + assert((*it)->offset % CHUNKSIZE == 0); + if (pos < (*it)->offset) { + if (fa->numStripes <= stripeIdx) { + // No recovery: incomplete chunk block. + notStable = -1; + break; + } + good++; + holeFlag = true; + continue; // no chunk -- hole. + } + if (holeFlag && stripeIdx < fa->numStripes) { + // No prior stripes, incomplete chunk block. + notStable = -1; + break; + } + const chunkId_t curChunkId = (*it)->chunkId; + if (mChunkLeases.GetWriteLease(curChunkId) || + ! IsChunkStable(curChunkId)) { + notStable++; + break; + // MakeChunkStableDone will restart + // re-replication. + } + Servers& srvs = serversTmp.Get(); + const CSMap::Entry& ce = GetCsEntry(**it); + if (mChunkToServerMap.GetServers(ce, srvs) > 0) { + good++; + } + if (chunkId != curChunkId) { + GetInFlightChunkModificationOpCount( + curChunkId, &srvs); + } + placement.ExcludeServerAndRack(srvs, curChunkId); + ++it; + } + if (notStable > 0 || + (notStable == 0 && good < fa->numStripes)) { + if (! servers.empty()) { + // Can not use recovery instead of replication. + SetReplicationState(c, + CSMap::Entry::kStateNoDestination); + return false; + } + // Ensure that all pending recovery chunks in this block + // are adjacent in the the recovery list. + // UpdatePendingRecovery() depends on this. + int pendingCnt = 0; + chunkId_t firstPending = -1; + for (it = cblk.begin(); it != cblk.end(); ++it) { + chunkId_t const chunkId = (*it)->chunkId; + CSMap::Entry& ci = GetCsEntry(**it); + if (! mChunkToServerMap.HasServers(ci) || + mChunkLeases.GetWriteLease( + chunkId) || + ! IsChunkStable(chunkId)) { + mChunkToServerMap.SetState(ci, + CSMap::Entry::kStatePendingRecovery); + pendingCnt++; + if (&ci != &c && firstPending <= 0) { + firstPending = chunkId; + } + } + } + const int64_t totalCnt = mChunkToServerMap.GetCount( + CSMap::Entry::kStatePendingRecovery); + KFS_LOG_STREAM( + totalCnt < mMaxPendingRecoveryMsgLogInfo ? + MsgLogger::kLogLevelINFO : + MsgLogger::kLogLevelDEBUG) << + "recovery pending:" + " <" << fa->id() << "," << chunkId << ">" + " chunks: available: " << good << + " stripe: " << stripeIdx << + " required: " << fa->numStripes << + " added: " << pendingCnt << + " total: " << totalCnt << + " not stable: " << notStable << + " other chunk: " << firstPending << + " block:" + " chunks: " << cblk.size() << + " pos: " << start << + " size: " << (end - start) << + KFS_LOG_EOM; + return false; + } + // Temporary paper over: presently the client lib doesn't + // re-invalidate chunks when all leases expire withing the chunk + // group it writes into. + // For example: no write activity for more than 5 min, + // or client doesn't get scheduled on cpu because os thrashing, + // or there is connectivity problem. + // For now delay recovery of the chunk in the blocks past of + // logical EOF, until the client updates EOF. This is needed to + // prevent recovery from starting "too soon" and creating a + // potentially bogus chunk. + // Obviously this has no effect with re-write, when chunk block + // position is less than logical EOF, but re-write isn't fully + // supported with striped files. + const int64_t timeMicrosec = + (int64_t)TimeNow() * kSecs2MicroSecs; + if (notStable != 0 || (fa->filesize <= + fa->ChunkPosToChunkBlkFileStartPos(start) && + timeMicrosec < fa->mtime + + mPastEofRecoveryDelay)) { + if (! servers.empty()) { + // Cannot use recovery instead of replication. + SetReplicationState(c, + CSMap::Entry::kStateNoDestination); + return false; + } + KFS_LOG_STREAM_INFO << + "recovery:" + " <" << fa->id() << "," << chunkId << ">" + " file size: " << fa->filesize << + " fblk pos: " << + fa->ChunkPosToChunkBlkFileStartPos(start) << + " chunk off: " << start << + " not stable: " << notStable << + " mtime: " << + ((timeMicrosec - fa->mtime) * 1e-6) << + " sec. ago" + " pending recovery: " << + mChunkToServerMap.GetCount( + CSMap::Entry::kStateDelayedRecovery) << + " delaying recovery by: " << + ((fa->mtime + mPastEofRecoveryDelay - + timeMicrosec) * 1e-6) << + " sec." << + KFS_LOG_EOM; + SetReplicationState(c, + CSMap::Entry::kStateDelayedRecovery); + return false; + } + recoveryInfo->offset = chunk->offset; + recoveryInfo->version = chunk->chunkVersion; + recoveryInfo->striperType = fa->striperType; + recoveryInfo->numStripes = fa->numStripes; + recoveryInfo->numRecoveryStripes = fa->numRecoveryStripes; + recoveryInfo->stripeSize = fa->stripeSize; + recoveryInfo->fileSize = fa->filesize; + } + // if any of the chunkservers are retiring, we need to make copies + // so, first determine how many copies we need because one of the + // servers hosting the chunk is going down + // May need to re-replicate this chunk: + // - extraReplicas > 0 means make extra copies; + // - extraReplicas == 0, take out this chunkid from the candidate set + // - extraReplicas < 0, means we got too many copies; delete some + const int numRetiringServers = (int)count_if( + servers.begin(), servers.end(), + bind(&ChunkServer::IsEvacuationScheduled, _1, chunkId)); + // now, determine if we have sufficient copies + // we need to make this many copies: # of servers that are + // retiring plus the # this chunk is under-replicated + extraReplicas = fa->numReplicas + numRetiringServers - + (int)servers.size(); + // Do not delete evacuated / retired replicas until there is sufficient + // number of replicas, then delete all extra copies at once. + // Take into the account hibernated servers, delay the re-replication / + // recovery until the hibernated servers are removed. + // For now count hibernated server only in the case if + if (extraReplicas <= 0) { + extraReplicas -= numRetiringServers; + } else if (recoveryInfo && (int)hibernatedCount <= extraReplicas) { + extraReplicas -= hibernatedCount; + } + // + // If additional copies need to be deleted, check if there is a valid + // (read) lease issued on the chunk. In case if lease exists leave the + // chunk alone for now; we'll look at deleting it when the lease has + // expired. This is for safety: if a client was reading from the copy + // of the chunk that we are trying to delete, the client will see the + // deletion and will have to failover; avoid unnecessary failovers + // + const bool readLeaseWaitFlag = recoveryInfo && extraReplicas < 0 && + mChunkLeases.UpdateReadLeaseReplicationCheck(chunkId, true); + KFS_LOG_STREAM_DEBUG << + "re-replicate: chunk:" + " <" << c.GetFileId() << "," << chunkId << ">" + " version: " << chunk->chunkVersion << + " offset: " << chunk->offset << + " eof: " << fa->filesize << + " replicas: " << servers.size() << + " retiring: " << numRetiringServers << + " target: " << fa->numReplicas << + " rlease: " << readLeaseWaitFlag << + " hibernated: " << hibernatedCount << + " needed: " << extraReplicas << + KFS_LOG_EOM; + if (readLeaseWaitFlag) { + SetReplicationState(c, CSMap::Entry::kStatePendingReplication); + return false; + } + return true; +} + +void +LayoutManager::CheckHibernatingServersStatus() +{ + const time_t now = TimeNow(); + + for (HibernatedServerInfos::iterator + iter = mHibernatingServers.begin(); + iter != mHibernatingServers.end(); + ) { + Servers::const_iterator const i = find_if( + mChunkServers.begin(), mChunkServers.end(), + MatchingServer(iter->location)); + if (i == mChunkServers.end() && now < iter->sleepEndTime) { + // within the time window where the server is sleeping + // so, move on + iter++; + continue; + } + if (i != mChunkServers.end()) { + if (! iter->IsHibernated()) { + if (iter->sleepEndTime + 10 * 60 < now) { + KFS_LOG_STREAM_INFO << + "hibernated server: " << + iter->location << + " still connected, canceling" + " hibernation" << + KFS_LOG_EOM; + iter = mHibernatingServers.erase(iter); + } + continue; + } + KFS_LOG_STREAM_INFO << + "hibernated server: " << iter->location << + " is back as promised" << + KFS_LOG_EOM; + } else { + // server hasn't come back as promised...so, check + // re-replication for the blocks that were on that node + KFS_LOG_STREAM_INFO << + "hibernated server: " << iter->location << + " is NOT back as promised" << + KFS_LOG_EOM; + } + if (! mChunkToServerMap.RemoveHibernatedServer(iter->csmapIdx)) { + panic("failed to remove hibernated server"); + } + iter = mHibernatingServers.erase(iter); + } +} + +int +LayoutManager::CountServersAvailForReReplication() const +{ + int anyAvail = 0; + for (uint32_t i = 0; i < mChunkServers.size(); i++) { + const ChunkServer& cs = *mChunkServers[i].get(); + if (cs.GetSpaceUtilization(mUseFsTotalSpaceFlag) > + mMaxSpaceUtilizationThreshold) { + continue; + } + if (cs.GetNumChunkReplications() >= + mMaxConcurrentWriteReplicationsPerNode) { + continue; + } + anyAvail++; + } + return anyAvail; +} + +bool +LayoutManager::HandoutChunkReplicationWork() +{ + // There is a set of chunks that are affected: their server went down + // or there is a change in their degree of replication. in either + // case, walk this set of chunkid's and work on their replication amount. + + // List of in flight ops to transition chunk to pending list. + // Completion of any op in this list must transition chunk from this + // list (usually by invoking UpdateReplicationState()) as the list is + // not scanned by the timer. + MetaOp const makePendingOpTypes[] = { + META_CHUNK_REPLICATE, + META_CHUNK_VERSCHANGE, + META_CHUNK_MAKE_STABLE, + META_NUM_OPS_COUNT // Sentinel + }; + + int64_t now = microseconds(); + int64_t endTime = now; + const int kCheckTime = 32; + int pass = kCheckTime; + int64_t start = now; + int64_t count = 0; + int64_t doneCount = 0; + int64_t loopCount = 0; + int avail = 0; + bool timedOutFlag = false; + if (now <= mLastReplicationCheckRunEndTime + + mMinChunkReplicationCheckInterval) { + int64_t kMinInterval = 500; + if (mLastReplicationCheckRunEndTime + kMinInterval < now) { + endTime += kMinInterval; + } else { + pass = 4; + } + } else { + endTime += mMaxTimeForChunkReplicationCheck; + } + ChunkRecoveryInfo recoveryInfo; + StTmp placementTmp(mChunkPlacementTmp); + bool nextRunLowPriorityFlag = false; + mChunkToServerMap.First(CSMap::Entry::kStateCheckReplication); + for (; ; loopCount++) { + if (--pass <= 0) { + now = microseconds(); + pass = kCheckTime; + if (endTime <= now) { + mReplicationCheckTimeouts++; + if (nextRunLowPriorityFlag) { + break; + } + timedOutFlag = true; + mChunkReplicator.ScheduleNext(); + const int64_t kMsgInterval = + 2 * kSecs2MicroSecs; + if (now < mLastReplicationCheckRunEndTime + + kMsgInterval) { + break; + } + KFS_LOG_STREAM_INFO << + "exiting replication check:" + " time spent: " << (now - start) << + " microsec" << + " timeouts: " << + mReplicationCheckTimeouts << + " candidates: " << + mChunkToServerMap.GetCount( + CSMap::Entry::kStateCheckReplication) << + " initiated: " << count << + " done: " << doneCount << + " loop: " << loopCount << + KFS_LOG_EOM; + break; + } + } + if (avail <= 0 && (avail = + CountServersAvailForReReplication()) <= 0) { + if (count <= 0) { + mNoServersAvailableForReplicationCount++; + KFS_LOG_STREAM_INFO << + "exiting replication check:" + " no servers available for" + " replication: " << + mNoServersAvailableForReplicationCount << + KFS_LOG_EOM; + } + break; + } + CSMap::Entry* cur = mChunkToServerMap.Next( + CSMap::Entry::kStateCheckReplication); + if (! cur) { + // See if all chunks check was requested. + if (! (cur = mChunkToServerMap.Next( + CSMap::Entry::kStateNone))) { + mCheckAllChunksInProgressFlag = false; + nextRunLowPriorityFlag = true; + if (! (cur = mChunkToServerMap.Next( + CSMap::Entry::kStateNoDestination)) && + ! (cur = mChunkToServerMap.Next( + CSMap::Entry::kStateDelayedRecovery))) { + mChunkToServerMap.First( + CSMap::Entry::kStateNoDestination); + mChunkToServerMap.First( + CSMap::Entry::kStateDelayedRecovery); + break; // Done. + } + } + // Move to the replication list. + mChunkToServerMap.SetState( + *cur, CSMap::Entry::kStateCheckReplication); + } + CSMap::Entry& entry = *cur; + + if (GetInFlightChunkOpsCount(entry.GetChunkId(), + makePendingOpTypes) > 0) { + // This chunk is being re-replicated, or in transition. + // Replication check will get scheduled again when the + // corresponding op completes. + SetReplicationState(entry, + CSMap::Entry::kStatePendingReplication); + continue; + } + int extraReplicas = 0; + int hibernatedReplicaCount = 0; + recoveryInfo.Clear(); + ChunkPlacement& placement = placementTmp.Get(); + if (! CanReplicateChunkNow( + entry, + extraReplicas, + placement, + &hibernatedReplicaCount, + &recoveryInfo)) { + continue; + } + if (extraReplicas > 0) { + const int numStarted = ReplicateChunk( + entry, + extraReplicas, + placement, + recoveryInfo); + if (numStarted <= 0) { + SetReplicationState(entry, + CSMap::Entry::kStateNoDestination); + } + count += numStarted; + avail -= numStarted; + } else { + if (extraReplicas < 0) { + DeleteAddlChunkReplicas( + entry, -extraReplicas, placement); + } + if (hibernatedReplicaCount <= 0) { + // Sufficient replicas, now no need to make + // chunk stable. + CancelPendingMakeStable( + entry.GetFileId(), entry.GetChunkId()); + } + SetReplicationState(entry, CSMap::Entry::kStateNone); + doneCount++; + } + if (mNumOngoingReplications > (int64_t)mChunkServers.size() * + mMaxConcurrentWriteReplicationsPerNode) { + // throttle...we are handing out + break; + } + } + mLastReplicationCheckRunEndTime = + pass == kCheckTime ? now : microseconds(); + return timedOutFlag; +} + +void LayoutManager::Timeout() +{ + ScheduleCleanup(mMaxServerCleanupScan); +} + +void LayoutManager::ScheduleCleanup(size_t maxScanCount /* = 1 */) +{ + if (mChunkToServerMap.RemoveServerCleanup(maxScanCount)) { + if (! mCleanupScheduledFlag) { + mCleanupScheduledFlag = true; + globalNetManager().RegisterTimeoutHandler(this); + } + globalNetManager().Wakeup(); + } else { + if (mCleanupScheduledFlag) { + mCleanupScheduledFlag = false; + globalNetManager().UnRegisterTimeoutHandler(this); + } + } +} + +struct EvacuateChunkChecker +{ + bool& mRetiringServersFlag; + EvacuateChunkChecker(bool& flag) + : mRetiringServersFlag(flag) { + mRetiringServersFlag = false; + } + void operator()(const ChunkServerPtr& c) const { + if (! c->IsRetiring()) { + return; + } + // Until the server disconnects, even if it has no chunks, + // set the flag. + mRetiringServersFlag = true; + if (c->GetChunkCount() > 0) { + return; + } + c->Retire(); + } +}; + +void +LayoutManager::ChunkReplicationChecker() +{ + if (! mPendingBeginMakeStable.empty() && ! InRecoveryPeriod()) { + ProcessPendingBeginMakeStable(); + } + const bool recoveryFlag = InRecovery(); + const int64_t now = microseconds(); + const bool fullCheckFlag = mCompleteReplicationCheckTime + + mCompleteReplicationCheckInterval <= now; + if (fullCheckFlag) { + mCompleteReplicationCheckTime = now; + CheckHibernatingServersStatus(); + } + if (mLastReplicationCheckTime + mFullReplicationCheckInterval <= now) { + KFS_LOG_STREAM_INFO << + "Initiating a replication check of all chunks" << + KFS_LOG_EOM; + InitCheckAllChunks(); + mLastReplicationCheckTime = now; + } + const bool runRebalanceFlag = + ! recoveryFlag && + ! HandoutChunkReplicationWork() && + ! mCheckAllChunksInProgressFlag; + if (fullCheckFlag) { + if (mMightHaveRetiringServersFlag) { + // Chunk deletion does not initiate retiring, and retire + // isn't completely reliable -- notification only. + // Tell servers to retire if they are still here. + for_each(mChunkServers.begin(), mChunkServers.end(), + EvacuateChunkChecker( + mMightHaveRetiringServersFlag)); + } + } + if (runRebalanceFlag && + (mIsRebalancingEnabled || mIsExecutingRebalancePlan) && + mLastRebalanceRunTime + mRebalanceRunInterval <= now) { + mLastRebalanceRunTime = now; + RebalanceServers(); + } + mReplicationTodoStats->Set(mChunkToServerMap.GetCount( + CSMap::Entry::kStateCheckReplication)); + ScheduleCleanup(mMaxServerCleanupScan); +} + +void +LayoutManager::ChunkReplicationDone(MetaChunkReplicate* req) +{ + const bool versChangeDoneFlag = req->versChange != 0; + assert(! req->suspended || versChangeDoneFlag); + if (versChangeDoneFlag) { + if (! req->suspended) { + req->versChange = 0; + return; + } + assert(! req->versChange->clnt); + req->suspended = false; + req->status = req->versChange->status; + req->statusMsg = req->versChange->statusMsg; + } + + // In the recovery case the source location's host name is empty. + const bool replicationFlag = req->srcLocation.IsValid(); + KFS_LOG_STREAM_INFO << + (versChangeDoneFlag ? "version change" : + (replicationFlag ? "replication" : "recovery")) << + " done:" + " chunk: " << req->chunkId << + " version: " << req->chunkVersion << + " status: " << req->status << + (req->statusMsg.empty() ? "" : " ") << req->statusMsg << + " server: " << req->server->ServerID() << + " " << (req->server->IsDown() ? "down" : "OK") << + " replications in flight: " << mNumOngoingReplications << + KFS_LOG_EOM; + + if (! versChangeDoneFlag) { + mOngoingReplicationStats->Update(-1); + assert(mNumOngoingReplications > 0); + mNumOngoingReplications--; + req->server->ReplicateChunkDone(req->chunkId); + if (replicationFlag && req->dataServer) { + req->dataServer->UpdateReplicationReadLoad(-1); + } + req->dataServer.reset(); + } + + // Since this server is now free, + // schedule chunk replication scheduler to run. + if ((((int64_t)mChunkToServerMap.GetCount( + CSMap::Entry::kStateCheckReplication) > 0 || + (int64_t)mChunkToServerMap.GetCount( + CSMap::Entry::kStateNoDestination) > + (int64_t)mChunkServers.size() * + mMaxConcurrentWriteReplicationsPerNode) && + (int64_t)mNumOngoingReplications * 5 / 4 < + (int64_t)mChunkServers.size() * + mMaxConcurrentWriteReplicationsPerNode) || + (req->server->GetNumChunkReplications() * 5 / 4 < + mMaxConcurrentWriteReplicationsPerNode && + ! req->server->IsRetiring() && + ! req->server->IsDown())) { + mChunkReplicator.ScheduleNext(); + } + + CSMap::Entry* const ci = mChunkToServerMap.Find(req->chunkId); + if (! ci) { + KFS_LOG_STREAM_INFO << + "chunk " << req->chunkId << + " mapping no longer exists" << + KFS_LOG_EOM; + req->server->NotifyStaleChunk(req->chunkId); + return; + } + if (req->status != 0 || req->server->IsDown()) { + // Replication failed...we will try again later + const fid_t fid = ci->GetFileId(); + KFS_LOG_STREAM_INFO << + req->server->GetServerLocation() << + ": re-replication failed" + " chunk: " << req->chunkId << + " fid: " << req->fid << "/" << fid << + " status: " << req->status << + " in flight: " << + GetInFlightChunkOpsCount( + req->chunkId, META_CHUNK_REPLICATE) << + " invalid stripes: " << req->invalidStripes.size() << + KFS_LOG_EOM; + mFailedReplicationStats->Update(1); + UpdateReplicationState(*ci); + // Aways send stale chunk notification properly handle op time + // outs by the meta server. Theoretically this could be + // conditional on the op status code, if it is guaranteed that + // the chunk server never sends the op timed out status. + if (req->server->IsDown() || + ci->HasServer(mChunkToServerMap, req->server)) { + return; + } + if (! versChangeDoneFlag && fid == req->fid) { + ProcessInvalidStripes(*req); + } + req->server->NotifyStaleChunk(req->chunkId); + if (! replicationFlag || req->server->IsDown() || + versChangeDoneFlag) { + return; + } + const MetaFattr* const fa = ci->GetFattr(); + if (fa->HasRecovery() && + mChunkToServerMap.ServerCount(*ci) == 1) { + KFS_LOG_STREAM_INFO << + "chunk: " << req->chunkId << + " fid: " << req->fid << "/" << fid << + " attempting to use recovery" + " instead of replication" << + KFS_LOG_EOM; + const bool kForceRecoveryFlag = true; + int extraReplicas = 0; + int hibernatedReplicaCount = 0; + ChunkRecoveryInfo recoveryInfo; + recoveryInfo.Clear(); + StTmp placementTmp(mChunkPlacementTmp); + ChunkPlacement& placement = placementTmp.Get(); + if (GetInFlightChunkModificationOpCount( + req->chunkId) <= 0 && + CanReplicateChunkNow( + *ci, + extraReplicas, + placement, + &hibernatedReplicaCount, + &recoveryInfo, + kForceRecoveryFlag) && + extraReplicas > 0 && + ReplicateChunk( + *ci, + extraReplicas, + placement, + recoveryInfo) <= 0) { + SetReplicationState(*ci, + CSMap::Entry::kStateNoDestination); + } + } + return; + } + // replication succeeded: book-keeping + // validate that the server got the latest copy of the chunk + const MetaChunkInfo* const chunk = ci->GetChunkInfo(); + if (chunk->chunkVersion != req->chunkVersion) { + // Version that we replicated has changed...so, stale + KFS_LOG_STREAM_INFO << + req->server->GetServerLocation() << + " re-replicate: chunk " << req->chunkId << + " version changed was: " << req->chunkVersion << + " now " << chunk->chunkVersion << " => stale" << + KFS_LOG_EOM; + mFailedReplicationStats->Update(1); + UpdateReplicationState(*ci); + req->server->NotifyStaleChunk(req->chunkId); + return; + } + if (! replicationFlag && ! versChangeDoneFlag) { + const fid_t fid = ci->GetFileId(); + if (fid != req->fid) { + KFS_LOG_STREAM_INFO << + req->server->GetServerLocation() << + " recover: chunk " << req->chunkId << + " file id changed:" + " was: " << req->fid << + " now: " << fid << " => stale" << + KFS_LOG_EOM; + UpdateReplicationState(*ci); + req->server->NotifyStaleChunk(req->chunkId); + return; + } + req->suspended = true; + const bool kMakeStableFlag = true; + const bool kPendingAddFlag = false; + req->server->NotifyChunkVersChange( + req->fid, + req->chunkId, + req->chunkVersion, // to + 0, // from + kMakeStableFlag, + kPendingAddFlag, + req + ); + return; + } + UpdateReplicationState(*ci); + // Yaeee...all good... + KFS_LOG_STREAM_DEBUG << + req->server->GetServerLocation() << + " chunk: " << req->chunkId << + (replicationFlag ? " re-replication" : " recovery") << + " done" << + KFS_LOG_EOM; + AddHosted(*ci, req->server); + req->server->MovingChunkDone(req->chunkId); + StTmp serversTmp(mServersTmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(*ci, servers); + // if any of the hosting servers were being "retired", notify them that + // re-replication of any chunks hosted on them is finished + // Replication check is already scheduled by UpdateReplicationState the + // above. Let the normal path figure out if the any further actions are + // needed. + RemoveRetiring(*ci, servers, ci->GetFattr()->numReplicas); +} + +void +LayoutManager::RemoveRetiring( + CSMap::Entry& ci, + LayoutManager::Servers& servers, + int numReplicas, + bool deleteRetiringFlag /* = false */) +{ + const chunkId_t chunkId = ci.GetChunkId(); + int cnt = (int)servers.size(); + for (int i = 0; numReplicas < cnt && i < cnt; ) { + const ChunkServerPtr& server = servers[i]; + if (! server->IsEvacuationScheduled(chunkId)) { + i++; + continue; + } + if (! mChunkToServerMap.RemoveServer(server, ci)) { + panic("failed to remove server"); + } + if (! server->IsDown()) { + if (server->IsRetiring()) { + if (server->GetChunkCount() <= 0) { + server->Retire(); + } else if (deleteRetiringFlag) { + server->DeleteChunk(chunkId); + } + } else { + const bool kEvacuateChunkFlag = true; + server->NotifyStaleChunk( + chunkId, kEvacuateChunkFlag); + } + } + servers.erase(servers.begin() + i); + cnt--; + } +} + +struct InvalidChunkInfo +{ + InvalidChunkInfo(const MetaChunkInfo& ci) + : offset(ci.offset), + chunkId(ci.chunkId), + chunkVersion(ci.chunkVersion) + {} + chunkOff_t offset; + chunkId_t chunkId; + seq_t chunkVersion; +}; +typedef vector > InvalidChunks; + +void +LayoutManager::ProcessInvalidStripes(MetaChunkReplicate& req) +{ + if (req.invalidStripes.empty()) { + return; + } + CSMap::Entry* const ci = mChunkToServerMap.Find(req.chunkId); + if (! ci) { + return; + } + const MetaFattr* const fa = ci->GetFattr(); + if (! fa->HasRecovery() || + fa->striperType != req.striperType || + fa->stripeSize != req.stripeSize || + fa->numStripes != req.numStripes || + fa->numRecoveryStripes != req.numRecoveryStripes) { + return; + } + const MetaChunkInfo* const chunk = ci->GetChunkInfo(); + chunkOff_t start = -1; + vector cblk; + cblk.reserve(fa->numStripes + fa->numRecoveryStripes); + MetaFattr* mfa = 0; + MetaChunkInfo* mci = 0; + chunkOff_t offset = chunk->offset; + if (metatree.getalloc(fa->id(), offset, + mfa, mci, &cblk, &start) != 0 || + mfa != fa || mci != chunk) { + panic("chunk mapping / getalloc mismatch"); + return; + } + const chunkOff_t end = start + fa->ChunkBlkSize(); + int idx = 0; + InvalidChunks invalidChunks; + vector::const_iterator it = cblk.begin(); + for (chunkOff_t pos = start; + pos < end; + pos += (chunkOff_t)CHUNKSIZE, idx++) { + if (it == cblk.end() || pos < (*it)->offset) { + if (req.invalidStripes.find(idx) != + req.invalidStripes.end()) { + KFS_LOG_STREAM_ERROR << "invalid stripes:" + " index: " << idx << + " chunk: " << + (it == cblk.end() ? + (*it)->chunkId : + chunkId_t(-1)) << + " chunk offset: " << + (it == cblk.end() ? + (*it)->offset : + chunkOff_t(-1)) << + " offset: " << pos << + " error: no chunk" << + KFS_LOG_EOM; + invalidChunks.clear(); + break; + } + continue; // no chunk -- hole. + } + assert(pos == (*it)->offset); + if (mChunkLeases.GetWriteLease((*it)->chunkId) || + ! IsChunkStable((*it)->chunkId)) { + KFS_LOG_STREAM_ERROR << "invalid stripes:" + " index: " << idx << + " chunk: " << (*it)->chunkId << + " offset: " << pos << + " error: chunk is not readable" << + KFS_LOG_EOM; + invalidChunks.clear(); + break; + } + MetaChunkReplicate::InvalidStripes::const_iterator const isi = + req.invalidStripes.find(idx); + if (isi != req.invalidStripes.end()) { + if (isi->second.first != (*it)->chunkId || + isi->second.second != (*it)->chunkVersion) { + KFS_LOG_STREAM_ERROR << "invalid stripes:" + " index: " << idx << + " chunk: " << (*it)->chunkId << + " expected: " << isi->second.first << + " version: " << (*it)->chunkVersion << + " expected: " << isi->second.second << + " offset: " << pos << + " error: chunk id or version mismatch" << + KFS_LOG_EOM; + invalidChunks.clear(); + break; + } + assert(mChunkToServerMap.Find((*it)->chunkId)); + // It is likely that more than one recovery was + // scheduled at the same time for this chunk group, and + // at least one has finished and reported this chunk + // as invalid. + // Do not invalidate chunks, wait for in-flight recovery + // to finish. + MetaOp const opTypes[] = { + META_CHUNK_ALLOCATE, + META_CHUNK_REPLICATE, + META_CHUNK_MAKE_STABLE, + META_CHUNK_VERSCHANGE, + META_NUM_OPS_COUNT + }; + if (GetInFlightChunkOpsCount( + (*it)->chunkId, opTypes) > 0) { + KFS_LOG_STREAM_ERROR << "invalid stripes:" + " index: " << idx << + " chunk: " << (*it)->chunkId << + " offset: " << pos << + " error: chunk is being replicated" << + KFS_LOG_EOM; + invalidChunks.clear(); + break; + } + invalidChunks.push_back(InvalidChunkInfo(**it)); + } + ++it; + } + if (invalidChunks.empty()) { + return; + } + if (req.invalidStripes.size() != invalidChunks.size()) { + KFS_LOG_STREAM_ERROR << "invalid stripes:" + " failed to find all chunks:" + " expected: " << req.invalidStripes.size() << + " found: " << invalidChunks.size() << + KFS_LOG_EOM; + return; + } + MetaChunkReplicate::InvalidStripes::const_iterator sit = + req.invalidStripes.begin(); + for (InvalidChunks::const_iterator cit = invalidChunks.begin(); + cit != invalidChunks.end(); + ++cit, ++sit) { + KFS_LOG_STREAM_INFO << "invalidating:" + " <" << req.fid << + "," << cit->chunkId << ">" + " version: " << cit->chunkVersion << + " offset: " << cit->offset << + " stripe: " << sit->first << + KFS_LOG_EOM; + if (mPanicOnInvalidChunkFlag) { + ostringstream os; + os << + "invalid chunk detected:" + " <" << req.fid << + "," << cit->chunkId << ">" + " version: " << cit->chunkVersion << + " offset: " << cit->offset << + " stripe: " << sit->first; + panic(os.str()); + } + MetaAllocate& alloc = *(new MetaAllocate( + sit->first, req.fid, cit->offset)); + alloc.invalidateAllFlag = true; + // To pass worm mode check assign name with tmp suffix. + const char* const kWormFakeName = "InvalidateChunk.tmp"; + alloc.pathname.Copy(kWormFakeName, strlen(kWormFakeName)); + submit_request(&alloc); + } +} + +// +// To delete additional copies of a chunk, find the servers that have the least +// amount of space and delete the chunk from there. In addition, also pay +// attention to rack-awareness: if two copies are on the same rack, then we pick +// the server that is the most loaded and delete it there +// +void +LayoutManager::DeleteAddlChunkReplicas( + CSMap::Entry& entry, + int extraReplicas, + LayoutManager::ChunkPlacement& placement) +{ + if (extraReplicas <= 0) { + return; + } + StTmp serversTmp(mServersTmp); + Servers& servers = serversTmp.Get(); + mChunkToServerMap.GetServers(entry, servers); + size_t cnt = servers.size(); + if (cnt <= (size_t)extraReplicas) { + return; + } + // Remove retiring / evacuating first, regardless of the placement + // constraints to make retirement / evacuation work in the case where + // not enough racks or disk space is available. + const size_t numReplicas = cnt - extraReplicas; + const bool kDeleteRetiringFlag = true; + RemoveRetiring(entry, servers, (int)numReplicas, kDeleteRetiringFlag); + if (servers.size() <= numReplicas) { + return; + } + placement.clear(); + const MetaFattr* const fa = entry.GetFattr(); + const int fileNumReplicas = fa->numReplicas; + bool useOtherSrvsRacksFlag = fa->IsStriped(); + if (useOtherSrvsRacksFlag && fileNumReplicas > 1) { + // If more than one replicas are on the same rack, then do not + // take into the account placement of other chunks in the stripe + // block. + placement.ExcludeServerAndRack(servers); + useOtherSrvsRacksFlag = placement.GetExcludedRacksCount() >= + servers.size(); + placement.clear(); + } + size_t otherRacksEx = 0; + if (useOtherSrvsRacksFlag) { + const bool kIncludeThisChunkFlag = false; + GetPlacementExcludes(entry, placement, kIncludeThisChunkFlag); + otherRacksEx = placement.GetExcludedRacksCount(); + if (fileNumReplicas > 1 && + otherRacksEx + fileNumReplicas > + mRacks.size()) { + // Do not pay attention to other stripes, with + // replication higher than 1 and insufficient number of + // racks. + placement.clear(); + otherRacksEx = 0; + } + } + + StTmp copiesToDiscardTmp(mServers2Tmp); + Servers& copiesToDiscard = copiesToDiscardTmp.Get(); + // Sort server by space utilization in ascending order: the delete + // candidates with the least free space will be at the end. + sort(servers.begin(), servers.end(), + bind(&ChunkServer::GetSpaceUtilization, _1, + mUseFsTotalSpaceFlag) < + bind(&ChunkServer::GetSpaceUtilization, _2, + mUseFsTotalSpaceFlag) + ); + const size_t otherSrvEx = placement.GetExcludedServersCount(); + if (otherSrvEx > 0) { + for (Servers::iterator it = servers.end(); + numReplicas < cnt && + it != servers.begin(); ) { + ChunkServerPtr& server = *--it; + if (! placement.IsServerExcluded(server)) { + continue; + } + // Delete redundant replica on the server with + // other chunks / replicas from the same + // stripe block. + copiesToDiscard.insert(copiesToDiscard.end(), + ChunkServerPtr())->swap(server); + cnt--; + } + } + const chunkId_t chunkId = entry.GetChunkId(); + if (numReplicas < cnt) { + // Try to keep as many copies as racks. + // For striped files placement keep the copies that are on + // different racks than the chunks in stripe / rs block. + StBufferT canDiscardIdx; + for (Servers::iterator it = servers.begin(); + it != servers.end(); + ++it) { + const ChunkServerPtr& server = *it; + if (! server || placement.ExcludeServerAndRack( + server, chunkId)) { + continue; + } + // Delete redundant replica on this rack. + canDiscardIdx.Append(it - servers.begin()); + } + for (const size_t* first = canDiscardIdx.GetPtr(), + * cur = first + canDiscardIdx.GetSize(); + first < cur && numReplicas < cnt; + ) { + --cur; + ChunkServerPtr& server = servers[*cur]; + copiesToDiscard.insert(copiesToDiscard.end(), + ChunkServerPtr())->swap(server); + cnt--; + } + // Drop the tail if needed. + for (Servers::iterator it = servers.end(); + numReplicas < cnt && + it != servers.begin(); ) { + ChunkServerPtr& server = *--it; + if (! server) { + continue; + } + copiesToDiscard.insert(copiesToDiscard.end(), + ChunkServerPtr())->swap(server); + cnt--; + } + } + + KFS_LOG_STREAM_START(MsgLogger::kLogLevelINFO, logStream); + ostream& os = logStream.GetStream(); + os << + "<" << entry.GetFileId() << "," << chunkId << ">" + " excludes:" + " srv: " + " other: " << otherSrvEx << + " all: " << placement.GetExcludedServersCount() << + " rack:" + " other: " << otherRacksEx << + " all: " << placement.GetExcludedRacksCount() << + " keeping:"; + const char* prefix = " "; + for (Servers::const_iterator it = servers.begin(); + it != servers.end(); + ++it) { + const ChunkServerPtr& server = *it; + if (! server) { + continue; + } + const ChunkServer& srv = *server; + os << prefix << + srv.GetServerLocation() << + " " << srv.GetRack() << + " " << srv.GetSpaceUtilization( + mUseFsTotalSpaceFlag); + prefix = ","; + } + os << " discarding:"; + prefix = " "; + for (Servers::const_iterator it = copiesToDiscard.begin(); + it != copiesToDiscard.end(); + ++it) { + const ChunkServer& srv = **it; + os << prefix << + srv.GetServerLocation() << + " " << srv.GetRack() << + " " << srv.GetSpaceUtilization( + mUseFsTotalSpaceFlag); + prefix = ","; + } + KFS_LOG_STREAM_END; + + for (Servers::const_iterator it = copiesToDiscard.begin(); + it != copiesToDiscard.end(); + ++it) { + const ChunkServerPtr& server = *it; + server->DeleteChunk(chunkId); + entry.Remove(mChunkToServerMap, server); + } +} + +void +LayoutManager::ChangeChunkReplication(chunkId_t chunkId) +{ + CSMap::Entry* const entry = mChunkToServerMap.Find(chunkId); + if (entry) { + CheckReplication(*entry); + } +} + +static inline void +MoveChunkBlockBack( + vector& cblk, + CSMap& csmap) +{ + if (cblk.size() <= 1) { + return; // No point of moving it. + } + // Move all chunks in the block to the back of the list. + // The order in "state none" list presently only has effect on + // re-balance, and nothing else. The logic depend on the relative order + // in the other chunk lists, on particular delayed recovery. + for (vector::const_iterator + it = cblk.begin(); it != cblk.end(); ++it) { + CSMap::Entry& ce = CSMap::Entry::GetCsEntry(**it); + if (csmap.GetState(ce) == CSMap::Entry::kStateNone) { + csmap.SetState(ce, CSMap::Entry::kStateNone); + } + } +} + +// +// Periodically, if we find that some chunkservers have LOT (> 80% free) of space +// and if others are loaded (i.e., < 30% free space), move chunks around. This +// helps with keeping better disk space utilization (and maybe load). +// +void +LayoutManager::RebalanceServers() +{ + if (InRecovery() || + mChunkServers.empty() || + mChunkToServerMap.Size() <= 0) { + return; + } + if (mRebalanceReplicationsThresholdCount <= mNumOngoingReplications) { + return; + } + // if we are doing rebalancing based on a plan, execute as + // much of the plan as there is room. + ExecuteRebalancePlan(); + + if (! mIsRebalancingEnabled || mIsExecutingRebalancePlan) { + return; + } + + // Use backward cursor, check all chunk replication uses forward cursor, + // see InitCheckAllChunks() + const ChunkRecoveryInfo recoveryInfo; + StTmp serversTmp(mServersTmp); + StTmp placementTmp(mChunkPlacementTmp); + StTmp > cblkTmp(mChunkInfos2Tmp); + vector& cblk = cblkTmp.Get(); + bool rescheduleFlag = true; + int64_t maxTime = + microseconds() + mMaxRebalanceRunTime; + const size_t maxScan = min(mChunkToServerMap.Size(), + (size_t)max(mMaxRebalanceScan, 0)); + for (size_t i = 0; i < maxScan; i++) { + if (((i + 1) & 0x1F) == 0) { + const int64_t now = microseconds(); + if (maxTime < now) { + mRebalanceCtrs.ScanTimeout(); + break; + } + } + CSMap::Entry* p = mChunkToServerMap.Prev( + CSMap::Entry::kStateNone); + if (! p) { + rescheduleFlag = false; + mRebalanceCtrs.NextRound(); + // Restart backward scan. + mChunkToServerMap.Last(CSMap::Entry::kStateNone); + if (! (p = mChunkToServerMap.Prev( + CSMap::Entry::kStateNone))) { + break; + } + } + mRebalanceCtrs.Scanned(); + CSMap::Entry& entry = *p; + const chunkId_t cid = entry.GetChunkId(); + int extraReplicas = 0; + ChunkPlacement& placement = placementTmp.Get(); + if (GetInFlightChunkModificationOpCount(cid) > 0 || + ! CanReplicateChunkNow( + entry, extraReplicas, placement) || + extraReplicas != 0) { + mRebalanceCtrs.Busy(); + continue; + } + // Cache chunk block. + if (find(cblk.begin(), cblk.end(), entry.GetChunkInfo()) == + cblk.end()) { + cblk.clear(); + } + placement.clear(); + const bool kIncludeThisChunkFlag = false; + const bool kStopIfHasAnyReplicationsInFlight = true; + const bool busyFlag = ! GetPlacementExcludes( + entry, + placement, + kIncludeThisChunkFlag, + kStopIfHasAnyReplicationsInFlight, + &cblk + ); + const int numReplicas = entry.GetFattr()->numReplicas; + if (numReplicas > 1 && + placement.GetExcludedRacksCount() + + numReplicas >= mRacks.size()) { + // Do not pay attention to other stripes, with + // replication higher than 1 and insufficient number of + // racks. + placement.clear(); + } else if (busyFlag) { + mRebalanceCtrs.BusyOther(); + cblk.clear(); + // Move all chunks in the blok to the back of + // the list, in order to skip them on this + // re-balance pass. + MoveChunkBlockBack(cblk, mChunkToServerMap); + continue; + } + Servers& srvs = serversTmp.Get(); + mChunkToServerMap.GetServers(entry, srvs); + double maxUtil = -1; + int srcCnt = 0; + int srvPos = -1; + int rackPos = -1; + int loadPos = -1; + for (Servers::const_iterator it = srvs.begin(); + it != srvs.end(); + ++it) { + ChunkServer& srv = **it; + if (srv.GetReplicationReadLoad() < + mMaxConcurrentReadReplicationsPerNode && + srv.IsResponsiveServer()) { + srcCnt++; + } + if (srvPos < 0 && + (placement.IsServerExcluded(srv) && + placement.GetExcludedServersCount() < + mChunkServers.size())) { + srvPos = (int)(it - srvs.begin()); + } + if (! placement.ExcludeServerAndRack(srv, cid) && + rackPos < 0 && + placement.SearchCandidateRacks() + ) { + rackPos = (int)(it - srvs.begin()); + } + if (srvPos >= 0 || rackPos >= 0) { + continue; + } + const double util = + srv.GetSpaceUtilization(mUseFsTotalSpaceFlag); + if (util > max(maxUtil, + mMaxRebalanceSpaceUtilThreshold)) { + loadPos = (int)(it - srvs.begin()); + maxUtil = util; + } + } + if (srcCnt <= 0) { + if (srvPos >= 0 || rackPos >= 0 || loadPos >= 0) { + mRebalanceCtrs.NoSource(); + } else { + mRebalanceCtrs.ServerOk(); + } + continue; + } + const char* reason = 0; + if (srvPos >= 0 || rackPos >= 0) { + srvs.clear(); + double maxUtilization = mMinRebalanceSpaceUtilThreshold; + for (int i = 0; ; i++) { + if (i > 0) { + if (mMaxSpaceUtilizationThreshold <= + maxUtilization) { + break; + } + maxUtilization = + mMaxSpaceUtilizationThreshold; + } + placement.FindRebalanceCandidates( + maxUtilization); + if (srvPos < 0) { + if (placement.IsUsingRackExcludes()) { + continue; + } + const RackId rackId = + placement.GetRackId(); + if (rackId < 0 || rackId == srvs[ + rackPos]->GetRack()) { + continue; + } + } + const bool kCanIgnoreServerExcludesFlag = false; + const ChunkServerPtr srv = placement.GetNext( + kCanIgnoreServerExcludesFlag); + if (! srv) { + continue; + } + srvs.push_back(srv); + reason = srvPos >= 0 ? + "re-balance server placement" : + "re-balance rack placement"; + break; + } + } else if (loadPos >= 0) { + const RackId rackId = srvs[loadPos]->GetRack(); + placement.FindRebalanceCandidates( + mMinRebalanceSpaceUtilThreshold, + rackId + ); + const bool kCanIgnoreServerExcludesFlag = false; + const ChunkServerPtr srv = placement.GetNext( + kCanIgnoreServerExcludesFlag); + srvs.clear(); + if (srv && (srv->GetRack() >= 0 || rackId < 0) && + ((placement.GetRackId() >= 0 && + ! placement.IsUsingRackExcludes() + ) || + (placement.GetCandidateRackCount() <= + 0 && + placement.GetExcludedRacksCount() + + numReplicas >= mRacks.size()))) { + srvs.push_back(srv); + reason = "re-balance utilization"; + } + } else { + mRebalanceCtrs.ServerOk(); + continue; + } + const bool noCandidatesFlag = srvs.empty(); + if (srvPos >= 0) { + mRebalanceCtrs.ServerNeeded(); + if (noCandidatesFlag) { + mRebalanceCtrs.NoServerFound(); + } + } else if (rackPos >= 0) { + mRebalanceCtrs.RackNeeded(); + if (noCandidatesFlag) { + mRebalanceCtrs.NoRackFound(); + } + } else { + mRebalanceCtrs.NonLoadedServerNeeded(); + if (noCandidatesFlag) { + mRebalanceCtrs.NoNonLoadedServerFound(); + } + } + if (noCandidatesFlag) { + continue; + } + if (ReplicateChunk(entry, 1, srvs, recoveryInfo, reason) > 0) { + mRebalanceCtrs.ReplicationStarted(); + MoveChunkBlockBack(cblk, mChunkToServerMap); + cblk.clear(); + if (mRebalanceReplicationsThresholdCount <= + mNumOngoingReplications) { + break; + } + } else { + mRebalanceCtrs.NoReplicationStarted(); + } + } + if (rescheduleFlag) { + mChunkReplicator.ScheduleNext(mRebalanceRunInterval / 1024); + } +} + +int +LayoutManager::LoadRebalancePlan(const string& planFn) +{ + if (mRebalancePlanFileName == planFn && + mRebalancePlan.is_open()) { + return 0; + } + mRebalancePlan.close(); + mRebalancePlanFileName = planFn; + if (mRebalancePlanFileName.empty()) { + return 0; + } + mRebalancePlan.open(mRebalancePlanFileName.c_str(), istream::in); + if (! mRebalancePlan) { + int err = errno; + KFS_LOG_STREAM_ERROR << "re-balance plan: " << + mRebalancePlanFileName << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + return (err > 0 ? -err : -EINVAL); + } + mRebalancePlan.setf(istream::hex); + mIsExecutingRebalancePlan = true; + mRebalanceCtrs.StartPlan(); + KFS_LOG_STREAM_INFO << + "start executing re-balance plan: " << + mRebalancePlanFileName << + KFS_LOG_EOM; + return 0; +} + +bool +LayoutManager::ReadRebalancePlan(size_t nread) +{ + if (! mRebalancePlan.is_open()) { + return false; + } + chunkId_t chunkId; + ServerLocation loc; + bool addedFlag = false; + size_t i; + for (i = 0; i < nread && ! mRebalancePlan.eof(); i++) { + if (! (mRebalancePlan >> chunkId >> loc)) { + break; + } + mRebalanceCtrs.PlanLine(); + Servers::const_iterator const it = find_if( + mChunkServers.begin(), mChunkServers.end(), + MatchingServer(loc) + ); + if (it == mChunkServers.end()) { + mRebalanceCtrs.PlanNoServer(); + continue; + } + (*it)->AddToChunksToMove(chunkId); + mRebalanceCtrs.PlanAdded(); + addedFlag = true; + } + if (nread <= i) { + return true; + } + if (mRebalancePlan.eof()) { + KFS_LOG_STREAM_INFO << + "finished loading re-balance plan" << + KFS_LOG_EOM; + } else { + KFS_LOG_STREAM_ERROR << + "invalid re-balance plan line: " << + mRebalanceCtrs.GetPlanLine() << + " terminating plan loading" << + KFS_LOG_EOM; + } + mRebalancePlan.close(); + return addedFlag; +} + +void +LayoutManager::ExecuteRebalancePlan() +{ + if (! mIsExecutingRebalancePlan || + mRebalanceReplicationsThresholdCount <= + mNumOngoingReplications) { + return; + } + size_t rem = 0; + int maxScan = mMaxRebalanceScan; + int nextTimeCheck = maxScan - 32; + int64_t maxTime = microseconds() + mMaxRebalanceRunTime; + for (Servers::const_iterator it = mChunkServers.begin(); + maxScan > 0 && it != mChunkServers.end(); + ++it) { + bool serverDownFlag = true; + rem += ExecuteRebalancePlan( + *it, + serverDownFlag, + maxScan, + maxTime, + nextTimeCheck); + if (serverDownFlag || + mRebalanceReplicationsThresholdCount <= + mNumOngoingReplications) { + maxScan = -1; + break; + } + } + if (maxScan <= 0) { + mChunkReplicator.ScheduleNext(mRebalanceRunInterval / 1024); + return; + } + if (mMaxRebalancePlanRead <= 0 && mRebalancePlan) { + KFS_LOG_STREAM_INFO << + "terminating loading re-balance plan " << + mRebalancePlanFileName << + KFS_LOG_EOM; + mRebalancePlan.close(); + } + if (rem >= mMaxRebalancePlanRead) { + return; + } + if (ReadRebalancePlan(mMaxRebalancePlanRead - rem)) { + return; + } + if (rem <= 0) { + KFS_LOG_STREAM_INFO << + "finished execution of rebalance plan: " << + mRebalancePlanFileName << + KFS_LOG_EOM; + mIsExecutingRebalancePlan = false; + } +} + +size_t +LayoutManager::ExecuteRebalancePlan( + const ChunkServerPtr& c, bool& serverDownFlag, int& maxScan, + int64_t maxTime, int& nextTimeCheck) +{ + serverDownFlag = false; + if (! mIsExecutingRebalancePlan || c->IsRetiring() || c->IsDown()) { + c->ClearChunksToMove(); + return 0; + } + ChunkIdSet& chunksToMove = const_cast(c->GetChunksToMove()); + if (c->GetSpaceUtilization(mUseFsTotalSpaceFlag) > + mMaxSpaceUtilizationThreshold) { + KFS_LOG_STREAM_INFO << + "terminating re-balance plan execution for" + " overloaded server " << c->ServerID() << + " chunks left: " << c->GetChunksToMove().Size() << + KFS_LOG_EOM; + c->ClearChunksToMove(); + return 0; + } + if (chunksToMove.IsEmpty() || ! IsCandidateServer(*c)) { + return chunksToMove.Size(); + } + + StTmp placementTmp(mChunkPlacementTmp); + StTmp serversTmp(mServersTmp); + StTmp candidatesTmp(mServers2Tmp); + Servers& candidates = candidatesTmp.Get(); + candidates.push_back(c); + + const ChunkRecoveryInfo recoveryInfo; + const chunkId_t* it = chunksToMove.Next(); + size_t curScan = chunksToMove.Size(); + while (maxScan > 0 && curScan > 0) { + if (c->GetNumChunkReplications() >= + mMaxConcurrentWriteReplicationsPerNode) { + mRebalanceCtrs.PlanNoDest(); + break; + } + if (maxScan <= nextTimeCheck) { + if (maxTime < microseconds()) { + maxScan = -1; + mRebalanceCtrs.PlanTimeout(); + break; + } + nextTimeCheck = maxScan - 32; + } + if (! (it = chunksToMove.Next())) { + const size_t sz = chunksToMove.Size(); + if (sz <= 0) { + break; + } + curScan = min(curScan, sz); + chunksToMove.First(); + continue; + } + curScan--; + maxScan--; + mRebalanceCtrs.PlanScanned(); + chunkId_t const cid = *it; + CSMap::Entry* const ci = mChunkToServerMap.Find(cid); + if (! ci) { + // Chunk got deleted from the time the plan was created. + c->MovingChunkDone(cid); + mRebalanceCtrs.PlanNoChunk(); + continue; + } + Servers& srvs = serversTmp.Get(); + mChunkToServerMap.GetServers(*ci, srvs); + bool foundFlag = false; + int srcCnt = 0; + for (Servers::const_iterator ci = srvs.begin(); + ci != srvs.end(); + ++ci) { + if (*ci == c) { + foundFlag = true; + break; + } + if ((*ci)->GetReplicationReadLoad() < + mMaxConcurrentReadReplicationsPerNode && + (*ci)->IsResponsiveServer()) { + srcCnt++; + } + } + if (foundFlag) { + c->MovingChunkDone(cid); // Already there. + continue; + } + if (srcCnt <= 0) { + mRebalanceCtrs.PlanNoSrc(); + continue; + } + int extraReplicas = 0; + ChunkPlacement& placement = placementTmp.Get(); + if (mChunkToServerMap.GetState(*ci) != + CSMap::Entry::kStateNone || + GetInFlightChunkModificationOpCount(cid) > 0 || + ! CanReplicateChunkNow( + *ci, extraReplicas, placement)) { + mRebalanceCtrs.PlanBusy(); + continue; + } + placement.clear(); + const bool kIncludeThisChunkFlag = false; + const bool kStopIfHasAnyReplicationsInFlight = true; + if (ci->GetFattr()->numReplicas <= 1 && + ! GetPlacementExcludes( + *ci, + placement, + kIncludeThisChunkFlag, + kStopIfHasAnyReplicationsInFlight)) { + mRebalanceCtrs.PlanBusyOther(); + continue; + } + if ((placement.IsServerExcluded(c) && + placement.GetExcludedServersCount() < + mChunkServers.size()) || + (placement.IsRackExcluded(c) && + placement.HasCandidateRacks())) { + // Chunk cannot be moved due to rack aware placement + // constraints. + c->MovingChunkDone(cid); + KFS_LOG_STREAM_INFO << + "cannot move" + " chunk: " << cid << + " to: " << c->GetServerLocation() << + " excluded: " << + " servers: " << + placement.GetExcludedServersCount() << + " racks: " << + placement.GetExcludedRacksCount() << + KFS_LOG_EOM; + mRebalanceCtrs.PlanCannotMove(); + continue; + } + if (ReplicateChunk(*ci, 1, candidates, recoveryInfo, + "re-balance plan") > 0) { + mRebalanceCtrs.PlanReplicationStarted(); + if (mRebalanceReplicationsThresholdCount <= + mNumOngoingReplications) { + break; + } + } else { + mRebalanceCtrs.PlanNoReplicationStarted(); + } + // Always use smart pointer copy here, instead of a reference, + // as reference might become invalid if the chunk server goes + // down as result of queuing replication op. + if (candidates.front()->IsDown()) { + serverDownFlag = true; + return 0; + } + } + return chunksToMove.Size(); +} + +void +LayoutManager::GetOpenFiles( + MetaOpenFiles::ReadInfo& openForRead, + MetaOpenFiles::WriteInfo& openForWrite) +{ + mChunkLeases.GetOpenFiles(openForRead, openForWrite, mChunkToServerMap); +} + +bool +LayoutManager::HasEnoughFreeBuffers(MetaRequest* /* req = 0*/) +{ + // This has to be re-entrant with req == 0. Racy check is OK though. + return (GetFreeIoBufferByteCount() > + SyncAddAndFetch(mIoBufPending, int64_t(0)) + + mMinIoBufferBytesToProcessRequest); +} + +void +LayoutManager::SetUserAndGroupSelf(const MetaRequest& req, + kfsUid_t& user, kfsGid_t& group) +{ + const string& ip = req.clientIp; + if (ip.empty()) { + return; + } + if (ip == mLastUidGidRemap.mIp && + mLastUidGidRemap.mUser == user && + mLastUidGidRemap.mGroup == group) { + if (user != kKfsUserNone) { + user = mLastUidGidRemap.mToUser; + } + if (group != kKfsGroupNone) { + group = mLastUidGidRemap.mToGroup; + } + return; + } + mLastUidGidRemap.mIp = ip; + mLastUidGidRemap.mUser = user; + mLastUidGidRemap.mGroup = group; + for (HostUserGroupRemap::const_iterator + it = mHostUserGroupRemap.begin(); + it != mHostUserGroupRemap.end(); + ++it) { + if (! it->mHostPrefix.Match(ip)) { + continue; + } + if (user != kKfsUserNone) { + HostUserGroupMapEntry::UserMap::const_iterator + const ui = it->mUserMap.find(user); + if (ui != it->mUserMap.end()) { + user = ui->second; + } + } + if (group != kKfsGroupNone) { + HostUserGroupMapEntry::GroupMap::const_iterator + const gi = it->mGroupMap.find(user); + if (gi != it->mGroupMap.end()) { + group = gi->second; + } + } + break; + } + mLastUidGidRemap.mToUser = user; + mLastUidGidRemap.mToGroup = group; +} + +void +LayoutManager::CSMapUnitTest(const Properties& props) +{ + const char* const kUniteTestPropName = "metaServer.csmap.unittest"; + const int unitTestPropVal = props.getValue(kUniteTestPropName, 0); + if (unitTestPropVal == 0) { + return; + } + + if (mChunkToServerMap.Size() > 0 || + mChunkToServerMap.GetServerCount() > 0) { + KFS_LOG_STREAM_INFO << "not running CSMap unit test:" + " chunks: " << mChunkToServerMap.Size() << + " servers: " << mChunkToServerMap.GetServerCount() << + KFS_LOG_EOM; + return; + } + KFS_LOG_STREAM_WARN << "running CSMap unit test: " << + kUniteTestPropName << " = " << unitTestPropVal << + KFS_LOG_EOM; + + const chunkId_t kChunks = 1000; + const int kServers = 100; + + mChunkToServerMap.SetDebugValidate(true); + MetaFattr* const fattr = MetaFattr::create(KFS_FILE, 1, 1, + kKfsUserRoot, kKfsGroupRoot, 0644); + chunkId_t cid; + for (cid = 1; cid <= kChunks; cid++) { + bool newEntryFlag = false; + if (! mChunkToServerMap.Insert( + fattr, (chunkOff_t)cid * CHUNKSIZE, cid, 1, + newEntryFlag) || ! newEntryFlag) { + panic("duplicate chunk id"); + break; + } + } + for (int i = 0; i < kServers; i++) { + mChunkServers.push_back(ChunkServerPtr( + new ChunkServer(NetConnectionPtr( + new NetConnection( + new TcpSocket(), 0))))); + if (! mChunkToServerMap.AddServer(mChunkServers.back())) { + panic("failed to add server"); + } + } + if (! mChunkToServerMap.RemoveServer(mChunkServers.front())) { + panic("failed to remove server"); + } + if (! mChunkToServerMap.RemoveServer(mChunkServers.back())) { + panic("failed to remove server"); + } + if (! mChunkToServerMap.AddServer(mChunkServers.front())) { + panic("failed to add server"); + } + if (! mChunkToServerMap.AddServer(mChunkServers.back())) { + panic("failed to add server"); + } + if (mChunkToServerMap.GetServerCount() != mChunkServers.size()) { + panic("server count don't match"); + } + Servers expected; + for (int i = 0; i < 4; i++) { + expected.push_back(mChunkServers[i]); + } + for (cid = 1; cid <= kChunks; cid++) { + CSMap::Entry* const cur = mChunkToServerMap.Find(cid); + if (! cur) { + panic("missing chunk entry"); + break; + } + CSMap::Entry& entry = *cur; + if (! mChunkToServerMap.AddServer(mChunkServers[5], entry)) { + panic("failed to add server to entry"); + break; + } + for (int i = 0; i < 5; i++) { + if (! mChunkToServerMap.AddServer( + mChunkServers[i], entry)) { + panic("failed to add server to entry"); + break; + } + } + if (! mChunkToServerMap.RemoveServer(mChunkServers[4], entry)) { + panic("failed to remove server to entry"); + break; + } + if (! mChunkToServerMap.RemoveServer(mChunkServers[5], entry)) { + panic("failed to remove server to entry"); + break; + } + if (mChunkToServerMap.GetServers(entry) != expected) { + panic("servers don't match"); + break; + } + mChunkToServerMap.SetServers(mChunkServers, entry); + if (mChunkToServerMap.GetServers(entry) != mChunkServers) { + panic("servers don't match"); + break; + } + if (! mChunkToServerMap.RemoveServer( + mChunkServers[10], entry)) { + panic("failed to remove server to entry"); + break; + } + if (cid % 3 == 0) { + continue; + } + for (int i = 0; i < kServers; i++) { + if (i == 10) { + continue; + } + if (! mChunkToServerMap.RemoveServer( + mChunkServers[i], entry)) { + panic("failed to remove server"); + } + } + if (mChunkToServerMap.ServerCount(entry) != 0) { + panic("invalid server count"); + } + } + cid = 1; + for (int i = 11; i < kServers && i < 30; i++) { + if (i == 10) { + continue; + } + if (! mChunkToServerMap.RemoveServer(mChunkServers[i])) { + panic("failed to remove server"); + } + if (mChunkToServerMap.RemoveServerCleanup(1)) { + KFS_LOG_STREAM_DEBUG << + "more cleanup " << i << + KFS_LOG_EOM; + } + if (! mChunkToServerMap.SetState(cid++, + CSMap::Entry::kStatePendingReplication)) { + panic("failed to move to pending replication"); + } + } + cid = 1000000; + vector idxs; + for (int i = 30; i < kServers && i < 60; i++) { + if (i == 10) { + continue; + } + size_t idx = 0; + if (! mChunkToServerMap.SetHibernated(mChunkServers[i], idx)) { + panic("failed to hibernate server"); + } + idxs.push_back(idx); + if (mChunkToServerMap.RemoveServerCleanup(1)) { + KFS_LOG_STREAM_DEBUG << + "hibernate more cleanup: " << i << + " server: " << idx << + KFS_LOG_EOM; + } + bool newEntryFlag = false; + cid++; + if (! mChunkToServerMap.Insert( + fattr, (chunkOff_t)cid * CHUNKSIZE, cid, 1, + newEntryFlag) || + ! newEntryFlag) { + panic("duplicate chunk id"); + } + if (! mChunkToServerMap.SetState(cid, + CSMap::Entry::kStateCheckReplication)) { + panic("failed to move into check replication"); + } + } + expected = mChunkToServerMap.GetServers(3); + for (chunkId_t cid = 1; cid <= kChunks; cid++) { + if (cid % 3 == 0) { + if (mChunkToServerMap.GetServers(cid) != expected) { + panic("invalid servers"); + } + } else { + if (mChunkToServerMap.HasServers(cid)) { + panic("invalid server count"); + } + } + } + expected.clear(); + if (mChunkToServerMap.GetHibernatedCount() != idxs.size()) { + panic("invalid hibernated servers count"); + } + for (size_t i = 0; i < idxs.size(); i++) { + if (! mChunkToServerMap.RemoveHibernatedServer(idxs[i])) { + panic("failed to remove hibernated server"); + } + } + if (mChunkToServerMap.GetHibernatedCount() != 0) { + panic("invalid hibernated servers count"); + } + while (mChunkToServerMap.RemoveServerCleanup(3)) { + KFS_LOG_STREAM_DEBUG << "final cleanup" << KFS_LOG_EOM; + } + KFS_LOG_STREAM_DEBUG << + "servers: " << mChunkToServerMap.GetServerCount() << + " replication: " << mChunkToServerMap.GetCount( + CSMap::Entry::kStateCheckReplication) << + " pending: " << mChunkToServerMap.GetCount( + CSMap::Entry::kStatePendingReplication) << + KFS_LOG_EOM; + mChunkToServerMap.RemoveServerCleanup(0); + mChunkToServerMap.Clear(); + for (int i = 0; i < kServers; i++) { + if (mChunkServers[i]->GetIndex() < 0) { + continue; + } + if (! mChunkToServerMap.RemoveServer(mChunkServers[i])) { + panic("failed to remove server"); + } + mChunkServers[i]->ForceDown(); + } + if (mChunkToServerMap.GetServerCount() != 0) { + panic("failed to remove all servers"); + } + if (CSMap::Entry::GetAllocBlockCount() != 0) { + panic("server list allocation leak"); + } + if (CSMap::Entry::GetAllocByteCount() != 0) { + panic("server list allocation byte count mismatch"); + } + mChunkServers.clear(); + fattr->destroy(); + + KFS_LOG_STREAM_WARN << "passed CSMap unit test" << + KFS_LOG_EOM; +} + +bool +LayoutManager::AddReplica(CSMap::Entry& ci, const ChunkServerPtr& s) +{ + return AddHosted(ci, s); +} + +void +LayoutManager::CheckChunkReplication(CSMap::Entry& entry) +{ + return CheckReplication(entry); +} + +ostream& +LayoutManager::RebalanceCtrs::Show( + ostream& os, const char* prefix, const char* suffix) +{ + const char* const pref = prefix ? prefix : " "; + const char* const suf = suffix ? suffix : " "; + os << + "RoundCount" << pref << mRoundCount << suf << + "NoSource" << pref << mNoSource << suf << + "ServerNeeded" << pref << mServerNeeded << suf << + "NoServerFound" << pref << mNoServerFound << suf << + "RackNeeded" << pref << mRackNeeded << suf << + "NoRackFound" << pref << mNoRackFound << suf << + "NonLoadedServerNeeded" << pref << mNonLoadedServerNeeded << suf << + "NoNonLoadedServerFound" << pref << mNoNonLoadedServerFound << suf << + "Ok" << pref << mOk << suf << + "Scanned" << pref << mScanned << suf << + "Busy" << pref << mBusy << suf << + "BusyOther" << pref << mBusyOther << suf << + "ReplicationStarted" << pref << mReplicationStarted << suf << + "NoReplicationStarted" << pref << mNoReplicationStarted << suf << + "ScanTimeout" << pref << mScanTimeout << suf << + "TotalNoSource" << pref << mTotalNoSource << suf << + "TotalServerNeeded" << pref << mTotalServerNeeded << suf << + "TotalNoServerFound" << pref << mTotalNoServerFound << suf << + "TotalRackNeeded" << pref << mTotalRackNeeded << suf << + "TotalNoRackFound" << pref << mTotalNoRackFound << suf << + "TotalNonLoadedServerNeeded" << pref << mTotalNonLoadedServerNeeded << suf << + "TotalNoNonLoadedServerFound" << pref << mTotalNoNonLoadedServerFound << suf << + "TotalOk" << pref << mTotalOk << suf << + "TotalScanned" << pref << mTotalScanned << suf << + "TotalBusy" << pref << mTotalBusy << suf << + "TotalBusyOther" << pref << mTotalBusyOther << suf << + "TotalReplicationStarted" << pref << mTotalReplicationStarted << suf << + "TotalNoReplicationStarted" << pref << mTotalNoReplicationStarted << suf << + "TotalScanTimeout" << pref << mTotalScanTimeout << suf << + "Plan" << pref << mPlan << suf << + "PlanNoDest" << pref << mPlanNoDest << suf << + "PlanTimeout" << pref << mPlanTimeout << suf << + "PlanScanned" << pref << mPlanScanned << suf << + "PlanNoChunk" << pref << mPlanNoChunk << suf << + "PlanNoSrc" << pref << mPlanNoSrc << suf << + "PlanBusy" << pref << mPlanBusy << suf << + "PlanBusyOther" << pref << mPlanBusyOther << suf << + "PlanCannotMove" << pref << mPlanCannotMove << suf << + "PlanReplicationStarted" << pref << mPlanReplicationStarted << suf << + "PlanNoReplicationStarted" << pref << mPlanNoReplicationStarted << suf << + "PlanLine" << pref << mPlanLine << suf << + "PlanNoServer" << pref << mPlanNoServer << suf << + "PlanAdded" << pref << mPlanAdded << suf << + "TotalPlanNoDest" << pref << mTotalPlanNoDest << suf << + "TotalPlanTimeout" << pref << mTotalPlanTimeout << suf << + "TotalPlanScanned" << pref << mTotalPlanScanned << suf << + "TotalPlanNoChunk" << pref << mTotalPlanNoChunk << suf << + "TotalPlanNoSrc" << pref << mTotalPlanNoSrc << suf << + "TotalPlanBusy" << pref << mTotalPlanBusy << suf << + "TotalPlanBusyOther" << pref << mTotalPlanBusyOther << suf << + "TotalPlanCannotMove" << pref << mTotalPlanCannotMove << suf << + "TotalPlanReplicationStarted" << pref << mTotalPlanReplicationStarted << suf << + "TotalPlanNoReplicationStarted" << pref << mTotalPlanNoReplicationStarted << suf << + "TotalPlanLine" << pref << mTotalPlanLine << suf << + "TotalPlanNoServer" << pref << mTotalPlanNoServer << suf << + "TotalPlanAdded" << pref << mTotalPlanAdded << suf + ; + return os; +} + +} // namespace KFS diff --git a/src/cc/meta/LayoutManager.h b/src/cc/meta/LayoutManager.h new file mode 100644 index 000000000..649f0a670 --- /dev/null +++ b/src/cc/meta/LayoutManager.h @@ -0,0 +1,1918 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/06 +// Author: Sriram Rao +// Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file LayoutManager.h +// \brief Layout manager is responsible for laying out chunks on chunk +// servers. Model is that, when a chunkserver connects to the meta +// server, the layout manager gets notified; the layout manager then +// uses the chunk server for data placement. +// +//---------------------------------------------------------------------------- + +#ifndef META_LAYOUTMANAGER_H +#define META_LAYOUTMANAGER_H + +#include "kfstypes.h" +#include "meta.h" +#include "ChunkServer.h" + +#include "kfsio/Counter.h" +#include "common/Properties.h" +#include "common/StdAllocator.h" +#include "common/kfsatomic.h" +#include "common/StTmp.h" +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/ITimeout.h" +#include "kfsio/event.h" +#include "kfsio/Globals.h" +#include "MetaRequest.h" +#include "CSMap.h" +#include "ChunkPlacement.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +class QCIoBufferPool; +namespace KFS +{ +using std::string; +using std::map; +using std::vector; +using std::pair; +using std::list; +using std::set; +using std::make_pair; +using std::less; +using std::equal_to; +using std::deque; +using std::ostream; +using std::ostringstream; +using std::find; +using std::ifstream; +using libkfsio::globalNetManager; + +/// Model for leases: metaserver assigns write leases to chunkservers; +/// clients/chunkservers can grab read lease on a chunk at any time. +/// The server will typically renew unexpired leases whenever asked. +/// As long as the lease is valid, server promises not to chnage +/// the lease's version # (also, chunk won't disappear as long as +/// lease is valid). +class ARAChunkCache; +class ChunkLeases +{ +public: + typedef int64_t LeaseId; + struct ReadLease + { + ReadLease(LeaseId id = -1, time_t exp = 0) + : leaseId(id), + expires(exp) + {} + ReadLease(const ReadLease& lease) + : leaseId(lease.leaseId), + expires(lease.expires) + {} + ReadLease& operator=(const ReadLease& lease) + { + Mutable(leaseId) = lease.leaseId; + expires = leaseId; + return *this; + } + const LeaseId leaseId; + time_t expires; + template static T& Mutable(const T& val) + { return const_cast(val); } + }; + struct WriteLease : public ReadLease + { + WriteLease( + LeaseId i, + seq_t cvers, + const ChunkServerPtr& c, + const string& p, + bool append, + bool stripedFile, + const MetaAllocate* alloc, + time_t exp) + : ReadLease(i, exp), + chunkVersion(cvers), + chunkServer(c), + pathname(p), + appendFlag(append), + stripedFileFlag(stripedFile), + relinquishedFlag(false), + ownerWasDownFlag(false), + allocInFlight(alloc) + {} + WriteLease(const WriteLease& lease) + : ReadLease(lease), + chunkVersion(lease.chunkVersion), + chunkServer(lease.chunkServer), + pathname(lease.pathname), + appendFlag(lease.appendFlag), + stripedFileFlag(lease.stripedFileFlag), + relinquishedFlag(lease.relinquishedFlag), + ownerWasDownFlag(lease.ownerWasDownFlag), + allocInFlight(lease.allocInFlight) + {} + WriteLease& operator=(const WriteLease& lease) + { + ReadLease::operator=(lease); + Mutable(chunkVersion) = lease.chunkVersion; + Mutable(chunkServer) = lease.chunkServer; + Mutable(pathname) = lease.pathname; + Mutable(appendFlag) = lease.appendFlag; + Mutable(stripedFileFlag) = lease.stripedFileFlag; + relinquishedFlag = lease.relinquishedFlag; + ownerWasDownFlag = lease.ownerWasDownFlag; + allocInFlight = lease.allocInFlight; + return *this; + } + const seq_t chunkVersion; + const ChunkServerPtr chunkServer; + // record the pathname; we can use the path to traverse + // the dir. tree and update space used at each level of + // the tree + const string pathname; + const bool appendFlag:1; + const bool stripedFileFlag:1; + bool relinquishedFlag:1; + bool ownerWasDownFlag:1; + const MetaAllocate* allocInFlight; + }; + + ChunkLeases(); + + inline const WriteLease* GetWriteLease( + chunkId_t chunkId) const; + inline const WriteLease* GetValidWriteLease( + chunkId_t chunkId) const; + inline const WriteLease* RenewValidWriteLease( + chunkId_t chunkId); + inline bool HasValidWriteLease( + chunkId_t chunkId) const; + inline bool HasValidLease( + chunkId_t chunkId) const; + inline bool HasWriteLease( + chunkId_t chunkId) const; + inline bool HasLease( + chunkId_t chunkId) const; + inline int ReplicaLost( + chunkId_t chunkId, + const ChunkServer* chunkServer); + inline bool NewReadLease( + chunkId_t chunkId, + time_t expires, + LeaseId& leaseId); + inline bool NewWriteLease( + chunkId_t chunkId, + seq_t chunkVersion, + time_t expires, + const ChunkServerPtr& server, + const string& path, + bool append, + bool stripedFileFlag, + const MetaAllocate* allocInFlight, + LeaseId& leaseId); + inline bool DeleteWriteLease( + chunkId_t chunkId, + LeaseId leaseId); + inline int Renew( + chunkId_t chunkId, + LeaseId leaseId, + bool allocDoneFlag = false); + inline bool Delete(chunkId_t chunkId); + inline bool ExpiredCleanup( + chunkId_t chunkId, + time_t now, + int ownerDownExpireDelay, + ARAChunkCache& arac, + CSMap& csmap); + inline const char* FlushWriteLease( + chunkId_t chunkId, + ARAChunkCache& arac, + CSMap& csmap); + inline bool Timer( + time_t now, + int ownerDownExpireDelay, + ARAChunkCache& arac, + CSMap& csmap); + inline int LeaseRelinquish( + const MetaLeaseRelinquish& req, + ARAChunkCache& arac, + CSMap& csmap); + inline void SetMaxLeaseId( + LeaseId id); + inline void GetOpenFiles( + MetaOpenFiles::ReadInfo& openForRead, + MetaOpenFiles::WriteInfo& openForWrite, + const CSMap& csmap) const; + inline void ServerDown( + const ChunkServerPtr& chunkServer, + ARAChunkCache& arac, + CSMap& csmap); + inline bool UpdateReadLeaseReplicationCheck( + chunkId_t chunkId, + bool setScheduleReplicationCheckFlag); +private: + typedef list< + ReadLease, + StdFastAllocator + > ChunkReadLeases; + struct ChunkReadLeasesHead + { + ChunkReadLeasesHead() + : mLeases(), + mScheduleReplicationCheckFlag(false) + {} + ChunkReadLeases mLeases; + bool mScheduleReplicationCheckFlag; + }; + typedef std::tr1::unordered_map < + chunkId_t, + ChunkReadLeasesHead, + std::tr1::hash, + equal_to, + StdFastAllocator< + pair + > + > ReadLeases; + typedef std::tr1::unordered_map < + chunkId_t, + WriteLease, + std::tr1::hash, + equal_to, + StdFastAllocator< + pair + > + > WriteLeases; + /// A rolling counter for tracking leases that are issued to + /// to clients/chunkservers for reading/writing chunks + LeaseId mLeaseId; + ReadLeases mReadLeases; + WriteLeases mWriteLeases; + WriteLeases::iterator mCurWrIt; + bool mTimerRunningFlag; + + inline bool ExpiredCleanup( + ReadLeases::iterator it, + time_t now); + inline bool ExpiredCleanup( + WriteLeases::iterator it, + time_t now, + int ownerDownExpireDelay, + ARAChunkCache& arac, + CSMap& csmap); + inline int ReplicaLost( + ChunkLeases::WriteLease& wl, + const ChunkServer* chunkServer); + inline void Erase( + WriteLeases::iterator it); + inline void Erase( + ReadLeases::iterator it); + inline bool IsReadLease( + LeaseId leaseId); + inline bool IsWriteLease( + LeaseId leaseId); + inline LeaseId NewReadLeaseId(); + inline LeaseId NewWriteLeaseId(); +}; + +// Chunks are made stable by a message from the metaserver -> +// chunkservers. To prevent clients from seeing non-stable chunks, the +// metaserver delays issuing a read lease to a client if there is a make +// stable message in-flight. Whenever the metaserver receives acks for +// the make stable message, it needs to update its state. This structure +// tracks the # of messages sent out and how many have been ack'ed. +// When all the messages have been ack'ed the entry for a particular +// chunk can be cleaned up. +struct MakeChunkStableInfo +{ + MakeChunkStableInfo( + int nServers = 0, + bool beginMakeStable = false, + string name = string(), + seq_t cvers = -1, + bool stripedFile = false, + bool updateMTime = false) + : beginMakeStableFlag(beginMakeStable), + logMakeChunkStableFlag(false), + serverAddedFlag(false), + stripedFileFlag(stripedFile), + updateMTimeFlag(updateMTime), + numServers(nServers), + numAckMsg(0), + pathname(name), + chunkChecksum(0), + chunkSize(-1), + chunkVersion(cvers) + {} + bool beginMakeStableFlag:1; + bool logMakeChunkStableFlag:1; + bool serverAddedFlag:1; + const bool stripedFileFlag:1; + const bool updateMTimeFlag:1; + int numServers; + int numAckMsg; + const string pathname; + uint32_t chunkChecksum; + chunkOff_t chunkSize; + seq_t chunkVersion; +}; +typedef map , + StdFastAllocator< + pair > +> NonStableChunksMap; + +typedef map , + StdFastAllocator< + pair > +> PendingBeginMakeStable; + +// Pending make stable -- chunks with no replicas at the moment. +// Persistent across restarts -- serialized onto transaction log and +// checkpoint. See make stable protocol description in LayoutManager.cc +struct PendingMakeStableEntry +{ + PendingMakeStableEntry( + chunkOff_t size = -1, + bool hasChecksum = false, + uint32_t checksum = 0, + seq_t version = -1) + : mSize(size), + mHasChecksum(hasChecksum), + mChecksum(checksum), + mChunkVersion(version) + {} + chunkOff_t mSize; + bool mHasChecksum; + uint32_t mChecksum; + seq_t mChunkVersion; +}; +typedef map , + StdFastAllocator< + pair > +> PendingMakeStableMap; + +// "Rack" (failure group) state aggregation for rack aware replica placement. +class RackInfo +{ +public: + typedef ChunkServer::RackId RackId; + typedef double RackWeight; + typedef CSMap::Servers Servers; + + RackInfo(RackId id, + RackWeight weight, + const ChunkServerPtr& server) + : mRackId(id), + mPossibleCandidatesCount(0), + mRackWeight(1.0), + mServers() + { RackInfo::addServer(server); } + RackId id() const { + return mRackId; + } + void addServer(const ChunkServerPtr& server) { + mServers.push_back(server); + } + void removeServer(const ChunkServerPtr& server) { + Servers::iterator const iter = find( + mServers.begin(), mServers.end(), server); + if (iter != mServers.end()) { + mServers.erase(iter); + } + } + const Servers& getServers() const { + return mServers; + } + int getPossibleCandidatesCount() const { + return mPossibleCandidatesCount; + } + void updatePossibleCandidatesCount(int delta) { + mPossibleCandidatesCount += delta; + assert(mPossibleCandidatesCount >= 0); + } + RackWeight getWeight() const { + return mRackWeight; + } + void setWeight(RackWeight weight) { + mRackWeight = weight; + } + int64_t getWeightedPossibleCandidatesCount() const { + return (int64_t)(mRackWeight * mPossibleCandidatesCount); + } +private: + RackId mRackId; + int mPossibleCandidatesCount; + RackWeight mRackWeight; + Servers mServers; +}; + +typedef map< + chunkId_t, + seq_t, + less, + StdFastAllocator< + pair + > +> ChunkVersionRollBack; + +// +// For maintenance reasons, we'd like to schedule downtime for a server. +// When the server is taken down, a promise is made---the server will go +// down now and come back up by a specified time. During this window, we +// are willing to tolerate reduced # of copies for a block. Now, if the +// server doesn't come up by the promised time, the metaserver will +// initiate re-replication of blocks on that node. This ability allows +// us to schedule downtime on a node without having to incur the +// overhead of re-replication. +// +struct HibernatingServerInfo_t +{ + HibernatingServerInfo_t() + : location(), + sleepEndTime(), + csmapIdx(~size_t(0)) + {} + bool IsHibernated() const { return (csmapIdx != ~size_t(0)) ; } + // the server we put in hibernation + ServerLocation location; + // when is it likely to wake up + time_t sleepEndTime; + // CSMap server index to remove hibernated server. + size_t csmapIdx; +}; +typedef vector< + HibernatingServerInfo_t, + StdAllocator +> HibernatedServerInfos; + +// Atomic record append (write append) chunk allocation cache. +// The cache includes completed and in-flight chunk allocation requests. +// The cache has single entry per file. +// The cache is used to "allocate" chunks on behalf of multiple clients, and +// then "broadcast" the result of the allocation. +class ARAChunkCache +{ +public: + struct Entry { + Entry( + chunkId_t cid = -1, + seq_t cv = -1, + chunkOff_t co = -1, + time_t now = 0, + MetaAllocate* req = 0, + Permissions perms = Permissions()) + : chunkId(cid), + chunkVersion(cv), + offset(co), + lastAccessedTime(now), + lastDecayTime(now), + spaceReservationSize(0), + numAppendersInChunk(0), + master(req ? req->master : ChunkServerPtr()), + permissions(perms), + lastPendingRequest(req), + responseStr() + {} + bool AddPending(MetaAllocate& req); + bool IsAllocationPending() const { + return (lastPendingRequest != 0); + } + // index into chunk->server map to work out where the block lives + chunkId_t chunkId; + seq_t chunkVersion; + // the file offset corresponding to the last chunk + chunkOff_t offset; + // when was this info last accessed; use this to cleanup + time_t lastAccessedTime; + time_t lastDecayTime; + // chunk space reservation approximation + int spaceReservationSize; + // # of appenders to which this chunk was used for allocation + int numAppendersInChunk; + ChunkServerPtr master; + Permissions permissions; + private: + MetaAllocate* lastPendingRequest; + string responseStr; + friend class ARAChunkCache; + }; + typedef map , + StdFastAllocator< + pair > + > Map; + typedef Map::const_iterator const_iterator; + typedef Map::iterator iterator; + + ARAChunkCache() + : mMap() + {} + ~ARAChunkCache() + { mMap.clear(); } + void RequestNew(MetaAllocate& req); + void RequestDone(const MetaAllocate& req); + void Timeout(time_t now); + inline bool Invalidate(fid_t fid); + inline bool Invalidate(fid_t fid, chunkId_t chunkId); + inline bool Invalidate(iterator it); + iterator Find(fid_t fid) { + return mMap.find(fid); + } + const_iterator Find(fid_t fid) const { + return mMap.find(fid); + } + const Entry* Get(const_iterator it) const { + return (it == mMap.end() ? 0 : &it->second); + } + Entry* Get(iterator it) { + return (it == mMap.end() ? 0 : &it->second); + } + const Entry* Get(fid_t fid) const { + return Get(Find(fid)); + } + Entry* Get(fid_t fid) { + return Get(Find(fid)); + } + size_t GetSize() const { + return mMap.size(); + } +private: + Map mMap; +}; + +// Run operation on a timer. +template +class PeriodicOp : public KfsCallbackObj, public ITimeout +{ +public: + PeriodicOp(int intervalMs) + : mInProgress(false), + mCmdIntervalMs(intervalMs), + mOp(1, this) { + SET_HANDLER(this, &PeriodicOp::HandleEvent); + SetTimeoutInterval(mCmdIntervalMs); + globalNetManager().RegisterTimeoutHandler(this); + } + virtual ~PeriodicOp() { + assert(! mInProgress); + globalNetManager().UnRegisterTimeoutHandler(this); + } + void SetTimeoutInterval(int ms) { + mCmdIntervalMs = ms; + ITimeout::SetTimeoutInterval(mCmdIntervalMs); + } + int GetTimeoutInterval() const { + return mCmdIntervalMs; + } + int HandleEvent(int code, void *data) { + assert(mInProgress && code == EVENT_CMD_DONE && data == &mOp); + mInProgress = false; + return 0; + } + virtual void Timeout() { + if (mInProgress) { + return; + } + mOp.opSeqno++; + mInProgress = true; + ITimeout::SetTimeoutInterval(mCmdIntervalMs); + submit_request(&mOp); + } + OPTYPE& GetOp() { + return mOp; + } + void ScheduleNext(int ms = 0) { + const int intervalMs = max(0, ms); + if (mCmdIntervalMs < intervalMs) { + return; + } + ITimeout::SetTimeoutInterval(intervalMs); + if (intervalMs <= 0) { + globalNetManager().Wakeup(); + } + } +private: + /// If op is in progress, skip a send + bool mInProgress; + int mCmdIntervalMs; + /// The op for checking + OPTYPE mOp; +private: + PeriodicOp(const PeriodicOp&); + PeriodicOp& operator=(const PeriodicOp&); +}; + +// Chunk recovery state information. +struct ChunkRecoveryInfo +{ +ChunkRecoveryInfo() + : offset(-1), + version(-1), + striperType(KFS_STRIPED_FILE_TYPE_NONE), + numStripes(0), + numRecoveryStripes(0), + stripeSize(0), + fileSize(-1) + {} + bool HasRecovery() const + { return (numRecoveryStripes > 0); } + void Clear() + { + striperType = KFS_STRIPED_FILE_TYPE_NONE; + offset = -1; + version = -1; + numStripes = 0; + numRecoveryStripes = 0; + stripeSize = 0; + fileSize = -1; + } + + chunkOff_t offset; + seq_t version; + int16_t striperType; + int16_t numStripes; + int16_t numRecoveryStripes; + int32_t stripeSize; + chunkOff_t fileSize; +}; + +/// +/// LayoutManager is responsible for chunk allocation, re-replication, recovery, +/// and space re-balancing. +/// +/// Allocating space for a chunk is a 3-way communication: +/// 1. Client sends a request to the meta server for +/// allocation +/// 2. Meta server picks a chunkserver to hold the chunk and +/// then sends an RPC to that chunkserver to create a chunk. +/// 3. The chunkserver creates a chunk and replies to the +/// meta server's RPC. +/// 4. Finally, the metaserver logs the allocation request +/// and then replies to the client. +/// +/// In this model, the layout manager picks the chunkserver +/// location and queues the RPC to the chunkserver. +/// +class LayoutManager : public ITimeout +{ +public: + typedef CSMap::Servers Servers; + typedef ChunkServer::ChunkIdSet ChunkIdSet; + typedef RackInfo::RackId RackId; + + LayoutManager(); + + virtual ~LayoutManager(); + + void Shutdown(); + + /// A new chunk server has joined and sent a HELLO message. + /// Use it to configure information about that server + /// @param[in] r The MetaHello request sent by the + /// new chunk server. + void AddNewServer(MetaHello *r); + + /// Our connection to a chunkserver went down. So, + /// for all chunks hosted on this server, update the + /// mapping table to indicate that we can't + /// get to the data. + /// @param[in] server The server that is down + void ServerDown(const ChunkServerPtr& server); + + /// A server is being taken down: if downtime is > 0, it is a + /// value in seconds that specifies the time interval within + /// which the server will connect back. If it doesn't connect + /// within that interval, the server is assumed to be down and + /// re-replication will start. + int RetireServer(const ServerLocation &loc, int downtime); + + /// Allocate space to hold a chunk on some + /// chunkserver. + /// @param[in] r The request associated with the + /// write-allocation call. + /// @retval 0 on success; -1 on failure + int AllocateChunk(MetaAllocate *r, const vector& chunkBlock); + + bool IsAllocationAllowed(MetaAllocate* req); + + /// When allocating a chunk for append, we try to re-use an + /// existing chunk for a which a valid write lease exists. + /// @param[in/out] r The request associated with the + /// write-allocation call. When an existing chunk is re-used, + /// the chunkid/version is returned back to the caller. + /// @retval 0 on success; -1 on failure + int AllocateChunkForAppend(MetaAllocate *r); + + void ChangeChunkFid(MetaFattr* srcFattr, MetaFattr* dstFattr, + MetaChunkInfo* chunk); + + /// A chunkid has been previously allocated. The caller + /// is trying to grab the write lease on the chunk. If a valid + /// lease exists, we return it; otherwise, we assign a new lease, + /// bump the version # for the chunk and notify the caller. + /// + /// @param[in] r The request associated with the + /// write-allocation call. + /// @param[out] isNewLease True if a new lease has been + /// issued, which tells the caller that a version # bump + /// for the chunk has been done. + /// @retval status code + int GetChunkWriteLease(MetaAllocate *r, bool &isNewLease); + + /// Delete a chunk on the server that holds it. + /// @param[in] chunkId The id of the chunk being deleted + void DeleteChunk(CSMap::Entry& entry); + void DeleteChunk(MetaAllocate *req); + bool InvalidateAllChunkReplicas(fid_t fid, chunkOff_t offset, + chunkId_t chunkId, seq_t& chunkVersion); + + /// A chunkserver is notifying us that a chunk it has is + /// corrupt; so update our tables to reflect that the chunk isn't + /// hosted on that chunkserver any more; re-replication will take + /// care of recovering that chunk. + /// @param[in] r The request that describes the corrupted chunk + void ChunkCorrupt(MetaChunkCorrupt *r); + void ChunkCorrupt(chunkId_t chunkId, const ChunkServerPtr& server, + bool notifyStale = true); + void ChunkEvacuate(MetaChunkEvacuate* r); + /// Handlers to acquire and renew leases. Unexpired leases + /// will typically be renewed. + int GetChunkReadLeases(MetaLeaseAcquire& req); + int GetChunkReadLease(MetaLeaseAcquire *r); + int LeaseRenew(MetaLeaseRenew *r); + + /// Handler to let a lease owner relinquish a lease. + int LeaseRelinquish(MetaLeaseRelinquish *r); + + bool Validate(MetaAllocate* r); + void CommitOrRollBackChunkVersion(MetaAllocate* op); + + /// Is a valid lease issued on any of the chunks in the + /// vector of MetaChunkInfo's? + bool IsValidLeaseIssued(const vector &c); + + void MakeChunkStableInit( + const CSMap::Entry& entry, + seq_t chunkVersion, + string pathname, + bool beginMakeStableFlag, + chunkOff_t chunkSize, + bool hasChunkChecksum, + uint32_t chunkChecksum, + bool stripedFileFlag, + bool appendFlag, + bool leaseRelinquishFlag); + bool AddServerToMakeStable( + CSMap::Entry& placementInfo, + ChunkServerPtr server, + chunkId_t chunkId, + seq_t chunkVersion, + const char*& errMsg); + void BeginMakeChunkStableDone(const MetaBeginMakeChunkStable* req); + void LogMakeChunkStableDone(const MetaLogMakeChunkStable* req); + void MakeChunkStableDone(const MetaChunkMakeStable* req); + void ReplayPendingMakeStable( + chunkId_t chunkId, + seq_t chunkVersion, + chunkOff_t chunkSize, + bool hasChunkChecksum, + uint32_t chunkChecksum, + bool addFlag); + bool ReplayBeginChangeChunkVersion( + fid_t fid, + chunkId_t chunkId, + seq_t chunkVersion); + int WritePendingChunkVersionChange(ostream& os) const; + int WritePendingMakeStable(ostream& os) const; + void CancelPendingMakeStable(fid_t fid, chunkId_t chunkId); + int GetChunkSizeDone(MetaChunkSize* req); + bool IsChunkStable(chunkId_t chunkId); + const char* AddNotStableChunk( + const ChunkServerPtr& server, + fid_t allocFileId, + chunkId_t chunkId, + seq_t chunkVersion, + bool appendFlag, + const string& logPrefix); + void ProcessPendingBeginMakeStable(); + + /// Add a mapping from chunkId -> server. + /// @param[in] chunkId chunkId that has been stored + /// on server c + /// @param[in] fid fileId associated with this chunk. + MetaChunkInfo* AddChunkToServerMapping(MetaFattr* fattr, + chunkOff_t offset, chunkId_t chunkId, seq_t chunkVersion, + bool& newEntryFlag); + + /// Update the mapping from chunkId -> server. + /// @param[in] chunkId chunkId that has been stored + /// on server c + /// @param[in] c server that stores chunk chunkId. + /// @retval 0 if update is successful; -1 otherwise + /// Update will fail if chunkId is not present in the + /// chunkId -> server mapping table. + int UpdateChunkToServerMapping(chunkId_t chunkId, const ChunkServerPtr& s); + + /// Get the mapping from chunkId -> server. + /// @param[in] chunkId chunkId that has been stored + /// on some server(s) + /// @param[out] c server(s) that stores chunk chunkId + /// @retval 0 if a mapping was found; -1 otherwise + /// + int GetChunkToServerMapping(MetaChunkInfo& chunkInfo, Servers &c, + MetaFattr*& fa, bool* orderReplicasFlag = 0); + + /// Get the mapping from chunkId -> file id. + /// @param[in] chunkId chunkId + /// @param[out] fileId file id the chunk belongs to + /// @retval true if a mapping was found; false otherwise + /// + bool GetChunkFileId(chunkId_t chunkId, fid_t& fileId, + const MetaChunkInfo** chunkInfo = 0, const MetaFattr** fa = 0, + LayoutManager::Servers* srvs = 0); + + /// Dump out the chunk location map to a file. The file is + /// written to the specified dir. The filename: + /// /chunkmap.txt. + /// + void DumpChunkToServerMap(const string &dir); + + /// Dump out the chunk location map to a string stream. + void DumpChunkToServerMap(ostream &os); + + /// Dump out the list of chunks that are currently replication + /// candidates. + void DumpChunkReplicationCandidates(MetaDumpChunkReplicationCandidates* op); + + /// Check the replication level of all the blocks and report + /// back files that are under-replicated. + /// Returns true if the system is healthy. + int FsckStreamCount(bool reportAbandonedFilesFlag) const; + void Fsck(ostream** os, bool reportAbandonedFilesFlag); + + /// For monitoring purposes, dump out state of all the + /// connected chunk servers. + void Ping(IOBuffer& buf, bool wormModeFlag); + + /// Return a list of alive chunk servers + void UpServers(ostream &os); + + /// Periodically, walk the table of chunk -> [location, lease] + /// and remove out dead leases. + void LeaseCleanup(); + + /// Periodically, re-check the replication level of all chunks + /// the system; this call initiates the checking work, which + /// gets done over time. + void InitCheckAllChunks(); + + /// Is an expensive call; use sparingly + void CheckAllLeases(); + + /// Cleanup the lease for a particular chunk + /// @param[in] chunkId the chunk for which leases need to be cleaned up + /// @param[in] v the placement/lease info for the chunk + void LeaseCleanup(chunkId_t chunkId, CSMap::Entry &v); + bool ExpiredLeaseCleanup(chunkId_t chunkId); + + /// Handler that loops thru the chunk->location map and determines + /// if there are sufficient copies of each chunk. Those chunks with + /// fewer copies are (re) replicated. + void ChunkReplicationChecker(); + + /// A set of nodes have been put in hibernation by an admin. + /// This is done for scheduled downtime. During this period, we + /// don't want to pro-actively replicate data on the down nodes; + /// if the node doesn't come back as promised, we then start + /// re-replication. Periodically, check the status of + /// hibernating nodes. + void CheckHibernatingServersStatus(); + + /// A chunk replication operation finished. If the op was successful, + /// then, we update the chunk->location map to record the presence + /// of a new replica. + /// @param[in] req The op that we sent to a chunk server asking + /// it to do the replication. + void ChunkReplicationDone(MetaChunkReplicate *req); + + /// Degree of replication for chunk has changed. When the replication + /// checker runs, have it check the status for this chunk. + /// @param[in] chunkId chunk whose replication level needs checking + /// + void ChangeChunkReplication(chunkId_t chunkId); + + /// Get all the fid's for which there is an open lease (read/write). + /// This is useful for reporting purposes. + /// @param[out] openForRead, openForWrite: the pathnames of files + /// that are open for reading/writing respectively + void GetOpenFiles( + MetaOpenFiles::ReadInfo& openForRead, + MetaOpenFiles::WriteInfo& openForWrite); + + void InitRecoveryStartTime() { + mRecoveryStartTime = time(0); + } + + void SetMinChunkserversToExitRecovery(uint32_t n) { + mMinChunkserversToExitRecovery = n; + } + + void ToggleRebalancing(bool v) { + mIsRebalancingEnabled = v; + } + + /// Methods for doing "planned" rebalancing of data. + /// Read in the file that lays out the plan + /// Return 0 if we can open the file; -1 otherwise + int LoadRebalancePlan(const string& planFn); + + /// Execute the plan for all servers + void ExecuteRebalancePlan(); + + /// Execute planned rebalance for server c + size_t ExecuteRebalancePlan( + const ChunkServerPtr& c, + bool& serverDownFlag, + int& maxScan, + int64_t maxTime, + int& nextTimeCheck); + + void SetParameters(const Properties& props, int clientPort = -1); + void SetChunkServersProperties(const Properties& props); + + void GetChunkServerCounters(IOBuffer& buf); + + void AllocateChunkForAppendDone(MetaAllocate& req) { + mARAChunkCache.RequestDone(req); + } + + uint32_t GetConcurrentWritesPerNodeWatermark() const { + return mConcurrentWritesPerNodeWatermark; + } + double GetMaxSpaceUtilizationThreshold() const { + return mMaxSpaceUtilizationThreshold; + } + int GetInFlightChunkOpsCount(chunkId_t chunkId, MetaOp opType) const; + int GetInFlightChunkModificationOpCount(chunkId_t chunkId, + Servers* srvs = 0) const; + int GetInFlightChunkOpsCount(chunkId_t chunkId, const MetaOp* opTypes, + Servers* srvs = 0) const; + void DoCheckpoint() { + mCheckpoint.GetOp().ScheduleNow(); + mCheckpoint.Timeout(); + } + void SetBufferPool(QCIoBufferPool* pool) + { mBufferPool = pool; } + QCIoBufferPool* GetBufferPool() + { return mBufferPool; } + int64_t GetFreeIoBufferByteCount() const; + void Done(MetaChunkVersChange& req); + virtual void Timeout(); + bool Validate(MetaHello& r) const; + void UpdateDelayedRecovery(const MetaFattr& fa, bool forceUpdateFlag = false); + bool HasWriteAppendLease(chunkId_t chunkId) const; + void ScheduleRestartChunkServers(); + bool IsRetireOnCSRestart() const + { return mRetireOnCSRestartFlag; } + void UpdateSrvLoadAvg(ChunkServer& srv, int64_t delta, + bool canBeCandidateFlag = true); + int16_t GetMaxReplicasPerFile() const + { return mMaxReplicasPerFile; } + int16_t GetMaxReplicasPerRSFile() const + { return mMaxReplicasPerRSFile; } + int64_t GetMaxFsckTime() const + { return mMaxFsckTime; } + bool HasEnoughFreeBuffers(MetaRequest* req = 0); + int GetMaxResponseSize() const + { return mMaxResponseSize; } + int GetReadDirLimit() const + { return mReadDirLimit; } + void ChangeIoBufPending(int64_t delta) + { SyncAddAndFetch(mIoBufPending, delta); } + bool IsCandidateServer(const ChunkServer& c, + double writableChunksThresholdRatio = 1.0); + bool GetPanicOnInvalidChunkFlag() const + { return mPanicOnInvalidChunkFlag; } + + // Chunk placement. + enum { kSlaveScaleFracBits = 8 }; + typedef vector > RackInfos; + + bool GetSortCandidatesBySpaceUtilizationFlag() const + { return mSortCandidatesBySpaceUtilizationFlag; } + bool GetSortCandidatesByLoadAvgFlag() const + { return mSortCandidatesByLoadAvgFlag; } + bool GetUseFsTotalSpaceFlag() const + { return mUseFsTotalSpaceFlag; } + int64_t GetSlavePlacementScale(); + int GetMaxConcurrentWriteReplicationsPerNode() const + { return mMaxConcurrentWriteReplicationsPerNode; } + const Servers& GetChunkServers() const + { return mChunkServers; } + const RackInfos& GetRacks() const + { return mRacks; } + int64_t Rand(int64_t interval); + void UpdateChunkWritesPerDrive(ChunkServer& srv, + int deltaNumChunkWrites, int deltaNumWritableDrives); + + // Unix style permissions + kfsUid_t GetDefaultUser() const + { return mDefaultUser; } + kfsGid_t GetDefaultGroup() const + { return mDefaultGroup; } + kfsMode_t GetDefaultFileMode() const + { return mDefaultFileMode; } + kfsMode_t GetDefaultDirMode() const + { return mDefaultDirMode; } + kfsUid_t GetDefaultLoadUser() const + { return mDefaultLoadUser; } + kfsGid_t GetDefaultLoadGroup() const + { return mDefaultLoadGroup; } + kfsMode_t GetDefaultLoadFileMode() const + { return mDefaultLoadFileMode; } + kfsMode_t GetDefaultLoadDirMode() const + { return mDefaultLoadDirMode; } + bool VerifyAllOpsPermissions() const + { return mVerifyAllOpsPermissionsFlag; } + void SetEUserAndEGroup(MetaRequest& req) + { + SetUserAndGroup(req, req.euser, req.egroup); + if (mForceEUserToRootFlag) { + req.euser = kKfsUserRoot; + } + if (req.euser != kKfsUserRoot || mRootHosts.empty()) { + return; + } + if (mRootHosts.find(req.clientIp) == mRootHosts.end()) { + req.euser = kKfsUserNone; + } + } + void SetUserAndGroup(const MetaRequest& req, + kfsUid_t& user, kfsGid_t& group) + { + if (mHostUserGroupRemap.empty() || req.clientIp.empty()) { + return; + } + SetUserAndGroupSelf(req, user, group); + } +protected: + typedef vector< + int, + StdAllocator + > RackIds; + class RebalanceCtrs + { + public: + typedef int64_t Counter; + + RebalanceCtrs() + : mRoundCount(0), + mNoSource(0), + mServerNeeded(0), + mNoServerFound(0), + mRackNeeded(0), + mNoRackFound(0), + mNonLoadedServerNeeded(0), + mNoNonLoadedServerFound(0), + mOk(0), + mScanned(0), + mBusy(0), + mBusyOther(0), + mReplicationStarted(0), + mNoReplicationStarted(0), + mScanTimeout(0), + mTotalNoSource(0), + mTotalServerNeeded(0), + mTotalNoServerFound(0), + mTotalRackNeeded(0), + mTotalNoRackFound(0), + mTotalNonLoadedServerNeeded(0), + mTotalNoNonLoadedServerFound(0), + mTotalOk(0), + mTotalScanned(0), + mTotalBusy(0), + mTotalBusyOther(0), + mTotalReplicationStarted(0), + mTotalNoReplicationStarted(0), + mTotalScanTimeout(0), + mPlan(0), + mPlanNoDest(0), + mPlanTimeout(0), + mPlanScanned(0), + mPlanNoChunk(0), + mPlanNoSrc(0), + mPlanBusy(0), + mPlanBusyOther(0), + mPlanCannotMove(0), + mPlanReplicationStarted(0), + mPlanNoReplicationStarted(0), + mPlanLine(0), + mPlanAdded(0), + mPlanNoServer(0), + mTotalPlanNoDest(0), + mTotalPlanTimeout(0), + mTotalPlanScanned(0), + mTotalPlanNoChunk(0), + mTotalPlanNoSrc(0), + mTotalPlanBusy(0), + mTotalPlanBusyOther(0), + mTotalPlanCannotMove(0), + mTotalPlanReplicationStarted(0), + mTotalPlanNoReplicationStarted(0), + mTotalPlanLine(0), + mTotalPlanAdded(0), + mTotalPlanNoServer(0) + {} + void Clear() + { + *this = RebalanceCtrs(); + } + void NoSource() + { + mNoSource++; + mTotalNoSource++; + } + void ServerOk() + { + mOk++; + mTotalOk++; + } + void ServerNeeded() + { + mServerNeeded++; + mTotalServerNeeded++; + } + void NoServerFound() + { + mNoServerFound++; + mTotalNoServerFound++; + } + void RackNeeded() + { + mRackNeeded++; + mTotalRackNeeded++; + } + void NoRackFound() + { + mNoRackFound++; + mTotalNoRackFound++; + } + void NonLoadedServerNeeded() + { + mNonLoadedServerNeeded++; + mTotalNonLoadedServerNeeded++; + } + void NoNonLoadedServerFound() + { + mNoNonLoadedServerFound++; + mTotalNoNonLoadedServerFound++; + } + void ReplicationStarted() + { + mReplicationStarted++; + mTotalReplicationStarted++; + } + void NoReplicationStarted() + { + mNoReplicationStarted++; + mTotalNoReplicationStarted++; + } + void Scanned() + { + mScanned++; + mTotalScanned++; + } + void Busy() + { + mBusy++; + mTotalBusy++; + } + void BusyOther() + { + mBusyOther++; + mTotalBusyOther++; + } + + void ScanTimeout() + { + mScanTimeout++; + mTotalScanTimeout++; + } + void NextRound() + { + mRoundCount++; + mServerNeeded = 0; + mNoServerFound = 0; + mRackNeeded = 0; + mNoRackFound = 0; + mNonLoadedServerNeeded = 0; + mNoNonLoadedServerFound = 0; + mOk = 0; + mScanned = 0; + mBusy = 0; + mBusyOther = 0; + mReplicationStarted = 0; + mNoReplicationStarted = 0; + mScanTimeout = 0; + } + void StartPlan() + { + mPlan++; + mPlanNoDest = 0; + mPlanTimeout = 0; + mPlanScanned = 0; + mPlanNoChunk = 0; + mPlanNoSrc = 0; + mPlanBusy = 0; + mPlanBusyOther = 0; + mPlanCannotMove = 0; + mPlanReplicationStarted = 0; + mPlanNoReplicationStarted = 0; + mPlanLine = 0; + mPlanAdded = 0; + mPlanNoServer = 0; + } + void PlanNoDest() + { + mPlanNoDest++; + mTotalPlanNoDest++; + } + void PlanTimeout() + { + mPlanTimeout++; + mTotalPlanTimeout++; + } + void PlanScanned() + { + mPlanScanned++; + mTotalPlanScanned++; + } + void PlanNoChunk() + { + mPlanNoChunk++; + mTotalPlanNoChunk++; + } + void PlanNoSrc() + { + mPlanNoSrc++; + mTotalPlanNoSrc++; + } + void PlanBusy() + { + mPlanBusy++; + mTotalPlanBusy++; + } + void PlanBusyOther() + { + mPlanBusyOther++; + mTotalPlanBusyOther++; + } + void PlanCannotMove() + { + mPlanCannotMove++; + mTotalPlanCannotMove++; + } + void PlanReplicationStarted() + { + mPlanReplicationStarted++; + mTotalPlanReplicationStarted++; + } + void PlanNoReplicationStarted() + { + mPlanNoReplicationStarted++; + mTotalPlanNoReplicationStarted++; + } + void PlanLine() + { + mPlanLine++; + mTotalPlanLine++; + } + void PlanAdded() + { + mPlanAdded++; + mTotalPlanAdded++; + } + void PlanNoServer() + { + mPlanNoServer++; + mTotalPlanNoServer++; + } + Counter GetPlanLine() const + { + return mPlanLine; + } + Counter GetTotalScanned() const + { + return mTotalScanned; + } + Counter GetRoundCount() const + { + return mRoundCount; + } + ostream& Show(ostream& os, + const char* prefix = 0, const char* suffix = 0); + private: + Counter mRoundCount; + Counter mNoSource; + Counter mServerNeeded; + Counter mNoServerFound; + Counter mRackNeeded; + Counter mNoRackFound; + Counter mNonLoadedServerNeeded; + Counter mNoNonLoadedServerFound; + Counter mOk; + Counter mScanned; + Counter mBusy; + Counter mBusyOther; + Counter mReplicationStarted; + Counter mNoReplicationStarted; + Counter mScanTimeout; + Counter mTotalNoSource; + Counter mTotalServerNeeded; + Counter mTotalNoServerFound; + Counter mTotalRackNeeded; + Counter mTotalNoRackFound; + Counter mTotalNonLoadedServerNeeded; + Counter mTotalNoNonLoadedServerFound; + Counter mTotalOk; + Counter mTotalScanned; + Counter mTotalBusy; + Counter mTotalBusyOther; + Counter mTotalReplicationStarted; + Counter mTotalNoReplicationStarted; + Counter mTotalScanTimeout; + Counter mPlan; + Counter mPlanNoDest; + Counter mPlanTimeout; + Counter mPlanScanned; + Counter mPlanNoChunk; + Counter mPlanNoSrc; + + Counter mPlanBusy; + Counter mPlanBusyOther; + Counter mPlanCannotMove; + Counter mPlanReplicationStarted; + Counter mPlanNoReplicationStarted; + Counter mPlanLine; + Counter mPlanAdded; + Counter mPlanNoServer; + Counter mTotalPlanNoDest; + Counter mTotalPlanTimeout; + Counter mTotalPlanScanned; + Counter mTotalPlanNoChunk; + Counter mTotalPlanNoSrc; + Counter mTotalPlanBusy; + Counter mTotalPlanBusyOther; + Counter mTotalPlanCannotMove; + Counter mTotalPlanReplicationStarted; + Counter mTotalPlanNoReplicationStarted; + Counter mTotalPlanLine; + Counter mTotalPlanAdded; + Counter mTotalPlanNoServer; + }; + + // Chunk servers counters -- aggregated from chunk server heartbeat + // responses. + typedef map , + less, + StdFastAllocator< + pair > + > + > CSCounters; + + // Striped (Reed-Solomon) files allocations in flight used for chunk + // placment. + typedef set< + pair, chunkId_t>, + less, chunkId_t> >, + StdFastAllocator, chunkId_t> > + > StripedFilesAllocationsInFlight; + + class FilesChecker; + + /// A counter to track the # of ongoing chunk replications + int mNumOngoingReplications; + + /// A switch to toggle rebalancing: if the system is under load, + /// we'd like to turn off rebalancing. We can enable it a + /// suitable time. + bool mIsRebalancingEnabled; + + /// For the purposes of rebalancing, what is the range we want + /// a node to be in. If a node is outside the range, it is + /// either underloaded (in which case, it can take blocks) or it + /// is overloaded (in which case, it can give up blocks). + double mMaxRebalanceSpaceUtilThreshold; + double mMinRebalanceSpaceUtilThreshold; + + /// Set when a rebalancing plan is being excuted. + bool mIsExecutingRebalancePlan; + + /// After a crash, track the recovery start time. For a timer + /// period that equals the length of lease interval, we only grant + /// lease renews and new leases to new chunks. We however, + /// disallow granting new leases to existing chunks. This is + /// because during the time period that corresponds to a lease interval, + /// we may learn about leases that we had handed out before crashing. + time_t mRecoveryStartTime; + /// To keep track of uptime. + const time_t mStartTime; + + /// Defaults to the width of a lease window + int mRecoveryIntervalSec; + + /// Periodically clean out dead leases + PeriodicOp mLeaseCleaner; + + /// Similar to the lease cleaner: periodically check if there are + /// sufficient copies of each chunk. + PeriodicOp mChunkReplicator; + PeriodicOp mCheckpoint; + + uint32_t mMinChunkserversToExitRecovery; + + /// List of connected chunk servers. + Servers mChunkServers; + + /// List of servers that are hibernating; if they don't wake up + /// the time the hibernation period ends, the blocks on those + /// nodes needs to be re-replicated. This provides us the ability + /// to take a node down for maintenance and bring it back up + /// without incurring re-replication overheads. + HibernatedServerInfos mHibernatingServers; + + /// Track when servers went down so we can report it + typedef deque DownServers; + DownServers mDownServers; + + /// State about how each rack (such as, servers/space etc) + RackInfos mRacks; + + /// Mapping from a chunk to its location(s). + CSMap mChunkToServerMap; + + StripedFilesAllocationsInFlight mStripedFilesAllocationsInFlight; + + /// chunks to which a lease has been handed out; whenever we + /// cleanup the leases, this set is walked + ChunkLeases mChunkLeases; + + /// For files that are being atomic record appended to, track the last + /// chunk of the file that we can use for subsequent allocations + ARAChunkCache mARAChunkCache; + + /// Set of chunks that are in the process being made stable: a + /// message has been sent to the associated chunkservers which are + /// flushing out data to disk. + NonStableChunksMap mNonStableChunks; + PendingBeginMakeStable mPendingBeginMakeStable; + PendingMakeStableMap mPendingMakeStable; + /// In memory representation of chunk versions roll back. + ChunkVersionRollBack mChunkVersionRollBack; + + /// Counters to track chunk replications + Counter *mOngoingReplicationStats; + Counter *mTotalReplicationStats; + /// how much todo before we are all done (estimate of the size + /// of the chunk-replication candidates set). + Counter *mReplicationTodoStats; + /// # of chunks for which there is only a single copy + /// Track the # of replication ops that failed + Counter *mFailedReplicationStats; + /// Track the # of stale chunks we have seen so far + Counter *mStaleChunkCount; + size_t mMastersCount; + size_t mSlavesCount; + bool mAssignMasterByIpFlag; + int mLeaseOwnerDownExpireDelay; + // Write append space reservation accounting. + int mMaxReservationSize; + int mReservationDecayStep; + int mChunkReservationThreshold; + int mAllocAppendReuseInFlightTimeoutSec; + int mMinAppendersPerChunk; + int mMaxAppendersPerChunk; + double mReservationOvercommitFactor; + // Delay replication when connection breaks. + int mServerDownReplicationDelay; + uint64_t mMaxDownServersHistorySize; + // Chunk server properties broadcasted to all chunk servers. + Properties mChunkServersProps; + string mChunkServersPropsFileName; + bool mReloadChunkServersPropertiesFlag; + // Chunk server restart logic. + int mCSToRestartCount; + int mMastersToRestartCount; + int mMaxCSRestarting; + bool mRetireOnCSRestartFlag; + int64_t mMaxCSUptime; + int64_t mCSRestartTime; + int64_t mCSGracefulRestartTimeout; + int64_t mCSGracefulRestartAppendWithWidTimeout; + int64_t mLastReplicationCheckTime; + // "instant du" + time_t mLastRecomputeDirsizeTime; + int mRecomputeDirSizesIntervalSec; + /// Max # of concurrent read/write replications per node + /// -- write: is the # of chunks that the node can pull in from outside + /// -- read: is the # of chunks that the node is allowed to send out + /// + int mMaxConcurrentWriteReplicationsPerNode; + int mMaxConcurrentReadReplicationsPerNode; + bool mUseEvacuationRecoveryFlag; + int64_t mReplicationFindWorkTimeouts; + /// How much do we spend on each internal RPC in chunk-replication-check to handout + /// replication work. + int64_t mMaxTimeForChunkReplicationCheck; + int64_t mMinChunkReplicationCheckInterval; + int64_t mLastReplicationCheckRunEndTime; + int64_t mReplicationCheckTimeouts; + int64_t mNoServersAvailableForReplicationCount; + /// Periodically (once a week), check the replication of all blocks in the system + int64_t mFullReplicationCheckInterval; + bool mCheckAllChunksInProgressFlag; + + /// + /// When placing chunks, we see the space available on the node as well as + /// we take our estimate of the # of writes on + /// the node as a hint for choosing servers; if a server is "loaded" we should + /// avoid sending traffic to it. This value defines a watermark after which load + /// begins to be an issue. + /// + uint32_t mConcurrentWritesPerNodeWatermark; + + double mMaxSpaceUtilizationThreshold; + bool mUseFsTotalSpaceFlag; + int64_t mChunkAllocMinAvailSpace; + + int64_t mCompleteReplicationCheckInterval; + int64_t mCompleteReplicationCheckTime; + int64_t mPastEofRecoveryDelay; + size_t mMaxServerCleanupScan; + int mMaxRebalanceScan; + double mRebalanceReplicationsThreshold; + int64_t mRebalanceReplicationsThresholdCount; + int64_t mMaxRebalanceRunTime; + int64_t mLastRebalanceRunTime; + int64_t mRebalanceRunInterval; + size_t mMaxRebalancePlanRead; + string mRebalancePlanFileName; + RebalanceCtrs mRebalanceCtrs; + ifstream mRebalancePlan; + bool mCleanupScheduledFlag; + + int mCSCountersUpdateInterval; + time_t mCSCountersUpdateTime; + CSCounters mCSCounters; + IOBuffer mCSCountersResponse; + int mPingUpdateInterval; + time_t mPingUpdateTime; + IOBuffer mPingResponse; + ostringstream mStringStream; + IOBuffer::WOStream mWOstream; + QCIoBufferPool* mBufferPool; + bool mMightHaveRetiringServersFlag; + + class HostPrefix + { + public: + HostPrefix() + : mLen(0), + mMinLen(0) + {} + bool operator==(const HostPrefix& other) const + { + return (mLen == other.mLen && + mMinLen == other.mMinLen && + memcmp(mPrefix, other.mPrefix, mLen) == 0); + } + bool Match(const string& host) const + { + return (host.length() >= mMinLen && + memcmp(host.data(), mPrefix, mLen) == 0); + + } + size_t Parse(const string& pref) + { + // Allow to position prefix with trailing ?? + // For example: 10.6.34.2? + mMinLen = min(sizeof(mPrefix), pref.length()); + mLen = pref.find('?'); + if (mLen == string::npos || mMinLen < mLen) { + mLen = mMinLen; + } + memcpy(mPrefix, pref.data(), mLen); + return mMinLen; + } + private: + char mPrefix[64]; + size_t mLen; + size_t mMinLen; + + }; + typedef vector > RackPrefixes; + typedef map RackWeights; + typedef vector ChunkServersMd5sums; + bool mRackPrefixUsePortFlag; + RackPrefixes mRackPrefixes; + RackWeights mRackWeights; + ChunkServersMd5sums mChunkServerMd5sums; + string mClusterKey; + + int64_t mDelayedRecoveryUpdateMaxScanCount; + bool mForceDelayedRecoveryUpdateFlag; + bool mSortCandidatesBySpaceUtilizationFlag; + bool mSortCandidatesByLoadAvgFlag; + int64_t mMaxFsckFiles; + int64_t mFsckAbandonedFileTimeout; + int64_t mMaxFsckTime; + bool mFullFsckFlag; + int64_t mMTimeUpdateResolution; + int64_t mMaxPendingRecoveryMsgLogInfo; + bool mAllowLocalPlacementFlag; + bool mAllowLocalPlacementForAppendFlag; + bool mInRackPlacementForAppendFlag; + bool mInRackPlacementFlag; + bool mAllocateDebugVerifyFlag; + + CSMap::Entry* mChunkEntryToChange; + MetaFattr* mFattrToChangeTo; + + int64_t mCSLoadAvgSum; + int64_t mCSMasterLoadAvgSum; + int64_t mCSSlaveLoadAvgSum; + int mCSTotalPossibleCandidateCount; + int mCSMasterPossibleCandidateCount; + int mCSSlavePossibleCandidateCount; + bool mUpdateCSLoadAvgFlag; + bool mUpdatePlacementScaleFlag; + int64_t mCSMaxGoodCandidateLoadAvg; + int64_t mCSMaxGoodMasterCandidateLoadAvg; + int64_t mCSMaxGoodSlaveCandidateLoadAvg; + double mCSMaxGoodCandidateLoadRatio; + double mCSMaxGoodMasterLoadRatio; + double mCSMaxGoodSlaveLoadRatio; + int64_t mSlavePlacementScale; + int64_t mMaxSlavePlacementRange; + int16_t mMaxReplicasPerFile; + int16_t mMaxReplicasPerRSFile; + bool mGetAllocOrderServersByLoadFlag; + int mMinChunkAllocClientProtoVersion; + + int mMaxResponseSize; + int64_t mMinIoBufferBytesToProcessRequest; + int mReadDirLimit; + bool mAllowChunkServerRetireFlag; + bool mPanicOnInvalidChunkFlag; + int mAppendCacheCleanupInterval; + int mTotalChunkWrites; + int mTotalWritableDrives; + int mMinWritesPerDrive; + int mMaxWritesPerDriveThreshold; + double mMaxWritesPerDriveRatio; + double mMaxLocalPlacementWeight; + double mTotalWritableDrivesMult; + string mConfig; + + kfsUid_t mDefaultUser; + kfsGid_t mDefaultGroup; + kfsMode_t mDefaultFileMode; + kfsMode_t mDefaultDirMode; + kfsUid_t mDefaultLoadUser; + kfsGid_t mDefaultLoadGroup; + kfsMode_t mDefaultLoadFileMode; + kfsMode_t mDefaultLoadDirMode; + bool mForceEUserToRootFlag; // Turns off permission verification. + bool mVerifyAllOpsPermissionsFlag; // If true, then the + // following won't work: + // write(open("/file", O_RDWR | O_CREAT, 0000), "1", 1); + typedef set RootHosts; + RootHosts mRootHosts; + struct HostUserGroupMapEntry + { + HostUserGroupMapEntry() + : mHostPrefix(), + mUserMap(), + mGroupMap() + {} + typedef map UserMap; + typedef map GroupMap; + HostPrefix mHostPrefix; + UserMap mUserMap; + GroupMap mGroupMap; + }; + typedef vector HostUserGroupRemap; + HostUserGroupRemap mHostUserGroupRemap; + struct LastUidGidRemap + { + string mIp; + kfsUid_t mUser; + kfsGid_t mGroup; + kfsUid_t mToUser; + kfsGid_t mToGroup; + LastUidGidRemap() + : mIp(), + mUser(kKfsUserNone), + mGroup(kKfsGroupNone), + mToUser(kKfsUserNone), + mToGroup(kKfsGroupNone) + {} + }; + LastUidGidRemap mLastUidGidRemap; + + volatile int64_t mIoBufPending; + + StTmp >::Tmp mChunkInfosTmp; + StTmp >::Tmp mChunkInfos2Tmp; + StTmp::Tmp mServersTmp; + StTmp::Tmp mServers2Tmp; + StTmp::Tmp mServers3Tmp; + StTmp::Tmp mServers4Tmp; + + struct ChunkPlacement : public KFS::ChunkPlacement + { + typedef KFS::ChunkPlacement Super; + ChunkPlacement(); + }; + StTmp::Tmp mChunkPlacementTmp; + + typedef boost::mt19937 Random; + Random mRandom; + const Random::result_type mRandMin; + const uint64_t mRandInterval; + + /// Check the # of copies for the chunk and return true if the + /// # of copies is less than targeted amount. We also don't replicate a chunk + /// if it is currently being written to (i.e., if a write lease + /// has been issued). + /// @param[in] clli The location information about the chunk. + /// @param[out] extraReplicas The target # of additional replicas for the chunk + /// @retval true if the chunk is to be replicated; false otherwise + bool CanReplicateChunkNow( + CSMap::Entry& clli, + int& extraReplicas, + ChunkPlacement& chunkPlacement, + int* hibernatedReplicaCount = 0, + ChunkRecoveryInfo* recoveryInfo = 0, + bool forceRecoveryFlag = false); + + /// Replicate a chunk. This involves finding a new location for + /// the chunk that is different from the existing set of replicas + /// and asking the chunkserver to get a copy. + /// @param[in] chunkId The id of the chunk which we are checking + /// @param[in] clli The lease/location information about the chunk. + /// @param[in] extraReplicas The target # of additional replicas for the chunk + /// @param[in] candidates The set of servers on which the additional replicas + /// should be stored + /// @retval The # of actual replications triggered + int ReplicateChunk( + CSMap::Entry& clli, + int extraReplicas, + ChunkPlacement& chunkPlacement, + const ChunkRecoveryInfo& recoveryInfo); + int ReplicateChunk( + CSMap::Entry& clli, + int extraReplicas, + const Servers& candidates, + const ChunkRecoveryInfo& recoveryInfo, + const char* reasonMsg = 0); + + /// From the candidates, handout work to nodes. If any chunks are + /// over-replicated/chunk is deleted from system, add them to delset. + bool HandoutChunkReplicationWork(); + + /// There are more replicas of a chunk than the requested amount. So, + /// delete the extra replicas and reclaim space. When deleting the addtional + /// copies, find the servers that are low on space and delete from there. + /// As part of deletion, we update our mapping of where the chunk is stored. + /// @param[in] chunkId The id of the chunk which we are checking + /// @param[in] clli The lease/location information about the chunk. + /// @param[in] extraReplicas The # of replicas that need to be deleted + void DeleteAddlChunkReplicas(CSMap::Entry& entry, int extraReplicas, + ChunkPlacement& placement); + + /// Helper function to check set membership. + /// @param[in] hosters Set of servers hosting a chunk + /// @param[in] server The server we want to check for membership in hosters. + /// @retval true if server is a member of the set of hosters; + /// false otherwise + bool IsChunkHostedOnServer(const Servers &hosters, + const ChunkServerPtr &server); + + /// Periodically, update our estimate of how much space is + /// used/available in each rack. + void UpdateRackSpaceUsageCounts(); + + /// Does any server have space/write-b/w available for + /// re-replication + int CountServersAvailForReReplication() const; + + /// Periodically, rebalance servers by moving chunks around from + /// "over utilized" servers to "under utilized" servers. + void RebalanceServers(); + void UpdateReplicationsThreshold(); + + /// For a time period that corresponds to the length of a lease interval, + /// we are in recovery after a restart. + /// Also, if the # of chunkservers that are connected to us is + /// less than some threshold, we are in recovery mode. + inline bool InRecovery() const; + inline bool InRecoveryPeriod() const; + + inline bool IsChunkServerRestartAllowed() const; + void ScheduleChunkServersRestart(); + inline bool AddHosted(CSMap::Entry& entry, const ChunkServerPtr& c); + inline bool AddHosted(chunkId_t chunkId, CSMap::Entry& entry, const ChunkServerPtr& c); + bool AddReplica(CSMap::Entry& entry, const ChunkServerPtr& c); + void CheckChunkReplication(CSMap::Entry& entry); + inline void UpdateReplicationState(CSMap::Entry& entry); + inline void SetReplicationState(CSMap::Entry& entry, CSMap::Entry::State state); + + inline seq_t GetChunkVersionRollBack(chunkId_t chunkId); + inline seq_t IncrementChunkVersionRollBack(chunkId_t chunkId); + inline ostream& ClearStringStream(); + inline static const string& BoolToString(bool flag); + inline CSCounters::mapped_type& CSCountersMakeRow( + const string& name, size_t width, CSCounters::iterator& it); + inline void UpdatePendingRecovery(CSMap::Entry& entry); + inline void CheckReplication(CSMap::Entry& entry); + bool GetPlacementExcludes(const CSMap::Entry& entry, ChunkPlacement& placement, + bool includeThisChunkFlag = true, + bool stopIfHasAnyReplicationsInFlight = false, + vector* chunkBlock = 0); + void UpdateChunkServerCounters(); + void ProcessInvalidStripes(MetaChunkReplicate& req); + RackId GetRackId(const ServerLocation& loc); + RackId GetRackId(const string& loc); + void ScheduleCleanup(size_t maxScanCount = 1); + void RemoveRetiring(CSMap::Entry& ci, Servers& servers, int numReplicas, + bool deleteRetiringFlag = false); + void DeleteChunk(fid_t fid, chunkId_t chunkId, const Servers& servers); + void UpdateGoodCandidateLoadAvg(); + bool CanBeCandidateServer(const ChunkServer& c) const; + inline static CSMap::Entry& GetCsEntry(MetaChunkInfo& chunkInfo); + inline static CSMap::Entry* GetCsEntry(MetaChunkInfo* chunkInfo); + bool CanBeRecovered(const CSMap::Entry& entry, + bool& incompleteChunkBlockFlag, + bool* incompleteChunkBlockWriteHasLeaseFlag, + vector& cblk) const; + HibernatingServerInfo_t* FindHibernatingServer( + const ServerLocation& loc); + void CSMapUnitTest(const Properties& props); + int64_t GetMaxCSUptime() const; + bool ReadRebalancePlan(size_t nread); + void Fsck(ostream &os, bool reportAbandonedFilesFlag); + void CheckFile( + FilesChecker& fsck, + const MetaDentry& de, + const MetaFattr& fa); + static Random::result_type RandSeed(); + class RandGen; + template void LoadIdRemap( + istream& fs, T OT::* map); + void SetUserAndGroupSelf(const MetaRequest& req, + kfsUid_t& user, kfsGid_t& group); +}; + +extern LayoutManager& gLayoutManager; +} + +#endif // META_LAYOUTMANAGER_H diff --git a/src/cc/meta/Logger.cc b/src/cc/meta/Logger.cc new file mode 100644 index 000000000..1d215f32a --- /dev/null +++ b/src/cc/meta/Logger.cc @@ -0,0 +1,254 @@ +/*! + * $Id$ + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * \file logger.cc + * \brief metadata transaction logger. + * \author Sriram Rao (Quantcast Corp) and Blake Lewis (Kosmix Corp.) + * Mike Ovsiannikov -- implement transaction log checksum. + */ + +#include "Logger.h" +#include "Checkpoint.h" +#include "util.h" +#include "Replay.h" +#include "common/MsgLogger.h" +#include "kfsio/Globals.h" +#include "NetDispatch.h" + +#include + +namespace KFS +{ +using std::hex; +using std::dec; +using std::ofstream; +using std::ifstream; +using libkfsio::globalNetManager; + +// default values +string LOGDIR("./kfslog"); +string LASTLOG(LOGDIR + "/last"); + +Logger oplog(LOGDIR); + +void +Logger::dispatch(MetaRequest *r) +{ + r->seqno = ++nextseq; + if (r->mutation && r->status == 0) { + if (log(r) < 0) { + panic("Logger::dispatch", true); + } + cp.note_mutation(); + } + gNetDispatch.Dispatch(r); +} + +/*! + * \brief log the request and flush the result to the fs buffer. +*/ +int +Logger::log(MetaRequest *r) +{ + const int res = r->log(logstream); + if (res >= 0) { + flushResult(r); + } + return res; +} + +/*! + * \brief flush log entries to disk + * + * Make sure that all of the log entries are on disk and + * update the highest sequence number logged. + */ +void +Logger::flushLog() +{ + seq_t last = nextseq; + + logstream.flush(); + if (fail()) { + panic("Logger::flushLog", true); + } + committed = last; +} + +/*! + * \brief set the log filename/log # to seqno + * \param[in] seqno the next log sequence number (lognum) + */ +void +Logger::setLog(int seqno) +{ + assert(seqno >= 0); + lognum = seqno; + logname = logfile(lognum); +} + +/*! + * \brief open a new log file for writing + * \param[in] seqno the next log sequence number (lognum) + * \return 0 if successful, negative on I/O error + */ +int +Logger::startLog(int seqno, bool appendFlag /* = false */, + int logAppendIntBase /* = -1 */) +{ + assert(seqno >= 0); + lognum = seqno; + logname = logfile(lognum); + if (appendFlag) { + // following log replay, until the next CP, we + // should continue to append to the logfile that we replayed. + // seqno will be set to the value we got from the chkpt file. + // So, don't overwrite the log file. + KFS_LOG_STREAM_INFO << + "log append:" << + " int base: " << logAppendIntBase << + " file: " << logname << + KFS_LOG_EOM; + logf.open(logname.c_str(), ofstream::app | ofstream::binary); + md.SetStream(&logf); + md.SetWriteTrough(false); + switch (logAppendIntBase) { + case 10: logstream << dec; break; + case 16: logstream << hex; break; + default: + panic("invalid int base parameter", false); + logf.close(); + return -EINVAL; + } + return (fail() ? -EIO : 0); + } + logf.open(logname.c_str(), + ofstream::out | ofstream::binary | ofstream::trunc); + md.SetWriteTrough(false); + md.Reset(&logf); + logstream << + "version/" << VERSION << "\n" + "checksum/last-line\n" + "setintbase/16\n"; + ; + logstream << "time/" << DisplayIsoDateTime() << '\n'; + logstream << hex; + logstream.flush(); + return (fail() ? -EIO : 0); +} + +/*! + * \brief close current log file and begin a new one + */ +int +Logger::finishLog() +{ + // if there has been no update to the log since the last roll, don't + // roll the file over; otherwise, we'll have a file every N mins + if (incp == committed) { + return 0; + } + logstream << "time/" << DisplayIsoDateTime() << '\n'; + logstream.flush(); + const string checksum = md.GetMd(); + logf << "checksum/" << checksum << '\n'; + logf.close(); + if (fail()) { + panic("Logger::finishLog, close", true); + } + if (link_latest(logname, LASTLOG)) { + panic("Logger::finishLog, link", true); + } + incp = committed; + const int status = startLog(lognum + 1); + if (status < 0) { + panic("Logger::finishLog, startLog", true); + } + cp.resetMutationCount(); + return status; +} + +/*! + * \brief make sure result is on disk + * \param[in] r the result of interest + * + * If this result has a higher sequence number than what is + * currently known to be on disk, flush the log to disk. + */ +void +Logger::flushResult(MetaRequest *r) +{ + if (r->seqno > committed) { + flushLog(); + assert(r->seqno <= committed); + } +} + +void +logger_setup_paths(const string& logdir) +{ + if (! logdir.empty()) { + LOGDIR = logdir; + LASTLOG = LOGDIR + "/last"; + oplog.setLogDir(LOGDIR); + } +} + +class LogRotater : public ITimeout +{ +public: + LogRotater() + : ITimeout() + {} + void SetInterval(int rotateIntervalSec) + { SetTimeoutInterval(rotateIntervalSec * 1000); }; + virtual void Timeout() + { oplog.finishLog(); } + +}; +static LogRotater logRotater; + +void +logger_set_rotate_interval(int rotateIntervalSec) +{ + logRotater.SetInterval(rotateIntervalSec); +} + +void +logger_init(int rotateIntervalSec) +{ + const int num = replayer.logno(); + const bool appendFlag = replayer.getAppendToLastLogFlag(); + if (num > 0 && ! file_exists(LASTLOG)) { + const string logfn = oplog.logfile(num - 1); + if (file_exists(logfn) && link_latest(logfn, LASTLOG)) { + panic("KFS::logger_init, link " + + logfn + " " + LASTLOG, true); + } + } + if (oplog.startLog(num, appendFlag, + replayer.getLastLogIntBase()) != 0) { + panic("KFS::logger_init, startLog", true); + } + logger_set_rotate_interval(rotateIntervalSec); + globalNetManager().RegisterTimeoutHandler(&logRotater); +} + +} // namespace KFS. diff --git a/src/cc/meta/Logger.h b/src/cc/meta/Logger.h new file mode 100644 index 000000000..3ad2924e2 --- /dev/null +++ b/src/cc/meta/Logger.h @@ -0,0 +1,146 @@ +/*! + * $Id$ + * + * \file Logger.h + * \brief metadata logger + * \author Blake Lewis (Kosmix Corp.) + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ +#if !defined(KFS_LOGGER_H) +#define KFS_LOGGER_H + +#include +#include +#include + +#include "kfstypes.h" +#include "MetaRequest.h" +#include "util.h" +#include "common/MdStream.h" + +#include "kfsio/ITimeout.h" + +namespace KFS +{ +using std::string; +using std::ostringstream; +using std::ofstream; + +/*! + * \brief Class for logging metadata updates + * + * - RPCs when they are done are logged (if necessary, such as, they mutate the + * tree) and are then dispatched to the sender. + * - a timer that periodically causes log rollover. Whenever + * the log rollover occurs, after we close the log file, we create a link from + * "LAST" to the recently closed log file. This is used by the log compactor + * to determine the set of files that can be compacted. + */ + +class Logger +{ +public: + static const int VERSION = 1; + Logger(string d) + : logdir(d), + lognum(-1), + logname(), + logf(), + md(), + logstream(md), + nextseq(0), + committed(0), + incp(0) + {} + ~Logger() + { + logstream.flush(); + logf.close(); + } + void setLogDir(const string &d) + { + logdir = d; + } + string logfile(int n) //!< generate a log file name + { + return makename(logdir, "log", n); + } + /*! + * \brief check whether request is stored on disk + * \param[in] r the request of interest + * \return whether it is on disk + */ + bool iscommitted(MetaRequest *r) + { + return r->seqno != 0 && r->seqno <= committed; + } + //!< log a request + int log(MetaRequest *r); + //!< add to the log and dispatch downstream to netdispatcher + void dispatch(MetaRequest *r); + seq_t checkpointed() { return incp; } //!< highest seqno in CP + void setLog(int seqno); //!< set the log filename based on seqno + //!< create or open log file + int startLog(int seqno, + bool appendFlag = false, int logAppendIntBase = -1); + int finishLog(); //!< rollover the log file + const string name() const { return logname; } //!< name of log file + /*! + * \brief set initial sequence numbers at startup + * \param[in] last last sequence number from checkpoint or log + */ + void set_seqno(seq_t last) + { + incp = committed = nextseq = last; + } + MdStream& getMdStream() { return md; } +private: + string logdir; //!< directory where logs are kept + int lognum; //!< for generating log file names + string logname; //!< name of current log file + ofstream logf; //!< the current log file + MdStream md; + ostream& logstream; + seq_t nextseq; //!< next request sequence no. + seq_t committed; //!< highest request known to be on disk + seq_t incp; //!< highest request in a checkpoint + string genfile(int n) //!< generate a log file name + { + ostringstream f(ostringstream::out); + f << n; + return logdir + "/log." + f.str(); + } + bool fail() const { return (logf.fail() || md.fail()); } + void flushLog(); + void flushResult(MetaRequest *r); +private: + // No copy. + Logger(const Logger&); + Logger& operator=(Logger&); +}; + +extern string LOGDIR; +extern string LASTLOG; +extern Logger oplog; +extern void logger_setup_paths(const string& logdir); +extern void logger_init(int rotateIntervalSec); +extern void logger_set_rotate_interval(int rotateIntervalSec); + +} +#endif // !defined(KFS_LOGGER_H) diff --git a/src/cc/meta/MetaNode.h b/src/cc/meta/MetaNode.h new file mode 100644 index 000000000..dacb45a23 --- /dev/null +++ b/src/cc/meta/MetaNode.h @@ -0,0 +1,113 @@ +/*! + * $Id$ + * + * \file MetaNode.h + * \brief Base class for KFS metadata nodes. + * \author Blake Lewis (Kosmix Corp.) + * Mike Ovsiannikov -- implement pool allocator. + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ +#if !defined(META_NODE_H) +#define META_NODE_H + +#include "kfstypes.h" +#include "common/PoolAllocator.h" + +#include + +namespace KFS { + +// MetaNode flag values +static const int META_CPBIT = 1;//!< CP parity bit +static const int META_NEW = 2; //!< new since start of CP +static const int META_ROOT = 4; //!< root node +static const int META_LEVEL1 = 8; //!< children are leaves +static const int META_SKIP = 16; //!< exclude from current CP + +/*! + * \brief base class for both internal and leaf nodes + */ +class MetaNode { +private: + MetaType nodetype; + int flagbits; + MetaNode& operator=(const MetaNode&); + MetaNode(const MetaNode&); +protected: + virtual ~MetaNode() {} + template + class Allocator + { + public: + typedef PoolAllocator< + sizeof(T), // size_t TItemSize, + size_t(8) << 20, // size_t TMinStorageAlloc, + size_t(128) << 20, // size_t TMaxStorageAlloc, + // no explicit ~Tree() or cleanup implemented yet. + false // bool TForceCleanupFlag + > Alloc; + Allocator() : alloc() {} + void* allocate() { + return alloc.Allocate(); + } + void deallocate(void* ptr) { + alloc.Deallocate(ptr); + } + const Alloc& getPoolAllocator() const { + return alloc; + } + private: + Alloc alloc; + }; + template + static Allocator& getAllocator(T* ptr = 0) + { + static Allocator allocator; + return allocator; + } + template + static void* allocate(T* type = 0) + { + return getAllocator(type).allocate(); + } + template + static void deallocate(T* ptr) + { + getAllocator(ptr).deallocate(ptr); + } +public: + virtual void destroy() = 0; + MetaNode(MetaType t): nodetype(t), flagbits(0) { } + MetaNode(MetaType t, int f): nodetype(t), flagbits(f) { } + MetaType metaType() const { return nodetype; } + virtual const Key key() const = 0; //!< cons up key value for node + virtual std::ostream& show(std::ostream& os) const = 0; + int flags() const { return flagbits; } + void setflag(int bit) { flagbits |= bit; } + void clearflag(int bit) { flagbits &= ~bit; } + bool testflag(int bit) const { return (flagbits & bit) != 0; } + template static const typename Allocator::Alloc& + getPoolAllocator(T* type = 0) { + return getAllocator(type).getPoolAllocator(); + } +}; + +} + +#endif // !defined(META_NODE_H) diff --git a/src/cc/meta/MetaRequest.cc b/src/cc/meta/MetaRequest.cc new file mode 100644 index 000000000..13b414053 --- /dev/null +++ b/src/cc/meta/MetaRequest.cc @@ -0,0 +1,4262 @@ +/*! + * $Id$ + * + * \file MetaRequest.cc + * \brief Meta server request handlers. + * \author Blake Lewis and Sriram Rao + * Mike Ovsiannikov + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "kfstree.h" +#include "MetaRequest.h" +#include "Logger.h" +#include "Checkpoint.h" +#include "util.h" +#include "LayoutManager.h" +#include "ChildProcessTracker.h" +#include "NetDispatch.h" +#include "Restorer.h" +#include "AuditLog.h" + +#include "kfsio/Globals.h" +#include "kfsio/checksum.h" +#include "common/MsgLogger.h" +#include "common/RequestParser.h" +#include "qcdio/QCUtils.h" +#include "qcdio/qcstutils.h" +#include "common/time.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace KFS { + +using std::map; +using std::string; +using std::istringstream; +using std::ifstream; +using std::min; +using std::max; +using std::make_pair; +using std::numeric_limits; +using KFS::libkfsio::globals; + +static bool gWormMode = false; +static string gChunkmapDumpDir("."); +static const char* const ftypes[] = { "empty", "file", "dir" }; + +static bool +CanAccessFile(const MetaFattr* fa, MetaRequest& op) +{ + if (! fa) { + op.status = -ENOENT; + op.statusMsg = "no such file"; + return false; + } + if (fa->IsStriped() && op.clientProtoVers < + KFS_CLIENT_MIN_STRIPED_FILE_SUPPORT_PROTO_VERS) { + op.status = -EINVAL; + op.statusMsg = "striped file, client upgrade required"; + return false; + } + return true; +} + +/*! + * Specially named files (such as, those that end with ".tmp") can be + * mutated by remove/rename. Otherwise, in WORM no deletes/renames are allowed. + */ +static inline bool +IsWormMutationAllowed(const string &pathname) +{ + return (pathname.length() >= 4 && + pathname.compare(pathname.length() - 4, 4, ".tmp") == 0); +} + +static inline bool +startsWith(const string& str, const string& prefix) +{ + const size_t len = prefix.length(); + return (str.length() >= len && str.compare(0, len, prefix) == 0); +} + +/* + * Set WORM mode. In WORM mode, deletes are disabled. + */ +void +setWORMMode(bool value) +{ + gWormMode = value; +} + +void +setChunkmapDumpDir(string d) +{ + gChunkmapDumpDir = d; +} + +inline static bool +OkHeader(const MetaRequest* op, ostream &os, bool checkStatus = true) +{ + os << + "OK\r\n" + "Cseq: " << op->opSeqno + ; + if (op->status == 0 && op->statusMsg.empty()) { + os << + "\r\n" + "Status: 0\r\n" + ; + return true; + } + os << + "\r\n" + "Status: " << op->status << "\r\n" + ; + if (! op->statusMsg.empty()) { + const size_t p = op->statusMsg.find('\r'); + assert( + string::npos == p && + op->statusMsg.find('\n') == string::npos + ); + os << "Status-message: " << + (p == string::npos ? + op->statusMsg : + op->statusMsg.substr(0, p)) << + "\r\n"; + } + if (checkStatus && op->status < 0) { + os << "\r\n"; + } + return (op->status >= 0); +} + +inline static ostream& +PutHeader(const MetaRequest* op, ostream &os) +{ + OkHeader(op, os, false); + return os; +} + +inline static bool +IsValidUser(kfsUid_t user) +{ + return (user != kKfsUserNone); +} + +inline static bool +IsValidGroup(kfsGid_t group) +{ + return (group != kKfsGroupNone); +} + +inline static bool +IsValidMode(kfsMode_t mode) +{ + return (mode != kKfsModeUndef && + (mode & ~((kfsMode_t(1) << 3 * 3) - 1)) == 0); +} + +inline static bool +IsGroupMember(kfsUid_t user, kfsGid_t group) +{ + return IsValidGroup(group); // The client lib does group validation +} + +inline static void +SetEUserAndEGroup(MetaRequest& req) +{ + gLayoutManager.SetEUserAndEGroup(req); +} + +template inline void +SetUserAndGroup(T& req) +{ + SetEUserAndEGroup(req); + if (req.user != kKfsUserNone || req.group != kKfsGroupNone) { + gLayoutManager.SetUserAndGroup(req, req.user, req.group); + } +} + +inline static void +FattrReply(const MetaFattr* fa, MFattr& ofa) +{ + if (! fa) { + return; + } + ofa = *fa; + if (fa->filesize < 0 && + fa->type == KFS_FILE && + fa->chunkcount() > 0 && + fa->nextChunkOffset() >= (chunkOff_t)CHUNKSIZE && + ! fa->IsStriped()) { + MetaChunkInfo* ci = 0; + if (metatree.getalloc(fa->id(), + fa->nextChunkOffset() - CHUNKSIZE, &ci) == 0 && + ci && + gLayoutManager.HasWriteAppendLease( + ci->chunkId)) { + // Reduce getlayout calls, return the same value to the + // client as in the case of getlayout followed by + // getsize on unstable write append chunk. + // Chunk servers always return CHUNKSIZE in such case + // to allow reading file opened for append while the + // file is being written into. + ofa.filesize = fa->nextChunkOffset(); + } + } +} + +inline static ostream& +FattrReply(ostream& os, const MFattr& fa) +{ + os << + "File-handle: " << fa.id() << "\r\n" + "Type: " << ftypes[fa.type] << "\r\n" + "File-size: " << fa.filesize << "\r\n" + "Replication: " << fa.numReplicas << "\r\n"; + if (fa.type == KFS_FILE) { + os << "Chunk-count: " << fa.chunkcount() << "\r\n"; + } else if (fa.type == KFS_DIR) { + os << + "File-count: " << fa.fileCount() << "\r\n" + "Dir-count: " << fa.dirCount() << "\r\n"; + } + sendtime(os, "M-Time: ", fa.mtime, "\r\n"); + sendtime(os, "C-Time: ", fa.ctime, "\r\n"); + sendtime(os, "CR-Time: ", fa.crtime, "\r\n"); + if (fa.IsStriped()) { + os << + "Striper-type: " << int32_t(fa.striperType) << "\r\n" + "Num-stripes: " << fa.numStripes << "\r\n" + "Num-recovery-stripes: " << fa.numRecoveryStripes << "\r\n" + "Stripe-size: " << fa.stripeSize << "\r\n"; + } + os << + "User: " << fa.user << "\r\n" + "Group: " << fa.group << "\r\n" + "Mode: " << fa.mode << "\r\n"; + return os; +} + +template +class RequestWaitQueue : public ITimeout +{ +public: + RequestWaitQueue( + NetManager& netManager, + CondT& cond, + bool ownsCondFlag = false) + : ITimeout(), + mCond(cond), + mFront(0), + mBack(0), + mCur(0), + mDeletedFlag(0), + mNetManager(netManager), + mOwnsCondFlag(ownsCondFlag), + mMaxOpsPerLoop(2) + {} + virtual ~RequestWaitQueue() + { + if (mFront) { + mNetManager.UnRegisterTimeoutHandler(this); + // leave requests suspended + } + if (mOwnsCondFlag) { + delete &mCond; + } + if (mDeletedFlag) { + *mDeletedFlag = true; + } + } + virtual void Timeout() + { + int opsCount = 0; + bool deletedFlag = false; + mDeletedFlag = &deletedFlag; + while (mFront && mCond(*mFront)) { + mCur = mFront; + mFront = mCur->next; + if (! mFront) { + assert(mCur == mBack); + mBack = 0; + } + mCur->next = 0; + mCur->suspended = false; + submit_request(mCur); + if (deletedFlag) { + return; + } + mCur = 0; + if (mMaxOpsPerLoop <= ++opsCount) { + if (mFront && mCond(*mFront)) { + mNetManager.Wakeup(); + } + break; + } + } + mDeletedFlag = 0; + if (! mFront) { + mNetManager.UnRegisterTimeoutHandler(this); + } + } + bool SuspendIfNeeded(MetaRequest& req) + { + if (mCur == &req || (! HasPendingRequests() && mCond(req))) { + return false; + } + Add(&req); + return true; + } + void Add(MetaRequest* req) + { + if (! req || req->next || req == mFront || req == mBack) { + panic("request is null " + "or already in this or another queue", false); + return; + } + req->suspended = true; + if (mBack) { + assert(mFront); + mBack->next = req; + mBack = req; + } else { + assert(! mFront); + mFront = req; + mBack = req; + mNetManager.RegisterTimeoutHandler(this); + } + } + bool HasPendingRequests() const + { return (mFront != 0); } + void Wakeup() + { + if (! HasPendingRequests()) { + return; + } + mNetManager.Wakeup(); + } + void SetParameters(const Properties& props, const char* prefix) + { + mMaxOpsPerLoop = props.getValue( + (prefix ? prefix : "") + string("maxOpsPerLoop"), + mMaxOpsPerLoop + ); + } +private: + CondT& mCond; + MetaRequest* mFront; + MetaRequest* mBack; + MetaRequest* mCur; + bool* mDeletedFlag; + NetManager& mNetManager; + bool mOwnsCondFlag; + int mMaxOpsPerLoop; + + RequestWaitQueue(const RequestWaitQueue&); + RequestWaitQueue& operator=(const RequestWaitQueue&); +}; + +class EnoughBuffersCond +{ +public: + bool operator() (MetaRequest& req) + { + return gLayoutManager.HasEnoughFreeBuffers(&req); + } +}; + +static EnoughBuffersCond& +GetEnoughBuffersCond() +{ + static EnoughBuffersCond sEnoughBuffersCond; + return sEnoughBuffersCond; +} + +typedef RequestWaitQueue BuffersWaitQueue; +static BuffersWaitQueue sBuffersWaitQueue( + globalNetManager(), GetEnoughBuffersCond()); + +void +CheckIfIoBuffersAvailable() +{ + if (sBuffersWaitQueue.HasPendingRequests() && + gLayoutManager.HasEnoughFreeBuffers()) { + sBuffersWaitQueue.Wakeup(); + } +} + +void +SetRequestParameters(const Properties& props) +{ + sBuffersWaitQueue.SetParameters(props, "metaServer.buffersWaitQueue."); +} + +static bool +HasEnoughIoBuffersForResponse(MetaRequest& req) +{ + return (! sBuffersWaitQueue.SuspendIfNeeded(req)); +} + +class ResponseWOStream : private IOBuffer::WOStream +{ +public: + ResponseWOStream() + : IOBuffer::WOStream() + {} + ostream& Set(IOBuffer& buf) + { + return IOBuffer::WOStream::Set( + buf, gLayoutManager.GetMaxResponseSize()); + } + void Reset() + { IOBuffer::WOStream::Reset(); } +}; +static ResponseWOStream sWOStream; + +/* virtual */ void +MetaLookup::handle() +{ + SetEUserAndEGroup(*this); + MetaFattr* fa = 0; + if ((status = metatree.lookup(dir, name, euser, egroup, fa)) == 0) { + FattrReply(fa, fattr); + } +} + +/* virtual */ void +MetaLookupPath::handle() +{ + SetEUserAndEGroup(*this); + MetaFattr* fa = 0; + if ((status = metatree.lookupPath( + root, path, euser, egroup, fa)) == 0) { + FattrReply(fa, fattr); + } +} + +template inline static bool +CheckUserAndGroup(T& req) +{ + if (! IsValidUser(req.user)) { + req.status = -EINVAL; + req.statusMsg = "invalid user"; + return false; + } + if (! IsValidGroup(req.group)) { + req.status = -EINVAL; + req.statusMsg = "invalid group"; + return false; + } + if (req.user != req.euser && req.euser != kKfsUserRoot) { + req.status = -EPERM; + req.statusMsg = "user different from effective user"; + return false; + } + if (req.euser != kKfsUserRoot && req.egroup != req.group && + ! IsGroupMember(req.user, req.group)) { + req.status = -EPERM; + req.statusMsg = "user is not in the group"; + return false; + } + return true; +} + +template inline static bool +CheckCreatePerms(T& req) +{ + if (req.euser == kKfsUserNone) { + req.euser = gLayoutManager.GetDefaultUser(); + } + if (req.egroup == kKfsGroupNone) { + req.egroup = gLayoutManager.GetDefaultGroup(); + } + if (req.user == kKfsUserNone) { + req.user = req.euser; + } + if (req.group == kKfsGroupNone) { + req.group = req.egroup; + } + if (req.mode == kKfsModeUndef) { + req.mode = req.op == META_MKDIR ? + gLayoutManager.GetDefaultDirMode() : + gLayoutManager.GetDefaultFileMode(); + } + if (! IsValidMode(req.mode)) { + req.status = -EINVAL; + req.statusMsg = "invalid mode"; + return false; + } + return CheckUserAndGroup(req); +} + +const string kInvalidChunksPath("/proc/invalid_chunks"); +const string kInvalidChunksPrefix(kInvalidChunksPath + "/"); + +/* virtual */ void +MetaCreate::handle() +{ + SetUserAndGroup(*this); + const bool invalChunkFlag = dir == ROOTFID && + startsWith(name, kInvalidChunksPrefix); + bool rootUserFlag = false; + if (invalChunkFlag) { + name = name.substr(kInvalidChunksPrefix.length()); + const char* chunk = name.c_str(); + for (const char* p = chunk; ; p++) { + const int sym = *p & 0xff; + if ((sym < '0' || sym > '9') && sym != '.') { + if (sym == 0 && p > chunk) { + break; + } + statusMsg = "invalid chunk id: " + name; + status = -EINVAL; + return; + } + } + const string msg("detected invalid chunk: " + name); + KFS_LOG_STREAM_ERROR << msg << KFS_LOG_EOM; + if (gLayoutManager.GetPanicOnInvalidChunkFlag()) { + panic(msg, false); + } + MetaFattr* fa = 0; + if ((status = metatree.lookupPath(ROOTFID, kInvalidChunksPath, + kKfsUserRoot, kKfsGroupRoot, fa)) != 0 + || ! fa || fa->type != KFS_DIR) { + if (status == 0) { + status = -ENOENT; + statusMsg = kInvalidChunksPath + + ": no such directory"; + } + return; + } + dir = fa->id(); + if (user == kKfsUserNone) { + user = euser != kKfsUserNone ? euser : kKfsUserRoot; + group = egroup != kKfsGroupNone ? + egroup : kKfsGroupRoot; + } + mode = 0; + rootUserFlag = true; + } else { + if (! CheckCreatePerms(*this)) { + return; + } + } + + if (! invalChunkFlag && gWormMode && ! IsWormMutationAllowed(name)) { + // Do not create a file that we can not write into. + statusMsg = "worm mode"; + status = -EPERM; + return; + } + fid = 0; + todumpster = -1; + if (striperType == KFS_STRIPED_FILE_TYPE_RS && numRecoveryStripes > 0) { + numReplicas = min(numReplicas, + gLayoutManager.GetMaxReplicasPerRSFile()); + } else { + numReplicas = min(numReplicas, + gLayoutManager.GetMaxReplicasPerFile()); + } + status = metatree.create( + dir, + name, + &fid, + numReplicas, + exclusive, + striperType, + numStripes, + numRecoveryStripes, + stripeSize, + todumpster, + user, + group, + mode, + rootUserFlag ? kKfsUserRoot : euser, + egroup + ); +} + +/* virtual */ void +MetaMkdir::handle() +{ + SetUserAndGroup(*this); + if (! CheckCreatePerms(*this)) { + return; + } + fid = 0; + status = metatree.mkdir(dir, name, user, group, mode, euser, egroup, &fid); +} + +static int +LookupAbsPath(fid_t& dir, string& name, kfsUid_t euser, kfsGid_t egroup) +{ + if (dir != ROOTFID || name.empty() || + *name.begin() != '/' || *name.rbegin() == '/') { + return 0; + } + const size_t nameStart = name.rfind('/'); + size_t pos = nameStart; + while (pos > 0 && name[pos-1] == '/') { + pos--; + } + if (pos == 0) { + name = name.substr(nameStart + 1); + } else { + const string parentDir = name.substr(0, pos); + MetaFattr* fa = 0; + const int status = metatree.lookupPath(ROOTFID, parentDir, + euser, egroup, fa); + if (status != 0) { + return status; + } + if (fa && fa->type == KFS_DIR) { + dir = fa->id(); + name = name.substr(nameStart + 1); + } + } + return 0; +} + +/*! + * \brief Remove a file in a directory. Also, remove the chunks + * associated with the file. For removing chunks, we send off + * RPCs to the appropriate chunkservers. + */ + +/* virtual */ void +MetaRemove::handle() +{ + if (gWormMode && ! IsWormMutationAllowed(name)) { + // deletes are disabled in WORM mode except for specially named + // files + statusMsg = "worm mode"; + status = -EPERM; + return; + } + todumpster = -1; + SetEUserAndEGroup(*this); + if ((status = LookupAbsPath(dir, name, euser, egroup)) != 0) { + return; + } + status = metatree.remove(dir, name, pathname, todumpster, + euser, egroup); +} + +/* virtual */ void +MetaRmdir::handle() +{ + if (gWormMode && ! IsWormMutationAllowed(name)) { + // deletes are disabled in WORM mode + statusMsg = "worm mode"; + status = -EPERM; + return; + } + SetEUserAndEGroup(*this); + if ((status = LookupAbsPath(dir, name, euser, egroup)) != 0) { + return; + } + status = metatree.rmdir(dir, name, pathname, euser, egroup); +} + +static vector& +GetReadDirTmpVec() +{ + static vector sReaddirRes; + sReaddirRes.clear(); + sReaddirRes.reserve(1024); + return sReaddirRes; +} + +class IOBufferWriter +{ +public: + IOBufferWriter(IOBuffer& b) + : ioBuf(b), + buf(), + cur(buf.Producer()), + end(cur + buf.SpaceAvailable()) + {} + void Write(const char* data, size_t len) + { + for (; ;) { + if (cur + len <= end) { + memcpy(cur, data, len); + cur += len; + return; + } + const size_t ncp = end - cur; + memcpy(cur, data, ncp); + data += ncp; + len -= ncp; + buf.Fill((int)(cur - buf.Producer() + ncp)); + ioBuf.Append(buf); + buf = IOBufferData(); + cur = buf.Producer(); + end = cur + buf.SpaceAvailable(); + } + } + void Write(const string& str) + { + Write(str.data(), str.length()); + } + void Close() + { + const int len = (int)(cur - buf.Producer()); + if (len > 0) { + buf.Fill(len); + ioBuf.Append(buf); + } + } + int GetSize() const + { + return ioBuf.BytesConsumable(); + } +private: + IOBuffer& ioBuf; + IOBufferData buf; + char* cur; + char* end; +private: + IOBufferWriter(const IOBufferWriter&); + IOBufferWriter& operator=(const IOBufferWriter&); +}; + +inline const MetaFattr* +GetDirAttr(fid_t dir, const vector& v) +{ + const MetaFattr* fa = v.empty() ? 0 : (v.front()->getName() == ".." ? + v.back()->getFattr() : v.front()->getFattr()); + if (fa && fa->id() != dir) { + fa = fa->parent; + if (fa && fa->id() != dir) { + fa = 0; + } + } + return (fa ? fa : metatree.getFattr(dir)); +} + +/* virtual */ void +MetaReaddir::handle() +{ + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + const bool oldFormatFlag = numEntries < 0; + int maxEntries = gLayoutManager.GetReadDirLimit(); + if (numEntries > 0 && + (maxEntries <= 0 || numEntries < maxEntries)) { + maxEntries = numEntries; + } + numEntries = 0; + resp.Clear(); + vector& v = GetReadDirTmpVec(); + if ((status = fnameStart.empty() ? + metatree.readdir(dir, v, + maxEntries, &hasMoreEntriesFlag) : + metatree.readdir(dir, fnameStart, v, + maxEntries, hasMoreEntriesFlag) + ) != 0) { + if (status == -ENOENT) { + const MetaFattr* const fa = metatree.getFattr(dir); + if (fa && fa->type != KFS_DIR) { + status = -ENOTDIR; + } + } + return; + } + SetEUserAndEGroup(*this); + const MetaFattr* const fa = GetDirAttr(dir, v); + if (! fa || ! fa->CanRead(euser, egroup)) { + status = -EACCES; + return; + } + if (oldFormatFlag && hasMoreEntriesFlag) { + status = -ENOMEM; + statusMsg = "response exceeds max. allowed number of entries" + " consider updating kfs client lib"; + return; + } + const int extSize = IOBufferData::GetDefaultBufferSize() + + int(MAX_FILE_NAME_LENGTH); + int maxSize = gLayoutManager.GetMaxResponseSize(); + if (! oldFormatFlag && extSize * 2 < maxSize) { + maxSize -= extSize; + } + IOBufferWriter writer(resp); + vector::const_iterator it; + for (it = v.begin(); + it != v.end() && writer.GetSize() <= maxSize; + ++it) { + const string& name = (*it)->getName(); + // Supress "/" dentry for "/". + if (dir == ROOTFID && name == "/") { + continue; + } + writer.Write(name); + writer.Write("\n", 1); + ++numEntries; + } + writer.Close(); + if (resp.BytesConsumable() > maxSize) { + if (oldFormatFlag) { + resp.Clear(); + numEntries = 0; + status = -ENOMEM; + statusMsg = "response exceeds max. size"; + } else { + if (it != v.end()) { + hasMoreEntriesFlag = true; + } + } + } +} + +class EnumerateLocations +{ + ServerLocations& v; +public: + EnumerateLocations(ServerLocations& result) + : v(result) + {} + void operator()(const ChunkServerPtr& c) const + { + v.push_back(c->GetServerLocation()); + } +}; + +class ListServerLocations +{ + ostream& os; +public: + ListServerLocations(ostream &out) + : os(out) + {} + void operator()(const ServerLocation& s) const + { + os << " " << s; + } +}; + +template +class ReaddirPlusWriter +{ +public: + typedef MetaReaddirPlus::DEntry DEntry; + typedef MetaReaddirPlus::DEntries DEntries; + typedef MetaReaddirPlus::CInfos CInfos; +private: + enum { kNumBufSize = 64 }; + + typedef LayoutManager::Servers Servers; + typedef PropertiesTokenizer::Token Token; + + class PropName : public Token + { + public: + PropName(const char* shortName, const char* longName) + : Token(ShortFormatFlag ? shortName : longName) + {} + }; + + IOBufferWriter writer; + const int maxSize; + const bool getLastChunkInfoOnlyIfSizeUnknown; + char* const nBufEnd; + char nBuf[kNumBufSize]; + + static const PropName kMtime; + static const PropName kCtime; + static const PropName kCrtime; + static const PropName kBeginEntry; + static const PropName kName; + static const PropName kId; + static const PropName kType; + static const PropName kNL; + static const PropName kSType; + static const PropName kSCount; + static const PropName kSRecov; + static const PropName kSSize; + static const PropName kCCnt; + static const PropName kFSize; + static const PropName kRepl; + static const PropName kUser; + static const PropName kGroup; + static const PropName kMode; + static const PropName kLOff; + static const PropName kLId; + static const PropName kLVers; + static const PropName kLRCnt; + static const PropName kLRepl; + static const PropName kFileCount; + static const PropName kDirCount; + static const PropName kSpace; + static const Token kFileType[]; + + template static + char* ToString(T val, char* bufEnd) + { + if (! ShortFormatFlag) { + return toString(int64_t(val), bufEnd + 1); + } + char* p = bufEnd; + *p = 0; + char* const s = p - sizeof(val) * 2; + do { + *--p = "0123456789ABCDEF"[val & 0xF]; + val >>= 4; + } while (val != 0 && s < p); + return p; + } + void Write(const Token& name) + { + Write(name.mPtr, name.mLen); + } + template + void WriteInt(T val) + { + const char* const b = ToString(val, nBufEnd); + Write(b, nBufEnd - b); + } + void WriteTime(int64_t t) + { + if (ShortFormatFlag) { + WriteInt(t); + return; + } + const int64_t kMicroseconds = 1000 * 1000; + char* const s = ToString(t % kMicroseconds, nBufEnd) - 1; + const char* const b = ToString(t / kMicroseconds, s); + *s = ' '; + Write(b, nBufEnd - b); + } + void Write(const char* data, size_t len) + { + writer.Write(data, len); + } + void Write(const string& str) + { + writer.Write(str); + } + void Write(const DEntry& entry, CInfos::const_iterator& lci, + bool noAttrsFlag) + { + Write(kBeginEntry); + Write(kName); + Write(entry.name); + if (noAttrsFlag) { + Write(kNL); + return; + } + Write(kId); + WriteInt(entry.id()); + Write(kType); + Write(kFileType[entry.type]); + Write(kMtime); + WriteTime(entry.mtime); + if (! ShortFormatFlag || entry.ctime != entry.crtime) { + Write(kCtime); + WriteTime(entry.ctime); + } + Write(kCrtime); + WriteTime(entry.crtime); + Write(kUser); + WriteInt(entry.user); + Write(kGroup); + WriteInt(entry.group); + Write(kMode); + WriteInt(entry.mode); + if (entry.type == KFS_DIR) { + if (entry.filesize >= 0) { + Write(kFSize); + WriteInt(entry.filesize); + } + Write(kFileCount); + WriteInt(entry.fileCount()); + Write(kDirCount); + WriteInt(entry.dirCount()); + Write(kNL); + return; + } + if (entry.IsStriped()) { + Write(kSType); + WriteInt(int(entry.striperType)); + Write(kSCount); + WriteInt(entry.numStripes); + Write(kSRecov); + WriteInt(entry.numRecoveryStripes); + Write(kSSize); + WriteInt(entry.stripeSize); + } + Write(kCCnt); + WriteInt(entry.chunkcount()); + Write(kFSize); + WriteInt(entry.filesize); + Write(kRepl); + WriteInt(entry.numReplicas); + if (entry.type == KFS_DIR || entry.IsStriped() || + (getLastChunkInfoOnlyIfSizeUnknown && + entry.filesize >= 0)) { + Write(kNL); + return; + } + const ChunkLayoutInfo& lc = *lci++; + if (lc.chunkId <= 0) { + Write(kNL); + return; + } + // Tell the client the info about the last chunk of the + // file. If the file size is not known then, client can use this + // info to calculate the size. The most common case is when the + // chunk is being written into, and the file is not striped. + Write(kLOff); + WriteInt(lc.offset); + Write(kLId); + WriteInt(lc.chunkId); + Write(kLVers); + WriteInt(lc.chunkVersion); + Write(kLRCnt); + WriteInt(lc.locations.size()); + Write(kLRepl); + for (ServerLocations::const_iterator it = lc.locations.begin(); + it != lc.locations.end(); + ++it) { + Write(kSpace); + Write(it->hostname); + Write(kSpace); + WriteInt(it->port); + } + Write(kNL); + } + ReaddirPlusWriter(const ReaddirPlusWriter&); + ReaddirPlusWriter& operator=(const ReaddirPlusWriter&); +public: + ReaddirPlusWriter(IOBuffer& b, int ms, bool f) + : writer(b), + maxSize(ms), + getLastChunkInfoOnlyIfSizeUnknown(f), + nBufEnd(nBuf + kNumBufSize - 1) + {} + size_t Write(const DEntries& entries, const CInfos& cinfos, + bool noAttrsFlag) + { + CInfos::const_iterator cit = cinfos.begin(); + DEntries::const_iterator it; + for (it = entries.begin(); + it != entries.end() && + writer.GetSize() <= maxSize; + ++it) { + Write(*it, cit, noAttrsFlag); + } + if (cinfos.end() < cit) { + panic("dentry and last chunk info mismatch", false); + } + writer.Close(); + return (it - entries.begin()); + } +}; + +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kMtime( + "\nM:" , "\r\nM-Time: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kCtime( + "\nC:" , "\r\nC-Time: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kCrtime( + "\nCR:" , "\r\nCR-Time: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kBeginEntry( + "B" , "Begin-entry"); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kName( + "\nN:" , "\r\nName: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kId( + "\nH:" , "\r\nFile-handle: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kType( + "\nT:" , "\r\nType: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kNL( + "\n" , "\r\n"); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kSType( + "\nST:" , "\r\nStriper-type: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kSCount( + "\nSC:" , "\r\nNum-stripes: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kSRecov( + "\nSR:" , "\r\nNum-recovery-stripes: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kSSize( + "\nSS:" , "\r\nStripe-size: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kCCnt( + "\nCC:" , "\r\nChunk-count: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kFSize( + "\nS:" , "\r\nFile-size: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kRepl( + "\nR:" , "\r\nReplication: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kUser( + "\nU:" , "\r\nUser: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kGroup( + "\nG:" , "\r\nGroup: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kMode( + "\nA:" , "\r\nMode: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kLOff( + "\nLO:" , "\r\nChunk-offset: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kLId( + "\nLH:" , "\r\nChunk-handle: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kLVers( + "\nLV:" , "\r\nChunk-version: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kLRCnt( + "\nLN:" , "\r\nNum-replicas: "); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kLRepl( + "\nLR:" , "\r\nReplicas:"); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kFileCount( + "\nFC:" , "\r\nFile-count"); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kDirCount( + "\nDC:" , "\r\nDir-count"); +template const typename ReaddirPlusWriter::PropName + ReaddirPlusWriter::kSpace( + " " , " "); +template const typename ReaddirPlusWriter::Token + ReaddirPlusWriter::kFileType[] = { "empty", "file", "dir" }; + +MetaReaddirPlus::~MetaReaddirPlus() +{ + if (ioBufPending > 0) { + gLayoutManager.ChangeIoBufPending(-ioBufPending); + } +} + +/* virtual */ void +MetaReaddirPlus::handle() +{ + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + dentries.clear(); + lastChunkInfos.clear(); + if (ioBufPending > 0) { + gLayoutManager.ChangeIoBufPending(-ioBufPending); + } + ioBufPending = 0; + int maxEntries = gLayoutManager.GetReadDirLimit(); + if (numEntries > 0 && + (maxEntries <= 0 || numEntries < maxEntries)) { + maxEntries = numEntries; + } + vector& res = GetReadDirTmpVec(); + if ((status = fnameStart.empty() ? + metatree.readdir(dir, res, + maxEntries, &hasMoreEntriesFlag) : + metatree.readdir(dir, fnameStart, res, + maxEntries, hasMoreEntriesFlag) + ) != 0) { + if (status == -ENOENT) { + MetaFattr * const fa = metatree.getFattr(dir); + if (fa && fa->type != KFS_DIR) { + status = -ENOTDIR; + } + } + return; + } + const MetaFattr* const fa = GetDirAttr(dir, res); + SetEUserAndEGroup(*this); + if (! fa || ! fa->CanRead(euser, egroup)) { + status = -EACCES; + return; + } + noAttrsFlag = ! fa->CanSearch(euser, egroup); + if (numEntries < 0 && hasMoreEntriesFlag) { + status = -ENOMEM; + statusMsg = "response exceeds max. allowed number of entries" + " consider updating kfs client lib"; + return; + } + maxRespSize = max(0, gLayoutManager.GetMaxResponseSize()); + const int extSize = IOBufferData::GetDefaultBufferSize() + + int(MAX_FILE_NAME_LENGTH); + const size_t maxSize = + (size_t)((numEntries >= 0 && extSize * 2 < maxRespSize) ? + maxRespSize - extSize : maxRespSize); + dentries.reserve(res.size()); + const size_t avgDirExtraSize = numEntries < 0 ? 148 : 64; + const size_t avgFileExtraSize = numEntries < 0 ? 272 : 128; + const size_t avgChunkInfoSize = numEntries < 0 ? 82 : 24; + const size_t avgLocationSize = 22; + size_t responseSize = 0; + vector::const_iterator it; + for (it = res.begin(); + it != res.end() && responseSize <= maxSize; + ++it) { + const MetaDentry* const entry = *it; + const MetaFattr* const fa = metatree.getFattr(entry); + if (! fa) { + continue; + } + // Supress "/" dentry for "/". + const string& name = entry->getName(); + if (fa->id() == ROOTFID && name == "/") { + continue; + } + responseSize += name.length() + (fa->type == KFS_DIR ? + avgDirExtraSize : avgFileExtraSize); + dentries.push_back(DEntry(*fa, name)); + if (noAttrsFlag || fa->type == KFS_DIR || fa->IsStriped() || + (getLastChunkInfoOnlyIfSizeUnknown && + fa->filesize >= 0)) { + continue; + } + responseSize += avgChunkInfoSize; + lastChunkInfos.push_back(ChunkLayoutInfo()); + ChunkLayoutInfo& lc = lastChunkInfos.back(); + // for a file, get the layout and provide location of last chunk + // so that the client can compute filesize + MetaChunkInfo* lastChunk = 0; + MetaFattr* cfa = 0; + if (metatree.getLastChunkInfo( + fa->id(), false, cfa, lastChunk) != 0 || + ! lastChunk) { + lc.offset = -1; + lc.chunkId = -1; + lc.chunkVersion = -1; + continue; + } + if (fa != cfa) { + panic("readdirplus: file attribute mismatch", false); + } + lc.offset = lastChunk->offset; + lc.chunkId = lastChunk->chunkId; + lc.chunkVersion = lastChunk->chunkVersion; + Servers c; + MetaFattr* lfa = 0; + if (gLayoutManager.GetChunkToServerMapping( + *lastChunk, c, lfa) != 0) { + // All the servers hosting the chunk are down. + continue; + } + lc.locations.reserve(c.size()); + for_each(c.begin(), c.end(), EnumerateLocations(lc.locations)); + responseSize += lc.locations.size() * avgLocationSize; + } + if (maxSize < responseSize) { + if (numEntries < 0) { + status = -ENOMEM; + statusMsg = "response exceeds max. size"; + lastChunkInfos.clear(); + dentries.clear(); + responseSize = 0; + } else if (it != res.end()) { + hasMoreEntriesFlag = true; + } + } + ioBufPending = (int64_t)responseSize; + if (ioBufPending > 0) { + gLayoutManager.ChangeIoBufPending(ioBufPending); + maxRespSize = max(maxRespSize, (int)ioBufPending + + IOBufferData::GetDefaultBufferSize()); + } +} + +/*! + * \brief Get the allocation information for a specific chunk in a file. + */ +/* virtual */ void +MetaGetalloc::handle() +{ + if (offset < 0) { + status = -EINVAL; + statusMsg = "negative offset"; + return; + } + MetaChunkInfo* chunkInfo = 0; + status = metatree.getalloc(fid, offset, &chunkInfo); + if (status != 0) { + KFS_LOG_STREAM_DEBUG << + "handle_getalloc(" << fid << "," << offset << + ") = " << status << ": kfsop failed" << + KFS_LOG_EOM; + return; + } + + chunkId = chunkInfo->chunkId; + chunkVersion = chunkInfo->chunkVersion; + Servers c; + MetaFattr* fa = 0; + replicasOrderedFlag = false; + const int err = gLayoutManager.GetChunkToServerMapping( + *chunkInfo, c, fa, &replicasOrderedFlag); + if (! fa) { + panic("invalid chunk to server map", false); + } + if (! CanAccessFile(fa, *this)) { + return; + } + if (! fromChunkServerFlag && gLayoutManager.VerifyAllOpsPermissions()) { + SetEUserAndEGroup(*this); + if (! fa->CanRead(euser, egroup)) { + status = -EACCES; + return; + } + } + if (err) { + statusMsg = "no replicas available"; + status = -EAGAIN; + KFS_LOG_STREAM_ERROR << + "getalloc " + "<" << fid << "," << chunkId << "," << offset << ">" + " " << statusMsg << + KFS_LOG_EOM; + return; + } + locations.reserve(c.size()); + for_each(c.begin(), c.end(), EnumerateLocations(locations)); + status = 0; +} + +/*! + * \brief Get the allocation information for a file. Determine + * how many chunks there and where they are located. + */ +/* virtual */ void +MetaGetlayout::handle() +{ + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + vector chunkInfo; + MetaFattr* fa = 0; + if (lastChunkInfoOnlyFlag) { + bool kOnlyForNonStripedFileFlag = false; + MetaChunkInfo* ci = 0; + status = metatree.getLastChunkInfo( + fid, kOnlyForNonStripedFileFlag, fa, ci); + if (status == 0 && ci) { + chunkInfo.push_back(ci); + } + } else if (startOffset <= 0) { + status = metatree.getalloc(fid, fa, chunkInfo, + maxResCnt > 0 ? maxResCnt + 1 : maxResCnt); + } else { + status = metatree.getalloc(fid, startOffset, chunkInfo, + maxResCnt > 0 ? maxResCnt + 1 : maxResCnt); + } + if (status != 0) { + return; + } + if (! fa) { + if (chunkInfo.empty()) { + panic("MetaGetlayout::handle -- getalloc no chunks"); + status = -EFAULT; + return; + } + fa = CSMap::Entry::GetCsEntry(chunkInfo.front())->GetFattr(); + if (! fa) { + panic("MetaGetlayout::handle -- invalid chunk entry"); + status = -EFAULT; + return; + } + } + if (! CanAccessFile(fa, *this)) { + return; + } + if (gLayoutManager.VerifyAllOpsPermissions()) { + SetEUserAndEGroup(*this); + if (! fa->CanRead(euser, egroup)) { + status = -EACCES; + return; + } + } + numChunks = (int)chunkInfo.size(); + if ((hasMoreChunksFlag = maxResCnt > 0 && maxResCnt < numChunks)) { + numChunks = maxResCnt; + } + ostream& os = sWOStream.Set(resp); + const char* prefix = ""; + Servers c; + ChunkLayoutInfo l; + for (int i = 0; i < numChunks; i++) { + l.locations.clear(); + l.offset = chunkInfo[i]->offset; + l.chunkId = chunkInfo[i]->chunkId; + l.chunkVersion = chunkInfo[i]->chunkVersion; + if (! omitLocationsFlag) { + MetaFattr* cfa = 0; + const int err = gLayoutManager.GetChunkToServerMapping( + *(chunkInfo[i]), c, cfa); + assert(! fa || cfa == fa); + if (err) { + resp.Clear(); + status = -EHOSTUNREACH; + statusMsg = "chunk: " + toString(l.chunkId) + + " no replicas available"; + break; + } + for_each(c.begin(), c.end(), + EnumerateLocations(l.locations)); + } + if (! (os << prefix << l)) { + break; + } + prefix = " "; + } + os.flush(); + if (status == 0 && ! os) { + resp.Clear(); + status = -ENOMEM; + statusMsg = "response exceeds max. size"; + } + sWOStream.Reset(); +} + +/*! + * \brief handle an allocation request for a chunk in a file. + * \param[in] r write allocation request + * + * Write allocation proceeds as follows: + * 1. The client has sent a write allocation request which has been + * parsed and turned into an RPC request (which is handled here). + * 2. We first get a unique chunk identifier (after validating the + * fileid). + * 3. We send the request to the layout manager to pick a location + * for the chunk. + * 4. The layout manager picks a location and sends an RPC to the + * corresponding chunk server to create the chunk. + * 5. When the RPC is going on, processing for this request is + * suspended. + * 6. When the RPC reply is received, this request gets re-activated + * and we come back to this function. + * 7. Assuming that the chunk server returned a success, we update + * the metatree to link the chunkId with the fileid (from this + * request). + * 8. Processing for this request is now complete; it is logged and + * a reply is sent back to the client. + * + * Versioning/Leases introduces a few wrinkles to the above steps: + * In step #2, the metatree could return -EEXIST if an allocation + * has been done for the . In such a case, we need to + * check with the layout manager to see if a new lease is required. + * If a new lease is required, the layout manager bumps up the version + * # for the chunk and notifies the chunkservers. The message has to + * be suspended until the chunkservers ack. After the message is + * restarted, we need to update the metatree to reflect the newer + * version # before notifying the client. + * + * On the other hand, if a new lease isn't required, then the layout + * manager tells us where the data has been placed; the process for + * the request is therefore complete. + */ +/* virtual */ void +MetaAllocate::handle() +{ + suspended = false; + if (layoutDone) { + return; + } + KFS_LOG_STREAM_DEBUG << "Starting layout for req: " << opSeqno << + KFS_LOG_EOM; + if (gWormMode && ! IsWormMutationAllowed(pathname.GetStr())) { + statusMsg = "worm mode"; + status = -EPERM; + return; + } + if (! gLayoutManager.IsAllocationAllowed(this)) { + if (status >= 0) { + statusMsg = "allocation not allowed"; + status = -EPERM; + } + return; + } + if (gLayoutManager.VerifyAllOpsPermissions()) { + SetEUserAndEGroup(*this); + } + if (appendChunk) { + if (invalidateAllFlag) { + statusMsg = "chunk invalidation" + " is not supported with append"; + status = -EINVAL; + return; + } + // pick a chunk for which a write lease exists + status = gLayoutManager.AllocateChunkForAppend(this); + if (status == 0) { + // all good + KFS_LOG_STREAM_DEBUG << + "For append re-using chunk " << chunkId << + (suspended ? "; allocation in progress" : "") << + KFS_LOG_EOM; + logFlag = false; // Do not emit redundant log record. + return; + } + if (status == -EACCES) { + return; + } + offset = -1; // Allocate a new chunk past eof. + } + // force an allocation + chunkId = 0; + initialChunkVersion = -1; + vector chunkBlock; + MetaFattr* fa = 0; + // start at step #2 above. + status = metatree.allocateChunkId( + fid, offset, + &chunkId, + &chunkVersion, + &numReplicas, + &stripedFileFlag, + &chunkBlock, + &chunkBlockStart, + gLayoutManager.VerifyAllOpsPermissions() ? + euser : kKfsUserRoot, + egroup, + &fa + ); + if (status != 0 && (status != -EEXIST || appendChunk)) { + // we have a problem + return; + } + permissions = *fa; + if (stripedFileFlag && appendChunk) { + status = -EINVAL; + statusMsg = "append is not supported with striped files"; + return; + } + if (invalidateAllFlag) { + if (! stripedFileFlag) { + status = -EINVAL; + statusMsg = "chunk invalidation" + " is not supported for non striped files"; + return; + } + if (status != -EEXIST) { + // Allocate the chunk if doesn't exist to trigger the + // recovery later. + status = metatree.assignChunkId( + fid, offset, chunkId, chunkVersion, 0); + if (status == 0) { + // Add the chunk to the recovery queue. + gLayoutManager.ChangeChunkReplication(chunkId); + } + if (status != -EEXIST) { + // Presently chunk can not possibly exist, as + // metatree.allocateChunkId() the above + // returned success. + return; + } + } + initialChunkVersion = chunkVersion; + if (gLayoutManager.InvalidateAllChunkReplicas( + fid, offset, chunkId, chunkVersion)) { + // Add the chunk to the recovery queue. + gLayoutManager.ChangeChunkReplication(chunkId); + status = 0; + return; + } + panic("failed to invalidate existing chunk", false); + status = -ENOENT; + return; + } + int ret; + if (status == -EEXIST) { + initialChunkVersion = chunkVersion; + bool isNewLease = false; + // Get a (new) lease if possible + status = gLayoutManager.GetChunkWriteLease(this, isNewLease); + if (status != 0) { + // couln't get the lease...bail + return; + } + if (!isNewLease) { + KFS_LOG_STREAM_DEBUG << "Got valid lease for req:" << opSeqno << + KFS_LOG_EOM; + // we got a valid lease. so, return + return; + } + // new lease and chunkservers have been notified + // so, wait for them to ack + } else if ((ret = gLayoutManager.AllocateChunk(this, chunkBlock)) < 0) { + // we have a problem + status = ret; + return; + } + // we have queued an RPC to the chunkserver. so, hold + // off processing (step #5) + // If all allocate ops fail synchronously (all servers are down), then + // the op is not suspended, and can proceed immediately. + suspended =! layoutDone; +} + +void +MetaAllocate::LayoutDone(int64_t chunkAllocProcessTime) +{ + const bool wasSuspended = suspended; + suspended = false; + layoutDone = true; + KFS_LOG_STREAM_DEBUG << + "Layout is done for req: " << opSeqno << " status: " << status << + KFS_LOG_EOM; + if (status == 0) { + // Check if all servers are still up, and didn't go down + // and reconnected back. + // In the case of reconnect smart pointers should be different: + // the the previous server incarnation always taken down on + // reconnect. + // Since the chunk is "dangling" up until this point, then in + // the case of reconnect the chunk becomes "stale", and chunk + // server is instructed to delete its replica of this new chunk. + for (Servers::const_iterator i = servers.begin(); + i != servers.end(); ++i) { + if ((*i)->IsDown()) { + KFS_LOG_STREAM_DEBUG << (*i)->ServerID() << + " went down during allocation, alloc failed" << + KFS_LOG_EOM; + status = -EIO; + break; + } + } + } + // Ensure that the op isn't stale. + // Invalidate all replicas might make it stale if it comes while this op + // is in flight. Do not do any cleanup if the op is invalid: all required + // cleanup has already been done. + if (gLayoutManager.Validate(this) && status != 0) { + // we have a problem: it is possible that the server + // went down. ask the client to retry.... + status = -EALLOCFAILED; + if (initialChunkVersion >= 0) { + gLayoutManager.CommitOrRollBackChunkVersion(this); + } else { + // this is the first time the chunk was allocated. + // since the allocation failed, remove existence of this chunk + // on the metaserver. + gLayoutManager.DeleteChunk(this); + } + // processing for this message is all done + } + if (status == 0) { + // layout is complete (step #6) + + // update the tree (step #7) and since we turned off the + // suspend flag, the request will be logged and go on its + // merry way. + // + // There could be more than one append allocation request set + // (each set one or more request with the same chunk) in flight. + // The append request sets can finish in any order. + // The append request sets can potentially all have the same + // offset: the eof at the time the first request in each set + // started. + // For append requests assignChunkId assigns past eof offset, + // if it succeeds, and returns the value in appendOffset. + chunkOff_t appendOffset = offset; + chunkId_t curChunkId = chunkId; + status = metatree.assignChunkId(fid, offset, + chunkId, chunkVersion, + appendChunk ? &appendOffset : 0, + &curChunkId); + if (status == 0) { + // Offset can change in the case of append. + offset = appendOffset; + gLayoutManager.CancelPendingMakeStable(fid, chunkId); + // assignChunkId() forces a recompute of the file's size. + } else { + KFS_LOG_STREAM((appendChunk && status == -EEXIST) ? + MsgLogger::kLogLevelERROR : + MsgLogger::kLogLevelDEBUG) << + "Assign chunk id failed for" + " <" << fid << "," << offset << ">" + " status: " << status << + KFS_LOG_EOM; + if (appendChunk && status == -EEXIST) { + panic("append chunk allocation internal error", + false); + } else if (status == -EEXIST && curChunkId != chunkId) { + gLayoutManager.DeleteChunk(this); + } + } + gLayoutManager.CommitOrRollBackChunkVersion(this); + } + if (appendChunk) { + if (status >= 0 && responseStr.empty()) { + ostringstream os; + responseSelf(os); + responseStr = os.str(); + } + gLayoutManager.AllocateChunkForAppendDone(*this); + } + if (status >= 0 && pendingLeaseRelinquish) { + pendingLeaseRelinquish->clnt = clnt; + clnt = this; + } + if (! wasSuspended) { + // Don't need need to resume, if it wasn't suspended: this + // method is [indirectly] invoked from handle(). + // Presently the only way to get here is from synchronous chunk + // server allocation failure. The request cannot possibly have + // non empty request list in this case, as it wasn't ever + // suspened. + if (next) { + panic( + "non empty allocation queue," + " for request that was not suspended", + false + ); + } + return; + } + // Currently the ops queue only used for append allocations. + assert(appendChunk || ! next); + // Update the process time, charged from MetaChunkAllocate. + const int64_t now = microseconds(); + processTime += now - chunkAllocProcessTime; + // Clone status for all ops in the queue. + // Submit the replies in the same order as requests. + // "this" might get deleted after submit_request() + MetaAllocate* n = this; + do { + MetaAllocate& c = *n; + n = c.next; + c.next = 0; + if (n) { + MetaAllocate& q = *n; + assert(q.fid == c.fid); + q.status = c.status; + q.statusMsg = c.statusMsg; + q.suspended = false; + q.fid = c.fid; + q.offset = c.offset; + q.chunkId = c.chunkId; + q.chunkVersion = c.chunkVersion; + q.pathname = c.pathname; + q.numReplicas = c.numReplicas; + q.layoutDone = c.layoutDone; + q.appendChunk = c.appendChunk; + q.stripedFileFlag = c.stripedFileFlag; + q.numServerReplies = c.numServerReplies; + q.responseStr = c.responseStr; + if (q.responseStr.empty()) { + q.servers = c.servers; + q.master = c.master; + } + } + submit_request(&c); + } while (n); +} + +int +MetaAllocate::logOrLeaseRelinquishDone(int code, void* data) +{ + if (code != EVENT_CMD_DONE || + (data != this && data != pendingLeaseRelinquish)) { + panic("MetaChunkAllocate::logDone invalid invocation"); + return 1; + } + if (data == this) { + clnt = pendingLeaseRelinquish->clnt; + if (status >= 0) { + pendingLeaseRelinquish->clnt = this; + submit_request(pendingLeaseRelinquish); + return 0; + } + } + pendingLeaseRelinquish->clnt = 0; + delete pendingLeaseRelinquish; + pendingLeaseRelinquish = 0; + submit_request(this); + return 0; +} + +/* virtual */ void +MetaChunkAllocate::handle() +{ + assert(req && req->op == META_ALLOCATE); + + // if there is a non-zero status, don't throw it away + MetaAllocate& alloc = *req; + if (alloc.status == 0 && status < 0) { + alloc.status = status; + // In the case of version change failure take the first failed + // server out, otherwise allocation might never succeed. + if (alloc.initialChunkVersion >= 0 && + alloc.servers.size() > 1) { + for (size_t i = 0; i < alloc.servers.size(); i++) { + if (alloc.servers[i].get() == server.get() && + ! alloc.servers[i]->IsDown()) { + gLayoutManager.ChunkCorrupt( + alloc.chunkId, alloc.servers[i]); + break; + } + } + } + } + + alloc.numServerReplies++; + // wait until we get replies from all servers + if (alloc.numServerReplies == alloc.servers.size()) { + // The ops are no longer suspended + alloc.LayoutDone(processTime); + processTime = microseconds(); // The time was charged to alloc. + } +} + +string +MetaAllocate::Show() const +{ + ostringstream os; + os << "allocate:" + " seq: " << opSeqno << + " path: " << pathname << + " fid: " << fid << + " chunkId: " << chunkId << + " offset: " << offset << + " client: " << clientHost << + " replicas: " << numReplicas << + " append: " << appendChunk << + " log: " << logFlag + ; + for (Servers::const_iterator i = servers.begin(); + i != servers.end(); + ++i) { + os << " " << (*i)->ServerID(); + } + return os.str(); +} + +/* virtual */ void +MetaChunkVersChange::handle() +{ + gLayoutManager.Done(*this); +} + +/* virtual */ void +MetaTruncate::handle() +{ + if (gWormMode && ! IsWormMutationAllowed(pathname.GetStr())) { + statusMsg = "worm mode"; + status = -EPERM; + return; + } + mtime = microseconds(); + kfsUid_t eu; + if (gLayoutManager.VerifyAllOpsPermissions()) { + SetEUserAndEGroup(*this); + eu = euser; + } else { + eu = kKfsUserRoot; + } + if (pruneBlksFromHead) { + status = metatree.pruneFromHead(fid, offset, &mtime, eu, egroup); + return; + } + const string path(pathname.GetStr()); + status = metatree.truncate(fid, offset, path, &mtime, eu, egroup); +} + +/* virtual */ void +MetaRename::handle() +{ + MetaFattr* fa = 0; + status = 0; + SetEUserAndEGroup(*this); + if (gWormMode && (! IsWormMutationAllowed(oldname) || + (status = metatree.lookupPath( + ROOTFID, newname, euser, egroup, fa) + ) != -ENOENT)) { + if (status == 0) { + // renames are disabled in WORM mode: otherwise, we + // ocould verwrite an existing file + statusMsg = "worm mode"; + status = -EPERM; + } + return; + } + todumpster = -1; + status = metatree.rename(dir, oldname, newname, + oldpath, overwrite, todumpster, euser, egroup); +} + +/* virtual */ void +MetaSetMtime::handle() +{ + if (fid > 0 && status == 0) { + // mtime is already set, only log. + return; + } + SetEUserAndEGroup(*this); + MetaFattr* fa = 0; + status = metatree.lookupPath(ROOTFID, pathname, + euser, egroup, fa); + if (status != 0) { + return; + } + if (! fa->CanWrite(euser, egroup)) { + status = -EACCES; + return; + } + fa->mtime = mtime; + fid = fa->id(); +} + +/* virtual */ void +MetaChangeFileReplication::handle() +{ + MetaFattr* const fa = metatree.getFattr(fid); + if (! fa) { + statusMsg = "no such file"; + status = -ENOENT; + } + if (! CanAccessFile(fa, *this)) { + return; + } + SetEUserAndEGroup(*this); + if (! fa->CanWrite(euser, egroup)) { + status = -EACCES; + return; + } + numReplicas = min(numReplicas, + max(int16_t(fa->numReplicas), + (fa->striperType == KFS_STRIPED_FILE_TYPE_RS && + fa->numRecoveryStripes > 0) ? + gLayoutManager.GetMaxReplicasPerRSFile() : + gLayoutManager.GetMaxReplicasPerFile() + )); + status = metatree.changeFileReplication(fa, numReplicas); + if (status == 0) { + numReplicas = fa->numReplicas; // update for log() + } +} + +/* + * Move chunks from src file into the end chunk boundary of the dst file. + */ +/* virtual */ void +MetaCoalesceBlocks::handle() +{ + dstStartOffset = -1; + mtime = microseconds(); + SetEUserAndEGroup(*this); + status = metatree.coalesceBlocks( + srcPath, dstPath, srcFid, dstFid, + dstStartOffset, &mtime, numChunksMoved, + euser, egroup); + KFS_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + "coalesce blocks " << srcPath << "->" << dstPath << + " " << srcFid << "->" << dstFid << + " status: " << status << + " offset: " << dstStartOffset << + " chunks moved: " << numChunksMoved << + KFS_LOG_EOM; +} + +/* virtual */ void +MetaRetireChunkserver::handle() +{ + status = gLayoutManager.RetireServer(location, nSecsDown); +} + +/* virtual */ void +MetaToggleWORM::handle() +{ + KFS_LOG_STREAM_INFO << "Toggle WORM: " << value << KFS_LOG_EOM; + setWORMMode(value); + status = 0; +} + +/* virtual */ void +MetaHello::handle() +{ + if (! server) { + // This is likely coming from the ClientSM. + KFS_LOG_STREAM_DEBUG << "no server invalid cmd: " << Show() << + KFS_LOG_EOM; + status = -EINVAL; + } + if (status < 0) { + // bad hello request...possible cluster key mismatch + return; + } + gLayoutManager.AddNewServer(this); +} + +/* virtual */ void +MetaBye::handle() +{ + gLayoutManager.ServerDown(server); +} + +/* virtual */ void +MetaLeaseAcquire::handle() +{ + if (gLayoutManager.VerifyAllOpsPermissions()) { + SetEUserAndEGroup(*this); + } + status = gLayoutManager.GetChunkReadLease(this); +} + +/* virtual */ void +MetaLeaseRenew::handle() +{ + status = gLayoutManager.LeaseRenew(this); +} + +/* virtual */ void +MetaLeaseRelinquish::handle() +{ + status = gLayoutManager.LeaseRelinquish(this); + KFS_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << + Show() << " status: " << status << + KFS_LOG_EOM; +} + +/* virtual */ void +MetaLeaseCleanup::handle() +{ + gLayoutManager.LeaseCleanup(); + // Some leases might be expired or relinquished: try to cleanup the + // dumpster. + // FIXME: + // Dumpster cleanup needs to be logged, otherwise all files that ever + // got there will accumulate during replay. Log compactor doesn't help + // as it does not, and can not empty the dumpster. + // Only meta server empties dumpster. + // Checkpoints from the forked copy should alleviate the problem. + // Defer this for now assuming that checkpoints from forked copy is + // the default operating mode. + metatree.cleanupDumpster(); + metatree.cleanupPathToFidCache(); + status = 0; +} + +class PrintChunkServerLocations { + ostream &os; +public: + PrintChunkServerLocations(ostream &out): os(out) { } + void operator () (const ChunkServerPtr &s) + { + os << ' ' << s->GetServerLocation(); + } +}; + +/* virtual */ void +MetaGetPathName::handle() +{ + ostringstream os; + const MetaFattr* fa = 0; + if (fid < 0) { + const MetaChunkInfo* chunkInfo = 0; + LayoutManager::Servers srvs; + if (! gLayoutManager.GetChunkFileId( + chunkId, fid, &chunkInfo, &fa, &srvs)) { + status = -ENOENT; + statusMsg = "no such chunk"; + return; + } + if (chunkInfo) { + os << + "Chunk-offset: " << chunkInfo->offset << "\r\n" + "Chunk-version: " << chunkInfo->chunkVersion << "\r\n" + ; + } + os << "Num-replicas: " << srvs.size() << "\r\n"; + if (! srvs.empty()) { + os << "Replicas:"; + for_each(srvs.begin(), srvs.end(), + PrintChunkServerLocations(os)); + os << "\r\n"; + } + } else { + fa = metatree.getFattr(fid); + if (! fa) { + status = -ENOENT; + statusMsg = "no such file"; + return; + } + } + if (fa) { + os << "Path-name: " << metatree.getPathname(fa) << "\r\n"; + FattrReply(fa, fattr); + } + result = os.str(); +} + +/* virtual */ void +MetaChmod::handle() +{ + MetaFattr* const fa = metatree.getFattr(fid); + if (! fa) { + status = -ENOENT; + return; + } + if (IsValidMode(mode)) { + status = -EINVAL; + } + SetEUserAndEGroup(*this); + if (fa->user != euser && euser != kKfsUserRoot) { + status = -EACCES; + return; + } + status = 0; + fa->mode = mode; +} + +/* virtual */ void +MetaChown::handle() +{ + MetaFattr* const fa = metatree.getFattr(fid); + if (! fa) { + status = -ENOENT; + return; + } + SetUserAndGroup(*this); + if (fa->user != euser && euser != kKfsUserRoot) { + status = -EACCES; + return; + } + if (user != kKfsUserNone && euser != user && euser != kKfsUserRoot) { + status = -EACCES; + return; + } + if (group != kKfsGroupNone && euser != kKfsUserRoot && + ! IsGroupMember(euser, group)) { + statusMsg = "user not a member of a group"; + status = -EACCES; + return; + } + status = 0; + if (user != kKfsUserNone) { + fa->user = user; + } + if (group != kKfsGroupNone) { + fa->group = group; + } +} + +/* virtual */ void +MetaChunkCorrupt::handle() +{ + if (server) { + if (! chunkDir.empty()) { + server->SetChunkDirStatus(chunkDir, dirOkFlag); + } + if (chunkId > 0) { + gLayoutManager.ChunkCorrupt(this); + } + } else { + // This is likely coming from the ClientSM. + KFS_LOG_STREAM_DEBUG << "no server invalid cmd: " << Show() << + KFS_LOG_EOM; + status = -EINVAL; + } +} + +/* virtual */ void +MetaChunkEvacuate::handle() +{ + if (server) { + gLayoutManager.ChunkEvacuate(this); + } else { + // This is likely coming from the ClientSM. + KFS_LOG_STREAM_DEBUG << "no server invalid cmd: " << Show() << + KFS_LOG_EOM; + status = -EINVAL; + } +} + +/* virtual */ void +MetaChunkReplicationCheck::handle() +{ + gLayoutManager.ChunkReplicationChecker(); + status = 0; +} + +/* virtual */ void +MetaBeginMakeChunkStable::handle() +{ + gLayoutManager.BeginMakeChunkStableDone(this); + status = 0; +} + +int +MetaLogMakeChunkStable::logDone(int code, void *data) +{ + if (code != EVENT_CMD_DONE || data != this) { + panic("MetaLogMakeChunkStable::logDone invalid invocation"); + return 1; + } + if (op == META_LOG_MAKE_CHUNK_STABLE) { + gLayoutManager.LogMakeChunkStableDone(this); + } + delete this; + return 0; +} + +/* virtual */ void +MetaChunkMakeStable::handle() +{ + gLayoutManager.MakeChunkStableDone(this); + status = 0; +} + +/* virtual */ string +MetaChunkMakeStable::Show() const +{ + ostringstream os; + os << + "make-chunk-stable:" + " server: " << server->GetServerLocation() << + " seq: " << opSeqno << + " status: " << status << + (statusMsg.empty() ? "" : " ") << statusMsg << + " fileid: " << fid << + " chunkid: " << chunkId << + " chunkvers: " << chunkVersion << + " chunkSize: " << chunkSize << + " chunkChecksum: " << chunkChecksum + ; + return os.str(); +} + +/* virtual */ void +MetaChunkSize::handle() +{ + status = gLayoutManager.GetChunkSizeDone(this); +} + +/* virtual */ void +MetaChunkReplicate::handle() +{ + gLayoutManager.ChunkReplicationDone(this); +} + +/* virtual */ string +MetaChunkReplicate::Show() const +{ + ostringstream os; + os << + (numRecoveryStripes > 0 ? "recover" : "replicate:") << + " chunk: " << chunkId << + " version: " << chunkVersion << + " file: " << fid << + " fileSize: " << fileSize << + " path: " << pathname << + " recovStripes: " << numRecoveryStripes << + " seq: " << opSeqno << + " from: " << (dataServer ? + dataServer->GetServerLocation() : ServerLocation()) << + " to: " << (server ? + server->GetServerLocation() : ServerLocation()) + ; + return os.str(); +} + +/* virtual */ void +MetaPing::handle() +{ + status = 0; + gLayoutManager.Ping(resp, gWormMode); + +} + +/* virtual */ void +MetaUpServers::handle() +{ + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + ostream& os = sWOStream.Set(resp); + gLayoutManager.UpServers(os); + os.flush(); + sWOStream.Reset(); + if (! os) { + resp.Clear(); + status = -ENOMEM; + statusMsg = "response exceeds max. size"; + } +} + +/* virtual */ void +MetaRecomputeDirsize::handle() +{ + status = 0; + KFS_LOG_STREAM_INFO << "Processing a recompute dir size..." << KFS_LOG_EOM; + metatree.recomputeDirSize(); +} + +static void SigHupHandler(int /* sinum */) +{ + _exit(1); +} + +static void SigAlarmHandler(int /* sinum */) +{ + _exit(2); +} + +static void ChildAtFork(int childTimeLimit) +{ + signal(SIGHUP, &SigHupHandler); + signal(SIGALRM, &SigAlarmHandler); + if (childTimeLimit > 0) { + alarm(childTimeLimit); + } + if (MsgLogger::GetLogger()) { + MsgLogger::GetLogger()->ChildAtFork(); + } + AuditLog::ChildAtFork(); + globalNetManager().ChildAtFork(); + gNetDispatch.ChildAtFork(); +} + +static int DoFork(int childTimeLimit) +{ + gNetDispatch.PrepareCurrentThreadToFork(); + AuditLog::PrepareToFork(); + MsgLogger* const logger = MsgLogger::GetLogger(); + if (logger) { + logger->PrepareToFork(); + } + const int ret = fork(); + if (ret == 0) { + ChildAtFork(childTimeLimit); + } else { + if (logger) { + logger->ForkDone(); + } + AuditLog::ForkDone(); + } + return ret; +} + +/* virtual */ void +MetaDumpChunkToServerMap::handle() +{ + suspended = false; + if (pid > 0) { + pid = -1; + return; // Child finished. + } + if (gChildProcessTracker.GetProcessCount() > 0) { + statusMsg = "another child process running"; + status = -EAGAIN; + return; + } + if ((pid = DoFork(20 * 60)) == 0) { + // let the child write out the map; if the map is large, this'll + // take several seconds. we get the benefits of writing out the + // map in the background while the metaserver continues to + // process other RPCs + gLayoutManager.DumpChunkToServerMap(gChunkmapDumpDir); + _exit(0); // Child does not do graceful exit. + } + KFS_LOG_STREAM_INFO << "chunk to server map writer pid: " << pid << + KFS_LOG_EOM; + // if fork() failed, let the sender know + if (pid < 0) { + status = -1; + return; + } + // hold on to the request until the child finishes + ostringstream os; + os << gChunkmapDumpDir << "/chunkmap.txt"; + chunkmapFile = os.str(); + suspended = true; + gChildProcessTracker.Track(pid, this); +} + +/* virtual */ void +MetaDumpChunkReplicationCandidates::handle() +{ + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + gLayoutManager.DumpChunkReplicationCandidates(this); +} + +/* virtual */ void +MetaFsck::handle() +{ + suspended = false; + resp.Clear(); + if (pid > 0) { + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + // Child finished. + pid = -1; + if (status == 0) { + int maxReadSize = min( + gLayoutManager.GetMaxResponseSize(), + sMaxFsckResponseSize); + if (maxReadSize <= 0) { + statusMsg = "out of io buffers"; + status = -ENOMEM; + } else { + if (fd.empty()) { + statusMsg = "internal error"; + status = -EINVAL; + return; + } + for (Fds::const_iterator it = fd.begin(); + maxReadSize > 0 && + it != fd.end(); + ++it) { + struct stat st = {0}; + if (fstat(*it, &st) < 0) { + status = -errno; + statusMsg = QCUtils::SysError( + -status); + if (status >= 0) { + status = -EIO; + } + break; + } + if (st.st_size <= 0 && + it == fd.begin()) { + status = -EIO; + break; + } + const int nRead = + resp.Read(*it, maxReadSize); + if (st.st_size > maxReadSize && + maxReadSize == nRead) { + ostream& os = + sWOStream.Set(resp); + os << + "\nWARNING: output" + " truncated to " << + maxReadSize << " bytes" + "\n"; + os.flush(); + if (! os) { + resp.Clear(); + statusMsg ="out of io " + "buffers"; + status = -ENOMEM; + } + sWOStream.Reset(); + break; + } + if (nRead < st.st_size) { + statusMsg = "short read"; + status = -EIO; + break; + } + maxReadSize -= nRead; + } + } + } else if (status > 0) { + status = -status; + } + for (Fds::const_iterator it = fd.begin(); + it != fd.end(); + ++it) { + close(*it); + } + fd.clear(); + if (status != 0) { + status = status < 0 ? status : -EIO; + if (statusMsg.empty()) { + statusMsg = "fsck io failure"; + } + resp.Clear(); + } + return; + } + if (gChildProcessTracker.GetProcessCount() > 0) { + statusMsg = "another child process running"; + status = -EAGAIN; + return; + } + const int cnt = gLayoutManager.FsckStreamCount( + reportAbandonedFilesFlag); + if (cnt <= 0) { + statusMsg = "internal error"; + status = -EINVAL; + return; + } + const char* const suffix = ".XXXXXX"; + const size_t suffixLen = strlen(suffix); + StBufferT buf; + vector names; + names.reserve(cnt); + fd.reserve(cnt); + for (int i = 0; i < cnt; i++) { + char* const ptr = buf.Resize(sTmpName.length() + suffixLen + 1); + memcpy(ptr, sTmpName.data(), sTmpName.size()); + strcpy(ptr + sTmpName.size(), suffix); + const int tfd = mkstemp(ptr); + if (tfd < 0) { + status = errno > 0 ? -errno : -EINVAL; + statusMsg = "failed to create temporary file"; + while (--i >= 0) { + close(fd[i]); + unlink(names[i].c_str()); + } + return; + } + fd.push_back(tfd); + names.push_back(string(ptr)); + } + if ((pid = DoFork((int)(gLayoutManager.GetMaxFsckTime() / + (1000 * 1000)))) == 0) { + StBufferT streamsPtrBuf; + ostream** const ptr = streamsPtrBuf.Resize(cnt + 1); + ofstream* const streams = new ofstream[cnt]; + bool failedFlag = false; + for (int i = 0; i < cnt; i++) { + const char* const name = names[i].c_str(); + if (! failedFlag) { + streams[i].open(name); + ptr[i] = streams + i; + } + close(fd[i]); + unlink(name); + failedFlag = failedFlag || ! streams[i]; + } + if (! failedFlag) { + ptr[cnt] = 0; + gLayoutManager.Fsck(ptr, reportAbandonedFilesFlag); + failedFlag = false; + for (int i = 0; i < cnt; i++) { + streams[i].flush(); + streams[i].close(); + if (! streams[i]) { + failedFlag = true; + break; + } + } + for (int i = 0; i < cnt; i++) { + if (failedFlag) { + // Zero length file means error. + ftruncate(fd[i], 0); + } + } + } + delete [] streams; + _exit(failedFlag ? 3 : 0); // Child does not do graceful close. + } + if (pid < 0) { + status = errno > 0 ? -errno : -EINVAL; + statusMsg = "fork failure"; + for (int i = 0; i < cnt; i++) { + close(fd[i]); + unlink(names[i].c_str()); + } + return; + } + KFS_LOG_STREAM_INFO << "fsck pid: " << pid << + KFS_LOG_EOM; + suspended = true; + gChildProcessTracker.Track(pid, this); +} + +void +MetaFsck::SetParameters(const Properties& props) +{ + sTmpName = props.getValue( + "metaServer.fsck.tmpfile", sTmpName); + sMaxFsckResponseSize = props.getValue( + "metaServer.fsck.maxFsckResponseSize", sMaxFsckResponseSize); +} + +string MetaFsck::sTmpName("/tmp/kfsfsck.tmp"); +int MetaFsck::sMaxFsckResponseSize(20 << 20); + +/* virtual */ void +MetaCheckLeases::handle() +{ + status = 0; + gLayoutManager.CheckAllLeases(); +} + +/* virtual */ void +MetaStats::handle() +{ + ostringstream os; + status = 0; + globals().counterManager.Show(os); + stats = os.str(); + +} + +/* virtual */ void +MetaOpenFiles::handle() +{ + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + ReadInfo openForRead; + WriteInfo openForWrite; + gLayoutManager.GetOpenFiles(openForRead, openForWrite); + status = 0; + ostream& os = sWOStream.Set(resp); + for (ReadInfo::const_iterator it = openForRead.begin(); + it != openForRead.end(); + ++it) { + os << it->first; + for (std::vector >::const_iterator + i = it->second.begin(); + i != it->second.end(); + ++i) { + if (! (os << " " << i->first << " " << i->second)) { + break; + } + } + os << "\n"; + } + os << "\n"; + for (WriteInfo::const_iterator it = openForWrite.begin(); + it != openForWrite.end(); + ++it) { + os << it->first; + for (std::vector::const_iterator + i = it->second.begin(); + i != it->second.end(); + ++i) { + if (! (os << " " << *i)) { + break; + } + } + os << "\n"; + } + os.flush(); + if (! os) { + resp.Clear(); + status = -ENOMEM; + statusMsg = "response exceeds max. size"; + } else { + openForReadCnt = openForRead.size(); + openForWriteCnt = openForWrite.size(); + } + sWOStream.Reset(); +} + +/* virtual */ void +MetaSetChunkServersProperties::handle() +{ + status = (int)properties.size(); + gLayoutManager.SetChunkServersProperties(properties); +} + +/* virtual */ void +MetaGetChunkServersCounters::handle() +{ + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + status = 0; + gLayoutManager.GetChunkServerCounters(resp); +} + +/* virtual */ void +MetaGetRequestCounters::handle() +{ + if (! HasEnoughIoBuffersForResponse(*this)) { + return; + } + status = 0; + gNetDispatch.GetStatsCsv(resp); + userCpuMicroSec = gNetDispatch.GetUserCpuMicroSec(); + systemCpuMicroSec = gNetDispatch.GetSystemCpuMicroSec(); +} + +/* virtual */ void +MetaCheckpoint::handle() +{ + suspended = false; + if (pid > 0) { + // Child finished. + KFS_LOG_STREAM(status == 0 ? + MsgLogger::kLogLevelINFO : + MsgLogger::kLogLevelERROR) << + "checkpoint: " << lastCheckpointId << + " pid: " << pid << + " done; status: " << status << + " failures: " << failedCount << + KFS_LOG_EOM; + if (status < 0) { + failedCount++; + } else { + failedCount = 0; + lastCheckpointId = runningCheckpointId; + } + if (lockFd >= 0) { + close(lockFd); + } + if (failedCount > maxFailedCount) { + panic("checkpoint failures", false); + } + runningCheckpointId = -1; + pid = -1; + return; + } + status = 0; + if (intervalSec <= 0) { + return; // Disabled. + } + const time_t now = globalNetManager().Now(); + if (lastCheckpointId < 0) { + // First call -- init. + lastCheckpointId = oplog.checkpointed(); + lastRun = now; + return; + } + if (now < lastRun + intervalSec) { + return; + } + if (oplog.checkpointed() == lastCheckpointId && + ! cp.isCPNeeded()) { + return; + } + if (lockFd >= 0) { + close(lockFd); + } + if (! lockFileName.empty() && + (lockFd = try_to_acquire_lockfile(lockFileName)) < 0) { + KFS_LOG_STREAM_INFO << "checkpoint: " << + " failed to acquire lock: " << lockFileName << + " " << QCUtils::SysError(lockFd) << + KFS_LOG_EOM; + return; // Retry later. + } + status = oplog.finishLog(); + if (status != 0) { + KFS_LOG_STREAM_ERROR << "failed to finish log:" + " status:" << status << + KFS_LOG_EOM; + if (lockFd >= 0) { + close(lockFd); + } + return; + } + lastRun = now; + // If logger decided not to start new log it won't reset checkpoint + // mutation count. + if (cp.isCPNeeded()) { + if (lockFd >= 0) { + close(lockFd); + } + if (lastCheckpointId != oplog.checkpointed()) { + panic("finish log failure", false); + return; + } + KFS_LOG_STREAM_WARN << "finish log: no new log started" + "; delaying checkpoint" << + KFS_LOG_EOM; + return; + } + runningCheckpointId = oplog.checkpointed(); + if ((pid = DoFork(chekpointWriteTimeoutSec)) == 0) { + metatree.disableFidToPathname(); + metatree.recomputeDirSize(); + cp.setWriteSyncFlag(chekpointWriteSyncFlag); + cp.setWriteBufferSize(chekpointWriteBufferSize); + status = cp.do_CP(); + // Child does not attempt graceful exit. + _exit(status == 0 ? 0 : 1); + } + KFS_LOG_STREAM(pid > 0 ? + MsgLogger::kLogLevelINFO : + MsgLogger::kLogLevelERROR) << + "checkpoint: " << lastCheckpointId << + " pid: " << pid << + KFS_LOG_EOM; + if (pid < 0) { + status = -1; + return; + } + suspended = true; + gChildProcessTracker.Track(pid, this); +} + +void +MetaCheckpoint::ScheduleNow() +{ + lastRun = globalNetManager().Now() - intervalSec - 1; +} + +void +MetaCheckpoint::SetParameters(const Properties& props) +{ + intervalSec = props.getValue( + "metaServer.checkpoint.interval", intervalSec); + lockFileName = props.getValue( + "metaServer.checkpoint.lockFileName", lockFileName); + maxFailedCount = max(0, props.getValue( + "metaServer.checkpoint.maxFailedCount", maxFailedCount)); + chekpointWriteTimeoutSec = max(0, props.getValue( + "metaServer.chekpointWriteTimeoutSec", + chekpointWriteTimeoutSec)); + chekpointWriteSyncFlag = max(0, props.getValue( + "metaServer.chekpointWriteSync", + chekpointWriteSyncFlag)); + chekpointWriteBufferSize = props.getValue( + "metaServer.chekpointWriteBufferSize", + chekpointWriteBufferSize); +} + +/*! + * \brief add a new request to the queue: we used to have threads before; at + * that time, the requests would be dropped into the queue and the request + * processor would pick it up. We have taken out threads; so this method is + * just pass thru + * \param[in] r the request + */ +void +submit_request(MetaRequest *r) +{ + const int64_t start = microseconds(); + if (r->submitCount++ == 0) { + r->submitTime = start; + r->processTime = start; + } else { + // accumulate processing time. + r->processTime = start - r->processTime; + } + r->handle(); + if (r->suspended) { + r->processTime = microseconds() - r->processTime; + } else { + oplog.dispatch(r); + } +} + +/*! + * \brief print out the leaf nodes for debugging + */ +void +printleaves() +{ + metatree.printleaves(); +} + +/*! + * \brief log lookup request (nop) + */ +int +MetaLookup::log(ostream& /* file */) const +{ + return 0; +} + +/*! + * \brief log lookup path request (nop) + */ +int +MetaLookupPath::log(ostream& /* file */) const +{ + return 0; +} + +/*! + * \brief log a file create + */ +int +MetaCreate::log(ostream &file) const +{ + // use the log entry time as a proxy for when the file was created + file << "create" + "/dir/" << dir << + "/name/" << name << + "/id/" << fid << + "/numReplicas/" << numReplicas << + "/ctime/" << ShowTime(microseconds()) + ; + if (striperType != KFS_STRIPED_FILE_TYPE_NONE) { + file << + "/striperType/" << striperType << + "/numStripes/" << numStripes << + "/numRecoveryStripes/" << numRecoveryStripes << + "/stripeSize/" << stripeSize + ; + } + if (todumpster > 0) { + file << "/todumpster/" << todumpster; + } + file << "/user/" << user << + "/group/" << group << + "/mode/" << mode << + '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log a directory create + */ +int +MetaMkdir::log(ostream &file) const +{ + file << "mkdir" + "/dir/" << dir << + "/name/" << name << + "/id/" << fid << + "/ctime/" << ShowTime(microseconds()) << + "/user/" << user << + "/group/" << group << + "/mode/" << mode << + '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log a file deletion + */ +int +MetaRemove::log(ostream &file) const +{ + file << "remove/dir/" << dir << "/name/" << name; + if (todumpster > 0) { + file << "/todumpster/" << todumpster; + } + file << '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log a directory deletion + */ +int +MetaRmdir::log(ostream &file) const +{ + file << "rmdir/dir/" << dir << "/name/" << name << '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log directory read (nop) + */ +int +MetaReaddir::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief log directory read (nop) + */ +int +MetaReaddirPlus::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief log getalloc (nop) + */ +int +MetaGetalloc::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief log getlayout (nop) + */ +int +MetaGetlayout::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief log a chunk allocation + */ +int +MetaAllocate::log(ostream &file) const +{ + if (! logFlag) { + return 0; + } + // use the log entry time as a proxy for when the block was created/file + // was modified + file << "allocate/file/" << fid << "/offset/" << offset + << "/chunkId/" << chunkId + << "/chunkVersion/" << chunkVersion + << "/mtime/" << ShowTime(microseconds()) + << "/append/" << (appendChunk ? 1 : 0) + << '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log a file truncation + */ +int +MetaTruncate::log(ostream &file) const +{ + // use the log entry time as a proxy for when the file was modified + if (pruneBlksFromHead) { + file << "pruneFromHead/file/" << fid << "/offset/" << offset + << "/mtime/" << ShowTime(mtime) << '\n'; + } else { + file << "truncate/file/" << fid << "/offset/" << offset + << "/mtime/" << ShowTime(mtime) << '\n'; + } + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log a rename + */ +int +MetaRename::log(ostream &file) const +{ + file << "rename" + "/dir/" << dir << + "/old/" << oldname << + "/new/" << newname + ; + if (todumpster > 0) { + // Insert sentinel empty entry for pop_path() to work. + file << "//todumpster/" << todumpster; + } + file << '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log a block coalesce + */ +int +MetaCoalesceBlocks::log(ostream &file) const +{ + file << "coalesce" + "/old/" << srcFid << + "/new/" << dstFid << + "/count/" << numChunksMoved << + "/mtime/" << ShowTime(mtime) << + '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log a setmtime + */ +int +MetaSetMtime::log(ostream &file) const +{ + file << "setmtime/file/" << fid + << "/mtime/" << ShowTime(mtime) << '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log change file replication + */ +int +MetaChangeFileReplication::log(ostream &file) const +{ + file << "setrep/file/" << fid << "/replicas/" << numReplicas << '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief log retire chunkserver (nop) + */ +int +MetaRetireChunkserver::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief log toggling of metaserver WORM state (nop) + */ +int +MetaToggleWORM::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a chunkserver hello, there is nothing to log + */ +int +MetaHello::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a chunkserver's death, there is nothing to log + */ +int +MetaBye::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief When asking a chunkserver for a chunk's size, there is + * write out the estimate of the file's size. + */ +int +MetaChunkSize::log(ostream &file) const +{ + if (filesize < 0) + return 0; + + file << "size/file/" << fid << "/filesize/" << filesize << '\n'; + return file.fail() ? -EIO : 0; +} + +/*! + * \brief for a ping, there is nothing to log + */ +int +MetaPing::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a request of upserver, there is nothing to log + */ +int +MetaUpServers::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a stats request, there is nothing to log + */ +int +MetaStats::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a map dump request, there is nothing to log + */ +int +MetaDumpChunkToServerMap::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a fsck request, there is nothing to log + */ +int +MetaFsck::log(ostream &file) const +{ + return 0; +} + + +/*! + * \brief for a recompute dir size request, there is nothing to log + */ +int +MetaRecomputeDirsize::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a check all leases request, there is nothing to log + */ +int +MetaCheckLeases::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a dump chunk replication candidates request, there is nothing to log + */ +int +MetaDumpChunkReplicationCandidates::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for an open files request, there is nothing to log + */ +int +MetaOpenFiles::log(ostream &file) const +{ + return 0; +} + +int +MetaSetChunkServersProperties::log(ostream & /* file */) const +{ + return 0; +} + +int +MetaGetChunkServersCounters::log(ostream & /* file */) const +{ + return 0; +} + +/*! + * \brief for an open files request, there is nothing to log + */ +int +MetaChunkCorrupt::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a lease acquire request, there is nothing to log + */ +int +MetaLeaseAcquire::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a lease renew request, there is nothing to log + */ +int +MetaLeaseRenew::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a lease renew relinquish, there is nothing to log + */ +int +MetaLeaseRelinquish::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief for a lease cleanup request, there is nothing to log + */ +int +MetaLeaseCleanup::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief This is an internally generated op. There is + * nothing to log. + */ +int +MetaChunkReplicationCheck::log(ostream &file) const +{ + return 0; +} + +/*! + * \brief This is an internally generated op. Log chunk id, size, and checksum. + */ +int +MetaLogMakeChunkStable::log(ostream &file) const +{ + if (chunkVersion < 0) { + KFS_LOG_STREAM_WARN << "invalid chunk version ignoring: " << + Show() << + KFS_LOG_EOM; + return 0; + } + file << "mkstable" << + (op == META_LOG_MAKE_CHUNK_STABLE ? "" : "done") << + "/fileId/" << fid << + "/chunkId/" << chunkId << + "/chunkVersion/" << chunkVersion << + "/size/" << chunkSize << + "/checksum/" << chunkChecksum << + "/hasChecksum/" << (hasChunkChecksum ? 1 : 0) << + '\n'; + return file.fail() ? -EIO : 0; +} + +int +MetaChmod::log(ostream& file) const +{ + file << "chmod" << + "/file/" << fid << + "/mode/" << mode << + '\n'; + return file.fail() ? -EIO : 0; +} + +int +MetaChown::log(ostream& file) const +{ + file << "chown" << + "/file/" << fid << + "/user/" << user << + "/group/" << group << + '\n'; + return file.fail() ? -EIO : 0; +} + +bool +MetaSetChunkServersProperties::ValidateRequestHeader( + const char* name, + size_t nameLen, + const char* header, + size_t headerLen, + bool hasChecksum, + uint32_t checksum) +{ + if (! MetaRequest::ValidateRequestHeader( + name, + nameLen, + header, + headerLen, + hasChecksum, + checksum)) { + return false; + } + BufferInputStream is(header, headerLen); + const char separator = ':'; + Properties prop; + prop.loadProperties(is, separator, false); + prop.copyWithPrefix("chunkServer.", properties); + return true; +} + +/*! + * \brief Various parse handlers. All of them follow the same model: + * If parse is successful, returns a dynamically + * allocated meta request object. It is the callers responsibility to dispose + * of this pointer. + */ + +void +MetaRequest::Init() +{ + MetaRequestsList::Init(*this); + QCStMutexLocker locker(gNetDispatch.GetClientManagerMutex()); + MetaRequestsList::PushBack(sMetaRequestsPtr, *this); + sMetaRequestCount++; +} + +/* virtual */ +MetaRequest::~MetaRequest() +{ + QCStMutexLocker locker(gNetDispatch.GetClientManagerMutex()); + MetaRequestsList::Remove(sMetaRequestsPtr, *this); + sMetaRequestCount--; +} + +/* virtual */ void +MetaRequest::handle() +{ + status = -ENOSYS; // Not implemented +} + +/* static */ void +MetaRequest::SetParameters(const Properties& props) +{ + sRequireHeaderChecksumFlag = props.getValue( + "metaServer.request.requireHeaderChecksum", 0) != 0; + sVerifyHeaderChecksumFlag = props.getValue( + "metaServer.request.verifyHeaderChecksum", 1) != 0; +} + +/* static */ uint32_t +MetaRequest::Checksum( + const char* name, + size_t nameLen, + const char* header, + size_t headerLen) +{ + return ComputeBlockChecksum( + ComputeBlockChecksum(name, nameLen), header, headerLen); +} + +bool MetaRequest::sRequireHeaderChecksumFlag = false; +bool MetaRequest::sVerifyHeaderChecksumFlag = true; +int MetaRequest::sMetaRequestCount = 0; +MetaRequest* MetaRequest::sMetaRequestsPtr[1] = {0}; + +typedef RequestHandler MetaRequestHandler; +static const MetaRequestHandler& MakeMetaRequestHandler() +{ + static MetaRequestHandler sHandler; + return sHandler + .MakeParser("LOOKUP") + .MakeParser("LOOKUP_PATH") + .MakeParser("CREATE") + .MakeParser("MKDIR") + .MakeParser("REMOVE") + .MakeParser("RMDIR") + .MakeParser("READDIR") + .MakeParser("READDIRPLUS") + .MakeParser("GETALLOC") + .MakeParser("GETLAYOUT") + .MakeParser("ALLOCATE") + .MakeParser("TRUNCATE") + .MakeParser("RENAME") + .MakeParser("SET_MTIME") + .MakeParser("CHANGE_FILE_REPLICATION") + .MakeParser("COALESCE_BLOCKS") + .MakeParser("RETIRE_CHUNKSERVER") + + // Meta server <-> Chunk server ops + .MakeParser("HELLO") + .MakeParser("CORRUPT_CHUNK") + .MakeParser("EVACUATE_CHUNK") + + // Lease related ops + .MakeParser("LEASE_ACQUIRE") + .MakeParser("LEASE_RENEW") + .MakeParser("LEASE_RELINQUISH") + + .MakeParser("CHECK_LEASES") + .MakeParser("PING") + .MakeParser("UPSERVERS") + .MakeParser("TOGGLE_WORM") + .MakeParser("STATS") + .MakeParser("RECOMPUTE_DIRSIZE") + .MakeParser("DUMP_CHUNKTOSERVERMAP") + .MakeParser< + MetaDumpChunkReplicationCandidates >("DUMP_CHUNKREPLICATIONCANDIDATES") + .MakeParser("FSCK") + .MakeParser("OPEN_FILES") + .MakeParser< + MetaGetChunkServersCounters >("GET_CHUNK_SERVERS_COUNTERS") + .MakeParser< + MetaSetChunkServersProperties >("SET_CHUNK_SERVERS_PROPERTIES") + .MakeParser("GET_REQUEST_COUNTERS") + .MakeParser("DISCONNECT") + .MakeParser("GETPATHNAME") + .MakeParser("CHOWN") + .MakeParser("CHMOD") + ; +} +static const MetaRequestHandler& sMetaRequestHandler = MakeMetaRequestHandler(); + +/*! + * \brief parse a command sent by a client + * + * Commands are of the form: + * \r\n + * {header: value \r\n}+\r\n + * + * @param[in] ioBuf: buffer containing the request sent by the client + * @param[in] len: length of cmdBuf + * @param[out] res: A piece of memory allocated by calling new that + * contains the data for the request. It is the caller's + * responsibility to delete the memory returned in res. + * @retval 0 on success; -1 if there is an error + */ +int +ParseCommand(const IOBuffer& ioBuf, int len, MetaRequest **res, + char* threadParseBuffer /* = 0 */) +{ + // Main thread's buffer + static char tempBuf[MAX_RPC_HEADER_LEN]; + + *res = 0; + if (len <= 0 || len > MAX_RPC_HEADER_LEN) { + return -1; + } + // Copy if request header spans two or more buffers. + // Requests on average are over a magnitude shorter than single + // io buffer (4K page), thus the copy should be infrequent, and + // small enough. With modern cpu the copy should be take less + // cpu cycles than buffer boundary handling logic (or one symbol + // per call processing), besides the requests header are small + // enough to fit into cpu cache. + int reqLen = len; + const char* const buf = ioBuf.CopyOutOrGetBufPtr( + threadParseBuffer ? threadParseBuffer : tempBuf, reqLen); + assert(reqLen == len); + *res = reqLen == len ? sMetaRequestHandler.Handle(buf, reqLen) : 0; + return (*res ? 0 : -1); +} + +bool MetaCreate::Validate() +{ + return (dir >= 0 && ! name.empty() && numReplicas > 0); +} + +/*! + * \brief Generate response (a string) for various requests that + * describes the result of the request execution. The generated + * response string is based on the KFS protocol. All follow the same + * model: + * @param[out] os: A string stream that contains the response. + */ +void +MetaLookup::response(ostream& os) +{ + if (! OkHeader(this, os)) { + return; + } + FattrReply(os, fattr) << "\r\n"; +} + +void +MetaLookupPath::response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + FattrReply(os, fattr) << "\r\n"; +} + +void +MetaCreate::response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + os << "File-handle: " << fid << "\r\n"; + if (striperType != KFS_STRIPED_FILE_TYPE_NONE) { + os << "Striper-type: " << striperType << "\r\n"; + } + os << + "User: " << user << "\r\n" + "Group: " << group << "\r\n" + "Mode: " << mode << "\r\n" + "\r\n"; +} + +void +MetaRemove::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaMkdir::response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + os << + "File-handle: " << fid << "\r\n" + "User: " << user << "\r\n" + "Group: " << group << "\r\n" + "Mode: " << mode << "\r\n" + "\r\n"; +} + +void +MetaRmdir::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaReaddir::response(ostream& os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + os << + "Num-Entries: " << numEntries << "\r\n" + "Has-more-entries: " << (hasMoreEntriesFlag ? 1 : 0) << "\r\n" + "Content-length: " << resp.BytesConsumable() << "\r\n" + "\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaReaddirPlus::response(ostream& os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + size_t entryCount; + IOBuffer resp; + if (numEntries >= 0) { + ReaddirPlusWriter writer( + resp, + maxRespSize, + getLastChunkInfoOnlyIfSizeUnknown); + entryCount = writer.Write(dentries, lastChunkInfos, + noAttrsFlag); + } else { + ReaddirPlusWriter writer( + resp, + maxRespSize, + getLastChunkInfoOnlyIfSizeUnknown); + entryCount = writer.Write(dentries, lastChunkInfos, + noAttrsFlag); + } + hasMoreEntriesFlag = hasMoreEntriesFlag || entryCount < dentries.size(); + dentries.clear(); + lastChunkInfos.clear(); + if (ioBufPending > 0) { + gLayoutManager.ChangeIoBufPending(-ioBufPending); + ioBufPending = 0; + } + if (hasMoreEntriesFlag && numEntries < 0) { + resp.Clear(); + status = -ENOMEM; + statusMsg = "response exceeds max. size"; + OkHeader(this, os); + return; + } + os << + "Num-Entries: " << entryCount << "\r\n" + "Has-more-entries: " << (hasMoreEntriesFlag ? 1 : 0) << "\r\n" + "Content-length: " << resp.BytesConsumable() << "\r\n" + "\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaRename::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaSetMtime::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaGetalloc::response(ostream& os) +{ + if (! OkHeader(this, os)) { + return; + } + os << + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n"; + if (replicasOrderedFlag) { + os << "Replicas-ordered: 1\r\n"; + } + os << "Num-replicas: " << locations.size() << "\r\n"; + + assert(locations.size() > 0); + + os << "Replicas:"; + for_each(locations.begin(), locations.end(), ListServerLocations(os)); + os << "\r\n\r\n"; +} + +void +MetaGetlayout::response(ostream& os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + if (hasMoreChunksFlag) { + os << "Has-more-chunks: 1\r\n"; + } + os << + "Num-chunks: " << numChunks << "\r\n" + "Content-length: " << resp.BytesConsumable() << "\r\n" + "\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaAllocate::response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + responseSelf(os); +} + +void +MetaAllocate::responseSelf(ostream &os) +{ + if (status < 0) { + return; + } + if (! responseStr.empty()) { + os.write(responseStr.data(), responseStr.size()); + return; + } + os << "Chunk-handle: " << chunkId << "\r\n"; + os << "Chunk-version: " << chunkVersion << "\r\n"; + if (appendChunk) { + os << "Chunk-offset: " << offset << "\r\n"; + } + assert((! servers.empty() && master) || invalidateAllFlag); + if (master) { + os << "Master: " << master->GetServerLocation() << "\r\n"; + } + os << "Num-replicas: " << servers.size() << "\r\n"; + if (! servers.empty()) { + os << "Replicas:"; + for_each(servers.begin(), servers.end(), + PrintChunkServerLocations(os)); + } + os << "\r\n\r\n"; +} + +void +MetaLeaseAcquire::response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + if (leaseId >= 0) { + os << "Lease-id: " << leaseId << "\r\n"; + } + if (! leaseIds.empty()) { + os << "Lease-ids:" << leaseIds << "\r\n"; + } + os << "\r\n"; +} + +void +MetaLeaseRenew::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaLeaseRelinquish::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaCoalesceBlocks::response(ostream &os) +{ + if (! OkHeader(this, os)) { + return; + } + os << + "Dst-start-offset: " << dstStartOffset << "\r\n" + "M-Time: " << ShowTime(mtime) << "\r\n" + "\r\n"; +} + +void +MetaHello::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaChunkCorrupt::response(ostream &os) +{ + if (noReplyFlag) { + return; + } + PutHeader(this, os) << "\r\n"; +} + +void +MetaChunkEvacuate::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaTruncate::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaChangeFileReplication::response(ostream &os) +{ + PutHeader(this, os) << + "Num-replicas: " << numReplicas << "\r\n\r\n"; +} + +void +MetaRetireChunkserver::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaToggleWORM::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaPing::response(ostream &os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + os.flush(); + buf.Move(&resp); +} + +void +MetaUpServers::response(ostream& os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + os << "Content-length: " << resp.BytesConsumable() << "\r\n\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaStats::response(ostream &os) +{ + PutHeader(this, os) << stats << "\r\n"; +} + +void +MetaCheckLeases::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaRecomputeDirsize::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaDumpChunkToServerMap::response(ostream &os) +{ + PutHeader(this, os) << "Filename: " << chunkmapFile << "\r\n\r\n"; +} + +void +MetaDumpChunkReplicationCandidates::response(ostream &os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + os << + "Num-replication: " << numReplication << "\r\n" + "Num-pending-recovery: " << numPendingRecovery << "\r\n" + "Content-length: " << resp.BytesConsumable() << "\r\n" + "\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaFsck::response(ostream &os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + os << "Content-length: " << resp.BytesConsumable() << "\r\n\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaOpenFiles::response(ostream& os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + os << + "Read: " << openForReadCnt << "\r\n" + "Write: " << openForWriteCnt << "\r\n" + "Content-length: " << resp.BytesConsumable() << "\r\n" + "\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaSetChunkServersProperties::response(ostream &os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaGetChunkServersCounters::response(ostream &os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + os << "Content-length: " << resp.BytesConsumable() << "\r\n\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaGetRequestCounters::response(ostream &os, IOBuffer& buf) +{ + if (! OkHeader(this, os)) { + return; + } + os << + "Content-length: " << resp.BytesConsumable() << "\r\n" + "User-cpu-micro-sec: " << userCpuMicroSec << "\r\n" + "System-cpu-mcro-sec: " << systemCpuMicroSec << "\r\n" + "\r\n"; + os.flush(); + buf.Move(&resp); +} + +void +MetaGetPathName::response(ostream& os) +{ + if (! OkHeader(this, os)) { + return; + } + os << result; + FattrReply(os, fattr) << "\r\n"; +} + +void +MetaChmod::response(ostream& os) +{ + PutHeader(this, os) << "\r\n"; +} + +void +MetaChown::response(ostream& os) +{ + PutHeader(this, os) << "\r\n"; +} + +/*! + * \brief Generate request (a string) that should be sent to the chunk + * server. The generated request string is based on the KFS + * protocol. All follow the same model: + * @param[out] os: A string stream that contains the response. + */ +void +MetaChunkAllocate::request(ostream &os) +{ + assert(req); + + os << "ALLOCATE \r\n"; + os << "Cseq: " << opSeqno << "\r\n"; + os << "Version: KFS/1.0\r\n"; + os << "File-handle: " << req->fid << "\r\n"; + os << "Chunk-handle: " << req->chunkId << "\r\n"; + os << "Chunk-version: " << req->chunkVersion << "\r\n"; + if (leaseId >= 0) { + os << "Lease-id: " << leaseId << "\r\n"; + } + os << "Chunk-append: " << (req->appendChunk ? 1 : 0) << "\r\n"; + + os << "Num-servers: " << req->servers.size() << "\r\n"; + assert(req->servers.size() > 0); + + os << "Servers:"; + for_each(req->servers.begin(), req->servers.end(), + PrintChunkServerLocations(os)); + os << "\r\n\r\n"; +} + +void +MetaChunkDelete::request(ostream &os) +{ + os << "DELETE \r\n"; + os << "Cseq: " << opSeqno << "\r\n"; + os << "Version: KFS/1.0\r\n"; + os << "Chunk-handle: " << chunkId << "\r\n\r\n"; +} + +void +MetaChunkHeartbeat::request(ostream &os) +{ + os << + "HEARTBEAT \r\n" + "Cseq: " << opSeqno << "\r\n" + "Version: KFS/1.0\r\n" + "Num-evacuate: " << evacuateCount << "\r\n" + "\r\n" + ; +} + +static inline char* +ChunkIdToString(chunkId_t id, bool hexFormatFlag, char* end) +{ + if (hexFormatFlag) { + char* p = end - 1; + chunkId_t val = id; + char* const s = p - sizeof(val) * 2; + do { + *--p = "0123456789ABCDEF"[val & 0xF]; + val >>= 4; + } while (val != 0 && s < p); + return p; + } + return toString(id, end); +} + +void +MetaChunkStaleNotify::request(ostream& os, IOBuffer& buf) +{ + const size_t count = staleChunkIds.GetSize(); + os << + "STALE_CHUNKS \r\n" + "Cseq: " << opSeqno << "\r\n" + "Version: KFS/1.0\r\n" + "Num-chunks: " << count << "\r\n" + ; + if (evacuatedFlag) { + os << "Evacuated: 1\r\n"; + } + if (hexFormatFlag) { + os << "HexFormat: 1\r\n"; + } + const int kBufEnd = 30; + char tmpBuf[kBufEnd + 1]; + char* const end = tmpBuf + kBufEnd + 1; + if (count <= 1) { + char* const p = count < 1 ? end - 1 : + ChunkIdToString(staleChunkIds.Front(), hexFormatFlag, end); + size_t len = end - p - 1; + os << "Content-length: " << len << "\r\n\r\n"; + os.write(p, len); + return; + } + + ChunkIdQueue::ConstIterator it(staleChunkIds); + const chunkId_t* id; + IOBuffer ioBuf; + IOBufferWriter writer(ioBuf); + tmpBuf[kBufEnd] = (char)' '; + while ((id = it.Next())) { + char* const p = ChunkIdToString(*id, hexFormatFlag, end); + if (! hexFormatFlag) { + tmpBuf[kBufEnd] = (char)' '; + } + writer.Write(p, (int)(end - p)); + } + writer.Close(); + const int len = ioBuf.BytesConsumable(); + os << "Content-length: " << len << "\r\n\r\n"; + IOBuffer::iterator const bi = ioBuf.begin(); + const int defsz = IOBufferData::GetDefaultBufferSize(); + if (len < defsz - defsz / 4 && + bi != ioBuf.end() && len == bi->BytesConsumable()) { + os.write(bi->Consumer(), len); + } else { + os.flush(); + buf.Move(&ioBuf); + } +} + +void +MetaChunkRetire::request(ostream &os) +{ + os << "RETIRE \r\n"; + os << "Cseq: " << opSeqno << "\r\n"; + os << "Version: KFS/1.0\r\n\r\n"; +} + +void +MetaChunkVersChange::request(ostream &os) +{ + os << + "CHUNK_VERS_CHANGE \r\n" + "Cseq: " << opSeqno << "\r\n" + "Version: KFS/1.0\r\n" + "File-handle: " << fid << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "From-chunk-version: " << fromVersion << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + ; + if (makeStableFlag) { + os << "Make-stable: 1\r\n"; + } + os << "\r\n"; +} + +void +MetaBeginMakeChunkStable::request(ostream &os) +{ + os << "BEGIN_MAKE_CHUNK_STABLE\r\n" + "Cseq: " << opSeqno << "\r\n" + "Version: KFS/1.0\r\n" + "File-handle: " << fid << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "\r\n"; +} + +void +MetaChunkMakeStable::request(ostream &os) +{ + os << "MAKE_CHUNK_STABLE \r\n"; + os << "Cseq: " << opSeqno << "\r\n"; + os << "Version: KFS/1.0\r\n"; + os << "File-handle: " << fid << "\r\n"; + os << "Chunk-handle: " << chunkId << "\r\n"; + os << "Chunk-version: " << chunkVersion << "\r\n"; + os << "Chunk-size: " << chunkSize << "\r\n"; + if (hasChunkChecksum) { + os << "Chunk-checksum: " << chunkChecksum << "\r\n"; + } + os << "\r\n"; +} + +static const string sReplicateCmdName("REPLICATE"); + +void +MetaChunkReplicate::request(ostream& os) +{ + ostringstream rs; + rs << + "Cseq: " << opSeqno << "\r\n" + "Version: KFS/1.0\r\n" + "File-handle: " << fid << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + ; + if (numRecoveryStripes > 0) { + rs << + "Chunk-version: 0\r\n" + "Chunk-offset: " << chunkOffset << "\r\n" + "Striper-type: " << striperType << "\r\n" + "Num-stripes: " << numStripes << "\r\n" + "Num-recovery-stripes: " << numRecoveryStripes << "\r\n" + "Stripe-size: " << stripeSize << "\r\n" + "Meta-port: " << srcLocation.port << "\r\n" + ; + if (fileSize > 0) { + rs << "File-size: " << fileSize << "\r\n"; + } + } else { + rs << "Chunk-location: " << srcLocation << "\r\n"; + } + rs << "\r\n"; + const string req = rs.str(); + os << sReplicateCmdName << " " << Checksum( + sReplicateCmdName.data(), + sReplicateCmdName.size(), + req.data(), + req.size()) << + "\r\n"; + os.write(req.data(), req.size()); +} + +void +MetaChunkReplicate::handleReply(const Properties& prop) +{ + if (status == 0) { + const seq_t cVers = prop.getValue("Chunk-version", seq_t(0)); + if (numRecoveryStripes <= 0) { + chunkVersion = cVers; + } else if (cVers != 0) { + status = -EINVAL; + statusMsg = "invalid chunk version in reply"; + return; + } + } + fid = prop.getValue("File-handle", fid_t(0)); + invalidStripes.clear(); + const int sc = numStripes + numRecoveryStripes; + if (status == 0 || sc <= 0) { + return; + } + const string idxStr(prop.getValue("Invalid-stripes", string())); + if (idxStr.empty()) { + return; + } + istringstream is(idxStr); + is >> std::ws; + while (! is.eof()) { + int idx = -1; + chunkId_t chunkId = -1; + seq_t chunkVers = -1; + if (! (is >> idx >> chunkId >> chunkVers >> std::ws) || + idx < 0 || sc <= idx || + (int)invalidStripes.size() >= sc - 1) { + KFS_LOG_STREAM_ERROR << "replicate reply: parse error:" + " pos: " << invalidStripes.size() << + " Invalid-stripes: " << idxStr << + KFS_LOG_EOM; + invalidStripes.clear(); + break; + } + invalidStripes.insert( + make_pair(idx, make_pair(chunkId, chunkVers))); + } +} + +void +MetaChunkSize::request(ostream &os) +{ + os << + "SIZE \r\n" + "Cseq: " << opSeqno << "\r\n" + "Version: KFS/1.0\r\n" + "File-handle: " << fid << "\r\n" + "Chunk-version: " << chunkVersion << "\r\n" + "Chunk-handle: " << chunkId << "\r\n" + "\r\n"; +} + +void +MetaChunkSetProperties::request(ostream &os) +{ + os << + "CMD_SET_PROPERTIES\r\n" + "Cseq: " << opSeqno << "\r\n" + "Version: KFS/1.0\r\n" + "Content-length: " << serverProps.length() << "\r\n\r\n" << + serverProps + ; +} + +void +MetaChunkServerRestart::request(ostream &os) +{ + os << + "RESTART_CHUNK_SERVER\r\n" + "Cseq: " << opSeqno << "\r\n" + "Version: KFS/1.0\r\n" + "\r\n" + ; +} + +} /* namespace KFS */ diff --git a/src/cc/meta/MetaRequest.h b/src/cc/meta/MetaRequest.h new file mode 100644 index 000000000..a4ae19f6b --- /dev/null +++ b/src/cc/meta/MetaRequest.h @@ -0,0 +1,2578 @@ +/*! + * $Id$ + * + * \file MetaRequest.h + * \brief protocol requests to KFS metadata server + * \author Blake Lewis (Kosmix Corp.) + * Mike Ovsiannikov + * + * The model is that various receiver threads handle network + * connections and extract RPC parameters, then queue a request + * of the appropriate type for the metadata server to process. + * When the operation is finished, the server calls back to the + * receiver with status and any results. + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ +#if !defined(KFS_REQUEST_H) +#define KFS_REQUEST_H + +#include "common/kfsdecls.h" +#include "kfstypes.h" +#include "meta.h" +#include "util.h" + +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/IOBuffer.h" +#include "common/Properties.h" +#include "common/StBuffer.h" +#include "common/StdAllocator.h" +#include "common/DynamicArray.h" +#include "qcdio/QCDLList.h" + +#include +#include +#include +#include + +namespace KFS { + +using std::ostream; +using std::vector; +using std::map; +using std::pair; +using std::ostringstream; +using std::dec; +using std::oct; + +/*! + * \brief Metadata server operations + */ +#define KfsForEachMetaOpId(f) \ + /* Client -> Metadata server ops */ \ + f(LOOKUP) \ + f(LOOKUP_PATH) \ + f(CREATE) \ + f(MKDIR) \ + f(REMOVE) \ + f(RMDIR) \ + f(READDIR) \ + f(READDIRPLUS) \ + f(GETALLOC) \ + f(GETLAYOUT) \ + f(ALLOCATE) \ + f(TRUNCATE) \ + f(RENAME) \ + f(SETMTIME) /* Set the mtime on a specific file to support cp -p */ \ + f(CHANGE_FILE_REPLICATION) /* Client is asking for a change in file's replication factor */ \ + f(COALESCE_BLOCKS) /* Client is asking for blocks from one file to be coalesced with another */ \ + /* Admin is notifying us to retire a chunkserver */ \ + f(RETIRE_CHUNKSERVER) \ + f(TOGGLE_WORM) /* Toggle metaserver's WORM mode */ \ + /* Metadata server <-> Chunk server ops */ \ + f(HELLO) /* Hello RPC sent by chunkserver on startup */ \ + f(BYE) /* Internally generated op whenever a chunkserver goes down */ \ + f(CHUNK_HEARTBEAT) /* Periodic heartbeat from meta->chunk */ \ + f(CHUNK_ALLOCATE) /* Allocate chunk RPC from meta->chunk */ \ + f(CHUNK_DELETE) /* Delete chunk RPC from meta->chunk */ \ + f(CHUNK_STALENOTIFY) /* Stale chunk notification RPC from meta->chunk */ \ + f(BEGIN_MAKE_CHUNK_STABLE) \ + f(CHUNK_MAKE_STABLE) /* Notify a chunkserver to make a chunk stable */ \ + f(CHUNK_COALESCE_BLOCK) /* Notify a chunkserver to coalesce a chunk from file to another */ \ + f(CHUNK_VERSCHANGE) /* Notify chunkserver of version # change from meta->chunk */ \ + f(CHUNK_REPLICATE) /* Ask chunkserver to replicate a chunk */ \ + f(CHUNK_SIZE) /* Ask chunkserver for the size of a chunk */ \ + f(CHUNK_REPLICATION_CHECK) /* Internally generated */ \ + f(CHUNK_CORRUPT) /* Chunkserver is notifying us that a chunk is corrupt */ \ + /* All the blocks on the retiring server have been evacuated and the */ \ + /* server can safely go down. We are asking the server to take a graceful bow */ \ + f(CHUNK_RETIRE) \ + /* Lease related messages */ \ + f(LEASE_ACQUIRE) \ + f(LEASE_RENEW) \ + f(LEASE_RELINQUISH) \ + /* Internally generated to cleanup leases */ \ + f(LEASE_CLEANUP) \ + /* Metadata server monitoring */ \ + f(PING) /* Print out chunkserves and their configs */ \ + f(STATS) /* Print out whatever statistics/counters we have */ \ + f(RECOMPUTE_DIRSIZE) /* Do a top-down size update */ \ + f(DUMP_CHUNKTOSERVERMAP) /* Dump out the chunk -> location map */ \ + f(DUMP_CHUNKREPLICATIONCANDIDATES) /* Dump out the list of chunks being re-replicated */ \ + f(FSCK) /* Check all blocks and report files that have missing blocks */ \ + f(CHECK_LEASES) /* Check all the leases and clear out expired ones */ \ + f(OPEN_FILES) /* Print out open files---for which there is a valid read/write lease */ \ + f(UPSERVERS) /* Print out live chunk servers */ \ + f(LOG_MAKE_CHUNK_STABLE) /* Emit log record with chunk length and checksum */ \ + f(LOG_MAKE_CHUNK_STABLE_DONE) /* Emit log record with successful completion of make chunk stable. */ \ + f(SET_CHUNK_SERVERS_PROPERTIES) \ + f(CHUNK_SERVER_RESTART) \ + f(CHUNK_SET_PROPERTIES) \ + f(GET_CHUNK_SERVERS_COUNTERS) \ + f(LOG_CHUNK_VERSION_CHANGE) \ + f(GET_REQUEST_COUNTERS) \ + f(CHECKPOINT) \ + f(DISCONNECT) \ + f(GETPATHNAME) \ + f(CHUNK_EVACUATE) \ + f(CHMOD) \ + f(CHOWN) + +enum MetaOp { +#define KfsMakeMetaOpEnumEntry(name) META_##name, + KfsForEachMetaOpId(KfsMakeMetaOpEnumEntry) +#undef KfsMakeMetaOpEnumEntry + META_NUM_OPS_COUNT // must be the last one +}; + + +class ChunkServer; +typedef boost::shared_ptr ChunkServerPtr; +typedef DynamicArray ChunkIdQueue; + +/*! + * \brief Meta request base class + */ +struct MetaRequest { + typedef vector< + ChunkServerPtr, + StdAllocator + > Servers; + + const MetaOp op; //!< type of request + int status; //!< returned status + int clientProtoVers; //!< protocol version # sent by client + int submitCount; //!< for time tracking. + int64_t submitTime; //!< to time requests, optional. + int64_t processTime; //!< same as previous + string statusMsg; //!< optional human readable status message + seq_t opSeqno; //!< command sequence # sent by the client + seq_t seqno; //!< sequence no. in log + const bool mutation; //!< mutates metatree + bool suspended; //!< is this request suspended somewhere + bool fromChunkServerFlag; + string clientIp; + IOBuffer reqHeaders; + kfsUid_t euser; + kfsGid_t egroup; + MetaRequest* next; + KfsCallbackObj* clnt; //!< a handle to the client that generated this request. + MetaRequest(MetaOp o, bool mu, seq_t opSeq = -1) + : op(o), + status(0), + clientProtoVers(0), + submitCount(0), + submitTime(0), + processTime(0), + statusMsg(), + opSeqno(opSeq), + seqno(0), + mutation(mu), + suspended(false), + fromChunkServerFlag(false), + clientIp(), + reqHeaders(), + euser(kKfsUserNone), + egroup(kKfsGroupNone), + next(0), + clnt(0) + { MetaRequest::Init(); } + virtual ~MetaRequest(); + virtual void handle(); + //!< when an op finishes execution, we send a response back to + //!< the client. This function should generate the appropriate + //!< response to be sent back as per the KFS protocol. + virtual void response(ostream& /* os */) {} + virtual void response(ostream& os, IOBuffer& /* buf */) { response(os); } + virtual int log(ostream &file) const = 0; //!< write request to log + virtual string Show() const { return string(); } + virtual void setChunkServer(const ChunkServerPtr& /* cs */) {}; + bool ValidateRequestHeader( + const char* name, + size_t nameLen, + const char* header, + size_t headerLen, + bool hasChecksum, + uint32_t checksum) + { + return ( + hasChecksum ? + (! sVerifyHeaderChecksumFlag || + Checksum(name, nameLen, header, headerLen) == checksum) : + (! sRequireHeaderChecksumFlag || ! mutation) + ); + } + template static T& ParserDef(T& parser) + { + return parser + .Def("Cseq", &MetaRequest::opSeqno, seq_t(-1)) + .Def("Client-Protocol-Version", &MetaRequest::clientProtoVers, int(0)) + .Def("From-chunk-server", &MetaRequest::fromChunkServerFlag, false) + .Def("UserId", &MetaRequest::euser, kKfsUserNone) + .Def("GroupId", &MetaRequest::egroup, kKfsGroupNone) + ; + } + static void SetParameters(const Properties& props); + static uint32_t Checksum( + const char* name, + size_t nameLen, + const char* header, + size_t headerLen); + static int GetRequestCount() + { return sMetaRequestCount; } +private: + MetaRequest* mPrevPtr[1]; + MetaRequest* mNextPtr[1]; + + static bool sRequireHeaderChecksumFlag; + static bool sVerifyHeaderChecksumFlag; + static int sMetaRequestCount; + static MetaRequest* sMetaRequestsPtr[1]; + + friend class QCDLListOp; + typedef QCDLList MetaRequestsList; + void Init(); +}; + +void submit_request(MetaRequest *r); + +/*! + * \brief look up a file name + */ +struct MetaLookup: public MetaRequest { + fid_t dir; //!< parent directory fid + string name; //!< name to look up + MFattr fattr; + MetaLookup() + : MetaRequest(META_LOOKUP, false), + dir(-1), + name(), + fattr() + {} + virtual void handle(); + virtual int log(ostream& file) const; + virtual void response(ostream& os); + virtual string Show() const + { + ostringstream os; + + os << "lookup: name = " << name; + os << " (parent fid = " << dir << ")"; + return os.str(); + } + bool Validate() + { + return (dir >= 0 && ! name.empty()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Parent File-handle", &MetaLookup::dir, fid_t(-1)) + .Def("Filename", &MetaLookup::name ) + ; + } +}; + +/*! + * \brief look up a complete path + */ +struct MetaLookupPath: public MetaRequest { + fid_t root; //!< fid of starting directory + string path; //!< path to look up + MFattr fattr; + MetaLookupPath() + : MetaRequest(META_LOOKUP_PATH, false), + root(-1), + path(), + fattr() + {} + virtual void handle(); + virtual int log(ostream& file) const; + virtual void response(ostream& os); + virtual string Show() const + { + ostringstream os; + + os << "lookup_path: path = " << path; + os << " (root fid = " << root << ")"; + return os.str(); + } + bool Validate() + { + return (root >= 0 && ! path.empty()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Root File-handle", &MetaLookupPath::root, fid_t(-1)) + .Def("Pathname", &MetaLookupPath::path ) + ; + } +}; + +/*! + * \brief create a file + */ +struct MetaCreate: public MetaRequest { + fid_t dir; //!< parent directory fid + fid_t fid; //!< file ID of new file + int16_t numReplicas; //!< desired degree of replication + int32_t striperType; + int32_t numStripes; + int32_t numRecoveryStripes; + int32_t stripeSize; + bool exclusive; //!< model the O_EXCL flag + fid_t todumpster; //!< moved existing to dumpster + kfsUid_t user; + kfsGid_t group; + kfsMode_t mode; + seq_t reqId; + string name; //!< name to create + MetaCreate() + : MetaRequest(META_CREATE, true), + dir(-1), + numReplicas(1), + striperType(KFS_STRIPED_FILE_TYPE_NONE), + numStripes(0), + numRecoveryStripes(0), + stripeSize(0), + exclusive(false), + todumpster(-1), + user(kKfsUserNone), + group(kKfsGroupNone), + mode(kKfsModeUndef), + reqId(-1), + name() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + os << "create:" + " name: " << name << + " parent: " << dir << + " replication: " << numReplicas << + " striper: " << striperType << + " stripes: " << numStripes << + " recovery: " << numRecoveryStripes << + " stripe-size: " << stripeSize << + " todumpster: " << todumpster << + " user: " << user << + " group: " << group << + " mode: " << oct << mode << dec + ; + return os.str(); + } + bool Validate(); + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Parent File-handle", &MetaCreate::dir, fid_t(-1)) + .Def("Num-replicas", &MetaCreate::numReplicas, int16_t( 1)) + .Def("Striper-type", &MetaCreate::striperType, int32_t(KFS_STRIPED_FILE_TYPE_NONE)) + .Def("Num-stripes", &MetaCreate::numStripes, int32_t(0)) + .Def("Num-recovery-stripes", &MetaCreate::numRecoveryStripes, int32_t(0)) + .Def("Stripe-size", &MetaCreate::stripeSize, int32_t(0)) + .Def("Exclusive", &MetaCreate::exclusive, false) + .Def("Filename", &MetaCreate::name ) + .Def("Owner", &MetaCreate::user, kKfsUserNone) + .Def("Group", &MetaCreate::group, kKfsGroupNone) + .Def("Mode", &MetaCreate::mode, kKfsModeUndef) + .Def("ReqId", &MetaCreate::reqId, seq_t(-1)) + ; + } +}; + +/*! + * \brief create a directory + */ +struct MetaMkdir: public MetaRequest { + fid_t dir; //!< parent directory fid + fid_t fid; //!< file ID of new directory + kfsUid_t user; + kfsGid_t group; + kfsMode_t mode; + seq_t reqId; + string name; //!< name to create + MetaMkdir() + : MetaRequest(META_MKDIR, true), + dir(-1), + fid(-1), + user(kKfsUserNone), + group(kKfsGroupNone), + mode(kKfsModeUndef), + reqId(-1), + name() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "mkdir:" + " name: " << name << + " parent: " << dir << + " user: " << user << + " group: " << group << + " mode: " << oct << mode << dec << + " euser: " << euser << + " egroup: " << egroup + ; + return os.str(); + } + bool Validate() + { + return (dir >= 0 && ! name.empty()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Parent File-handle", &MetaMkdir::dir, fid_t(-1)) + .Def("Directory", &MetaMkdir::name ) + .Def("Owner", &MetaMkdir::user, kKfsUserNone) + .Def("Group", &MetaMkdir::group, kKfsGroupNone) + .Def("Mode", &MetaMkdir::mode, kKfsModeUndef) + .Def("ReqId", &MetaMkdir::reqId, seq_t(-1)) + ; + } +}; + +/*! + * \brief remove a file + */ +struct MetaRemove: public MetaRequest { + fid_t dir; //!< parent directory fid + string name; //!< name to remove + string pathname; //!< full pathname to remove + fid_t todumpster; + MetaRemove() + : MetaRequest(META_REMOVE, true), + dir(-1), + name(), + pathname(), + todumpster(-1) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "remove:" + " path: " << pathname << + " name: " << name << + " dir: " << dir << + " todumpster: " << todumpster + ; + return os.str(); + } + bool Validate() + { + return (dir >= 0 && ! name.empty()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Parent File-handle", &MetaRemove::dir, fid_t(-1)) + .Def("Filename", &MetaRemove::name ) + .Def("Pathname", &MetaRemove::pathname ) + ; + } +}; + +/*! + * \brief remove a directory + */ +struct MetaRmdir: public MetaRequest { + fid_t dir; //!< parent directory fid + string name; //!< name to remove + string pathname; //!< full pathname to remove + MetaRmdir() + : MetaRequest(META_RMDIR, true), + dir(-1), + name(), + pathname() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "rmdir: path = " << pathname << " (name = " << name << ")"; + os << " (parent fid = " << dir << ")"; + return os.str(); + } + bool Validate() + { + return (dir >= 0 && ! name.empty()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Parent File-handle", &MetaRmdir::dir, fid_t(-1)) + .Def("Directory", &MetaRmdir::name ) + .Def("Pathname", &MetaRmdir::pathname ) + ; + } +}; + +/*! + * \brief read directory contents + */ +struct MetaReaddir: public MetaRequest { + fid_t dir; //!< directory to read + IOBuffer resp; + int numEntries; + bool hasMoreEntriesFlag; + string fnameStart; + MetaReaddir() + : MetaRequest(META_READDIR, false), + dir(-1), + resp(), + numEntries(-1), + hasMoreEntriesFlag(false), + fnameStart() + {} + virtual void handle(); + virtual int log(ostream& file) const; + virtual void response(ostream& os, IOBuffer& buf); + virtual string Show() const + { + ostringstream os; + os << "readdir: dir fid = " << dir; + return os.str(); + } + bool Validate() + { + return (dir >= 0); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Directory File-handle", &MetaReaddir::dir, fid_t(-1)) + .Def("Max-entries", &MetaReaddir::numEntries, 0) + .Def("Fname-start", &MetaReaddir::fnameStart) + ; + } +}; + +typedef vector< + ServerLocation, + StdAllocator +> ServerLocations; + +/*! + * \brief layout information for a chunk + */ +struct ChunkLayoutInfo { + chunkOff_t offset; //!< offset of chunk within file + chunkId_t chunkId; //!< Id of the chunk corresponding to offset + seq_t chunkVersion; //!< version # assigned to this chunk + ServerLocations locations; //!< where the copies of the chunks are + ostream& show(ostream& os) const + { + os << offset << + " " << chunkId << + " " << chunkVersion << + " " << locations.size(); + for (ServerLocations::size_type i = 0; + i < locations.size(); + ++i) { + os << + " " << locations[i].hostname << + " " << locations[i].port; + } + return os; + } +}; + +inline static ostream& operator<<(ostream& os, const ChunkLayoutInfo& li) { + return li.show(os); +} + +/*! + * \brief read directory contents and get file attributes + */ +struct MetaReaddirPlus: public MetaRequest { + struct DEntry : public MFattr + { + DEntry() + : MFattr(), + name() + {} + DEntry(const MFattr& fa, const string& n) + : MFattr(fa), + name(n) + {} + string name; + }; + typedef vector > DEntries; + typedef vector > CInfos; + + fid_t dir; //!< directory to read + int numEntries; //!< max number of entres to return + int maxRespSize; + bool getLastChunkInfoOnlyIfSizeUnknown; + bool hasMoreEntriesFlag; + bool noAttrsFlag; + int64_t ioBufPending; + string fnameStart; + DEntries dentries; + CInfos lastChunkInfos; + + MetaReaddirPlus() + : MetaRequest(META_READDIRPLUS, false), + dir(-1), + numEntries(-1), + maxRespSize(-1), + getLastChunkInfoOnlyIfSizeUnknown(false), + hasMoreEntriesFlag(false), + noAttrsFlag(false), + ioBufPending(0), + fnameStart(), + dentries(), + lastChunkInfos() + {} + ~MetaReaddirPlus(); + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream& os, IOBuffer& buf); + virtual string Show() const + { + ostringstream os; + + os << "readdir plus: dir fid = " << dir; + return os.str(); + } + bool Validate() + { + return (dir >= 0); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Directory File-handle", &MetaReaddirPlus::dir, fid_t(-1)) + .Def("GetLastChunkInfoOnlyIfSizeUnknown", + &MetaReaddirPlus::getLastChunkInfoOnlyIfSizeUnknown, false) + .Def("Max-entries", &MetaReaddirPlus::numEntries, 0) + .Def("Fname-start", &MetaReaddirPlus::fnameStart) + ; + } +}; + +/*! + * \brief get allocation info. a chunk for a file + */ +struct MetaGetalloc: public MetaRequest { + fid_t fid; //!< file for alloc info is needed + chunkOff_t offset; //!< offset of chunk within file + chunkId_t chunkId; //!< Id of the chunk corresponding to offset + seq_t chunkVersion; //!< version # assigned to this chunk + ServerLocations locations; //!< where the copies of the chunks are + StringBufT<256> pathname; //!< pathname of the file (useful to print in debug msgs) + bool replicasOrderedFlag; + MetaGetalloc() + : MetaRequest(META_GETALLOC, false), + fid(-1), + offset(-1), + chunkId(-1), + chunkVersion(-1), + locations(), + pathname(), + replicasOrderedFlag(false) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "getalloc: " << pathname << " (fid = " << fid << ")"; + os << " offset = " << offset; + return os.str(); + } + bool Validate() + { + return (fid >= 0 && offset >= 0); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaGetalloc::fid, fid_t(-1)) + .Def("Chunk-offset", &MetaGetalloc::offset, chunkOff_t(-1)) + .Def("Pathname", &MetaGetalloc::pathname ) + ; + } +}; + +/*! + * \brief get allocation info. for all chunks of a file + */ +struct MetaGetlayout: public MetaRequest { + fid_t fid; //!< file for layout info is needed + chunkOff_t startOffset; + bool omitLocationsFlag; + bool lastChunkInfoOnlyFlag; + int maxResCnt; + int numChunks; + bool hasMoreChunksFlag; + IOBuffer resp; //!< result + MetaGetlayout() + : MetaRequest(META_GETLAYOUT, false), + fid(-1), + startOffset(0), + omitLocationsFlag(false), + lastChunkInfoOnlyFlag(false), + maxResCnt(-1), + numChunks(-1), + hasMoreChunksFlag(false), + resp() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os, IOBuffer& buf); + virtual string Show() const + { + ostringstream os; + + os << "getlayout: fid = " << fid; + return os.str(); + } + bool Validate() + { + return (fid >= 0); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaGetlayout::fid, fid_t(-1)) + .Def("Start-offset", &MetaGetlayout::startOffset, chunkOff_t(0)) + .Def("Omit-locations", &MetaGetlayout::omitLocationsFlag, false) + .Def("Last-chunk-only", &MetaGetlayout::lastChunkInfoOnlyFlag, false) + .Def("Max-chunks", &MetaGetlayout::maxResCnt, -1) + ; + } +}; + +/*! + * \brief Op for relinquishing a lease on a chunk of a file. + */ +struct MetaLeaseRelinquish: public MetaRequest { + LeaseType leaseType; //!< input + chunkId_t chunkId; //!< input + int64_t leaseId; //!< input + chunkOff_t chunkSize; + bool hasChunkChecksum; + uint32_t chunkChecksum; + MetaLeaseRelinquish() + : MetaRequest(META_LEASE_RELINQUISH, false), + leaseType(READ_LEASE), + chunkId(-1), + leaseId(-1), + chunkSize(-1), + hasChunkChecksum(false), + chunkChecksum(0), + chunkChecksumHdr(-1), + leaseTypeStr() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + os << + "relinquish " << + (leaseType == READ_LEASE ? "read" : "write") << + " lease: " << leaseId << + " chunk: " << chunkId << + " chunkSize: " << chunkSize << + " checksum: " << (hasChunkChecksum ? + int64_t(chunkChecksum) : int64_t(-1)) + ; + return os.str(); + } + bool Validate() + { + leaseType = (leaseTypeStr == "WRITE_LEASE") ? + WRITE_LEASE : READ_LEASE; + hasChunkChecksum = chunkChecksumHdr >= 0; + chunkChecksum = hasChunkChecksum ? + (uint32_t)chunkChecksumHdr : (uint32_t)0; + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Lease-type", &MetaLeaseRelinquish::leaseTypeStr ) + .Def("Chunk-handle", &MetaLeaseRelinquish::chunkId, chunkId_t(-1)) + .Def("Lease-id", &MetaLeaseRelinquish::leaseId, int64_t(-1)) + .Def("Chunk-size", &MetaLeaseRelinquish::chunkSize, chunkOff_t(-1)) + .Def("Chunk-checksum", &MetaLeaseRelinquish::chunkChecksumHdr, int64_t(-1)) + ; + } +private: + int64_t chunkChecksumHdr; + StringBufT<32> leaseTypeStr; +}; + +/*! + * \brief allocate a chunk for a file + */ +struct MetaAllocate: public MetaRequest, public KfsCallbackObj { + fid_t fid; //!< file for which space has to be allocated + chunkOff_t offset; //!< offset of chunk within file + chunkId_t chunkId; //!< Id of the chunk that was allocated + seq_t chunkVersion; //!< version # assigned to this chunk + seq_t initialChunkVersion; + int16_t numReplicas; //!< inherited from file's fattr + bool stripedFileFlag; + bool layoutDone; //!< Has layout of chunk been done + //!< when set, the allocation request is asking the metaserver to append + //!< a chunk to the file and let the client know the offset at which it was + //!< appended. + bool appendChunk; + //!< Write append only: the space reservation size that will follow the + //!< chunk allocation. + int spaceReservationSize; + //!< Suggested max # of concurrent appenders per chunk + int maxAppendersPerChunk; + //!< Server(s) on which this chunk has been placed + Servers servers; + //!< For replication, the master that runs the transaction + //!< for completing the write. + ChunkServerPtr master; + uint32_t numServerReplies; + bool logFlag; + bool invalidateAllFlag; + MetaAllocate* next; + int64_t leaseId; + chunkOff_t chunkBlockStart; + Permissions permissions; + MetaLeaseRelinquish* pendingLeaseRelinquish; + string responseStr; // Cached response + // With StringBufT instead of string the append allocation (presently + // the most frequent allocation type) saves malloc() calls. + StringBufT<64> clientHost; //!< the host from which request was received + StringBufT<256> pathname; //!< full pathname that corresponds to fid + MetaAllocate(seq_t s = -1, fid_t f = -1, chunkOff_t o = -1) + : MetaRequest(META_ALLOCATE, true, s), + KfsCallbackObj(), + fid(f), + offset(o), + chunkId(-1), + chunkVersion(-1), + initialChunkVersion(-1), + numReplicas(0), + stripedFileFlag(false), + layoutDone(false), + appendChunk(false), + spaceReservationSize(1 << 20), + maxAppendersPerChunk(64), + servers(), + master(), + numServerReplies(0), + logFlag(true), + invalidateAllFlag(false), + next(0), + leaseId(-1), + chunkBlockStart(-1), + permissions(), + pendingLeaseRelinquish(0), + responseStr(), + clientHost(), + pathname() + { + SET_HANDLER(this, &MetaAllocate::logOrLeaseRelinquishDone); + } + virtual ~MetaAllocate() + { delete pendingLeaseRelinquish; } + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const; + void responseSelf(ostream &os); + void LayoutDone(int64_t chunkAllocProcessTime); + int logOrLeaseRelinquishDone(int code, void *data); + bool Validate() + { + return (fid >= 0 && (offset >= 0 || appendChunk)); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaAllocate::fid, fid_t(-1)) + .Def("Chunk-append", &MetaAllocate::appendChunk, false) + .Def("Chunk-offset", &MetaAllocate::offset, chunkOff_t(-1)) + .Def("Pathname", &MetaAllocate::pathname ) + .Def("Client-host", &MetaAllocate::clientHost ) + .Def("Space-reserve", &MetaAllocate::spaceReservationSize, int(1<<20)) + .Def("Max-appenders", &MetaAllocate::maxAppendersPerChunk, int(64)) + .Def("Invalidate-all", &MetaAllocate::invalidateAllFlag, false) + ; + } +}; + +/*! + * \brief truncate a file + */ +struct MetaTruncate: public MetaRequest { + fid_t fid; //!< file for which space has to be allocated + chunkOff_t offset; //!< offset to truncate the file to + //!< set if the blks from the beginning of the file to the offset have + //!< to be deleted. + bool pruneBlksFromHead; + StringBufT<256> pathname; //!< full pathname for file being truncated + int64_t mtime; + MetaTruncate() + : MetaRequest(META_TRUNCATE, true), + fid(-1), + offset(-1), + pruneBlksFromHead(false), + pathname(), + mtime() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + os << + (pruneBlksFromHead ? + "prune from head:" : "truncate:") << + " path: " << pathname << + " fid: " << fid << + " offset: " << offset + ; + return os.str(); + } + bool Validate() + { + return (fid >= 0 && offset >= 0); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaTruncate::fid, fid_t(-1)) + .Def("Offset", &MetaTruncate::offset, chunkOff_t(-1)) + .Def("Pathname", &MetaTruncate::pathname ) + .Def("Prune-from-head", &MetaTruncate::pruneBlksFromHead, false) + ; + } +}; + +/*! + * \brief rename a file or directory + */ +struct MetaRename: public MetaRequest { + fid_t dir; //!< parent directory + string oldname; //!< old file name + string newname; //!< new file name + string oldpath; //!< fully-qualified old pathname + bool overwrite; //!< overwrite newname if it exists + fid_t todumpster; //!< moved original to dumpster + MetaRename() + : MetaRequest(META_RENAME, true), + dir(-1), + oldname(), + newname(), + oldpath(), + overwrite(false), + todumpster(-1) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "rename:" + " dir: " << dir << + " from: " << oldpath << + " to: " << newname << + " todumpster: " << todumpster + ; + return os.str(); + } + bool Validate() + { + return (dir >= 0 && ! oldname.empty() && ! newname.empty()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Parent File-handle", &MetaRename::dir, fid_t(-1)) + .Def("Old-name", &MetaRename::oldname ) + .Def("New-path", &MetaRename::newname ) + .Def("Old-path", &MetaRename::oldpath ) + .Def("Overwrite", &MetaRename::overwrite, false) + ; + } +}; + +/*! + * \brief set the mtime for a file or directory + */ +struct MetaSetMtime: public MetaRequest { + fid_t fid; //!< stash the fid for logging + string pathname; //!< absolute path for which we want to set the mtime + int64_t mtime; + MetaSetMtime(fid_t id = -1, int64_t mtime = 0) + : MetaRequest(META_SETMTIME, true), + fid(id), + pathname(), + mtime(mtime), + sec(0), + usec(0) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + os << "setmtime:" + " path: " << pathname << + " mtime: " << ShowTime(mtime) + ; + return os.str(); + } + bool Validate() + { + mtime = sec * 1000 * 1000 + usec; + return (! pathname.empty()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Mtime-sec", &MetaSetMtime::sec ) + .Def("Mtime-usec", &MetaSetMtime::usec ) + .Def("Pathname", &MetaSetMtime::pathname) + ; + } +private: + int64_t sec; + int64_t usec; +}; + +/*! + * \brief change a file's replication factor + */ +struct MetaChangeFileReplication: public MetaRequest { + fid_t fid; //!< fid whose replication has to be changed + int16_t numReplicas; //!< desired degree of replication + MetaChangeFileReplication() + : MetaRequest(META_CHANGE_FILE_REPLICATION, true), + fid(-1), + numReplicas(1) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "change-file-replication: fid = " << fid; + os << " new # of replicas: " << numReplicas << ' '; + return os.str(); + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaChangeFileReplication::fid, fid_t(-1)) + .Def("Num-replicas", &MetaChangeFileReplication::numReplicas, int16_t(1)) + ; + } +}; + +/*! + * \brief coalesce blocks of one file with another by appending the blocks from + * src->dest. After the coalesce is done, src will be of size 0. + */ +struct MetaCoalesceBlocks: public MetaRequest { + string srcPath; //!< fully-qualified pathname + string dstPath; //!< fully-qualified pathname + fid_t srcFid; + fid_t dstFid; + //!< output: the offset in dst at which the first + //!< block of src was moved to. + chunkOff_t dstStartOffset; + size_t numChunksMoved; + int64_t mtime; + MetaCoalesceBlocks() + : MetaRequest(META_COALESCE_BLOCKS, true), + srcPath(), + dstPath(), + srcFid(-1), + dstFid(-1), + dstStartOffset(-1), + numChunksMoved(0), + mtime(0) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "coalesce blocks: src = " << srcPath; + os << " dst = " << dstPath; + return os.str(); + } + bool Validate() + { + return (! srcPath.empty() && ! dstPath.empty()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Src-path", &MetaCoalesceBlocks::srcPath) + .Def("Dest-path", &MetaCoalesceBlocks::dstPath) + ; + } +}; + +/*! + * \brief Notification to hibernate/retire a chunkserver: + * Hibernation: when the server is put + * in hibernation mode, the server is taken down temporarily with a promise that + * it will come back N secs later; if the server doesnt' come up as promised + * then re-replication starts. + * + * Retirement: is extended downtime. The server is taken down and we don't know + * if it will ever come back. In this case, we use this server (preferably) + * to evacuate/re-replicate all the blocks off it before we take it down. + */ + +struct MetaRetireChunkserver : public MetaRequest, public ServerLocation { + ServerLocation& location; // 0 ? + "hibernating server: " : "retiring server: " + ) + location.ToString()); + } + bool Validate() + { + return (location.IsValid()); + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Chunk-server-name", &ServerLocation::hostname ) + .Def("Chunk-server-port", &ServerLocation::port, -1) + .Def("Downtime", &MetaRetireChunkserver::nSecsDown, -1) + ; + } +}; + +/*! + * \brief hello RPC from a chunk server on startup + */ +struct MetaHello : public MetaRequest, public ServerLocation { + struct ChunkInfo { + fid_t allocFileId; // file id when chunk was allocated + chunkId_t chunkId; + seq_t chunkVersion; + }; + typedef vector > ChunkInfos; + + ChunkServerPtr server; //!< The chunkserver that sent the hello message + ServerLocation& location; // static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Chunk-server-name", &ServerLocation::hostname ) + .Def("Chunk-server-port", &ServerLocation::port, int(-1)) + .Def("Cluster-key", &MetaHello::clusterKey ) + .Def("MD5Sum", &MetaHello::md5sum ) + .Def("Total-space", &MetaHello::totalSpace, int64_t(0)) + .Def("Total-fs-space", &MetaHello::totalFsSpace, int64_t(0)) + .Def("Used-space", &MetaHello::usedSpace, int64_t(0)) + .Def("Rack-id", &MetaHello::rackId, int(-1)) + .Def("Uptime", &MetaHello::uptime, int64_t(0)) + .Def("Num-chunks", &MetaHello::numChunks, int(0)) + .Def("Num-not-stable-append-chunks", &MetaHello::numNotStableAppendChunks, int(0)) + .Def("Num-not-stable-chunks", &MetaHello::numNotStableChunks, int(0)) + .Def("Num-appends-with-wids", &MetaHello::numAppendsWithWid, int64_t(0)) + .Def("Content-length", &MetaHello::contentLength, int(0)) + .Def("Content-int-base", &MetaHello::contentIntBase, int(10)) + .Def("Stale-chunks-hex-format", &MetaHello::staleChunksHexFormatFlag, false) + ; + } +}; + +/*! + * \brief whenever a chunk server goes down, this message is used to clean up state. + */ +struct MetaBye: public MetaRequest { + ChunkServerPtr server; //!< The chunkserver that went down + MetaBye(seq_t s, const ChunkServerPtr& c): + MetaRequest(META_BYE, false, s), server(c) { } + virtual void handle(); + virtual int log(ostream &file) const; + virtual string Show() const + { + return "Chunkserver Bye"; + } +}; + +struct MetaGetPathName: public MetaRequest { + fid_t fid; + chunkId_t chunkId; + MFattr fattr; + string result; + MetaGetPathName() + : MetaRequest(META_GETPATHNAME, false), + fid(-1), + chunkId(-1), + fattr(), + result() + {} + virtual void handle(); + virtual void response(ostream &os); + virtual int log(ostream& /* file */) const { return 0; } + virtual string Show() const + { + ostringstream os; + os << + "get pathname:" + " fid: " << fid << + " chunkId: " << chunkId << + " status: " << status + ; + return os.str(); + } + bool Validate() + { return (fid >= 0 || chunkId >= 0); } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaGetPathName::fid, fid_t(-1)) + .Def("Chunk-handle", &MetaGetPathName::chunkId, chunkId_t(-1)) + ; + } +}; + +struct MetaChmod: public MetaRequest { + fid_t fid; + kfsMode_t mode; + MetaChmod() + : MetaRequest(META_CHMOD, true), + fid(-1), + mode(kKfsModeUndef) + {} + virtual void handle(); + virtual void response(ostream &os); + virtual int log(ostream& file) const; + virtual string Show() const + { + ostringstream os; + os << + "chmod:" + " fid: " << fid << + " mode: " << oct << mode << dec << + " status: " << status + ; + return os.str(); + } + bool Validate() + { return (fid >= 0 && mode != kKfsModeUndef); } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaChmod::fid, fid_t(-1)) + .Def("Mode", &MetaChmod::mode, kKfsModeUndef) + ; + } +}; + +struct MetaChown: public MetaRequest { + fid_t fid; + kfsUid_t user; + kfsGid_t group; + MetaChown() + : MetaRequest(META_CHOWN, true), + fid(-1), + user(kKfsUserNone), + group(kKfsGroupNone) + {} + virtual void handle(); + virtual void response(ostream &os); + virtual int log(ostream& file) const; + virtual string Show() const + { + ostringstream os; + os << + "chown:" + " euser: " << euser << + " egroup: " << egroup << + " fid: " << fid << + " user: " << user << + " group: " << group << + " status: " << status + ; + return os.str(); + } + bool Validate() + { return (fid >= 0); } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaChown::fid, fid_t(-1)) + .Def("Owner", &MetaChown::user, kKfsUserNone) + .Def("Group", &MetaChown::group, kKfsGroupNone) + ; + } +}; + +/*! + * \brief RPCs that go from meta server->chunk server are + * MetaRequest's that define a method to generate the RPC + * request. + */ +struct MetaChunkRequest: public MetaRequest { + const chunkId_t chunkId; + const ChunkServerPtr server; // The "owner". + MetaChunkRequest(MetaOp o, seq_t s, bool mu, + const ChunkServerPtr& c, chunkId_t cid) + : MetaRequest(o, mu, s), + chunkId(cid), + server(c) + {} + //!< generate a request message (in string format) as per the + //!< KFS protocol. + virtual int log(ostream& /* file */) const { return 0; } + virtual void request(ostream& os, IOBuffer& /* buf */) { request(os); } + virtual void handleReply(const Properties& prop) {} + virtual void handle() {} + void resume() + { + submit_request(this); + } +protected: + virtual void request(ostream& /* os */) {} +}; + +/*! + * \brief Allocate RPC from meta server to chunk server + */ +struct MetaChunkAllocate: public MetaChunkRequest { + const int64_t leaseId; + MetaAllocate* const req; + MetaChunkAllocate(seq_t n, MetaAllocate *r, + const ChunkServerPtr& s, int64_t l) + : MetaChunkRequest(META_CHUNK_ALLOCATE, n, false, s, r->chunkId), + leaseId(l), + req(r) + {} + virtual void handle(); + virtual void request(ostream &os); + virtual string Show() const + { + return "meta->chunk allocate: "; + } +}; + +/*! + * \brief Delete RPC from meta server to chunk server + */ +struct MetaChunkDelete: public MetaChunkRequest { + MetaChunkDelete(seq_t n, const ChunkServerPtr& s, chunkId_t c) + : MetaChunkRequest(META_CHUNK_DELETE, n, false, s, c) + {} + virtual void request(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "meta->chunk delete: chunkId: " << chunkId; + return os.str(); + } +}; + +struct MetaChunkVersChange; + +/*! + * \brief Replicate RPC from meta server to chunk server. This + * message is sent to a "destination" chunk server---that is, a chunk + * server is told to create a copy of chunk from some source that is + * already hosting the chunk. This model allows the destination to + * replicate the chunk at its convenieance. + */ +struct MetaChunkReplicate: public MetaChunkRequest { + typedef map > InvalidStripes; + + fid_t fid; //!< input: we tell the chunkserver what it is + seq_t chunkVersion; //!< io: the chunkservers tells us what it did + chunkOff_t chunkOffset; //!< input: chunk recovery parameters + int16_t striperType; + int16_t numStripes; + int16_t numRecoveryStripes; + int32_t stripeSize; + ChunkServerPtr dataServer; //!< where to get a copy from + ServerLocation srcLocation; + string pathname; + int64_t fileSize; + InvalidStripes invalidStripes; + MetaChunkVersChange* versChange; + MetaChunkReplicate(seq_t n, const ChunkServerPtr& s, + fid_t f, chunkId_t c, const ServerLocation& loc, + const ChunkServerPtr& src) + : MetaChunkRequest(META_CHUNK_REPLICATE, n, false, s, c), + fid(f), + chunkVersion(-1), + chunkOffset(-1), + striperType(KFS_STRIPED_FILE_TYPE_NONE), + numStripes(0), + numRecoveryStripes(0), + stripeSize(0), + dataServer(src), + srcLocation(loc), + pathname(), + fileSize(-1), + invalidStripes(), + versChange(0) + {} + virtual ~MetaChunkReplicate() { assert(! versChange); } + virtual void handle(); + virtual void request(ostream &os); + virtual void handleReply(const Properties& prop); + virtual string Show() const; +}; + +/*! + * \brief Chunk version # change RPC from meta server to chunk server + */ +struct MetaChunkVersChange: public MetaChunkRequest { + fid_t fid; + seq_t chunkVersion; //!< version # assigned to this chunk + seq_t fromVersion; + bool makeStableFlag; + bool pendingAddFlag; + MetaChunkReplicate* replicate; + + MetaChunkVersChange( + seq_t n, + const ChunkServerPtr& s, + fid_t f, + chunkId_t c, + seq_t v, + seq_t fromVers, + bool mkStableFlag, + bool pendAddFlag, + MetaChunkReplicate* repl = 0) + : MetaChunkRequest(META_CHUNK_VERSCHANGE, n, false, s, c), + fid(f), + chunkVersion(v), + fromVersion(fromVers), + makeStableFlag(mkStableFlag), + pendingAddFlag(pendAddFlag), + replicate(repl) + { + if (replicate) { + assert(! replicate->versChange); + replicate->versChange = this; + } + } + virtual ~MetaChunkVersChange() { assert(! replicate); } + virtual void handle(); + virtual void request(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << + "meta->chunk vers change:" + " fid: " << fid << + " chunkId: " << chunkId << + " version: from: " << fromVersion << + " => to: " << chunkVersion << + " make stable: " << makeStableFlag + ; + return os.str(); + } +}; + +/*! + * \brief As a chunkserver for the size of a particular chunk. We use this RPC + * to compute the filesize: whenever the lease on the last chunk of the file + * expires, we get the chunk's size and then determine the filesize. + */ +struct MetaChunkSize: public MetaChunkRequest { + fid_t fid; //!< input: we use the tuple to + //!< find the entry we need. + seq_t chunkVersion; + chunkOff_t chunkSize; //!< output: the chunk size + chunkOff_t filesize; //!< for logging purposes: the size of the file + /// input: given the pathname, we can update space usage for the path + /// hierarchy corresponding to pathname; this will enable us to make "du" + /// instantaneous. + string pathname; + bool retryFlag; + MetaChunkSize(seq_t n, const ChunkServerPtr& s, fid_t f, + chunkId_t c, seq_t v, const string &p, bool retry) + : MetaChunkRequest(META_CHUNK_SIZE, n, true, s, c), + fid(f), + chunkVersion(v), + chunkSize(-1), + filesize(-1), + pathname(p), + retryFlag(retry) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void request(ostream &os); + virtual void handleReply(const Properties& prop) + { + chunkSize = prop.getValue("Size", (chunkOff_t) -1); + } + virtual string Show() const + { + ostringstream os; + os << + "meta->chunk size: " << pathname << + " fid: " << fid << + " chunkId: " << chunkId << + " chunkVersion: " << chunkVersion << + " size: " << chunkSize; + return os.str(); + } +}; + +/*! + * \brief Heartbeat RPC from meta server to chunk server. We can + * ask the chunk server for lots of stuff; for now, we ask it + * how much is available/used up. + */ +struct MetaChunkHeartbeat: public MetaChunkRequest { + int64_t evacuateCount; + MetaChunkHeartbeat(seq_t n, const ChunkServerPtr& s, + int64_t evacuateCnt) + : MetaChunkRequest(META_CHUNK_HEARTBEAT, n, false, s, -1), + evacuateCount(evacuateCnt) + {} + virtual void request(ostream &os); + virtual string Show() const + { + return "meta->chunk heartbeat"; + } +}; + +/*! + * \brief Stale chunk notification message from meta->chunk. This + * tells the chunk servers the id's of stale chunks, which the chunk + * server should get rid of. + */ +struct MetaChunkStaleNotify: public MetaChunkRequest { + MetaChunkStaleNotify(seq_t n, const ChunkServerPtr& s, + bool evacFlag, bool hexFmtFlag) + : MetaChunkRequest(META_CHUNK_STALENOTIFY, n, false, s, -1), + staleChunkIds(), + evacuatedFlag(evacFlag), + hexFormatFlag(hexFmtFlag) + {} + ChunkIdQueue staleChunkIds; //!< chunk ids that are stale + bool evacuatedFlag; + bool hexFormatFlag; + virtual void request(ostream& os, IOBuffer& buf); + virtual string Show() const + { + return "meta->chunk stale notify"; + } +}; + +struct MetaBeginMakeChunkStable : public MetaChunkRequest { + const fid_t fid; // input + const seq_t chunkVersion; // input + const ServerLocation serverLoc; // processing this cmd + int64_t chunkSize; // output + uint32_t chunkChecksum; // output + MetaBeginMakeChunkStable(seq_t n, const ChunkServerPtr& s, + const ServerLocation& l, fid_t f, chunkId_t c, seq_t v) : + MetaChunkRequest(META_BEGIN_MAKE_CHUNK_STABLE, n, false, s, c), + fid(f), chunkVersion(v), serverLoc(l), + chunkSize(-1), chunkChecksum(0) + {} + virtual void handle(); + virtual void request(ostream &os); + virtual void handleReply(const Properties& prop) + { + chunkSize = prop.getValue("Chunk-size", (int64_t) -1); + chunkChecksum = (uint32_t)prop.getValue("Chunk-checksum", (uint64_t)0); + } + virtual string Show() const { + ostringstream os; + os << "begin-make-chunk-stable:" + " server: " << serverLoc << + " seq: " << opSeqno << + " status: " << status << + (statusMsg.empty() ? "" : " ") << statusMsg << + " fileid: " << fid << + " chunkid: " << chunkId << + " chunkvers: " << chunkVersion << + " chunkSize: " << chunkSize << + " chunkChecksum: " << chunkChecksum; + return os.str(); + } +}; + +struct MetaLogMakeChunkStable : public MetaRequest, public KfsCallbackObj { + const fid_t fid; // input + const chunkId_t chunkId; // input + const seq_t chunkVersion; // input + const int64_t chunkSize; // input + const uint32_t chunkChecksum; // input + const bool hasChunkChecksum; // input + MetaLogMakeChunkStable(fid_t fileId, chunkId_t id, seq_t version, + int64_t size, bool hasChecksum, uint32_t checksum, seq_t seqNum, + bool logDoneTypeFlag = false) + : MetaRequest(logDoneTypeFlag ? + META_LOG_MAKE_CHUNK_STABLE_DONE : + META_LOG_MAKE_CHUNK_STABLE, true, seqNum), + KfsCallbackObj(), + fid(fileId), + chunkId(id), + chunkVersion(version), + chunkSize(size), + chunkChecksum(checksum), + hasChunkChecksum(hasChecksum) + { + SET_HANDLER(this, &MetaLogMakeChunkStable::logDone); + clnt = this; + } + virtual void handle() { status = 0; } + virtual string Show() const { + ostringstream os; + os << (op == META_LOG_MAKE_CHUNK_STABLE ? + "log-make-chunk-stable:" : + "log-make-chunk-stable-done:") << + " fleid: " << fid << + " chunkid: " << chunkId << + " chunkvers: " << chunkVersion << + " chunkSize: " << chunkSize << + " chunkChecksum: " << (hasChunkChecksum ? + int64_t(chunkChecksum) : int64_t(-1)); + return os.str(); + } + virtual int log(ostream &file) const; + int logDone(int code, void *data); +}; + +struct MetaLogMakeChunkStableDone : public MetaLogMakeChunkStable { + MetaLogMakeChunkStableDone(fid_t fileId, chunkId_t id, seq_t version, + int64_t size, bool hasChecksum, uint32_t checksum, seq_t seqNum) + : MetaLogMakeChunkStable(fileId, id, version, size, hasChecksum, + checksum, seqNum, true) + {} +}; + +/*! + * \brief Notification message from meta->chunk asking the server to make a + * chunk. This tells the chunk server that the writes to a chunk are done and + * that the chunkserver should flush any dirty data. + */ +struct MetaChunkMakeStable: public MetaChunkRequest { + MetaChunkMakeStable( + seq_t inSeqNo, + const ChunkServerPtr& inServer, + fid_t inFileId, + chunkId_t inChunkId, + seq_t inChunkVersion, + chunkOff_t inChunkSize, + bool inHasChunkChecksum, + uint32_t inChunkChecksum, + bool inAddPending) + : MetaChunkRequest(META_CHUNK_MAKE_STABLE, + inSeqNo, false, inServer, inChunkId), + fid(inFileId), + chunkVersion(inChunkVersion), + chunkSize(inChunkSize), + hasChunkChecksum(inHasChunkChecksum), + addPending(inAddPending), + chunkChecksum(inChunkChecksum) + {} + const fid_t fid; //!< input: we tell the chunkserver what it is + const seq_t chunkVersion; //!< The version tha the chunk should be in + const chunkOff_t chunkSize; + const bool hasChunkChecksum:1; + const bool addPending:1; + const uint32_t chunkChecksum; + virtual void handle(); + virtual void request(ostream &os); + virtual string Show() const; +}; + + +/*! + * For scheduled downtime, we evacaute all the chunks on a server; when + * we know that the evacuation is finished, we tell the chunkserver to retire. + */ +struct MetaChunkRetire: public MetaChunkRequest { + MetaChunkRetire(seq_t n, const ChunkServerPtr& s): + MetaChunkRequest(META_CHUNK_RETIRE, n, false, s, -1) { } + virtual void request(ostream &os); + virtual string Show() const + { + return "chunkserver retire"; + } +}; + +struct MetaChunkSetProperties: public MetaChunkRequest { + const string serverProps; + MetaChunkSetProperties(seq_t n, const ChunkServerPtr& s, + const Properties& props) + : MetaChunkRequest(META_CHUNK_SET_PROPERTIES, n, false, s, -1), + serverProps(Properties2Str(props)) + {} + virtual void request(ostream &os); + virtual string Show() const + { + return "chunkserver set properties"; + } + static string Properties2Str(const Properties& props) + { + string ret; + props.getList(ret, ""); + return ret; + } +}; + +struct MetaChunkServerRestart : public MetaChunkRequest { + MetaChunkServerRestart(seq_t n, const ChunkServerPtr& s) + : MetaChunkRequest(META_CHUNK_SERVER_RESTART, n, false, s, -1) + {} + virtual void request(ostream &os); + virtual string Show() const + { + return "chunkserver restart"; + } +}; + +/*! + * \brief For monitoring purposes, a client/tool can send a PING + * request. In response, the server replies with the list of all + * connected chunk servers and their locations as well as some state + * about each of those servers. + */ +struct MetaPing : public MetaRequest { + IOBuffer resp; + MetaPing() + : MetaRequest(META_PING, false), + resp() + { + // Suppress warning with requests with no version filed. + clientProtoVers = KFS_CLIENT_PROTO_VERS; + } + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os, IOBuffer& buf); + virtual string Show() const + { + return "ping"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +/*! + * \brief For monitoring purposes, a client/tool can request metaserver + * to provide a list of live chunkservers. + */ +struct MetaUpServers: public MetaRequest { + IOBuffer resp; + MetaUpServers() + : MetaRequest(META_UPSERVERS, false), + resp() + {} + virtual void handle(); + virtual int log(ostream& file) const; + virtual void response(ostream& os, IOBuffer& buf); + virtual string Show() const + { + return "upservers"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +/*! + * \brief To toggle WORM mode of metaserver a client/tool can send a + * TOGGLE_WORM request. In response, the server changes its WORM state. + */ +struct MetaToggleWORM: public MetaRequest { + bool value; // !< Enable/disable WORM + MetaToggleWORM() + : MetaRequest(META_TOGGLE_WORM, false), + value(false) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + return (value ? "Toggle WORM: Enabled" : "Toggle WORM: Disabled"); + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Toggle-WORM", &MetaToggleWORM::value, false) + ; + } +}; + +/*! + * \brief For monitoring purposes, a client/tool can send a STATS + * request. In response, the server replies with the list of all + * counters it keeps. + */ +struct MetaStats: public MetaRequest { + string stats; //!< result + MetaStats() + : MetaRequest(META_STATS, false), + stats() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + return "stats"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +/*! + * \brief For debugging purposes, recompute the size of the dir tree + */ +struct MetaRecomputeDirsize: public MetaRequest { + MetaRecomputeDirsize() + : MetaRequest(META_RECOMPUTE_DIRSIZE, false) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + return "recompute dir size"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +/*! + * \brief For debugging purposes, dump out the chunk->location map + * to a file. + */ +struct MetaDumpChunkToServerMap: public MetaRequest { + string chunkmapFile; //!< file to which the chunk map was written to + int pid; + MetaDumpChunkToServerMap() + : MetaRequest(META_DUMP_CHUNKTOSERVERMAP, false), + chunkmapFile(), + pid(-1) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + return "dump chunk2server map"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +/*! + * \brief For debugging purposes, check the status of all the leases + */ +struct MetaCheckLeases: public MetaRequest { + MetaCheckLeases() + : MetaRequest(META_CHECK_LEASES, false) + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + return "checking all leases"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +/*! + * \brief For debugging purposes, dump out the set of blocks that are currently + * being re-replicated. + */ +struct MetaDumpChunkReplicationCandidates: public MetaRequest { + MetaDumpChunkReplicationCandidates() + : MetaRequest(META_DUMP_CHUNKREPLICATIONCANDIDATES, false), + numReplication(0), + numPendingRecovery(0), + resp() + {} + // list of blocks that are being re-replicated + size_t numReplication; + size_t numPendingRecovery; + IOBuffer resp; + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os, IOBuffer& buf); + virtual string Show() const + { + return "dump chunk replication candidates"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +/*! + * \brief Check the replication level of all blocks in the system. Return back + * a list of files that have blocks missing. +*/ +struct MetaFsck: public MetaRequest { + MetaFsck() + : MetaRequest(META_FSCK, false), + reportAbandonedFilesFlag(true), + pid(-1), + fd(), + resp() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os, IOBuffer& buf); + virtual string Show() const + { + return "fsck"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Report-Abandoned-Files", &MetaFsck::reportAbandonedFilesFlag) + ; + } + static void SetParameters(const Properties& props); +private: + typedef vector Fds; + + bool reportAbandonedFilesFlag; + int pid; + Fds fd; + IOBuffer resp; + static string sTmpName; + static int sMaxFsckResponseSize; +}; + +/*! + * \brief For monitoring purposes, a client/tool can send a OPEN FILES + * request. In response, the server replies with the list of all + * open files---files for which there is a valid lease + */ +struct MetaOpenFiles: public MetaRequest { + typedef map< + fid_t, + vector > + > ReadInfo; + typedef map< + fid_t, + vector + > WriteInfo; + size_t openForReadCnt; //!< result + size_t openForWriteCnt; //!< result + IOBuffer resp; + MetaOpenFiles() + : MetaRequest(META_OPEN_FILES, false), + openForReadCnt(0), + openForWriteCnt(0), + resp() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream& os, IOBuffer& buf); + virtual string Show() const + { + return "open files"; + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +struct MetaSetChunkServersProperties : public MetaRequest { + Properties properties; // input + MetaSetChunkServersProperties() + : MetaRequest(META_SET_CHUNK_SERVERS_PROPERTIES, false), + properties() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + string ret("set chunk servers properties "); + properties.getList(ret, "", ";"); + return ret; + } + // RequestParser::Parse creates object of this type, overload is + // sufficient, i.e. ValidateRequestHeader does not have to be "virtual". + bool ValidateRequestHeader( + const char* name, + size_t nameLen, + const char* header, + size_t headerLen, + bool hasChecksum, + uint32_t checksum); + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +}; + +struct MetaGetChunkServersCounters : public MetaRequest { + MetaGetChunkServersCounters() + : MetaRequest(META_GET_CHUNK_SERVERS_COUNTERS, false), + resp() + { + // Suppress warning with requests with no version filed. + clientProtoVers = KFS_CLIENT_PROTO_VERS; + } + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os, IOBuffer& buf); + virtual string Show() const + { + return string("get chunk servers counters "); + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +private: + IOBuffer resp; +}; + +struct MetaGetRequestCounters : public MetaRequest { + MetaGetRequestCounters() + : MetaRequest(META_GET_REQUEST_COUNTERS, false), + resp(), + userCpuMicroSec(0), + systemCpuMicroSec(0) + {} + virtual void handle(); + virtual int log(ostream &file) const + { + return 0; + } + virtual void response(ostream &os, IOBuffer& buf); + virtual string Show() const + { + return string("get request counters "); + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + ; + } +private: + IOBuffer resp; + int64_t userCpuMicroSec; + int64_t systemCpuMicroSec; +}; + +struct MetaCheckpoint : public MetaRequest { + MetaCheckpoint(seq_t s, KfsCallbackObj* c) + : MetaRequest(META_CHECKPOINT, false, s), + lockFileName(), + lockFd(-1), + intervalSec(60 * 60), + pid(-1), + failedCount(0), + maxFailedCount(2), + chekpointWriteTimeoutSec(60 * 60), + chekpointWriteSyncFlag(true), + chekpointWriteBufferSize(16 << 20), + lastCheckpointId(-1), + runningCheckpointId(-1), + lastRun(0) + { clnt = c; } + virtual void handle(); + virtual int log(ostream &file) const + { + return 0; + } + virtual string Show() const + { + return string("checkpoint"); + } + void SetParameters(const Properties& props); + void ScheduleNow(); +private: + string lockFileName; + int lockFd; + int intervalSec; + int pid; + int failedCount; + int maxFailedCount; + int chekpointWriteTimeoutSec; + bool chekpointWriteSyncFlag; + size_t chekpointWriteBufferSize; + seq_t lastCheckpointId; + seq_t runningCheckpointId; + time_t lastRun; +}; + +/*! + * \brief Op to initiate connection close by the meta server. To use with netcat + * and such. + */ +struct MetaDisconnect : public MetaRequest { + MetaDisconnect() + : MetaRequest(META_DISCONNECT, false) + { + // Suppress warning with requests with no version filed. + clientProtoVers = KFS_CLIENT_PROTO_VERS; + } + virtual void handle() {} + virtual string Show() const { return string("disconnect"); } + virtual int log(ostream &file) const { return 0; } + bool Validate() { return true; } +}; + +/*! + * \brief Op for handling a notify of a corrupt chunk + */ +struct MetaChunkCorrupt: public MetaRequest { + fid_t fid; //!< input + chunkId_t chunkId; //!< input + bool isChunkLost; //!< input + bool noReplyFlag; //!< input + bool dirOkFlag; //!< input + string chunkDir; //!< input + ChunkServerPtr server; //!< The chunkserver that sent us this message + MetaChunkCorrupt(seq_t s = -1, fid_t f = -1, chunkId_t c = -1) + : MetaRequest(META_CHUNK_CORRUPT, false, s), + fid(f), + chunkId(c), + isChunkLost(false), + noReplyFlag(false), + dirOkFlag(false), + chunkDir(), + server() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << (isChunkLost ? "lost" : "corrupt") << " chunk: fid = " << fid << " chunkid = " << chunkId; + return os.str(); + } + virtual void setChunkServer(const ChunkServerPtr& cs) { server = cs; } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("File-handle", &MetaChunkCorrupt::fid, fid_t(-1)) + .Def("Chunk-handle", &MetaChunkCorrupt::chunkId, chunkId_t(-1)) + .Def("Is-chunk-lost", &MetaChunkCorrupt::isChunkLost, false) + .Def("No-reply", &MetaChunkCorrupt::noReplyFlag, false) + .Def("Chunk-dir", &MetaChunkCorrupt::chunkDir) + .Def("Dir-ok", &MetaChunkCorrupt::dirOkFlag, false) + ; + } +}; + +/*! + * \brief chunk server chunks evacuate request + */ +struct MetaChunkEvacuate: public MetaRequest { + int64_t totalSpace; + int64_t totalFsSpace; + int64_t usedSpace; + int numDrives; + int numWritableDrives; + int numEvacuateInFlight; + StringBufT<21 * 32> chunkIds; //!< input + ChunkServerPtr server; + MetaChunkEvacuate(seq_t s = -1) + : MetaRequest(META_CHUNK_EVACUATE, false, s), + totalSpace(-1), + totalFsSpace(-1), + usedSpace(-1), + numDrives(-1), + numWritableDrives(-1), + numEvacuateInFlight(-1), + chunkIds(), + server() + {} + virtual void handle(); + virtual int log(ostream &file) const + { + return 0; + } + virtual void response(ostream &os); + virtual string Show() const + { + return ("evacuate: " + chunkIds.GetStr()); + } + virtual void setChunkServer(const ChunkServerPtr& cs) { server = cs; } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Chunk-ids", &MetaChunkEvacuate::chunkIds) + .Def("Total-space", &MetaChunkEvacuate::totalSpace, int64_t(-1)) + .Def("Total-fs-space", &MetaChunkEvacuate::totalFsSpace, int64_t(-1)) + .Def("Used-space", &MetaChunkEvacuate::usedSpace, int64_t(-1)) + .Def("Num-drives", &MetaChunkEvacuate::numDrives, int(-1)) + .Def("Num-wr-drives", &MetaChunkEvacuate::numWritableDrives, int(-1)) + .Def("Num-evacuate", &MetaChunkEvacuate::numEvacuateInFlight, int(-1)) + ; + } +}; + +/*! + * \brief Op for acquiring a lease on a chunk of a file. + */ +struct MetaLeaseAcquire: public MetaRequest { + const LeaseType leaseType; //!< input + string pathname; //!< full pathname of the file that owns chunk + chunkId_t chunkId; //!< input + bool flushFlag; //!< input + int leaseTimeout; //!< input + int64_t leaseId; //!< result + StringBufT<21 * 8> chunkIds; //!< input + string leaseIds; //!< result + MetaLeaseAcquire() + : MetaRequest(META_LEASE_ACQUIRE, false), + leaseType(READ_LEASE), + pathname(), + chunkId(-1), + flushFlag(false), + leaseTimeout(LEASE_INTERVAL_SECS), + leaseId(-1), + chunkIds(), + leaseIds() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << + "lease acquire:" + " chunkId: " << chunkId << + " " << pathname << + (leaseType == READ_LEASE ? " read lease " : " write lease ") << + (flushFlag ? " flush" : "") + ; + return os.str(); + } + bool Validate() + { + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Pathname", &MetaLeaseAcquire::pathname ) + .Def("Chunk-handle", &MetaLeaseAcquire::chunkId, chunkId_t(-1) ) + .Def("Flush-write-lease", &MetaLeaseAcquire::flushFlag, false ) + .Def("Lease-timeout", &MetaLeaseAcquire::leaseTimeout, LEASE_INTERVAL_SECS) + .Def("Chunk-ids", &MetaLeaseAcquire::chunkIds) + ; + } +}; + +/*! + * \brief Op for renewing a lease on a chunk of a file. + */ +struct MetaLeaseRenew: public MetaRequest { + LeaseType leaseType; //!< input + string pathname; //!< full pathname of the file that owns chunk + chunkId_t chunkId; //!< input + int64_t leaseId; //!< input + MetaLeaseRenew() + : MetaRequest(META_LEASE_RENEW, false), + leaseType(READ_LEASE), + pathname(), + chunkId(-1), + leaseId(-1), + leaseTypeStr() + {} + virtual void handle(); + virtual int log(ostream &file) const; + virtual void response(ostream &os); + virtual string Show() const + { + ostringstream os; + + os << "lease renew: " << pathname << " "; + if (leaseType == READ_LEASE) + os << "read lease "; + else + os << "write lease "; + + os << " chunkId = " << chunkId; + return os.str(); + } + bool Validate() + { + leaseType = (leaseTypeStr == "WRITE_LEASE") ? + WRITE_LEASE : READ_LEASE; + return true; + } + template static T& ParserDef(T& parser) + { + return MetaRequest::ParserDef(parser) + .Def("Lease-type", &MetaLeaseRenew::leaseTypeStr ) + .Def("Lease-id", &MetaLeaseRenew::leaseId, int64_t(-1)) + .Def("Chunk-handle", &MetaLeaseRenew::chunkId, chunkId_t(-1)) + ; + } +private: + StringBufT<32> leaseTypeStr; +}; + +/*! + * \brief An internally generated op to force the cleanup of + * dead leases thru the main event processing loop. + */ +struct MetaLeaseCleanup: public MetaRequest { + MetaLeaseCleanup(seq_t s, KfsCallbackObj *c): + MetaRequest(META_LEASE_CLEANUP, false, s) { clnt = c; } + + virtual void handle(); + virtual int log(ostream &file) const; + virtual string Show() const + { + return "lease cleanup"; + } +}; + +/*! + * \brief An internally generated op to check that the degree + * of replication for each chunk is satisfactory. This op goes + * thru the main event processing loop. + */ +struct MetaChunkReplicationCheck : public MetaRequest { + MetaChunkReplicationCheck(seq_t s, KfsCallbackObj *c): + MetaRequest(META_CHUNK_REPLICATION_CHECK, false, s) { clnt = c; } + + virtual void handle(); + virtual int log(ostream &file) const; + virtual string Show() const + { + return "chunk replication check"; + } +}; + +int ParseCommand(const IOBuffer& buf, int len, MetaRequest **res, + char* threadParseBuffer = 0); + +void printleaves(); + +void setClusterKey(const char *key); +void setMD5SumFn(const char *md5sumFn); +void setWORMMode(bool value); +void setMaxReplicasPerFile(int16_t value); +void setChunkmapDumpDir(string dir); +void CheckIfIoBuffersAvailable(); +void SetRequestParameters(const Properties& props); + +/* update counters for # of files/dirs/chunks in the system */ +void UpdateNumDirs(int count); +void UpdateNumFiles(int count); +void UpdateNumChunks(int count); +void UpdatePathToFidCacheMiss(int count); +void UpdatePathToFidCacheHit(int count); +int64_t GetNumFiles(); +int64_t GetNumDirs(); + +} +#endif /* !defined(KFS_REQUEST_H) */ diff --git a/src/cc/meta/NetDispatch.cc b/src/cc/meta/NetDispatch.cc new file mode 100644 index 000000000..9a18332e0 --- /dev/null +++ b/src/cc/meta/NetDispatch.cc @@ -0,0 +1,1057 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/01 +// Author: Sriram Rao +// Mike Ovsiannikov. Re-implement. Implement "client threads". +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file NetDispatch.cc +// +// \brief Meta-server request processing threads implementation. +// +//---------------------------------------------------------------------------- + +#include "NetDispatch.h" +#include "LayoutManager.h" +#include "ClientSM.h" +#include "Logger.h" + +#include "kfsio/Acceptor.h" +#include "kfsio/KfsCallbackObj.h" +#include "kfsio/Globals.h" +#include "kfsio/IOBuffer.h" +#include "common/Properties.h" +#include "common/MsgLogger.h" +#include "common/time.h" +#include "common/rusage.h" +#include "qcdio/QCThread.h" +#include "qcdio/QCUtils.h" +#include "qcdio/qcstutils.h" + +#include +#include +#include + +namespace KFS +{ +using std::max; +using std::vector; + +using KFS::libkfsio::globalNetManager; +using KFS::libkfsio::globals; + +NetDispatch gNetDispatch; + +NetDispatch::NetDispatch() + : mClientManager(), + mChunkServerFactory(), + mMutex(0), + mClientManagerMutex(0), + mRunningFlag(false), + mClientThreadCount(0), + mClientThreadsStartCpuAffinity(-1) +{ +} + +NetDispatch::~NetDispatch() +{ + delete mMutex; +} + +// +// Open up the server for connections. +// +bool +NetDispatch::Start(int clientAcceptPort, int chunkServerAcceptPort) +{ + mMutex = mClientThreadCount > 0 ? new QCMutex() : 0; + mClientManagerMutex = mClientThreadCount > 0 ? + &mClientManager.GetMutex() : 0; + mRunningFlag = true; + // Start the acceptors so that it sets up a connection with the net + // manager for listening. + int err = 0; + if (mClientThreadsStartCpuAffinity >= 0 && + (err = QCThread::SetCurrentThreadAffinity( + QCThread::CpuAffinity(mClientThreadsStartCpuAffinity)))) { + KFS_LOG_STREAM_ERROR << + "failed to set main thread affinity: " << + mClientThreadsStartCpuAffinity << + " error: " << QCUtils::SysError(err) << + KFS_LOG_EOM; + } else if (mClientManager.StartAcceptor( + clientAcceptPort, + mClientThreadCount, + mClientThreadsStartCpuAffinity >= 0 ? + mClientThreadsStartCpuAffinity + 1 : + mClientThreadsStartCpuAffinity + ) && + mChunkServerFactory.StartAcceptor( + chunkServerAcceptPort)) { + // Start event processing. + globalNetManager().MainLoop(GetMutex()); + } else { + err = -EINVAL; + } + mClientManager.Shutdown(); + mRunningFlag = false; + mClientManagerMutex = 0; + delete mMutex; + mMutex = 0; + return (err == 0); +} + +void +NetDispatch::ChildAtFork() +{ + mClientManager.ChildAtFork(); +} + +void +NetDispatch::PrepareCurrentThreadToFork() +{ + mClientManager.PrepareCurrentThreadToFork(); +} + +// Counters for the various ops +struct MetaOpCounters : private map +{ + static void Update(MetaOp opName, int64_t time) + { + Counter* const c = GetCounter(opName); + if (! c) { + return; + } + c->Update(1); + c->UpdateTime(time); + } + static void UpdateNumDirs(int count) + { + if (sInstance) { + UpdateCtr(sInstance->mNumDirs, count); + } + } + static void UpdateNumFiles(int count) + { + if (sInstance) { + UpdateCtr(sInstance->mNumFiles, count); + } + } + static void UpdateNumChunks(int count) + { + if (sInstance) { + UpdateCtr(sInstance->mNumChunks, count); + } + } + static void UpdatePathToFidCacheHit(int count) + { + if (sInstance) { + UpdateCtr(sInstance->mPathToFidCacheHit, count); + } + } + static void UpdatePathToFidCacheMiss(int count) + { + if (sInstance) { + UpdateCtr(sInstance->mPathToFidCacheMiss, count); + } + } + static int64_t GetNumFiles() + { + return (sInstance ? + sInstance->mNumFiles.GetValue() : int64_t(0)); + } + static int64_t GetNumDirs() + { + return (sInstance ? + sInstance->mNumDirs.GetValue() : int64_t(0)); + } + +private: + Counter mNumFiles; + Counter mNumDirs; + Counter mNumChunks; + Counter mPathToFidCacheHit; + Counter mPathToFidCacheMiss; + static MetaOpCounters* sInstance; + + MetaOpCounters() + : map(), + mNumFiles("Number of Files"), + mNumDirs("Number of Directories"), + mNumChunks("Number of Chunks"), + mPathToFidCacheHit("Number of Hits in Path->Fid Cache"), + mPathToFidCacheMiss("Number of Misses in Path->Fid Cache") + {} + ~MetaOpCounters() + { + for (iterator i = begin(); i != end(); ++i) { + if (sInstance == this) { + globals().counterManager.RemoveCounter(i->second); + } + delete i->second; + } + if (sInstance == this) { + globals().counterManager.RemoveCounter(&mNumFiles); + globals().counterManager.RemoveCounter(&mNumDirs); + globals().counterManager.RemoveCounter(&mNumChunks); + globals().counterManager.RemoveCounter(&mPathToFidCacheHit); + globals().counterManager.RemoveCounter(&mPathToFidCacheMiss); + sInstance = 0; + } + } + void AddCounter(const char *name, MetaOp opName) + { + Counter* const c = new Counter(name); + if (! insert(make_pair(opName, c)).second) { + delete c; + return; + } + globals().counterManager.AddCounter(c); + } + static Counter* GetCounter(MetaOp opName) + { + if (! sInstance) { + return 0; + } + MetaOpCounters::iterator iter = sInstance->find(opName); + if (iter == sInstance->end()) { + return 0; + } + return iter->second; + } + static void UpdateCtr(Counter& ctr, int count) + { + if ((int64_t) ctr.GetValue() + count < 0) { + ctr.Reset(); + } else { + ctr.Update(count); + } + } + static MetaOpCounters* MakeInstance() + { + // ensure that globals constructed first + globals(); + static MetaOpCounters instance; + instance.Init(); + return &instance; + } + void Init() + { + AddCounter("Get alloc", META_GETALLOC); + AddCounter("Get layout", META_GETLAYOUT); + AddCounter("Lookup", META_LOOKUP); + AddCounter("Lookup Path", META_LOOKUP_PATH); + AddCounter("Allocate", META_ALLOCATE); + AddCounter("Truncate", META_TRUNCATE); + AddCounter("Create", META_CREATE); + AddCounter("Remove", META_REMOVE); + AddCounter("Rename", META_RENAME); + AddCounter("Set Mtime", META_SETMTIME); + AddCounter("Mkdir", META_MKDIR); + AddCounter("Rmdir", META_RMDIR); + AddCounter("Change File Replication", META_CHANGE_FILE_REPLICATION); + AddCounter("Lease Acquire", META_LEASE_ACQUIRE); + AddCounter("Lease Renew", META_LEASE_RENEW); + AddCounter("Lease Cleanup", META_LEASE_CLEANUP); + AddCounter("Corrupt Chunk ", META_CHUNK_CORRUPT); + AddCounter("Chunkserver Hello ", META_HELLO); + AddCounter("Chunkserver Bye ", META_BYE); + AddCounter("Chunkserver Retire Start", META_RETIRE_CHUNKSERVER); + AddCounter("Replication Checker ", META_CHUNK_REPLICATION_CHECK); + AddCounter("Replication Done ", META_CHUNK_REPLICATE); + + globals().counterManager.AddCounter(&mNumFiles); + globals().counterManager.AddCounter(&mNumDirs); + globals().counterManager.AddCounter(&mNumChunks); + globals().counterManager.AddCounter(&mPathToFidCacheHit); + globals().counterManager.AddCounter(&mPathToFidCacheMiss); + } +}* MetaOpCounters::sInstance(MetaOpCounters::MakeInstance()); + +static class RequestStatsGatherer +{ +public: + RequestStatsGatherer() + : mNextTime(0), + mStatsIntervalMicroSec(30000000), + mOpTimeWarningThresholdMicroSec(200000), + mUserCpuMicroSec(0), + mSystemCpuMicroSec(0), + mWOStream() + {} + void OpDone( + const MetaRequest& op) + { + const int64_t timeNowUsec = microseconds(); + const int64_t reqTimeUsec = timeNowUsec - op.submitTime; + const int64_t reqProcTimeUsec = timeNowUsec - op.processTime; + MetaOpCounters::Update(op.op, reqProcTimeUsec); + if (reqProcTimeUsec > mOpTimeWarningThresholdMicroSec) { + KFS_LOG_STREAM_INFO << + "Time spent processing: " << op.Show() << + " is: " << (reqProcTimeUsec * 1e-6) << + " total: " << (reqTimeUsec * 1e-6) << + " was submitted: " << op.submitCount << + KFS_LOG_EOM; + } + const int idx = + ((op.op < 0 || op.op >= META_NUM_OPS_COUNT) ? + (int)kOtherReqId : + ((op.op == META_ALLOCATE && + ! static_cast(op).logFlag) ? + (int)kReqTypeAllocNoLog : (int)op.op + 1)); + const int64_t reqTime = reqTimeUsec > 0 ? reqTimeUsec : 0; + const int64_t reqProcTime = + reqProcTimeUsec > 0 ? reqProcTimeUsec : 0; + mRequest[ 0].mCnt++; + mRequest[ 0].mTime += reqTime; + mRequest[ 0].mProcTime += reqProcTime; + mRequest[idx].mCnt++; + mRequest[idx].mTime += reqTime; + mRequest[idx].mProcTime += reqProcTime; + if (op.status < 0) { + mRequest[ 0].mErr++; + mRequest[idx].mErr++; + } + if (timeNowUsec < mNextTime) { + return; + } + if (cputime(&mUserCpuMicroSec, &mSystemCpuMicroSec) < 0) { + mUserCpuMicroSec = -1; + mSystemCpuMicroSec = -1; + } + mNextTime = timeNowUsec + mStatsIntervalMicroSec; + const char* kDelim = " "; + KFS_LOG_STREAM_START(MsgLogger::kLogLevelINFO, logStream); + ostream& os = logStream.GetStream(); + os << "===request=counters:" << + kDelim << timeNowUsec << + kDelim << mUserCpuMicroSec << + kDelim << mSystemCpuMicroSec + ; + for (int i = 0; i <= kReqTypeAllocNoLog; i++) { + os << + kDelim << mRequest[i].mCnt << + kDelim << mRequest[i].mErr << + kDelim << mRequest[i].mTime << + kDelim << mRequest[i].mProcTime + ; + } + KFS_LOG_STREAM_END; + const bool kRusageSelfFlag = true; + KFS_LOG_STREAM_START(MsgLogger::kLogLevelINFO, logStream); + ostream& os = logStream.GetStream(); + os << "===rusage=self: "; + showrusage(os, ": ", kDelim, kRusageSelfFlag); + KFS_LOG_STREAM_END; + KFS_LOG_STREAM_START(MsgLogger::kLogLevelINFO, logStream); + ostream& os = logStream.GetStream(); + os << "===rusage=chidren: "; + showrusage(os, ": ", kDelim, ! kRusageSelfFlag); + KFS_LOG_STREAM_END; + } + void SetParameters( + const Properties& props) + { + mNextTime -= mStatsIntervalMicroSec; + mStatsIntervalMicroSec = props.getValue( + "metaServer.statsGatherer.statsIntervalMicroSec", + mStatsIntervalMicroSec + ); + mOpTimeWarningThresholdMicroSec = props.getValue( + "metaServer.statsGatherer.opTimeWarningThresholdMicroSec", + mOpTimeWarningThresholdMicroSec + ); + mNextTime += mStatsIntervalMicroSec; + } + void GetStatsCsv( + ostream& os) + { + if (cputime(&mUserCpuMicroSec, &mSystemCpuMicroSec) < 0) { + mUserCpuMicroSec = -1; + mSystemCpuMicroSec = -1; + mRequest[kCpuUser].mErr++; + mRequest[kCpuSys ].mErr++; + } else { + mRequest[kCpuUser].mTime = mUserCpuMicroSec; + mRequest[kCpuSys ].mTime = mSystemCpuMicroSec; + mRequest[kCpuUser].mProcTime = mUserCpuMicroSec; + mRequest[kCpuSys ].mProcTime = mSystemCpuMicroSec; + } + os << "Name,Total,%-total,Errors,%-errors,Time-total,Time-CPU\n"; + const double ptotal = + 100. / (double)max(int64_t(1), mRequest[0].mCnt); + const double perrors = + 100. / (double)max(int64_t(1), mRequest[0].mErr); + const char* kDelim = ","; + for (int i = 0; i < kReqTypesCnt; i++) { + os << + GetRowName(i) << + kDelim << mRequest[i].mCnt << + kDelim << (mRequest[i].mCnt * ptotal) << + kDelim << mRequest[i].mErr << + kDelim << (mRequest[i].mErr * perrors) << + kDelim << mRequest[i].mTime << + kDelim << mRequest[i].mProcTime << + "\n" + ; + } + } + void GetStatsCsv( + IOBuffer& buf) + { + GetStatsCsv(mWOStream.Set(buf)); + mWOStream.Reset(); + } + int64_t GetUserCpuMicroSec() const + { return mUserCpuMicroSec; } + int64_t GetSystemCpuMicroSec() const + { return mSystemCpuMicroSec; } +private: + enum + { + kOtherReqId = META_NUM_OPS_COUNT + 1, + kReqTypeAllocNoLog = kOtherReqId + 1, + kCpuUser = kReqTypeAllocNoLog + 1, + kCpuSys = kCpuUser + 1, + kReqTypesCnt = kCpuSys + 1 + }; + struct Counter { + Counter() + : mCnt(0), + mErr(0), + mTime(0), + mProcTime(0) + {} + int64_t mCnt; + int64_t mErr; + int64_t mTime; + int64_t mProcTime; + }; + int64_t mNextTime; + int64_t mStatsIntervalMicroSec; + int64_t mOpTimeWarningThresholdMicroSec; + int64_t mUserCpuMicroSec; + int64_t mSystemCpuMicroSec; + Counter mRequest[kReqTypesCnt]; + IOBuffer::WOStream mWOStream; + + static const char* GetRowName( + int idx) + { + static const char* const kNames[kReqTypesCnt] = + { + "TOTAL", +# define KfsMakeMetaOpName(name) #name, + KfsForEachMetaOpId(KfsMakeMetaOpName) +# undef KfsMakeMetaOpName + "OTHER", + "ALLOCATE_NO_LOG", + "CPU_USER", + "CPU_SYS" + }; + return ((idx < 0 || idx >= kReqTypesCnt) ? "" : kNames[idx]); + } +} sReqStatsGatherer; + + +void NetDispatch::SetParameters(const Properties& props) +{ + if (! mRunningFlag) { + mClientThreadCount = props.getValue( + "metaServer.clientThreadCount", + mClientThreadCount); + mClientThreadsStartCpuAffinity = props.getValue( + "metaServer.clientThreadStartCpuAffinity", + mClientThreadsStartCpuAffinity); + } + + // Only main thread listens, and accepts. + TcpSocket::SetDefaultRecvBufSize(props.getValue( + "metaServer.tcpSocket.recvBufSize", + TcpSocket::GetDefaultRecvBufSize())); + TcpSocket::SetDefaultSendBufSize(props.getValue( + "metaServer.tcpSocket.sendBufSize", + TcpSocket::GetDefaultSendBufSize())); + + globalNetManager().SetMaxAcceptsPerRead(props.getValue( + "metaServer.net.maxAcceptsPerRead", + globalNetManager().GetMaxAcceptsPerRead())); + + sReqStatsGatherer.SetParameters(props); +} + +void NetDispatch::GetStatsCsv(ostream& os) +{ + sReqStatsGatherer.GetStatsCsv(os); +} + +void NetDispatch::GetStatsCsv(IOBuffer& buf) +{ + sReqStatsGatherer.GetStatsCsv(buf); +} + +int64_t NetDispatch::GetUserCpuMicroSec() const +{ + return sReqStatsGatherer.GetUserCpuMicroSec(); +} + +int64_t NetDispatch::GetSystemCpuMicroSec() const +{ + return sReqStatsGatherer.GetSystemCpuMicroSec(); +} + +void +UpdateNumDirs(int count) +{ + MetaOpCounters::UpdateNumDirs(count); +} + +int64_t +GetNumFiles() +{ + return MetaOpCounters::GetNumFiles(); +} + +int64_t +GetNumDirs() +{ + return MetaOpCounters::GetNumDirs(); +} + +void +UpdateNumFiles(int count) +{ + MetaOpCounters::UpdateNumFiles(count); +} + +void +UpdateNumChunks(int count) +{ + MetaOpCounters::UpdateNumChunks(count); +} + +void +UpdatePathToFidCacheMiss(int count) +{ + MetaOpCounters::UpdatePathToFidCacheMiss(count); +} + +void +UpdatePathToFidCacheHit(int count) +{ + MetaOpCounters::UpdatePathToFidCacheHit(count); +} + +/// +/// Poll the logger to see if any op's have finished execution. For +/// such ops, send a response back to the client. Also, if there any +/// layout related RPCs, dispatch them now. +/// +void +NetDispatch::Dispatch(MetaRequest *r) +{ + sReqStatsGatherer.OpDone(*r); + // Reset count for requests like replication check, where the same + // request reused. + r->submitCount = 0; + // The Client will send out a response and destroy r. + if (r->clnt) { + r->clnt->HandleEvent(EVENT_CMD_DONE, r); + } else { + delete r; + } +} + +class ClientManager::Impl : public IAcceptorOwner, public ITimeout +{ +public: + Impl() + : IAcceptorOwner(), + ITimeout(), + mAcceptor(0), + mClientThreads(0), + mClientThreadCount(-1), + mNextThreadIdx(0), + mMutex(), + mPrepareToForkDoneCond(), + mForkDoneCond(), + mPrepareToForkFlag(false), + mPrepareToForkCnt(0) + {}; + virtual ~Impl(); + bool StartAcceptor(int port, int threadCount, int startCpuAffinity); + virtual KfsCallbackObj* CreateKfsCallbackObj(NetConnectionPtr &conn); + void Shutdown(); + void ChildAtFork(); + QCMutex& GetMutex() + { return mMutex; } + void PrepareCurrentThreadToFork(); + inline void PrepareToFork(bool mainThreadFlag = false) + { + if (! mPrepareToForkFlag) { + return; + } + QCMutex* const mutex = gNetDispatch.GetMutex(); + if (! mutex) { + return; + } + assert(! mainThreadFlag || mutex->IsOwned()); + QCStMutexLocker locker(mainThreadFlag ? 0 : mutex); + // The prepare thread count includes "main" thread. + if (++mPrepareToForkCnt >= mClientThreadCount) { + mPrepareToForkDoneCond.Notify(); + } + mForkDoneCond.Wait(*mutex); + } + virtual void Timeout() + { + const bool kMainThreadFlag = true; + PrepareToFork(kMainThreadFlag); + } +private: + class ClientThread; + // The socket object which is setup to accept connections. + Acceptor* mAcceptor; + ClientManager::ClientThread* mClientThreads; + int mClientThreadCount; + int mNextThreadIdx; + QCMutex mMutex; + QCCondVar mPrepareToForkDoneCond; + QCCondVar mForkDoneCond; + volatile bool mPrepareToForkFlag; + volatile int mPrepareToForkCnt; +}; + +inline void +ClientManager::PrepareToFork() +{ + mImpl.PrepareToFork(); +} + +inline void +NetDispatch::PrepareToFork() +{ + mClientManager.PrepareToFork(); +} + +// All acceptors run in the main thread running global net manager event loop. +// New client "connections" are passed to the client threads via the queue. +// Each client thread runs each client "connection" (ClientSM instance) in its +// own net manager event loop. +// The core of the request processing submit_request() / MetaRequest::handle() +// is serialized with the mutex. The attempt is made to process requests in +// batches in order to reduce lock acquisition frequency. +// The client thread run loop is in Timeout() method below, which is invoked +// from NetManager::MainLoop(). +// The pending requests queue depth governed by the ClientSM parameters. +// ClientSM logic limits number of outstanding requests as well as pending io +// bytes to ensure request processing "fairness" in respect to all the client +// connections. +class ClientManager::ClientThread : + public QCRunnable, + public ITimeout +{ +public: + ClientThread() + : QCRunnable(), + ITimeout(), + mMutex(0), + mThread(), + mNetManager(), + mWOStream(), + mReqHead(0), + mReqTail(0), + mCliHead(0), + mCliTail(0), + mReqPendingHead(0), + mReqPendingTail(0), + mFlushQueue(8 << 10) + { + mNetManager.RegisterTimeoutHandler(this); + } + virtual ~ClientThread() + { + if (mThread.IsStarted()) { + mNetManager.Shutdown(); + mNetManager.Wakeup(); + mThread.Join(); + } + ClientThread::Timeout(); + assert(! mCliHead && ! mCliTail); + mNetManager.UnRegisterTimeoutHandler(this); + } + bool Start(QCMutex* mutex, int cpuIndex) + { + if (mThread.IsStarted()) { + return true; + } + mMutex = mutex; + const int kStackSize = 256 << 10; + const int err = mThread.TryToStart( + this, kStackSize, "ClientThread", + cpuIndex >= 0 ? + QCThread::CpuAffinity(cpuIndex) : + QCThread::CpuAffinity::None() + ); + if (err) { + KFS_LOG_STREAM_ERROR << QCUtils::SysError( + err, "failed to start thread") << + KFS_LOG_EOM; + } + return (err == 0); + } + virtual void Run() + { + mNetManager.MainLoop(); + } + virtual void Timeout() + { + gNetDispatch.PrepareToFork(); + MetaRequest* nextReq; + if (mReqPendingHead) { + // Dispatch requests. + nextReq = mReqPendingHead; + mReqPendingHead = 0; + mReqPendingTail = 0; + QCStMutexLocker locker(gNetDispatch.GetMutex()); + while (nextReq) { + MetaRequest& op = *nextReq; + nextReq = op.next; + op.next = 0; + submit_request(&op); + } + } + ClientSM* nextCli; + { + QCStMutexLocker locker(mMutex); + nextReq = mReqHead; + mReqHead = 0; + mReqTail = 0; + nextCli = mCliHead; + mCliHead = 0; + mCliTail = 0; + } + // Send responses. Try to minimize number of system calls by + // attempting to send multiple responses in single write. + FlushQueue::iterator it = mFlushQueue.begin(); + NetConnectionPtr conn; + while (nextReq) { + MetaRequest& op = *nextReq; + nextReq = op.next; + op.next = &op; + const NetConnectionPtr& cn = GetConnection(op); + if (cn && ! cn->IsWriteReady()) { + conn = cn; // Has no data pending. + } + op.clnt->HandleEvent(EVENT_CMD_DONE, &op); + if (! conn) { + continue; + } + if (! conn->CanStartFlush()) { + conn.reset(); + continue; + } + if (it == mFlushQueue.end()) { + mFlushQueue.push_back(NetConnectionPtr()); + it = mFlushQueue.end(); + conn.swap(mFlushQueue.back()); + continue; + } + conn.swap(*it++); + } + for (FlushQueue::iterator cit = mFlushQueue.begin(); + cit != it; + ++cit) { + (*cit)->StartFlush(); + cit->reset(); + } + // Add new connections to the net manager. + const bool runningFlag = mNetManager.IsRunning(); + while (nextCli) { + ClientSM& cli = *nextCli; + nextCli = cli.GetNext(); + cli.GetNext() = 0; + const NetConnectionPtr& conn = cli.GetConnection(); + assert(conn); + conn->SetOwningKfsCallbackObj(&cli); + if (runningFlag) { + mNetManager.AddConnection(conn); + } else { + conn->HandleErrorEvent(); + } + } + // Wake main thread if need to process requests waiting for + // io buffers, if any. + CheckIfIoBuffersAvailable(); + } + void Enqueue(MetaRequest& op) + { + if (! op.clnt) { + delete &op; + return; + } + QCStMutexLocker locker(mMutex); + op.next = 0; + if (mReqTail) { + mReqTail->next = &op; + mReqTail = &op; + return; + } + mReqHead = &op; + mReqTail = &op; + locker.Unlock(); + mNetManager.Wakeup(); + } + void Add(NetConnectionPtr& conn) + { + if (! conn || ! conn->IsGood() || ! mThread.IsStarted()) { + return; + } + QCStMutexLocker locker(mMutex); + ClientSM* const cli = new ClientSM( + conn, this, &mWOStream, mParseBuffer); + assert(cli->GetConnection() == conn); + conn.reset(); // Take the ownership. ClientSM ref. self. + if (mCliTail) { + mCliTail->GetNext() = cli; + mCliTail = cli; + return; + } + mCliHead = cli; + mCliTail = cli; + locker.Unlock(); + mNetManager.Wakeup(); + } + void Add(MetaRequest& op) + { + // This method must be called from the client thread: ClientSM + // adds request to the pending processing queue. + if (mReqPendingTail) { + mReqPendingTail->next = &op; + mReqPendingTail = &op; + return; + } + mReqPendingHead = &op; + mReqPendingTail = &op; + mNetManager.Wakeup(); + } + bool IsStarted() const + { return mThread.IsStarted(); } + void ChildAtFork() + { + mNetManager.ChildAtFork(); + } + void Wakeup() + { mNetManager.Wakeup(); } +private: + typedef vector FlushQueue; + + QCMutex* mMutex; + QCThread mThread; + NetManager mNetManager; + IOBuffer::WOStream mWOStream; + MetaRequest* mReqHead; + MetaRequest* mReqTail; + ClientSM* mCliHead; + ClientSM* mCliTail; + MetaRequest* mReqPendingHead; + MetaRequest* mReqPendingTail; + FlushQueue mFlushQueue; + char mParseBuffer[MAX_RPC_HEADER_LEN]; + + const NetConnectionPtr& GetConnection(MetaRequest& op) + { + return static_cast(op.clnt)->GetConnection(); + } +private: + ClientThread(const ClientThread&); + ClientThread& operator=(const ClientThread&); +}; + +ClientManager::Impl::~Impl() +{ + Impl::Shutdown(); +} + +bool +ClientManager::Impl::StartAcceptor(int port, int threadCount, + int startCpuAffinity) +{ + delete mAcceptor; + mAcceptor = 0; + mAcceptor = new Acceptor(port, this); + if (! mAcceptor->IsAcceptorStarted()) { + return false; + } + if (mClientThreadCount >= 0 || mClientThreads) { + return true; + } + mClientThreadCount = max(threadCount, 0); + if (mClientThreadCount <= 0) { + return true; + } + int cpuIndex = startCpuAffinity; + mClientThreads = new ClientManager::ClientThread[mClientThreadCount]; + for (int i = 0; i < mClientThreadCount; i++) { + if (! mClientThreads[i].Start(&mMutex, cpuIndex)) { + delete [] mClientThreads; + mClientThreads = 0; + mClientThreadCount = -1; + return false; + } + if (cpuIndex >= 0) { + cpuIndex++; + } + } + if (mClientThreadCount > 0) { + globalNetManager().RegisterTimeoutHandler(this); + } + return true; +}; + +KfsCallbackObj* +ClientManager::Impl::CreateKfsCallbackObj(NetConnectionPtr &conn) +{ + if (mClientThreadCount < 0) { + return 0; + } else if (mClientThreadCount == 0) { + return new ClientSM(conn); + } + int idx = mNextThreadIdx; + if (idx >= mClientThreadCount || idx < 0) { + idx = 0; + mNextThreadIdx = idx + 1; + } else { + mNextThreadIdx++; + } + mClientThreads[idx].Add(conn); + return 0; +} + +void +ClientManager::Impl::Shutdown() +{ + if (mClientThreadCount > 0) { + globalNetManager().UnRegisterTimeoutHandler(this); + } + delete mAcceptor; + mAcceptor = 0; + delete [] mClientThreads; + mClientThreads = 0; + mClientThreadCount = -1; +} + +void +ClientManager::Impl::ChildAtFork() +{ + for (int i = 0; i < mClientThreadCount; i++) { + mClientThreads[i].ChildAtFork(); + } +} + +void +ClientManager::Impl::PrepareCurrentThreadToFork() +{ + QCMutex* const mutex = gNetDispatch.GetMutex(); + if (! mutex) { + return; + } + assert(! mPrepareToForkFlag && mutex->IsOwned()); + mPrepareToForkFlag = true; + mPrepareToForkCnt = 0; + for (int i = 0; i < mClientThreadCount; i++) { + mClientThreads[i].Wakeup(); + } + globalNetManager().Wakeup(); + while (mPrepareToForkCnt < mClientThreadCount) { + mPrepareToForkDoneCond.Wait(*mutex); + } + mPrepareToForkFlag = false; + mPrepareToForkCnt = 0; + // Resume threads after fork completes and the logck gets released. + mForkDoneCond.NotifyAll(); +} + +ClientManager::ClientManager() + : mImpl(*(new Impl())) +{ +} + +/* virtual */ +ClientManager::~ClientManager() +{ + delete &mImpl; +}; + +bool +ClientManager::StartAcceptor(int port, int threadCount, int startCpuAffinity) +{ + return mImpl.StartAcceptor(port, threadCount, startCpuAffinity); +} + +void +ClientManager::Shutdown() +{ + mImpl.Shutdown(); +} + +void +ClientManager::ChildAtFork() +{ + mImpl.ChildAtFork(); +} + +void +ClientManager::PrepareCurrentThreadToFork() +{ + mImpl.PrepareCurrentThreadToFork(); +} + +QCMutex& +ClientManager::GetMutex() +{ + return mImpl.GetMutex(); +} + +/* static */ bool +ClientManager::EnqueueSelf(ClientManager::ClientThread* thread, MetaRequest& op) +{ + assert(thread); + if (! op.clnt) { + delete &op; + } else if (thread->IsStarted()) { + thread->Enqueue(op); + } else { + op.next = &op; + op.clnt->HandleEvent(EVENT_CMD_DONE, &op); + } + return true; +} + +/* static */ void +ClientManager::SubmitRequestSelf(ClientManager::ClientThread* thread, + MetaRequest& op) +{ + assert(thread); + thread->Add(op); +} + +} // namespace KFS diff --git a/src/cc/meta/NetDispatch.h b/src/cc/meta/NetDispatch.h new file mode 100644 index 000000000..94c152032 --- /dev/null +++ b/src/cc/meta/NetDispatch.h @@ -0,0 +1,77 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Created 2006/06/01 +// Author: Sriram Rao, Mike Ovsiannikov +// +// Copyright 2008-2012 Quantcast Corp. +// Copyright 2006-2008 Kosmix Corp. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// \file NetDispatch.h +// \brief Meta-server request processor(s). +// +//---------------------------------------------------------------------------- + +#ifndef META_NETDISPATCH_H +#define META_NETDISPATCH_H + +#include "ClientManager.h" +#include "ChunkServerFactory.h" + +#include + +class QCMutex; + +namespace KFS +{ +using std::ostream; +class Properties; +class IOBuffer; + +class NetDispatch +{ +public: + NetDispatch(); + ~NetDispatch(); + bool Start(int clientAcceptPort, int chunkServerAcceptPort); + //!< Dispatch completed request. + void Dispatch(MetaRequest* r); + void SetParameters(const Properties& props); + void GetStatsCsv(ostream& os); + void GetStatsCsv(IOBuffer& buf); + int64_t GetUserCpuMicroSec() const; + int64_t GetSystemCpuMicroSec() const; + QCMutex* GetMutex() const { return mMutex; } + QCMutex* GetClientManagerMutex() const { return mClientManagerMutex; } + bool IsRunning() const { return mRunningFlag; } + void ChildAtFork(); + void PrepareCurrentThreadToFork(); + inline void PrepareToFork(); +private: + ClientManager mClientManager; //!< tracks the connected clients + ChunkServerFactory mChunkServerFactory; //!< creates chunk servers when they connect + QCMutex* mMutex; + QCMutex* mClientManagerMutex; + bool mRunningFlag; + int mClientThreadCount; + int mClientThreadsStartCpuAffinity; +}; + +extern NetDispatch gNetDispatch; +} + +#endif // META_NETDISPATCH_H diff --git a/src/cc/meta/Replay.cc b/src/cc/meta/Replay.cc new file mode 100644 index 000000000..1cc6e09b9 --- /dev/null +++ b/src/cc/meta/Replay.cc @@ -0,0 +1,1056 @@ +/*! + * $Id$ + * + * \file Replay.cc + * \brief transaction log replay + * \author Blake Lewis (Kosmix Corp.) + * Mike Ovsiannikov + * + * Copyright 2008-2012 Quantcast Corp. + * Copyright 2006-2008 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "Logger.h" +#include "Replay.h" +#include "Restorer.h" +#include "util.h" +#include "DiskEntry.h" +#include "kfstree.h" +#include "LayoutManager.h" +#include "common/MdStream.h" +#include "common/MsgLogger.h" +#include "qcdio/QCUtils.h" + +#include +#include +#include + +#include +#include +#include +#include + +namespace KFS +{ +using std::ostringstream; +using std::atoi; + +Replay replayer; + +/*! + * \brief open saved log file for replay + * \param[in] p a path in the form "/log." + */ +int +Replay::openlog(const string &p) +{ + if (file.is_open()) { + file.close(); + } + KFS_LOG_STREAM_INFO << + "open log file: " << p.c_str() << + KFS_LOG_EOM; + int num = -1; + const string::size_type dot = p.rfind('.'); + if (dot != string::npos) { + const char* const ptr = p.c_str() + dot + 1; + if (*ptr != 0) { + char* end = 0; + const long val = strtol(ptr, &end, 10); + num = (int)val; + if (val != num || *end != 0) { + num = -1; + } + } + } + if (num < 0) { + KFS_LOG_STREAM_FATAL << + p << ": invalid log file name" << + KFS_LOG_EOM; + return -EINVAL; + } + file.open(p.c_str()); + if (file.fail()) { + const int err = errno; + KFS_LOG_STREAM_FATAL << + p << ": " << QCUtils::SysError(err) << + KFS_LOG_EOM; + return (err > 0 ? -err : (err == 0 ? -1 : err)); + } + number = num; + path = p; + return 0; +} + +/*! + * \brief check log version + * format: version/ + */ +static bool +replay_version(DETokenizer& c) +{ + fid_t vers; + bool ok = pop_fid(vers, "version", c, true); + return (ok && vers == Logger::VERSION); +} + +/*! + * \brief handle common prefix for all log records + */ +static bool +pop_parent(fid_t &id, DETokenizer& c) +{ + c.pop_front(); // get rid of record type + return pop_fid(id, "dir", c, true); +} + +/*! + * \brief update the seed of a UniqueID with what is passed in. + * Since this function is called in the context of log replay, it + * better be the case that the seed passed in is higher than + * the id's seed (which was set from a checkpoint file). +*/ +static void +updateSeed(UniqueID &id, seqid_t seed) +{ + if (seed < id.getseed()) { + ostringstream os; + os << "seed from log: " << seed << + " < id's seed: " << id.getseed(); + panic(os.str(), false); + } + id.setseed(seed); +} + +/*! + * \brief replay a file create + * format: create/dir//name//id/{/ctime/