Skip to content
This repository has been archived by the owner on Jan 13, 2022. It is now read-only.

Commit

Permalink
Saving and restoring the NodeManager state
Browse files Browse the repository at this point in the history
Summary: We are able to save and restore the NodeManager state now.

Test Plan:
I couldn't find a way to test the code completely as such. I tested in two basic ways: (a) Print out what was being read (b) After we recover from the safe mode and
reconstruct the state, persist the state again in a different file, and compare. The second method however requires turning off the compression and turning on pretty
printing.

Reviewers: dms, rvadali, aching

Reviewed By: dms

CC: security-diffs@lists

Task ID: 1112019
  • Loading branch information
gauravmenghani authored and Alex Feinberg committed Aug 14, 2012
1 parent 603e1b6 commit 80128bb
Show file tree
Hide file tree
Showing 10 changed files with 835 additions and 44 deletions.
2 changes: 1 addition & 1 deletion ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@
conf="common->master"/>
<dependency org="org.codehaus.jackson"
name="jackson-mapper-asl"
rev="1.0.1"
rev="1.7.9"
conf="common->default"/>
</dependencies>

Expand Down
2 changes: 1 addition & 1 deletion src/contrib/benchmark/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
conf="common->default"/>
<dependency org="org.codehaus.jackson"
name="jackson-mapper-asl"
rev="1.0.1"
rev="1.7.9"
conf="common->default"/>
</dependencies>
</ivy-module>
2 changes: 1 addition & 1 deletion src/contrib/corona/ivy/libraries.properties
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ checkstyle.version=5.0

guava.version=r09

jackson.version=1.0.1
jackson.version=1.7.9

json.version=20090211

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
*/
package org.apache.hadoop.corona;

import java.io.*;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.*;

Expand All @@ -27,9 +27,11 @@
import org.apache.hadoop.http.HttpServer;
import org.apache.hadoop.mapred.Clock;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.util.CoronaSerializer;
import org.apache.hadoop.util.HostsFileReader;
import org.apache.thrift.TApplicationException;
import org.apache.thrift.TException;
import org.codehaus.jackson.JsonGenerator;

/**
* Manager of all the resources of the cluster.
Expand Down Expand Up @@ -87,20 +89,48 @@ public ClusterManager() { }
* Primary constructor.
*
* @param conf Configuration to be used
* @param recoverFromDisk True if we are restarting after going down while
* in Safe Mode
* @throws IOException
*/
public ClusterManager(Configuration conf, boolean recoverFromDisk)
throws IOException {
this(new CoronaConf(conf), recoverFromDisk);
}

/**
* Constructor for ClusterManager, when it is not specified if we are
* restarting after persisting the state. In this case we assume the
* recoverFromDisk flag to be false.
*
* @param conf Configuration to be used
* @throws IOException
*/
public ClusterManager(Configuration conf) throws IOException {
this(new CoronaConf(conf));
this(new CoronaConf(conf), false);
}

/**
* Construct ClusterManager given {@link CoronaConf}
*
* @param conf the configuration for the ClusterManager
* @param recoverFromDisk true if we are restarting after going down while
* in Safe Mode
* @throws IOException
*/
public ClusterManager(CoronaConf conf) throws IOException {
public ClusterManager(CoronaConf conf, boolean recoverFromDisk)
throws IOException {
this.conf = conf;
HostsFileReader hostsReader =
new HostsFileReader(conf.getHostsFile(), conf.getExcludesFile());

if (recoverFromDisk) {
recoverClusterManagerFromDisk(hostsReader);
} else {
nodeManager = new NodeManager(this, hostsReader);
nodeManager.setConf(conf);
}

initLegalTypes();

metrics = new ClusterManagerMetrics(getTypes());
Expand All @@ -111,11 +141,6 @@ public ClusterManager(CoronaConf conf) throws IOException {
sessionHistoryManager = new SessionHistoryManager();
sessionHistoryManager.setConf(conf);

HostsFileReader hostsReader =
new HostsFileReader(conf.getHostsFile(), conf.getExcludesFile());
nodeManager = new NodeManager(this, hostsReader);
nodeManager.setConf(conf);

sessionNotifier = new SessionNotifier(sessionManager, this, metrics);
sessionNotifier.setConf(conf);

Expand All @@ -134,7 +159,34 @@ public ClusterManager(CoronaConf conf) throws IOException {

startTime = clock.getTime();
hostName = infoSocAddr.getHostName();
safeMode = false;
setSafeMode(false);
}

/**
* This method starts the process to restore the CM state by reading back
* the serialized state from the CM state file.
* @param hostsReader The HostsReader instance
* @throws IOException
*/
private void recoverClusterManagerFromDisk(HostsFileReader hostsReader)
throws IOException {
LOG.info("Recovering from Safe Mode");

// This will prevent the expireNodes thread from expiring the nodes
safeMode = true;

CoronaSerializer coronaSerializer = new CoronaSerializer(conf);

// Expecting the START_OBJECT token for ClusterManager
coronaSerializer.readStartObjectToken("ClusterManager");

coronaSerializer.readField("nodeManager");
nodeManager = new NodeManager(this, hostsReader, coronaSerializer);
nodeManager.setConf(conf);
nodeManager.restoreAfterSafeModeRestart();

// Expecting the END_OBJECT token for ClusterManager
coronaSerializer.readEndObjectToken("ClusterManager");
}

/**
Expand Down Expand Up @@ -452,6 +504,10 @@ public synchronized boolean setSafeMode(boolean safeMode) {
return true;
}

/**
* This function saves the state of the ClusterManager to disk.
* @return A boolean. True if saving the state succeeded, false otherwise.
*/
@Override
public boolean persistState() {
if (!safeMode) {
Expand All @@ -460,6 +516,21 @@ public boolean persistState() {
return false;
}

try {
JsonGenerator jsonGenerator =
CoronaSerializer.createJsonGenerator(conf);
jsonGenerator.writeStartObject();

jsonGenerator.writeFieldName("nodeManager");
nodeManager.write(jsonGenerator);
// TODO Write the sessionManager and other objects

jsonGenerator.writeEndObject();
jsonGenerator.close();
} catch (IOException e) {
LOG.info("Could not persist the state: ", e);
return false;
}
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
import java.net.ServerSocket;
import java.net.Socket;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
Expand All @@ -16,7 +22,7 @@
public class ClusterManagerServer extends Thread {
public static final Log LOG = LogFactory.getLog(ClusterManagerServer.class);

static{
static {
Configuration.addDefaultResource("mapred-default.xml");
Configuration.addDefaultResource("mapred-site.xml");
Utilities.makeProcessExitOnUncaughtException(LOG);
Expand Down Expand Up @@ -62,10 +68,24 @@ public void run() {
}

public static void main(String[] args)
throws IOException, TTransportException {
throws IOException, TTransportException, ParseException {
StringUtils.startupShutdownMessage(ClusterManager.class, args, LOG);
Configuration conf = new Configuration();
ClusterManager cm = new ClusterManager(conf);
boolean recoverFromDisk = false;
// Check if we want to start the ClusterManager to restore the persisted
// state
Option recoverFromDiskOption =
new Option("recoverFromDisk",
"Used to restart the CM from the state persisted on disk");
Options options = new Options();
options.addOption(recoverFromDiskOption);
CommandLineParser parser = new GnuParser();
CommandLine line = parser.parse(options, args);

if (line.hasOption("recoverFromDisk")) {
recoverFromDisk = true;
}
ClusterManager cm = new ClusterManager(conf, recoverFromDisk);
try {
ClusterManagerServer server = new ClusterManagerServer(conf, cm);
server.start();
Expand Down
Loading

0 comments on commit 80128bb

Please sign in to comment.