Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]init trainer log #8

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 133 additions & 30 deletions auto_log/autolog.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,23 @@
from .env import get_env_info
from .utils import Times
from .device import MemInfo, SubprocessGetMem
from .smoothed_meter import SmoothedMeter


class RunConfig:
def __init(self,
def __init(self,
run_devices="cpu",
ir_optim=False,
enable_tensorrt=False,
enable_mkldnn=False,
cpu_threads=0,
enable_mem_optim=True):

self.run_devices = run_devices
self.ir_optim = ir_optim
self.enable_mkldnn = enable_mkldnn
self.enable_tensorrt = enable_tensorrt
self.cpu_math_library_num_threads = self.cpu_threads
self.cpu_math_library_num_threads = cpu_threads
self.enable_mem_optim = enable_mem_optim


Expand All @@ -52,10 +53,12 @@ def __init__(self,
data_shape,
save_path,
inference_config=None,
pids=None,
process_name=None,
gpu_ids=None,
time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
pids=None,
process_name=None,
gpu_ids=None,
time_keys=[
'preprocess_time', 'inference_time', 'postprocess_time'
],
warmup=0,
logger=None,
**kwargs):
Expand All @@ -71,17 +74,17 @@ def __init__(self,
self.config_status = self.parse_config(self.paddle_infer_config)

self.time_keys = time_keys
self.times = Times(keys=time_keys,warmup=warmup)
self.times = Times(keys=time_keys, warmup=warmup)

self.get_paddle_info()

self.logger = self.init_logger() if logger is None else logger

self.get_mem = SubprocessGetMem(pid=pids, gpu_id=gpu_ids)
self.start_subprocess_get_mem()
self.pids = pids
self.gpu_ids = gpu_ids

def start_subprocess_get_mem(self):
self.get_mem.get_mem_subprocess_run(0.2)

Expand All @@ -95,10 +98,10 @@ def end_subprocess_get_mem(self):
else:
self.gpu_infos = gpu_infos[str(self.gpu_ids)]
return self.cpu_infos, self.gpu_infos

def init_logger(self, name='root', log_level=logging.DEBUG):
log_file = self.save_path

logger = logging.getLogger(name)

formatter = logging.Formatter(
Expand Down Expand Up @@ -128,7 +131,8 @@ def parse_config(self, config) -> dict:
"""
config_status = {}
if config is not None and type(config) is not dict:
config_status['runtime_device'] = "gpu" if config.use_gpu() else "cpu"
config_status['runtime_device'] = "gpu" if config.use_gpu(
) else "cpu"
config_status['ir_optim'] = config.ir_optim()
config_status['enable_tensorrt'] = config.tensorrt_engine_enabled()
config_status['precision'] = self.precision
Expand All @@ -137,12 +141,18 @@ def parse_config(self, config) -> dict:
'cpu_math_library_num_threads'] = config.cpu_math_library_num_threads(
)
elif type(config) is dict:
config_status['runtime_device'] = config['runtime_device'] if 'runtime_device' in config else None
config_status['ir_optim'] = config['ir_optim'] if 'ir_optim' in config else None
config_status['enable_tensorrt'] = config['enable_tensorrt'] if 'enable_tensorrt' in config else None
config_status['precision'] = config['precision'] if 'precision' in config else None
config_status['enable_mkldnn'] = config['enable_mkldnn'] if 'enable_mkldnn' in config else None
config_status['cpu_math_library_num_threads'] = config['cpu_math_library_num_threads'] if 'cpu_math_library_num_threads' in config else None
config_status['runtime_device'] = config[
'runtime_device'] if 'runtime_device' in config else None
config_status['ir_optim'] = config[
'ir_optim'] if 'ir_optim' in config else None
config_status['enable_tensorrt'] = config[
'enable_tensorrt'] if 'enable_tensorrt' in config else None
config_status['precision'] = config[
'precision'] if 'precision' in config else None
config_status['enable_mkldnn'] = config[
'enable_mkldnn'] if 'enable_mkldnn' in config else None
config_status['cpu_math_library_num_threads'] = config[
'cpu_math_library_num_threads'] if 'cpu_math_library_num_threads' in config else None
else:
config_status['runtime_device'] = "None"
config_status['ir_optim'] = "None"
Expand Down Expand Up @@ -178,27 +188,32 @@ def report(self, identifier=None):

# report memory
cpu_infos, gpu_infos = self.end_subprocess_get_mem()

cpu_rss_mb = self.cpu_infos['cpu_rss']
gpu_rss_mb = self.gpu_infos['memory.used'] if self.gpu_ids is not None else None
gpu_util = self.gpu_infos['utilization.gpu'] if self.gpu_ids is not None else None
gpu_rss_mb = self.gpu_infos[
'memory.used'] if self.gpu_ids is not None else None
gpu_util = self.gpu_infos[
'utilization.gpu'] if self.gpu_ids is not None else None

# report env
envs = get_env_info()

self.logger.info("\n")
self.logger.info(
"---------------------- Env info ----------------------")
# envs['nvidia_driver_version'] envs['cudnn_version']envs['cuda_version'] envs['os_info']
# envs['nvidia_driver_version'] envs['cudnn_version']envs['cuda_version'] envs['os_info']
self.logger.info(f"{identifier} OS_version: {envs['os_info']}")
self.logger.info(f"{identifier} CUDA_version: {envs['cuda_version']}")
self.logger.info(f"{identifier} CUDNN_version: {envs['cudnn_version']}")
self.logger.info(f"{identifier} drivier_version: {envs['nvidia_driver_version']}")
self.logger.info(
f"{identifier} CUDNN_version: {envs['cudnn_version']}")
self.logger.info(
f"{identifier} drivier_version: {envs['nvidia_driver_version']}")
self.logger.info(
"---------------------- Paddle info ----------------------")
self.logger.info(f"{identifier} paddle_version: {self.paddle_version}")
self.logger.info(f"{identifier} paddle_commit: {self.paddle_commit}")
self.logger.info(f"{identifier} log_api_version: {self.autolog_version}")
self.logger.info(
f"{identifier} log_api_version: {self.autolog_version}")
self.logger.info(
"----------------------- Conf info -----------------------")
self.logger.info(
Expand All @@ -211,7 +226,8 @@ def report(self, identifier=None):
f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}"
)
self.logger.info(
f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}")
f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}"
)
self.logger.info(
f"{identifier} cpu_math_library_num_threads: {self.config_status['cpu_math_library_num_threads']}"
)
Expand All @@ -229,8 +245,7 @@ def report(self, identifier=None):
self.logger.info(
f"{identifier} cpu_rss(MB): {cpu_rss_mb}, gpu_rss(MB): {gpu_rss_mb}, gpu_util: {gpu_util}%"
)
self.logger.info(
f"{identifier} total time spent(s): {total_time_s}")
self.logger.info(f"{identifier} total time spent(s): {total_time_s}")
self.logger.info(
f"{identifier} preprocess_time(ms): {preprocess_time_ms}, inference_time(ms): {inference_time_ms}, postprocess_time(ms): {postprocess_time_ms}"
)
Expand Down Expand Up @@ -259,9 +274,97 @@ def print_help(self):
""")


class TrainerLogger(object):
def __init__(self, output_dir=None, mode="train"):
self.mode = mode.upper()
self.logger = self.init_logger(output_dir)

self.smoothed_infos = dict()

self.init_fixed_fields()

def init_fixed_fields(self, ):
fixed_unit_maps = {
"avg_samples": "",
"avg_reader_cost": "sec",
"avg_batch_cost": "sec",
}
if self.mode == "TRAIN":
fixed_unit_maps["loss"] = ""

for key in fixed_unit_maps:
self.smoothed_infos[key] = SmoothedMeter(
key, unit=fixed_unit_maps[key])

def add_record_info(self, key, unit=""):
self.smoothed_infos[key] = SmoothedMeter(key, unit=unit)

def update(self, samples, reader_cost, run_cost, loss, **kwargs):
self.smoothed_infos["avg_samples"].update(samples)
self.smoothed_infos["avg_reader_cost"].update(reader_cost)
self.smoothed_infos["avg_batch_cost"].update(run_cost + reader_cost)
self.smoothed_infos["loss"].update(loss)
for key in kwargs:
self.smoothed_infos[key].update(kwargs[key])

def get_ips_info(self, scope="local"):
if self.smoothed_infos["avg_samples"].get_sum(scope=scope) == 0:
return ""
ips_info = "avg_ips: {:.7f} images/sec".format(
self.smoothed_infos["avg_samples"].get_sum(scope=scope) /
self.smoothed_infos["avg_batch_cost"].get_sum(scope=scope))
return ips_info

def log(self, epoch, batch_idx=None):
if batch_idx is None:
scope = "global"
epoch_info = "[end of epoch {}]".format(epoch)
else:
scope = "local"
epoch_info = "[epoch {}, batch_idx {}]".format(epoch, epoch)

mode_info = "[{}]".format(self.mode)
smooth_val_info = ", ".join([
self.smoothed_infos[key].log(scope=scope)
for key in self.smoothed_infos
])
ips_info = self.get_ips_info(scope=scope)

msg = "{}{}{}, {}".format(epoch_info, mode_info, smooth_val_info,
ips_info)
self.logger.info(msg)
self.reset(scope=scope)

def reset(self, scope="local"):
for key in self.smoothed_infos:
self.smoothed_infos[key].reset(scope=scope)

def init_logger(self, save_path, name='root', log_level=logging.DEBUG):
os.makedirs(save_path, exist_ok=True)
log_file = os.path.join(save_path, "train.log")

logger = logging.getLogger(name)

formatter = logging.Formatter(
'[%(asctime)s] %(name)s %(levelname)s: %(message)s',
datefmt="%Y/%m/%d %H:%M:%S")

stream_handler = logging.StreamHandler(stream=sys.stdout)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
if log_file is not None:
dir_name = os.path.dirname(log_file)
if len(dir_name) > 0 and not os.path.exists(dir_name):
os.makedirs(dir_name)
file_handler = logging.FileHandler(log_file, 'w')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(log_level)
return logger


# if __name__ == "__main__":
# get_os_info()
# print(envs['os_info'])
# get_cudnn_info()
# print(envs['cudnn_version'])

59 changes: 59 additions & 0 deletions auto_log/smoothed_meter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
class SmoothedMeter(object):
def __init__(self, key, unit=""):
# basic info
self.key = key
self.unit = unit
# value and count for local average
self.local_sum = 0
self.local_count = 0
# value and count in all for global average
self.global_sum = 0
self.global_count = 0
self.fmt = None

def reset(self, scope="global"):
self.local_sum = 0
self.local_count = 0

if scope == "global":
self.global_sum = 0
self.global_count = 0
return

def update(self, value, n=1):
assert isinstance(value, (
int,
float)), "value can just be int or float bu got {}".format(value)
if self.fmt is None:
self.fmt = "{}: {:.7f} {}" if isinstance(value,
float) else "{}: {} {}"
self.local_count += n
self.local_sum += value * n

self.global_sum += value * n
self.global_count += n

def _average(self, scope="local"):
assert self.fmt is not None, "update must be used before add_record"
if scope == "local":
if self.local_count <= 0:
return ""
else:
return self.fmt.format(self.key, self.local_sum /
self.local_count, self.unit)
else:
if self.global_count <= 0:
return ""
else:
return self.fmt.format(self.key, self.global_sum /
self.global_count, self.unit)

def get_sum(self, scope="local"):
if scope == "local":
return self.local_sum
else:
return self.global_sum

def log(self, scope="local"):
info = self._average(scope=scope)
return info