diff --git a/README.md b/README.md index 36a4267c..d8fd46bd 100644 --- a/README.md +++ b/README.md @@ -225,7 +225,7 @@ Updates to the parameters can be done by calling `NetworkConf$update.variables(. - `edge.attributes` * The list of edge-attribute names and information * a subset of the following as a single vector: - - timestamp information: *`"date"`*, + - timestamp information: *`"date"`*, `"date.offset"` - author information: `"author.name"`, `"author.email"` - committer information: `"committer.date"`, `"committer.name"`, `"committer.email"` - e-mail information: *`"message.id"`*, *`"thread"`*, `"subject"` diff --git a/showcase.R b/showcase.R index a3e1119c..06982619 100644 --- a/showcase.R +++ b/showcase.R @@ -140,6 +140,29 @@ y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf) # plot.network(g) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Vertex attributes ------------------------------------------------------- + +# ## define bins for network construction +# mybins = c("2012-07-10 15:58:00", "2012-07-15 16:02:00", "2012-07-20 16:04:00", "2012-07-25 16:06:30") +# ## split data into ranges +# cf.data = split.data.time.based(x.data, bins = mybins) +# ## construct (author) networks from range data +# my.networks = lapply(cf.data, function(range.data) { +# y = NetworkBuilder$new(project.data = range.data, network.conf = net.conf) +# return (y$get.author.network()) +# }) +# ## add commit-count vertex attributes +# sample = add.vertex.attribute.commit.count.author(my.networks, x.data, aggregation.level = "range") +# sample.cumulative = add.vertex.attribute.commit.count.author(my.networks, x.data, aggregation.level = "cumulative") +# ## add email-address vertex attribute +# sample.mail = add.vertex.attribute.author.email(my.networks, x.data, "author.email") + +# ## add vertex attributes for the project-level network +# x.net.as.list = list("1970-01-01 00:00:00-2030-01-01 00:00:00" = x$get.author.network()) +# sample.entire = add.vertex.attribute.commit.count.author(x.net.as.list, x.data, aggregation.level = "complete") + + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Bulk methods for Codeface ranges ---------------------------------------- diff --git a/tests/test-network-covariates.R b/tests/test-network-covariates.R new file mode 100644 index 00000000..d3e58737 --- /dev/null +++ b/tests/test-network-covariates.R @@ -0,0 +1,528 @@ +## (c) Felix Prasse, 2017 +## prassefe@fim.uni-passau.de + +## +## Context +## + +CF.DATA = file.path(".", "codeface-data") +CF.SELECTION.PROCESS = "testing" +CASESTUDY = "test" +ARTIFACT = "feature" +AGGREGATION.LEVELS = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", "complete") + +## use only when debugging this file independently +if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Global information ------------------------------------------------------ + +mybins = as.POSIXct(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", "2016-08-31 18:00:00")) +myranges = construct.ranges(mybins, sliding.window = FALSE) + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Utility functions ------------------------------------------------------- + +#' Load test data and generate test networks +#' +#' @return Tuple containing project data and list of networks +get.network.covariates.test.networks = function(network.type = c("author", "artifact")) { + + network.type.function = paste("get", match.arg(network.type), "network", sep = ".") + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("artifact.filter.base", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + + ## retrieve project data and network builder + project.data = ProjectData$new(proj.conf) + + ## split data + input.data = split.data.time.based(project.data, bins = mybins) + input.data.networks = lapply(input.data, function(d) NetworkBuilder$new(d, net.conf)[[network.type.function]]()) + + return(list("networks" = input.data.networks, "project.data" = project.data)) +} + +#' Get splitted test data +#' +#' @return splitted test data for each level +get.network.covariates.test.networks.data = function(network.type = c("author", "artifact")) { + networks.and.data = get.network.covariates.test.networks() + + ## split data by networks + results = lapply(AGGREGATION.LEVELS, function(level) + split.data.by.networks(networks.and.data[["networks"]], networks.and.data[["project.data"]], level) + ) + names(results) = AGGREGATION.LEVELS + + return(results) +} + +#' Sample computation callback +#' +#' @param range The range identifier +#' @param range.data The current range data +#' @param current.network The current network +#' +#' @return A list containing the value 1 for each author except "Olaf" +test.compute.attr = function(range, range.data, current.network) { + authors = range.data$get.authors()[["author.name"]] + + ## Olaf should get default value + authors = authors[-which(authors == "Olaf")] + + attributes = lapply(authors, function(name) 1) + names(attributes) = authors + return(attributes) +} + +#' Build list with appropriate range names +#' +#' @param x Value for first range +#' @param y Value for second range +#' @param z Value for third range +#' +#' @return The list of x, y, z with range names +network.covariates.test.build.expected = function(x, y, z) { + arguments = list(x, y, z) + names(arguments) = myranges + + return(arguments) +} + +#' Return the arguments as a list after calling as.POSIXct on them +#' +#' @param ... a bunch of arguments to convert +#' +#' @return The list containing the arguments as POSIXct +posixList = function(...) { + return(lapply(list(...), as.POSIXct)) +} + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Unit tests for author networks ------------------------------------------ + +#' Test the add.vertex.attribute method +test_that("Test add.vertex.attribute", { + + ## Test setup + + test.networks = get.network.covariates.test.networks.data() + expected.attributes = network.covariates.test.build.expected(list(1), list(42), list(42, 1, 1)) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute(test.networks[[level]], "test.attr", 42, + test.compute.attr) + + actual.attributes = lapply(networks.with.attr, function(net) igraph::V(net)$test.attr) + expect_identical(expected.attributes, actual.attributes) + }) +}) + +#' Test the split.and.add.vertex.attribute method +test_that("Test split.and.add.vertex.attribute", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks() + + expected.attributes = network.covariates.test.build.expected(list(1), list(42), list(42, 1, 1)) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = split.and.add.vertex.attribute(networks.and.data[["networks"]], + networks.and.data[["project.data"]], + "test.attr", level, 42, test.compute.attr) + + actual.attributes = lapply(networks.with.attr, function(net) igraph::V(net)$test.attr) + expect_identical(expected.attributes, actual.attributes) + }) +}) + +#' Test the add.vertex.attribute.commit.count.author method +test_that("Test add.vertex.attribute.commit.count.author", { + ## Test setup + networks.and.data = get.network.covariates.test.networks() + + expected.attributes = list( + range = network.covariates.test.build.expected(list(1L), list(1L), list(1L, 1L, 1L)), + cumulative = network.covariates.test.build.expected(list(1L), list(1L), list(2L, 1L, 1L)), + all.ranges = network.covariates.test.build.expected(list(1L), list(2L), list(2L, 1L, 1L)), + project.cumulative = network.covariates.test.build.expected(list(1L), list(1L), list(2L, 1L, 1L)), + project.all.ranges = network.covariates.test.build.expected(list(1L), list(2L), list(2L, 1L, 1L)), + complete = network.covariates.test.build.expected(list(1L), list(2L), list(2L, 1L, 1L)) + ) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.commit.count.author( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "commit.count") + + expect_identical(expected.attributes[[level]], actual.attributes) + }) +}) + +#' Test the add.vertex.attribute.author.email method +test_that("Test add.vertex.attribute.author.email", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks() + + expected.attributes = network.covariates.test.build.expected( + list("hunsen@fim.uni-passau.de"), + list("olaf@example.org"), + list("olaf@example.org", "karl@example.org", "thomas@example.org") + ) + + ## Test + + networks.with.attr = add.vertex.attribute.author.email( + networks.and.data[["networks"]], networks.and.data[["project.data"]] + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "author.email") + + expect_identical(expected.attributes, actual.attributes) +}) + +#' Test the add.vertex.attribute.artifact.count method +test_that("Test add.vertex.attribute.artifact.count", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks() + + expected.attributes = list( + range = network.covariates.test.build.expected(list(1L), list(1L), list(1L, 1L, 1L)), + cumulative = network.covariates.test.build.expected(list(1L), list(1L), list(2L, 1L, 1L)), + all.ranges = network.covariates.test.build.expected(list(1L), list(2L), list(2L, 1L, 1L)), + project.cumulative = network.covariates.test.build.expected(list(1L), list(1L), list(2L, 1L, 1L)), + project.all.ranges = network.covariates.test.build.expected(list(1L), list(2L), list(2L, 1L, 1L)), + complete = network.covariates.test.build.expected(list(1L), list(2L), list(2L, 1L, 1L)) + ) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.artifact.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "artifact.count") + + expect_identical(expected.attributes[[level]], actual.attributes) + }) +}) + +#' Test the add.vertex.attribute.first.activity method +test_that("Test add.vertex.attribute.first.activity", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks() + + expected.attributes = list( + range = list( + mails = network.covariates.test.build.expected( + list("2016-07-12 15:58:40"), + list(NA), + list("2016-07-12 16:05:37", NA, NA) + ), + commits = network.covariates.test.build.expected( + list("2016-07-12 15:58:59"), + list("2016-07-12 16:00:45"), + list("2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:32") + ), + issues = network.covariates.test.build.expected( + list("2016-07-12 15:59:25"), + list(NA), + list("2016-07-27 22:25:25", NA, "2016-07-14 02:03:14") + ) + ), + cumulative = list( + mails = network.covariates.test.build.expected( + list("2016-07-12 15:58:40"), + list("2016-07-12 15:58:50"), + list("2016-07-12 15:58:50", NA, "2016-07-12 16:04:40") + ), + commits = network.covariates.test.build.expected( + list("2016-07-12 15:58:59"), + list("2016-07-12 16:00:45"), + list("2016-07-12 16:00:45", "2016-07-12 16:06:10", "2016-07-12 16:06:32") + ), + issues = network.covariates.test.build.expected( + list("2016-07-12 15:59:25"), + list(NA), + list("2016-07-27 22:25:25", NA, "2016-07-12 15:59:25") + ) + ), + all.ranges = list( + mails = network.covariates.test.build.expected( + list("2016-07-12 15:58:40"), + list("2016-07-12 15:58:50"), + list("2016-07-12 15:58:50", NA, "2016-07-12 16:04:40") + ), + commits = network.covariates.test.build.expected( + list("2016-07-12 15:58:59"), + list("2016-07-12 16:00:45"), + list("2016-07-12 16:00:45", "2016-07-12 16:06:10", "2016-07-12 16:06:32") + ), + issues = network.covariates.test.build.expected( + list("2016-07-12 15:59:25"), + list("2016-07-27 22:25:25"), + list("2016-07-27 22:25:25", NA, "2016-07-12 15:59:25") + ) + ), + project.cumulative = list( + mails = network.covariates.test.build.expected( + list("2004-10-09 18:38:13"), + list("2016-07-12 15:58:50"), + list("2016-07-12 15:58:50", NA, "2016-07-12 16:04:40") + ), + commits = network.covariates.test.build.expected( + list("2016-07-12 15:58:59"), + list("2016-07-12 16:00:45"), + list("2016-07-12 16:00:45", "2016-07-12 16:06:10", "2016-07-12 16:06:32") + ), + issues = network.covariates.test.build.expected( + list("2016-07-12 15:59:25"), + list("2013-05-25 20:02:08"), + list("2013-05-25 20:02:08", "2013-04-21 23:52:09", "2016-07-12 15:59:25") + ) + ), + project.all.ranges = list( + mails = network.covariates.test.build.expected( + list("2004-10-09 18:38:13"), + list("2016-07-12 15:58:50"), + list("2016-07-12 15:58:50", NA, "2016-07-12 16:04:40") + ), + commits = network.covariates.test.build.expected( + list("2016-07-12 15:58:59"), + list("2016-07-12 16:00:45"), + list("2016-07-12 16:00:45", "2016-07-12 16:06:10", "2016-07-12 16:06:32") + ), + issues = network.covariates.test.build.expected( + list("2016-07-12 15:59:25"), + list("2013-05-25 20:02:08"), + list("2013-05-25 20:02:08", "2013-04-21 23:52:09", "2016-07-12 15:59:25") + ) + ), + complete = list( + mails = network.covariates.test.build.expected( + list("2004-10-09 18:38:13"), + list("2016-07-12 15:58:50"), + list("2016-07-12 15:58:50", NA, "2016-07-12 16:04:40") + ), + commits = network.covariates.test.build.expected( + list("2016-07-12 15:58:59"), + list("2016-07-12 16:00:45"), + list("2016-07-12 16:00:45", "2016-07-12 16:06:10", "2016-07-12 16:06:32") + ), + issues = network.covariates.test.build.expected( + list("2016-07-12 15:59:25"), + list("2013-05-25 20:02:08"), + list("2013-05-25 20:02:08", "2013-04-21 23:52:09", "2016-07-12 15:59:25") + ) + ) + ) + + ## convert date strings to POSIXct + expected.attributes = lapply(expected.attributes, function(types) { + lapply(types, function(times) { + lapply(times, function(date.list) { + lapply(date.list, function(date) { + as.POSIXct(date) + }) + }) + }) + }) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + attr = lapply(c("mails", "commits", "issues"), function(type) { + + networks.with.attr = add.vertex.attribute.first.activity( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level, activity.type = type + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "first.activity") + + expect_equal(expected.attributes[[level]][[type]], actual.attributes) + }) + }) +}) + +#' Test the add.vertex.attribute.active.ranges method +test_that("Test add.vertex.attribute.active.ranges", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks() + + expected.attributes = network.covariates.test.build.expected( + list(myranges[1]), list(myranges[2:3]), list(myranges[2:3], myranges[3], myranges[3]) + ) + + ## Test + + networks.with.attr = add.vertex.attribute.active.ranges( + networks.and.data[["networks"]], networks.and.data[["project.data"]] + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "active.ranges") + + expect_identical(expected.attributes, actual.attributes) +}) + +#' Test the add.vertex.attribute.author.role.simple method +test_that("Test add.vertex.attribute.author.role.simple", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks() + + expected.attributes = list( + range = list( + commit.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ), + loc.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ) + ), + cumulative = list( + commit.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ), + loc.count = network.covariates.test.build.expected( + list("core"), list("peripheral"), list("core", "core", "peripheral") + ) + ), + all.ranges = list( + commit.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ), + loc.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ) + ), + project.cumulative = list( + commit.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ), + loc.count = network.covariates.test.build.expected( + list("core"), list("peripheral"), list("core", "core", "peripheral") + ) + ), + project.all.ranges = list( + commit.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ), + loc.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ) + ), + complete = list( + commit.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ), + loc.count = network.covariates.test.build.expected( + list("core"), list("core"), list("core", "core", "peripheral") + ) + ) + ) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + lapply(c("commit.count", "loc.count"), function(type) { + networks.with.attr = add.vertex.attribute.author.role.simple( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + type = type, aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "author.role") + + expect_identical(expected.attributes[[level]][[type]], actual.attributes) + }) + }) +}) + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Unit tests for artifact networks ---------------------------------------- + +#' Test the add.vertex.attribute.artifact.editor.count method +test_that("Test add.vertex.attribute.artifact.editor.count", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact") + + expected.attributes = network.covariates.test.build.expected(list(1L), list(1L), list(3L)) + + ## Test + + networks.with.attr = add.vertex.attribute.artifact.editor.count(networks.and.data[["networks"]], + networks.and.data[["project.data"]]) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "editor.count") + + expect_identical(expected.attributes, actual.attributes) +}) + +#' Test the add.vertex.attribute.artifact.first.occurrence method +test_that("Test add.vertex.attribute.artifact.first.occurrence", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact") + + expected.attributes = network.covariates.test.build.expected( + posixList("2016-07-12 15:58:59 UTC"), posixList("2016-07-12 16:00:45 UTC"), posixList("2016-07-12 16:05:41 UTC") + ) + + ## Test + + networks.with.attr = add.vertex.attribute.artifact.first.occurrence( + networks.and.data[["networks"]], networks.and.data[["project.data"]] + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "first.occurrence") + + expect_equal(expected.attributes, actual.attributes) +}) + +#' Test the add.vertex.attribute.artifact.change.count method +test_that("Test add.vertex.attribute.artifact.change.count", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact") + + expected.attributes = network.covariates.test.build.expected(list(1L), list(1L), list(3L)) + + ## Test + + networks.with.attr = add.vertex.attribute.artifact.change.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]] + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "change.count") + + expect_identical(expected.attributes, actual.attributes) +}) diff --git a/tests/test-split.R b/tests/test-split.R index 2f2ac965..a48583aa 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -1,6 +1,7 @@ ## (c) Claus Hunsen, 2017 ## hunsen@fim.uni-passau.de - +## (c) Felix Prasse, 2017 +## prassefe@fim.uni-passau.de context("Splitting functionality.") @@ -1454,3 +1455,68 @@ test_that("Check and correct duplicate range names during network activity-based expect_identical(result, expected, info = "Removal of duplicate ranges.") }) + + +## +## Test splitting data by network names. +## +test_that("Test splitting data by networks", { + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("artifact.filter.base", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + + ## construct project data + project.data = ProjectData$new(proj.conf) + + ## split data + mybins = as.POSIXct(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", "2016-10-05 09:00:00")) + input.data = split.data.time.based(project.data, bins = mybins) + input.data.network = lapply(input.data, function(d) NetworkBuilder$new(d, net.conf)$get.author.network()) + + ## split data by networks + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete") + results = lapply(aggregation.level, function(level) + split.data.by.networks(input.data.network, project.data, level) + ) + names(results) = aggregation.level + + ## construct expected ranges + expected.ranges = list( + range = c("2016-07-12 15:00:00-2016-07-12 16:00:00", + "2016-07-12 16:00:00-2016-07-12 16:05:00", + "2016-07-12 16:05:00-2016-10-05 09:00:00"), + cumulative = c("2016-07-12 15:00:00-2016-07-12 16:00:00", + "2016-07-12 15:00:00-2016-07-12 16:05:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00"), + all.ranges = c("2016-07-12 15:00:00-2016-10-05 09:00:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00"), + project.cumulative = c("2004-10-09 18:38:13-2016-07-12 16:00:00", + "2004-10-09 18:38:13-2016-07-12 16:05:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00"), + project.all.ranges = c("2004-10-09 18:38:13-2016-10-05 09:00:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00"), + complete = c("2004-10-09 18:38:13-2017-05-23 12:32:39", + "2004-10-09 18:38:13-2017-05-23 12:32:39", + "2004-10-09 18:38:13-2017-05-23 12:32:39") + ) + + ## test the ranges + test.each.network = function(aggregation.level) { + result.data = results[[aggregation.level]] + expected.range.names = expected.ranges[[aggregation.level]] + + lapply(seq_along(result.data), function(i) { + result.entry = result.data[[i]] + + expect_true(igraph::identical_graphs(result.entry[["network"]], input.data.network[[i]])) + expect_equal(result.entry[["data"]]$get.range(), expected.range.names[[i]]) + }) + } + lapply(aggregation.level, test.each.network) +}) diff --git a/util-conf.R b/util-conf.R index 549e21cf..a3fad277 100644 --- a/util-conf.R +++ b/util-conf.R @@ -679,7 +679,7 @@ NetworkConf = R6::R6Class("NetworkConf", inherit = Conf, type = "character", allowed = c( # the date - "date", + "date", "date.offset", # author information "author.name", "author.email", # committer information diff --git a/util-core-peripheral.R b/util-core-peripheral.R index a66b0323..083a1ba8 100644 --- a/util-core-peripheral.R +++ b/util-core-peripheral.R @@ -6,12 +6,13 @@ ## mitchell.joblin@uni-passau.de ## (c) Sofie Kemper, 2017 ## kemperso@fim.uni-passau.de +## (c) Felix Prasse, 2017 +## prassefe@fim.uni-passau.de ## This file is derived from following Codeface script: ## https://github.com/siemens/codeface/blob/master/codeface/R/developer_classification.r - ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Libraries --------------------------------------------------------------- @@ -451,6 +452,58 @@ get.commit.count.threshold = function(range.data) { return(threshold) } +#' Get the commit count per comitter in the given range data, where the committer +#' does not match the author of the respective commits +#' +#' @param range.data The data to count on +#' +#' @return A data frame in descending order by the commit count +get.committer.not.author.commit.count = function(range.data) { + logging::logdebug("get.committer.not.author.commit.count: starting.") + + ## Get commit data + commits.df = get.commit.data(range.data, columns = c("committer.name", "author.name"))[[1]] + + ## Return NA in case no commit data is available + if(all(is.na(commits.df))) { + return(NA) + } + + ## Execute a query to get the commit count per author + res = sqldf::sqldf("SELECT *, COUNT(*) AS `freq` FROM `commits.df` + WHERE committer.name <> author.name + GROUP BY `committer.name`,`author.name` + ORDER BY `freq` DESC") + + logging::logdebug("get.committer.not.author.commit.count: finished.") + return(res) +} + +#' Get the commit count per comitter in the given range data, where the committer +#' may match the author of the respective commits +#' +#' @param range.data The data to count on +#' +#' @return A data frame in descending order by the commit count. +get.committer.commit.count = function(range.data) { + logging::logdebug("get.committer.commit.count: starting.") + + ## Get commit data + commits.df = get.commit.data(range.data, columns = c("committer.name", "committer.email"))[[1]] + + ## Return NA in case no commit data is available + if(all(is.na(commits.df))) { + return(NA) + } + + ## Execute a query to get the commit count per author + res = sqldf::sqldf("SELECT *, COUNT(*) AS `freq` FROM `commits.df` + GROUP BY `committer.name` ORDER BY `freq` DESC") + + logging::logdebug("get.committer.commit.count: finished.") + return(res) +} + ## Get the commit count per author of the specified version range ## as a data frame ordered by the commit count. get.author.commit.count = function(range.data) { @@ -465,7 +518,8 @@ get.author.commit.count = function(range.data) { } ## Execute a query to get the commit count per author - res = sqldf::sqldf("select *, COUNT(*) as `freq` from `commits.df` group by `author.name` order by `freq` desc") + res = sqldf::sqldf("SELECT *, COUNT(*) AS `freq` FROM `commits.df` + GROUP BY `author.name` ORDER BY `freq` DESC") logging::logdebug("get.author.commit.count: finished.") return(res) @@ -508,8 +562,10 @@ get.author.loc.count = function(range.data) { logging::logdebug("get.author.loc.count: starting.") ## Get commit data - commits.df = get.commit.data(range.data, - columns = c("author.name", "author.email", "added.lines", "deleted.lines"))[[1]] + commits.df = get.commit.data( + range.data, + columns = c("author.name", "author.email", "added.lines", "deleted.lines") + )[[1]] ## Return NA in case no commit data is available if(all(is.na(commits.df))) { @@ -517,8 +573,10 @@ get.author.loc.count = function(range.data) { } ## Execute a query to get the changed lines per author - res = sqldf::sqldf("select `author.name`, `author.email`, SUM(`added.lines`) + SUM(`deleted.lines`) as `loc` - from `commits.df` group by `author.name` order by `loc` desc") + res = sqldf::sqldf("SELECT `author.name`, `author.email`, + SUM(`added.lines`) + SUM(`deleted.lines`) AS `loc` + FROM `commits.df` + GROUP BY `author.name` ORDER BY `loc` DESC") logging::logdebug("get.author.loc.count: finished.") return(res) @@ -565,8 +623,10 @@ get.author.class.activity = function(range.data = NULL, split = split) ## Build the query string to group commits by the author name - commits.query = "select `author.name`, SUM(`added.lines`) + SUM(`deleted.lines`) as `loc.count`, - COUNT(*) as `commit.count` from `commits.df` group by `author.name`" + commits.query = "SELECT `author.name`, SUM(`added.lines`) + SUM(`deleted.lines`) AS `loc.count`, + COUNT(*) AS `commit.count` + FROM `commits.df` + GROUP BY `author.name`" ## Get the authors with their commit count and corresponding class for each splitted range commits.dev.list = list() @@ -960,89 +1020,3 @@ get.threshold = function(data.list) { logging::logdebug("get.threshold: finished.") return(data.threshold) } - -## Get the commit data with the specified columns for the specified version range as a data frame -## for each specified split range. -## A split interval can be set by defining the number of weeks for each requested range as a vector. -get.commit.data = function(range.data, columns = c("author.name", "author.email"), split = c()) { - logging::logdebug("get.commit.data: starting.") - - ## Get commit data - commits.df = range.data$get.commits() - - ## In case no commit data is available, return NA - if(nrow(commits.df) == 0) { - return(NA) - } - - ## Make sure the hash is included in the cut columns vector for grouping - cut.columns = columns - if (!("hash" %in% cut.columns)) { - cut.columns = c(cut.columns, "hash") - } - - ## Make sure the date is included in the cut columns vector for splitting - if (!("date" %in% cut.columns)) { - cut.columns = c(cut.columns, "date") - } - - ## Cut down data to needed minimum - commits.df = commits.df[cut.columns] - - ## Group by hash to get a line per commit - commits.df = sqldf::sqldf("select * from `commits.df` group by `hash`") - - ## Remove hash column if not wanted as it now contains nonsensical data - if (!("hash" %in% columns)) { - commits.df["hash"] = NULL - } - - ## Order commits by date column - commits.df = commits.df[order(commits.df$date),] - - ## Fetch the date range info - date.first = as.Date(commits.df$date[1]) - date.last = as.Date(commits.df$date[nrow(commits.df)]) + 1 # +1 since findInterval is right-exclusive - - ## Calc the split dates depending on the specified intervals - date.split = c(date.last) - if (!is.null(split)) { - for (i in 1:length(split)) { - ## substract split[i] number of weeks (i.e., split[i] * 7 days) - ## TODO use lubridate package here to substract a week from POSIXct? - date.calc = date.split[i] - (split[i] * 7) - - ## Check if calculated date is still after the first commit date of the range - if (date.calc > date.first) { - date.split = c(date.split, date.calc) - } else { - date.split = c(date.split, date.first) - break - } - } - } else { - date.split = c(date.split, date.first) - } - - date.split = rev(date.split) - - ## Only keep the commits which were made within the specified split ranges - ## TODO https://github.com/se-passau/codeface-extraction-r/pull/51#discussion_r132924711 - commits.df = commits.df[as.Date(commits.df$date) >= date.split[1],] - - ## Calc group numbers for the commits by the split dates - intervals = findInterval(as.Date(commits.df[["date"]]), date.split, all.inside = FALSE) - - ## Remove date column if not wanted - if (!("date" %in% columns)) { - commits.df["date"] = NULL - } - - ## Split the commits by the calculated groups - res = split.data.by.bins(commits.df, intervals) - names(res) = construct.ranges(date.split) - attr(res, "bins") = date.split - - logging::logdebug("get.commit.data: finished.") - return(res) -} diff --git a/util-data.R b/util-data.R index 2c451900..dda7065d 100644 --- a/util-data.R +++ b/util-data.R @@ -145,7 +145,7 @@ ProjectData = R6::R6Class("ProjectData", logging::logdebug("filter.commits: finished.") }, - ## * * pasta data ------------------------------------------- + ## * * pasta data -------------------------------------------------- #' Add the pasta data to the given data.frame for further analysis. #' @@ -173,7 +173,7 @@ ProjectData = R6::R6Class("ProjectData", return(data) }, - ## * * timestamps ------------------------------------------- + ## * * timestamps -------------------------------------------------- #' Call the getters of the specified data sources in order to #' initialize the sources and extract the timestamps. @@ -565,6 +565,7 @@ ProjectData = R6::R6Class("ProjectData", #' @return the list of artifacts get.artifacts = function() { ## FIXME the artifacts determination should be dependent on the artifact.relation + ## (see also get.author2artifact) logging::loginfo("Getting artifact data.") ## if artifacts are not read already, do this @@ -611,20 +612,28 @@ ProjectData = R6::R6Class("ProjectData", } }, - ## * * data cutting ----------------------------------------- + ## * * data cutting ------------------------------------------------ - #' Get the timestamps (earliest and latest date) of the specified data sources. - #' If 'simple' is TRUE, return the overall latest start and earliest end date - #' in order to cut the specified data sources to the same date ranges. + #' Get the timestamps (earliest and latest date of activity) of the specified + #' data sources. #' - #' If there are no actual data available for a data source, the result indicates NA + #' If there are no data available for a data source, the result indicates NA. #' - #' @param data.sources the specified data sources - #' @param simple whether or not the timestamps get simplified + #' @param data.sources The specified data sources. One of \code{"mails"}, + #' \code{"commits"}, and \code{"issues"}. + #' @param simple If TRUE, return the overall latest start and earliest end date + #' across all data sources in a one-row data.frame; otherwise, return + #' the first and last activities of all data sources individually. + #' Can be overridden by \code{outermost}. + #' @param outermost Whether the very first and the very last activity across all data + #' sources is to be returned in a one-row data.frame; ignored otherwise. + #' This overrides any value given via \code{simple}. #' - #' @return a data.frame with the timestamps of each data source as columns "start" and "end", - #' with the data source as corresponding row name - get.data.timestamps = function(data.sources = c("mails", "commits", "issues"), simple = FALSE) { + #' @return A data.frame with the timestamps of each data source as columns "start" and "end", + #' possibly with the data source as corresponding row name + get.data.timestamps = function(data.sources = c("mails", "commits", "issues"), simple = FALSE, + outermost = FALSE) { + ## check arguments data.sources = match.arg(arg = data.sources, several.ok = TRUE) @@ -635,8 +644,14 @@ ProjectData = R6::R6Class("ProjectData", subset.timestamps = private$data.timestamps[data.sources, ] ## get the proper subset of timestamps for returning - if(simple) { - ## get minima and maxima across data sources (rows) + if (outermost) { + ## get minimum start date and maximum end date across data sources + timestamps = data.frame( + start = min(subset.timestamps[, "start"], na.rm = TRUE), + end = max(subset.timestamps[, "end"], na.rm = TRUE) + ) + } else if(simple) { + ## get maximum start date and minimum end date across data sources timestamps = data.frame( start = max(subset.timestamps[, "start"], na.rm = TRUE), end = min(subset.timestamps[, "end"], na.rm = TRUE) @@ -692,6 +707,8 @@ ProjectData = R6::R6Class("ProjectData", #' #' @return the list of artifacts for every author get.author2artifact = function() { + ## FIXME the artifacts determination should be dependent on the artifact.relation + ## (see also get.artifacts) logging::loginfo("Getting author--artifact data.") ## store the authors per artifact @@ -967,3 +984,99 @@ get.key.to.value.from.df = function(base.data, key, value, ...) { return(mylist) } + +#' Get the commit data with the specified columns for the specified project-data instance +#' as a data frame for each specified split range. +#' +#' A split interval can be set by defining the number of weeks for each requested range as a vector. +#' +#' @param project.data The project data as source for the commits +#' @param columns The commit-data columns to select and return +#' @param split A list of numerics, indicating numbers of weeks into which the selected data +#' is to be split +#' +#' @return A data.frame indicating the selected \code{columns}, split into the given numbers +#' of weeks (\code{split}) +get.commit.data = function(project.data, columns = c("author.name", "author.email"), split = c()) { + logging::logdebug("get.commit.data: starting.") + + ## Get commit data + commits.df = project.data$get.commits() + + ## In case no commit data is available, return NA + if(nrow(commits.df) == 0) { + return(NA) + } + + ## Make sure the hash is included in the cut columns vector for grouping + cut.columns = columns + if (!("hash" %in% cut.columns)) { + cut.columns = c(cut.columns, "hash") + } + + ## Make sure the date is included in the cut columns vector for splitting + if (!("date" %in% cut.columns)) { + cut.columns = c(cut.columns, "date") + } + + ## Cut down data to needed minimum + commits.df = commits.df[cut.columns] + + ## Group by hash to get a line per commit + commits.df = sqldf::sqldf("SELECT * FROM `commits.df` GROUP BY `hash`") + + ## Remove hash column if not wanted as it now contains nonsensical data + if (!("hash" %in% columns)) { + commits.df["hash"] = NULL + } + + ## Order commits by date column + commits.df = commits.df[order(commits.df$date),] + + ## Fetch the date range info + date.first = as.Date(commits.df$date[1]) + date.last = as.Date(commits.df$date[nrow(commits.df)]) + 1 # +1 since findInterval is right-exclusive + + ## Calc the split dates depending on the specified intervals + date.split = c(date.last) + if (!is.null(split)) { + for (i in 1:length(split)) { + ## substract split[i] number of weeks (i.e., split[i] * 7 days) + ## TODO use lubridate package here to substract a week from POSIXct? + date.calc = date.split[i] - (split[i] * 7) + + ## Check if calculated date is still after the first commit date of the range + if (date.calc > date.first) { + date.split = c(date.split, date.calc) + } else { + date.split = c(date.split, date.first) + break + } + } + } else { + date.split = c(date.split, date.first) + } + + date.split = rev(date.split) + + ## Only keep the commits which were made within the specified split ranges + ## TODO https://github.com/se-passau/codeface-extraction-r/pull/51#discussion_r132924711 + commits.df = commits.df[as.Date(commits.df$date) >= date.split[1],] + + ## Calc group numbers for the commits by the split dates + intervals = findInterval(as.Date(commits.df[["date"]]), date.split, all.inside = FALSE) + + ## Remove date column if not wanted + if (!("date" %in% columns)) { + commits.df["date"] = NULL + } + + ## Split the commits by the calculated groups + res = split.data.by.bins(commits.df, intervals) + names(res) = construct.ranges(date.split) + attr(res, "bins") = date.split + + logging::logdebug("get.commit.data: finished.") + return(res) +} + diff --git a/util-init.R b/util-init.R index 0ab03439..87d48ec9 100644 --- a/util-init.R +++ b/util-init.R @@ -24,3 +24,4 @@ source("util-bulk.R") source("util-plot.R") source("util-core-peripheral.R") source("util-networks-metrics.R") +source("util-networks-covariates.R") diff --git a/util-networks-covariates.R b/util-networks-covariates.R new file mode 100644 index 00000000..7c1648a4 --- /dev/null +++ b/util-networks-covariates.R @@ -0,0 +1,603 @@ +## (c) Felix Prasse, 2017 +## prassefe@fim.uni-passau.de + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Utility functions ------------------------------------------------------- + +#' Utility function to compute vertex attributes for a list of network +#' +#' Important: This function only works for lists of networks which have timestamps used in their range names. +#' +#' This method is a wrapper combining the steps of splitting the project data and calculating the attribute. +#' +#' @param list.of.networks The list of networks to add vertex attributes to +#' @param project.data The entire project data +#' @param attr.name The name of the attribute to add +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value +#' @param compute.attr The function to compute the attribute to add. Must return a named list +#' with the names being the name of the vertex. +#' +#' @return A list of networks with the added attribute +split.and.add.vertex.attribute = function(list.of.networks, project.data, attr.name, + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value, compute.attr) { + net.to.range.list = split.data.by.networks(list.of.networks, project.data, aggregation.level) + + nets.with.attr = add.vertex.attribute(net.to.range.list, attr.name, default.value, compute.attr) + return(nets.with.attr) +} + +#' Utility function to compute vertex attributes for a list of network-to-range tuples. +#' +#' @param net.to.range.list A list containing tuples with networks and corresponding range data. +#' @param attr.name The name of the attribute to add +#' @param default.value The default value to add if a vertex has no matching value +#' @param compute.attr The function to compute the attribute to add. Must return a named list +#' with the names being the name of the vertex. +#' +#' @return A list of networks with the added attribute +add.vertex.attribute = function(net.to.range.list, attr.name, default.value, compute.attr) { + + nets.with.attr = mapply( + names(net.to.range.list), net.to.range.list, + SIMPLIFY = FALSE, FUN = function(range, net.to.range) { + + current.network = net.to.range[["network"]] + range.data = net.to.range[["data"]] + + attr.df = compute.attr(range, range.data, current.network) + + get.or.default = function(name, data, default) { + if(name %in% names(data)) { + return(data[[name]]) + } else { + return(default) + } + } + + attributes = lapply(igraph::V(current.network)$name, + function(x) get.or.default(x, attr.df, default.value)) + + net.with.attr = igraph::set.vertex.attribute(current.network, attr.name, value = attributes) + + return(net.with.attr) + } + ) + + return (nets.with.attr) +} + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Author network functions ------------------------------------------------ + +## * Commit count ---------------------------------------------------------- + +#' Add commit-count attribute based on author name +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "commit.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: 0] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.commit.count.author = function(list.of.networks, project.data, name = "commit.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0) { + nets.with.attr = add.vertex.attribute.commit.count.helper( + list.of.networks, project.data, name, aggregation.level, + default.value, get.author.commit.count, "author.name" + ) + + return(nets.with.attr) +} + +#' Add commit-count attribute based on author name where author is not committer +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "commit.count.author.not.committer"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: 0] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.commit.count.author.not.committer = function(list.of.networks, project.data, + name = "commit.count.author.not.committer", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0) { + nets.with.attr = add.vertex.attribute.commit.count.helper( + list.of.networks, project.data, name, aggregation.level, + default.value, get.committer.not.author.commit.count, "author.name" + ) + + return(nets.with.attr) +} + +#' Add commit-count attribute based on comitter name +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "commit.count.committer"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: 0] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.commit.count.committer = function(list.of.networks, project.data, name = "commit.count.committer", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0) { + nets.with.attr = add.vertex.attribute.commit.count.helper( + list.of.networks, project.data, name, aggregation.level, + default.value, get.committer.commit.count, "committer.name" + ) + + return(nets.with.attr) +} + +#' Add commit-count attribute based on comitter name where committer is not author +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "commit.count.committer.not.author"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: 0] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.commit.count.committer.not.author = function(list.of.networks, project.data, + name = "commit.count.committer.not.author", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0) { + nets.with.attr = add.vertex.attribute.commit.count.helper( + list.of.networks, project.data, name, aggregation.level, + default.value, get.committer.not.author.commit.count, "committer.name" + ) + + return(nets.with.attr) +} + +#' Add commit-count attribute based using \code{commit.count.method} +#' +#' Note: This is a helper function for all other functions adding a commit-count-related +#' vertex attribute. +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "commit.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: 0] +#' @param commit.count.method The method reference for counting the commits +#' @param name.column The name of the author or committer column +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.commit.count.helper = function(list.of.networks, project.data, name = "commit.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0, commit.count.method, name.column) { + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + commit.count.df = commit.count.method(range.data)[c(name.column, "freq")] + + if(!is.data.frame(commit.count.df)) { + return(list()) + } + + commit.count.list = structure(commit.count.df[["freq"]], names = commit.count.df[[name.column]]) + + return(commit.count.list) + } + ) + + return(nets.with.attr) +} + +## * Meta-data ------------------------------------------------------------- + +#' Add author email attribute +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "author.email"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.author.email = function(list.of.networks, project.data, name = "author.email", default.value = NA) { + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, "complete", default.value, + function(range, range.data, net) { + authors = range.data$get.authors() + author.to.mail = structure(names = authors[["author.name"]], + authors[["author.email"]]) + + return(author.to.mail) + } + ) + + return(nets.with.attr) +} + +#' Add unique artifact count attribute +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "artifact.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: 0] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.artifact.count = function(list.of.networks, project.data, name = "artifact.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0) { + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + lapply(range.data$get.author2artifact(), function(x) { + length(unique(x[["artifact"]])) + }) + } + ) + + return(nets.with.attr) +} + +## * Activity -------------------------------------------------------------- + +#' Add first activity attribute +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param activity.type The kind of activity to use as basis. +#' One of \code{mails}, \code{commits}, and \code{issues}. +#' @param name The attribute name to add [default: "first.activity"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.first.activity = function(list.of.networks, project.data, + activity.type = c("mails", "commits", "issues"), + name = "first.activity", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = NA) { + activity.type = match.arg(activity.type) + function.suffix = substr(activity.type, 1, nchar(activity.type) - 1) + activity.type.function = paste0("get.author2", function.suffix) + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + lapply(range.data[[activity.type.function]](), + function(x) min(x[["date"]]) + ) + } + ) + + return(nets.with.attr) +} + +#' Add active ranges attribute +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "active.ranges"] +#' @param default.value The default value to add if a vertex has no matching value [default: list()] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.active.ranges = function(list.of.networks, project.data, name = "active.ranges", + default.value = list()) { + net.to.range.list = split.data.by.networks(list.of.networks, project.data, "range") + + range.to.authors = lapply( + net.to.range.list, + function(net.to.range) { + ## FIXME support data-source-specific method AND all-sources method + ## (we only use 'commits' source here) + names(net.to.range[["data"]]$get.author2commit()) + } + ) + + author.names = unique(unlist(range.to.authors)) + + active.ranges = lapply( + author.names, + function(author) { + filter.by.author = Filter(function(range) author %in% range, + range.to.authors) + + active.ranges.of.author = names(filter.by.author) + + return(active.ranges.of.author) + } + ) + + names(active.ranges) = author.names + + nets.with.attr = add.vertex.attribute( + net.to.range.list, name, default.value, + function(range, range.data, net) { + active.ranges + } + ) + + return(nets.with.attr) +} + +## * Role ------------------------------------------------------------------ + +#' Add author role attribute, while using the classification method +#' \code{get.author.class.by.type} to provide the attributes +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "author.role"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param type The type of author classification. One of \code{"network.degree"}, +#' \code{"network.eigen"}, \code{"commit.count"}, and \code{"loc.count"}. +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.author.role.simple = function(list.of.networks, project.data, name = "author.role", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + type = c("network.degree", "network.eigen", + "commit.count", "loc.count"), + default.value = NA) { + + classification.function = function(network, range.data) { + classification = get.author.class.by.type(network, range.data, type) + return(classification) + } + + nets.with.attr = add.vertex.attribute.author.role.function( + list.of.networks, project.data,classification.function, name, + aggregation.level, default.value + ) + + return(nets.with.attr) +} + +#' Add author role attribute using a specified classification function +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param classification.result A name of an author-classification function. Must return a tuple +#' of two lists containing the authors named "core" and "peripheral". +#' See the functions \code{get.author.class.*}. +#' @param name The attribute name to add [default: "author.role"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.author.role.function = function(list.of.networks, project.data, classification.function, + name = "author.role", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = NA) { + + net.to.range.list = split.data.by.networks(list.of.networks, project.data, aggregation.level) + + classification.results = lapply( + net.to.range.list, + function(net.to.range) { + author.class = classification.function(net.to.range[["network"]], net.to.range[["data"]]) + return(author.class) + } + ) + + nets.with.attr = add.vertex.attribute.author.role( + list.of.networks, project.data, classification.results, name, + aggregation.level, default.value + ) + + return(nets.with.attr) +} + +#' Add author role attribute by classification results +#' +#' Important: The lists \code{list.of.networks} and \code{classification.results} needs to be of the same +#' length for this to work properly. +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param classification.results A list of author-classification results. Each item needs to contain +#' a tuple of two lists containing the authors named "core" and "peripheral" +#' (see the functions \code{get.author.class.*}). +#' The list needs to be of the same length as \code{list.of.networks} and use +#' the same names. +#' @param name The attribute name to add [default: "author.role"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.author.role = function(list.of.networks, project.data, classification.results, + name = "author.role", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = NA) { + + if (length(list.of.networks) != length(classification.results) || + !identical(names(list.of.networks), names(classification.results))) { + logging::logwarn(paste("Adding author-classification vertex attribute: The classification", + "results do not match with the list of networks. Please see the", + "documentation of the function 'add.vertex.attribute.author.role'.")) + } + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + classification = classification.results[[range]] + author.class = plyr::ldply(classification, .id = NA) + + author.to.role = structure(author.class[[".id"]], names = author.class[["author.name"]]) + return(author.to.role) + } + ) + + return(nets.with.attr) +} + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Artifact network functions ---------------------------------------------- + +## * Change count ---------------------------------------------------------- + +#' Add the count of unique editors (i.e., authors) that worked on an artifact +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "editor.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: 0] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.artifact.editor.count = function(list.of.networks, project.data, name = "editor.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0) { + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + lapply(range.data$get.artifact2author(), function(x) { + length(unique(x[["author.name"]])) + }) + } + ) + + return(nets.with.attr) +} + +#' Add the amount of times the artifact was changed +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "change.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: 0] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.artifact.change.count = function(list.of.networks, project.data, name = "change.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0) { + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + artifact.to.commit = get.key.to.value.from.df(range.data$get.commits.filtered.empty(), "artifact", "hash") + artifact.change.count = lapply(artifact.to.commit, function(x) { + length(unique(x[["hash"]])) + }) + + return(artifact.change.count) + } + ) + + return(nets.with.attr) +} + +## * Activity -------------------------------------------------------------- + +#' Add the first occurrence of the artifact +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "first.occurrence"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.artifact.first.occurrence = function(list.of.networks, project.data, name = "first.occurrence", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = NA) { + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + artifact.to.dates = get.key.to.value.from.df(range.data$get.commits.filtered.empty(), "artifact", "date") + artifact.to.first = lapply(artifact.to.dates, function(a) { + min(a[["date"]]) + }) + return(artifact.to.first) + } + ) + return(nets.with.attr) +} diff --git a/util-split.R b/util-split.R index 76b644dd..248bc839 100644 --- a/util-split.R +++ b/util-split.R @@ -325,6 +325,119 @@ split.data.activity.based = function(project.data, activity.type = c("commits", return(cf.data) } +#' Map a list of networks to their corresponding range data, after splitting the +#' given project data (\code{project.data}) to the time ranges given by the networks' +#' names. The splitting can be more specifically configured with the parameter +#' \code{aggregation.level}, see below for more details. +#' +#' For this function to work properly, the list of networks needs to be named with +#' timestamp-ranges, which can be splitted using \code{get.range.bounds}. The easiest +#' way to achieve this is to use one of the \code{split.*} functions in this very file. +#' For example, the time ranges have a format like this: +#' "2017-01-01 23:57:01-2017-02-15 12:19:37", which can be split by the utility +#' function \code{get.range.bounds}, obtaining the range bounds as timestamps. +#' +#' Using different aggregation levels given by the parameter \code{aggregation.level}, +#' it is possible to configure the exact treatment of range bounds and, thus, the +#' splitting of the given project data. The various aggregation levels work as follows: +#' - \code{"range"}: The project data will be split exactly to the time ranges specified +#' by the networks' names. +#' - \code{"cumulative"}: The project data will be split exactly to the time ranges +#' specified by the networks' names, but in a cumulative manner. +#' - \code{"all.ranges"}: The project data will be split exactly to the time range +#' specified by the start of the first network and end of the last +#' network. All data instances will contain the same data. +#' - \code{"project.cumulative"}: The same splitting as for \code{"cumulative"}, but all +#' data will start at the beginning of the project data and *not* at +#' the beginning of the first network. +#' - \code{"project.all.ranges"}: The same splitting as for \code{"all.ranges"}, but all +#' data will start at the beginning of the project data and *not* at +#' the beginning of the first network. All data instances will contain +#' the same data. +#' - \code{"complete"}: The same splitting as for \code{"all.ranges"}, but all data will +#' start at the beginning of the project data and end at the end of +#' the project data. All data instances will contain the same data. +#' +#' @param list.of.networks The network list +#' @param project.data The entire project data +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See above for more details. +#' +#' @return A list containing tuples with the keys "network" and "data", where, under "network", are +#' the respective networks passed via \code{list.of.networks} and, under "data", are the +#' split data instances of type \code{RangeData}. +split.data.by.networks = function(list.of.networks, project.data, + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete")) { + ## get the chosen aggregation level + aggregation.level = match.arg(aggregation.level) + + ## get the timestamp data from the project data (needed for some aggr. levels) + project.timestamps = project.data$get.data.timestamps(outermost = TRUE) + + ## loop over all ranges and split the data for each range accordingly: + list.of.ranges = names(list.of.networks) + list.of.range.bounds = lapply(list.of.ranges, get.range.bounds) + net.to.range.list = lapply(list.of.ranges, function(range) { + ## 1) get the range bounds to work with + start.end = get.range.bounds(range) + + ## 2) adjust the range bounds for the respective aggregation levels + ## (if nothing else is stated below, the respective range bounds stay unchanged) + switch(aggregation.level, + + range = { + ## use the exact range bounds + }, + cumulative = { + ## the start is always at the first network's start bound + start.end[1] = list.of.range.bounds[[1]][1] + }, + all.ranges = { + ## the start is always at the first network's start bound + start.end[1] =list.of.range.bounds[[1]][1] + ## the end is always at the last network's ending bound + start.end[2] = list.of.range.bounds[[length(list.of.ranges)]][2] + }, + project.cumulative = { + ## the start is always at the project data's start + start.end[1] = project.timestamps[["start"]] + }, + project.all.ranges = { + ## the start is always at the project data's start + start.end[1] = project.timestamps[["start"]] + ## the end is always at the last network's ending bound + start.end[2] = list.of.range.bounds[[length(list.of.ranges)]][2] + }, + complete = { + ## the start is always at the project data's start + start.end[1] = project.timestamps[["start"]] + ## the start is always at the project data's ending + start.end[2] = project.timestamps[["end"]] + } + ) + + ## 3) split the data to the ranges + range.data = split.data.time.based(project.data, bins = start.end, sliding.window = FALSE)[[1]] + + ## 4) construct return value + net.to.range.entry = list( + "network" = list.of.networks[[range]], + "data" = range.data + ) + + return (net.to.range.entry) + }) + + ## properly set names for the result list + names(net.to.range.list) = names(list.of.networks) + + return(net.to.range.list) +} + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Split networks ----------------------------------------------------------