Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Working version, needs improvement [WIP] #39

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
55 changes: 55 additions & 0 deletions licensedb/internal/investigation.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
"gopkg.in/src-d/enry.v1"
)

var (
Expand Down Expand Up @@ -157,3 +158,57 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
func IsLicenseDirectory(fileName string) bool {
return licenseDirectoryRe.MatchString(strings.ToLower(fileName))
}

// ExtractSourceFiles searches for source code files and their returns header comments, when available.
// Enry is used to get possible valuable files.
func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
candidates := [][]byte{}
langs := []string{}
for _, file := range files {
lang, safe := enry.GetLanguageByExtension(file)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to use GetLanguage which is more accurate.

if safe == true {
langs = append(langs, lang)
text, err := fs.ReadFile(file)
if err == nil {
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
}
}
}
if len(candidates) > 0 {
candidates = ExtractHeaderComments(candidates, langs)
}
return candidates
}

// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
func ExtractHeaderComments(candidates [][]byte, lang []string) [][]byte {
// TO DO: split code from comments, preferably only header comments
comments := [][]byte{}
return comments
}

// InvestigateHeaderComments scans the header comments for licensing information and outputs the
// probable names using NER.
func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 {
// TO DO: split license-comments from description-comments.
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateHeaderComment(text)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
return maxLicenses
}

// InvestigateHeaderComment scans the header comments for licensing information and outputs probable
// names found with Named Entity Recognition from NLP.
func InvestigateHeaderComment(text []byte) map[string]float32 {
return globalLicenseDatabase().QueryLicenseText(string(text))
}
13 changes: 10 additions & 3 deletions licensedb/licensedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,17 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
}
// Plan B: take the README, find the section about the license and apply NER
candidates = internal.ExtractReadmeFiles(fileNames, fs)
if len(candidates) == 0 {
return nil, ErrNoLicenseFound
if len(candidates) > 0 {
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) > 0 {
return licenses, nil
}
}
// Plan C: look for licence texts in source code files with comments at header
candidates = internal.ExtractSourceFiles(fileNames, fs)
if len(candidates) > 0 {
licenses = internal.InvestigateHeaderComments(candidates, fs)
}
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) == 0 {
return nil, ErrNoLicenseFound
}
Expand Down