-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into LANTERN-782-curemd-webscraper
- Loading branch information
Showing
7 changed files
with
248 additions
and
5 deletions.
There are no files selected for viewing
34 changes: 34 additions & 0 deletions
34
endpointmanager/pkg/chplendpointquerier/betaAfoundriaWebScraper.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
package chplendpointquerier | ||
|
||
import ( | ||
"github.com/PuerkitoBio/goquery" | ||
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers" | ||
log "github.com/sirupsen/logrus" | ||
) | ||
|
||
func BetaAfoundriaWebScraper(CHPLURL string, fileToWriteTo string) { | ||
|
||
var lanternEntryList []LanternEntry | ||
var endpointEntryList EndpointList | ||
|
||
doc, err := helpers.ChromedpQueryEndpointList(CHPLURL, ".container") | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
doc.Find(".container h3").Each(func(index int, header *goquery.Selection) { | ||
nextAnchor := header.NextFiltered("a") | ||
href, exists := nextAnchor.Attr("href") | ||
if exists { | ||
var entry LanternEntry | ||
entry.URL = href | ||
lanternEntryList = append(lanternEntryList, entry) | ||
} | ||
}) | ||
|
||
endpointEntryList.Endpoints = lanternEntryList | ||
err = WriteCHPLFile(endpointEntryList, fileToWriteTo) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
101 changes: 101 additions & 0 deletions
101
endpointmanager/pkg/chplendpointquerier/customcsvparser.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
package chplendpointquerier | ||
|
||
import ( | ||
"encoding/csv" | ||
"io" | ||
"os" | ||
"strings" | ||
|
||
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers" | ||
log "github.com/sirupsen/logrus" | ||
) | ||
|
||
// CustomCSVParser reads a CSV from a URL or file and writes processed data to an output file. | ||
// Parameters: | ||
// - inputSource: URL or file path of the input CSV. | ||
// - fileToWriteTo: File path to write the processed data. | ||
// - csvFilePath: Temporary file path for storing the downloaded CSV (if applicable). | ||
// - numrecords: Number of records to process (-1 for all records). | ||
// - startrecord: Starting index of records to process. | ||
// - header: Boolean indicating if the CSV has a header to skip. | ||
// - urlIndex: Column index where the URL is located. | ||
// - organizationIndex: Column index where the organization name is located. | ||
func CustomCSVParser(inputSource string, fileToWriteTo string, csvFilePath string, numrecords int, startrecord int, header bool, urlIndex int, organizationIndex int) { | ||
var lanternEntryList []LanternEntry | ||
var endpointEntryList EndpointList | ||
|
||
var csvReader *csv.Reader | ||
var file *os.File | ||
var err error | ||
if strings.HasPrefix(inputSource, "http://") || strings.HasPrefix(inputSource, "https://") { | ||
csvReader, file, err = helpers.QueryAndOpenCSV(inputSource, csvFilePath, header) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
defer file.Close() | ||
} else { | ||
file, err = os.Open(inputSource) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
defer file.Close() | ||
|
||
csvReader = csv.NewReader(file) | ||
if header { | ||
_, err := csvReader.Read() | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
} | ||
} | ||
|
||
records := 0 | ||
for { | ||
rec, err := csvReader.Read() | ||
if err == io.EOF { | ||
break | ||
} | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
if numrecords >= 0 && records >= numrecords+startrecord { | ||
break | ||
} | ||
if records >= startrecord { | ||
var entry LanternEntry | ||
|
||
organizationName := "" | ||
if organizationIndex >= 0 { | ||
organizationName = strings.TrimSpace(rec[organizationIndex]) | ||
} | ||
|
||
if !strings.Contains(strings.ToLower(organizationName), "auth") { | ||
|
||
URL := strings.TrimSpace(rec[urlIndex]) | ||
URL = strings.Replace(URL, "/metadata", "", 1) | ||
|
||
entry.OrganizationName = organizationName | ||
entry.URL = URL | ||
|
||
lanternEntryList = append(lanternEntryList, entry) | ||
|
||
} | ||
} | ||
|
||
records++ | ||
} | ||
|
||
endpointEntryList.Endpoints = lanternEntryList | ||
|
||
err = WriteCHPLFile(endpointEntryList, fileToWriteTo) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
if _, err := os.Stat(csvFilePath); err == nil { | ||
err = os.Remove(csvFilePath) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
} | ||
} |
41 changes: 41 additions & 0 deletions
41
endpointmanager/pkg/chplendpointquerier/mdlandwebscraper.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package chplendpointquerier | ||
|
||
import ( | ||
"strings" | ||
|
||
"github.com/PuerkitoBio/goquery" | ||
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers" | ||
log "github.com/sirupsen/logrus" | ||
) | ||
|
||
func MdlandWebscraper(chplURL string, fileToWriteTo string) { | ||
|
||
var lanternEntryList []LanternEntry | ||
var endpointEntryList EndpointList | ||
|
||
doc, err := helpers.ChromedpQueryEndpointList(chplURL, ".MsoNormal") | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
doc.Find("span").Each(func(index int, spanElem *goquery.Selection) { | ||
if strings.Contains(spanElem.Text(), "https://") && strings.Contains(spanElem.Text(), "metadata") { | ||
str := spanElem.Text() | ||
str = strings.Replace(str, "GET ", "", -1) | ||
str = strings.Replace(str, "/metadata", "", -1) | ||
str = strings.Replace(str, "\nHTTP/1.1", "", -1) | ||
|
||
var entry LanternEntry | ||
entry.URL = str | ||
lanternEntryList = append(lanternEntryList, entry) | ||
} | ||
}) | ||
|
||
endpointEntryList.Endpoints = lanternEntryList | ||
|
||
err = WriteCHPLFile(endpointEntryList, fileToWriteTo) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
36 changes: 36 additions & 0 deletions
36
endpointmanager/pkg/chplendpointquerier/ontadawebscraper.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package chplendpointquerier | ||
|
||
import ( | ||
"strings" | ||
|
||
"github.com/onc-healthit/lantern-back-end/endpointmanager/pkg/helpers" | ||
log "github.com/sirupsen/logrus" | ||
) | ||
|
||
func OntadaWebscraper(chplURL string, fileToWriteTo string) { | ||
|
||
var lanternEntryList []LanternEntry | ||
var endpointEntryList EndpointList | ||
var entry LanternEntry | ||
|
||
doc, err := helpers.ChromedpQueryEndpointList(chplURL, ".sc-dTSzeu.dfUAUz") | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
divElem := doc.Find(".sc-dTSzeu.dfUAUz").First() | ||
spanElem := divElem.Find("span").First() | ||
|
||
entryURL := strings.TrimSpace(spanElem.Text()) | ||
entry.URL = entryURL | ||
|
||
lanternEntryList = append(lanternEntryList, entry) | ||
|
||
endpointEntryList.Endpoints = lanternEntryList | ||
|
||
err = WriteCHPLFile(endpointEntryList, fileToWriteTo) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
} |