-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Split crawling and downloading - Added support for bookmarks and channels - Added more options for user - Better folder structure - Code cleanup
- Loading branch information
Showing
5 changed files
with
421 additions
and
174 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
namespace CoubDownloader | ||
{ | ||
public static class Constants | ||
{ | ||
public const string CoubInfoDir = "Coubs-info"; | ||
public const string CoubDataDir = "Coubs"; | ||
|
||
public const string UrlListFileName = "url_list.txt"; | ||
public const string MetaDataFileName = "metadata.txt"; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,255 @@ | ||
using System; | ||
using System.Net; | ||
using System.Text; | ||
using Newtonsoft.Json.Linq; | ||
using System.Linq; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
|
||
namespace CoubDownloader | ||
{ | ||
public class Crawler | ||
{ | ||
private string _usersAccessToken; | ||
private const string LikedCategory = "liked"; | ||
private const string BookmarksCategory = "bookmarks"; | ||
|
||
public string InfoPath => Path.Combine(Environment.CurrentDirectory, Constants.CoubInfoDir); | ||
|
||
public void CrawlUrls(string[] categories) | ||
{ | ||
foreach (var category in categories) | ||
{ | ||
if (category == LikedCategory) | ||
{ | ||
DownloadLikedCoubs(); | ||
} | ||
else if (category == BookmarksCategory) | ||
{ | ||
DownloadBookmarkedCoubs(); | ||
} | ||
else | ||
{ | ||
DownloadChannelCoubs(category); | ||
} | ||
} | ||
} | ||
|
||
public void DownloadChannelCoubs(string channel) | ||
{ | ||
if (AlreadyDownloaded(channel)) | ||
{ | ||
return; | ||
} | ||
|
||
var url = "https://coub.com/api/v2/timeline/channel/"+ channel + "?page={0}&per_page=25"; | ||
DownloadLinks(url, channel); | ||
} | ||
|
||
private string GetUrlListLocation(string dir) | ||
{ | ||
return Path.Combine(Path.Combine(InfoPath, dir), Constants.UrlListFileName); | ||
} | ||
|
||
private string GetMetaDataLocation(string dir) | ||
{ | ||
return Path.Combine(Path.Combine(InfoPath, dir), Constants.MetaDataFileName); | ||
} | ||
|
||
private bool AlreadyDownloaded(string dir) | ||
{ | ||
var urlsPath = GetUrlListLocation(dir); | ||
|
||
if (File.Exists(urlsPath)) | ||
{ | ||
Console.WriteLine($"URL list for '{dir}' found! Skipping crawling."); | ||
return true; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
public void DownloadLikedCoubs() | ||
{ | ||
var dir = LikedCategory; | ||
if (AlreadyDownloaded(dir)) | ||
{ | ||
return; | ||
} | ||
|
||
var token = GetAccessToken(); | ||
|
||
if (string.IsNullOrWhiteSpace(token)) | ||
{ | ||
Console.Error.WriteLine("Invalid token! Valid token is necessary to download user-specific categories " | ||
+ "like 'liked' coubs. Skipping download of " + dir); | ||
return; | ||
} | ||
|
||
try | ||
{ | ||
var url = "https://coub.com/api/v2/timeline/likes?all=true&order_by=date&page={0}&per_page=25"; | ||
DownloadLinks(url, dir, token); | ||
} | ||
catch (Exception ex) | ||
{ | ||
if (ex.Message.Contains("Forbidden")) | ||
{ | ||
Console.Error.WriteLine("Invalid access token!"); | ||
} | ||
else | ||
{ | ||
Console.Error.WriteLine("Unexpected error: " + ex); | ||
} | ||
} | ||
} | ||
|
||
public void DownloadBookmarkedCoubs() | ||
{ | ||
var dir = BookmarksCategory; | ||
if (AlreadyDownloaded(dir)) | ||
{ | ||
return; | ||
} | ||
|
||
var token = GetAccessToken(); | ||
|
||
if (string.IsNullOrWhiteSpace(token)) | ||
{ | ||
Console.Error.WriteLine("Invalid token! Valid token is necessary to download user-specific categories " | ||
+ "like 'liked' coubs. Skipping download of " + dir); | ||
return; | ||
} | ||
|
||
try | ||
{ | ||
var url = "https://coub.com/api/v2/timeline/favourites?all=true&order_by=date&page={0}&per_page=25"; | ||
DownloadLinks(url, dir, token); | ||
} | ||
catch (Exception ex) | ||
{ | ||
if (ex.Message.Contains("Forbidden")) | ||
{ | ||
Console.Error.WriteLine("Invalid access token!"); | ||
} | ||
else | ||
{ | ||
Console.Error.WriteLine("Unexpected error: " + ex); | ||
} | ||
} | ||
} | ||
|
||
private string GetAccessToken() | ||
{ | ||
if (!string.IsNullOrWhiteSpace(_usersAccessToken)) | ||
{ | ||
// If user already passed token, return it | ||
return _usersAccessToken; | ||
} | ||
|
||
Console.WriteLine("Write/paste your access token. Read README if you don't know how to get it"); | ||
Console.Write("Access Token: "); | ||
var token = Console.ReadLine(); | ||
token = token?.Replace("remember_token=", "").Trim(); | ||
|
||
_usersAccessToken = token; | ||
|
||
return token; | ||
} | ||
|
||
private void DownloadLinks(string baseUrl, string dir, string token = null) | ||
{ | ||
Console.WriteLine($"Starting gathering links for '{dir}'..."); | ||
var links = GetLinks(baseUrl, 1, token); | ||
|
||
var metaDataPath = GetMetaDataLocation(dir); | ||
var urlsPath = GetUrlListLocation(dir); | ||
|
||
if (!Directory.Exists(InfoPath)) | ||
{ | ||
Directory.CreateDirectory(InfoPath); | ||
} | ||
|
||
var subDirectory = Path.Combine(InfoPath, dir); | ||
if (!Directory.Exists(subDirectory)) | ||
{ | ||
Directory.CreateDirectory(subDirectory); | ||
} | ||
|
||
Console.WriteLine("Writing crawled URLs..."); | ||
var urlLinks = string.Join("\n", links.Select(x => x.Link)); | ||
File.WriteAllText(urlsPath, urlLinks, ASCIIEncoding.UTF8); | ||
|
||
Console.WriteLine("Saving metadata details..."); | ||
var formattedLinks = string.Join("\n", links.Select(x => x.FormattedData)); | ||
File.WriteAllText(metaDataPath, formattedLinks, ASCIIEncoding.UTF8); | ||
} | ||
|
||
private static List<CoubDownloadResult> GetLinks(string baseUrl, int page, string token) | ||
{ | ||
var request = (HttpWebRequest)WebRequest.Create( | ||
string.Format(baseUrl, page)); | ||
|
||
if (!string.IsNullOrWhiteSpace(token)) | ||
{ | ||
// Add cookies header with access token to retrieve user-specific data | ||
request.Headers["Cookie"] = $"remember_token={token}"; | ||
} | ||
|
||
var response = (HttpWebResponse)request.GetResponse(); | ||
|
||
var responseText = ""; | ||
using (var reader = new System.IO.StreamReader(response.GetResponseStream(), ASCIIEncoding.UTF8)) | ||
{ | ||
responseText = reader.ReadToEnd(); | ||
} | ||
|
||
dynamic data = JObject.Parse(responseText); | ||
var totalPages = data.total_pages; | ||
|
||
var currentPage = data.page; | ||
var coubs = data.coubs; | ||
|
||
var downloadedData = new List<CoubDownloadResult>(); | ||
|
||
foreach (var coub in coubs) | ||
{ | ||
var tags = new List<string>(); | ||
var tagsEncoded = new List<string>(); | ||
foreach (var tag in coub.tags) | ||
{ | ||
tagsEncoded.Add(tag.value.ToString()); | ||
tags.Add(tag.title.ToString()); | ||
} | ||
|
||
var formatted = "https://coub.com/view/" + coub.permalink | ||
+ " | " + coub.title | ||
+ " | " + string.Join(",", tags) | ||
+ " | " + string.Join(",", tagsEncoded) | ||
.Replace("\r\n", "") // Remove any line-breaks that is often in the tags | ||
.Replace("\n", "") | ||
.Replace("\r", ""); | ||
|
||
var result = new CoubDownloadResult | ||
{ | ||
Link = "https://coub.com/view/" + coub.permalink, | ||
FormattedData = formatted.ToString() | ||
}; | ||
|
||
downloadedData.Add(result); | ||
} | ||
|
||
if (currentPage != totalPages) | ||
{ | ||
Console.WriteLine($"Crawling page {currentPage} out of {totalPages}"); | ||
downloadedData.AddRange(GetLinks(baseUrl, ++page, token)); | ||
} | ||
else | ||
{ | ||
Console.WriteLine("Reached last page... gathering results..."); | ||
} | ||
|
||
return downloadedData; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
using System; | ||
using System.Net; | ||
using System.Text; | ||
using Newtonsoft.Json.Linq; | ||
using System.Linq; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
|
||
namespace CoubDownloader | ||
{ | ||
public class Downloader | ||
{ | ||
private string coubsDir = Path.Combine(Environment.CurrentDirectory, Constants.CoubDataDir); | ||
|
||
public void DownloadCoubs(string path) | ||
{ | ||
var directoriesToDownload = Directory.GetDirectories(path); | ||
foreach (var directory in directoriesToDownload) | ||
{ | ||
var dir = new DirectoryInfo(directory).Name; | ||
if (!HasUrlList(path, dir)) | ||
{ | ||
Console.WriteLine($"No URL list for category '{dir}' found, skipping download..."); | ||
continue; | ||
} | ||
|
||
DownloadCoubsCategory(dir); | ||
} | ||
} | ||
|
||
private bool HasUrlList(string path, string dir) | ||
{ | ||
var filename = Path.Combine(Path.Combine(path, dir), Constants.UrlListFileName); | ||
return File.Exists(filename); | ||
} | ||
|
||
private void DownloadCoubsCategory(string dir) | ||
{ | ||
Console.WriteLine($"Starting download of '{dir}'..."); | ||
|
||
EnsureInput(dir); | ||
|
||
try | ||
{ | ||
Run.RunCommand( | ||
$"python.exe -X utf8 coub_v2.py -l {Constants.CoubInfoDir}\\{dir}\\{Constants.UrlListFileName} -o \"{Constants.CoubDataDir}\\{dir}\\%id%_%title%\"", | ||
Environment.CurrentDirectory); | ||
|
||
Console.WriteLine("DONE"); | ||
} | ||
catch (Exception ex) | ||
{ | ||
Console.Error.WriteLine("Fatal error: " + ex); | ||
} | ||
} | ||
|
||
private string GetDownloadLocation(string dir) | ||
{ | ||
return Path.Combine(coubsDir, dir); | ||
} | ||
|
||
private void EnsureInput(string dir) | ||
{ | ||
// coub_v2.py is creating temp list for data | ||
// It might be left behind if something happens, clean that up before starting | ||
var tmpList = Path.Combine(Environment.CurrentDirectory, "list.txt"); | ||
if (File.Exists(tmpList)) | ||
{ | ||
File.Delete(tmpList); | ||
} | ||
|
||
// Main coub directory | ||
if (!Directory.Exists(coubsDir)) | ||
{ | ||
Directory.CreateDirectory(coubsDir); | ||
} | ||
|
||
var targetDirectory = GetDownloadLocation(dir); | ||
if (!Directory.Exists(targetDirectory)) | ||
{ | ||
Directory.CreateDirectory(targetDirectory); | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.