Skip to content

Commit

Permalink
Improved coub downloader
Browse files Browse the repository at this point in the history
- Split crawling and downloading
- Added support for bookmarks and channels
- Added more options for user
- Better folder structure
- Code cleanup
  • Loading branch information
Bukk94 committed Mar 17, 2022
1 parent c06a6c5 commit 368384a
Show file tree
Hide file tree
Showing 5 changed files with 421 additions and 174 deletions.
11 changes: 11 additions & 0 deletions CoubDownloader/Constants.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
namespace CoubDownloader
{
public static class Constants
{
public const string CoubInfoDir = "Coubs-info";
public const string CoubDataDir = "Coubs";

public const string UrlListFileName = "url_list.txt";
public const string MetaDataFileName = "metadata.txt";
}
}
255 changes: 255 additions & 0 deletions CoubDownloader/Crawler.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
using System;
using System.Net;
using System.Text;
using Newtonsoft.Json.Linq;
using System.Linq;
using System.Collections.Generic;
using System.IO;

namespace CoubDownloader
{
public class Crawler
{
private string _usersAccessToken;
private const string LikedCategory = "liked";
private const string BookmarksCategory = "bookmarks";

public string InfoPath => Path.Combine(Environment.CurrentDirectory, Constants.CoubInfoDir);

public void CrawlUrls(string[] categories)
{
foreach (var category in categories)
{
if (category == LikedCategory)
{
DownloadLikedCoubs();
}
else if (category == BookmarksCategory)
{
DownloadBookmarkedCoubs();
}
else
{
DownloadChannelCoubs(category);
}
}
}

public void DownloadChannelCoubs(string channel)
{
if (AlreadyDownloaded(channel))
{
return;
}

var url = "https://coub.com/api/v2/timeline/channel/"+ channel + "?page={0}&per_page=25";
DownloadLinks(url, channel);
}

private string GetUrlListLocation(string dir)
{
return Path.Combine(Path.Combine(InfoPath, dir), Constants.UrlListFileName);
}

private string GetMetaDataLocation(string dir)
{
return Path.Combine(Path.Combine(InfoPath, dir), Constants.MetaDataFileName);
}

private bool AlreadyDownloaded(string dir)
{
var urlsPath = GetUrlListLocation(dir);

if (File.Exists(urlsPath))
{
Console.WriteLine($"URL list for '{dir}' found! Skipping crawling.");
return true;
}

return false;
}

public void DownloadLikedCoubs()
{
var dir = LikedCategory;
if (AlreadyDownloaded(dir))
{
return;
}

var token = GetAccessToken();

if (string.IsNullOrWhiteSpace(token))
{
Console.Error.WriteLine("Invalid token! Valid token is necessary to download user-specific categories "
+ "like 'liked' coubs. Skipping download of " + dir);
return;
}

try
{
var url = "https://coub.com/api/v2/timeline/likes?all=true&order_by=date&page={0}&per_page=25";
DownloadLinks(url, dir, token);
}
catch (Exception ex)
{
if (ex.Message.Contains("Forbidden"))
{
Console.Error.WriteLine("Invalid access token!");
}
else
{
Console.Error.WriteLine("Unexpected error: " + ex);
}
}
}

public void DownloadBookmarkedCoubs()
{
var dir = BookmarksCategory;
if (AlreadyDownloaded(dir))
{
return;
}

var token = GetAccessToken();

if (string.IsNullOrWhiteSpace(token))
{
Console.Error.WriteLine("Invalid token! Valid token is necessary to download user-specific categories "
+ "like 'liked' coubs. Skipping download of " + dir);
return;
}

try
{
var url = "https://coub.com/api/v2/timeline/favourites?all=true&order_by=date&page={0}&per_page=25";
DownloadLinks(url, dir, token);
}
catch (Exception ex)
{
if (ex.Message.Contains("Forbidden"))
{
Console.Error.WriteLine("Invalid access token!");
}
else
{
Console.Error.WriteLine("Unexpected error: " + ex);
}
}
}

private string GetAccessToken()
{
if (!string.IsNullOrWhiteSpace(_usersAccessToken))
{
// If user already passed token, return it
return _usersAccessToken;
}

Console.WriteLine("Write/paste your access token. Read README if you don't know how to get it");
Console.Write("Access Token: ");
var token = Console.ReadLine();
token = token?.Replace("remember_token=", "").Trim();

_usersAccessToken = token;

return token;
}

private void DownloadLinks(string baseUrl, string dir, string token = null)
{
Console.WriteLine($"Starting gathering links for '{dir}'...");
var links = GetLinks(baseUrl, 1, token);

var metaDataPath = GetMetaDataLocation(dir);
var urlsPath = GetUrlListLocation(dir);

if (!Directory.Exists(InfoPath))
{
Directory.CreateDirectory(InfoPath);
}

var subDirectory = Path.Combine(InfoPath, dir);
if (!Directory.Exists(subDirectory))
{
Directory.CreateDirectory(subDirectory);
}

Console.WriteLine("Writing crawled URLs...");
var urlLinks = string.Join("\n", links.Select(x => x.Link));
File.WriteAllText(urlsPath, urlLinks, ASCIIEncoding.UTF8);

Console.WriteLine("Saving metadata details...");
var formattedLinks = string.Join("\n", links.Select(x => x.FormattedData));
File.WriteAllText(metaDataPath, formattedLinks, ASCIIEncoding.UTF8);
}

private static List<CoubDownloadResult> GetLinks(string baseUrl, int page, string token)
{
var request = (HttpWebRequest)WebRequest.Create(
string.Format(baseUrl, page));

if (!string.IsNullOrWhiteSpace(token))
{
// Add cookies header with access token to retrieve user-specific data
request.Headers["Cookie"] = $"remember_token={token}";
}

var response = (HttpWebResponse)request.GetResponse();

var responseText = "";
using (var reader = new System.IO.StreamReader(response.GetResponseStream(), ASCIIEncoding.UTF8))
{
responseText = reader.ReadToEnd();
}

dynamic data = JObject.Parse(responseText);
var totalPages = data.total_pages;

var currentPage = data.page;
var coubs = data.coubs;

var downloadedData = new List<CoubDownloadResult>();

foreach (var coub in coubs)
{
var tags = new List<string>();
var tagsEncoded = new List<string>();
foreach (var tag in coub.tags)
{
tagsEncoded.Add(tag.value.ToString());
tags.Add(tag.title.ToString());
}

var formatted = "https://coub.com/view/" + coub.permalink
+ " | " + coub.title
+ " | " + string.Join(",", tags)
+ " | " + string.Join(",", tagsEncoded)
.Replace("\r\n", "") // Remove any line-breaks that is often in the tags
.Replace("\n", "")
.Replace("\r", "");

var result = new CoubDownloadResult
{
Link = "https://coub.com/view/" + coub.permalink,
FormattedData = formatted.ToString()
};

downloadedData.Add(result);
}

if (currentPage != totalPages)
{
Console.WriteLine($"Crawling page {currentPage} out of {totalPages}");
downloadedData.AddRange(GetLinks(baseUrl, ++page, token));
}
else
{
Console.WriteLine("Reached last page... gathering results...");
}

return downloadedData;
}
}
}
85 changes: 85 additions & 0 deletions CoubDownloader/Downloader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
using System;
using System.Net;
using System.Text;
using Newtonsoft.Json.Linq;
using System.Linq;
using System.Collections.Generic;
using System.IO;

namespace CoubDownloader
{
public class Downloader
{
private string coubsDir = Path.Combine(Environment.CurrentDirectory, Constants.CoubDataDir);

public void DownloadCoubs(string path)
{
var directoriesToDownload = Directory.GetDirectories(path);
foreach (var directory in directoriesToDownload)
{
var dir = new DirectoryInfo(directory).Name;
if (!HasUrlList(path, dir))
{
Console.WriteLine($"No URL list for category '{dir}' found, skipping download...");
continue;
}

DownloadCoubsCategory(dir);
}
}

private bool HasUrlList(string path, string dir)
{
var filename = Path.Combine(Path.Combine(path, dir), Constants.UrlListFileName);
return File.Exists(filename);
}

private void DownloadCoubsCategory(string dir)
{
Console.WriteLine($"Starting download of '{dir}'...");

EnsureInput(dir);

try
{
Run.RunCommand(
$"python.exe -X utf8 coub_v2.py -l {Constants.CoubInfoDir}\\{dir}\\{Constants.UrlListFileName} -o \"{Constants.CoubDataDir}\\{dir}\\%id%_%title%\"",
Environment.CurrentDirectory);

Console.WriteLine("DONE");
}
catch (Exception ex)
{
Console.Error.WriteLine("Fatal error: " + ex);
}
}

private string GetDownloadLocation(string dir)
{
return Path.Combine(coubsDir, dir);
}

private void EnsureInput(string dir)
{
// coub_v2.py is creating temp list for data
// It might be left behind if something happens, clean that up before starting
var tmpList = Path.Combine(Environment.CurrentDirectory, "list.txt");
if (File.Exists(tmpList))
{
File.Delete(tmpList);
}

// Main coub directory
if (!Directory.Exists(coubsDir))
{
Directory.CreateDirectory(coubsDir);
}

var targetDirectory = GetDownloadLocation(dir);
if (!Directory.Exists(targetDirectory))
{
Directory.CreateDirectory(targetDirectory);
}
}
}
}
Loading

0 comments on commit 368384a

Please sign in to comment.