Skip to content

Commit

Permalink
Resolves #1183 - Wikimedia duplicate images creates bad links
Browse files Browse the repository at this point in the history
  • Loading branch information
HarelM committed Mar 22, 2020
1 parent 9f8773a commit 499aa09
Show file tree
Hide file tree
Showing 23 changed files with 490 additions and 69 deletions.
31 changes: 22 additions & 9 deletions IsraelHiking.API/Controllers/PointsOfInterestController.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using IsraelHiking.API.Converters;
using IsraelHiking.API.Executors;
using IsraelHiking.API.Services;
using IsraelHiking.API.Services.Poi;
using IsraelHiking.Common;
Expand All @@ -13,6 +14,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Security.Cryptography;
using System.Threading.Tasks;

namespace IsraelHiking.API.Controllers
Expand All @@ -28,6 +30,7 @@ public class PointsOfInterestController : ControllerBase
private readonly IWikimediaCommonGateway _wikimediaCommonGateway;
private readonly IPointsOfInterestProvider _pointsOfInterestProvider;
private readonly IBase64ImageStringToFileConverter _base64ImageConverter;
private readonly IImagesUrlsStorageExecutor _imageUrlStoreExecutor;
private readonly ConfigurationData _options;
private readonly LruCache<string, TokenAndSecret> _cache;

Expand All @@ -39,20 +42,23 @@ public class PointsOfInterestController : ControllerBase
/// <param name="wikimediaCommonGateway"></param>
/// <param name="pointsOfInterestProvider"></param>
/// <param name="base64ImageConverter"></param>
/// <param name="imageUrlStoreExecutor"></param>
/// <param name="options"></param>
/// <param name="cache"></param>
public PointsOfInterestController(IClientsFactory clientsFactory,
ITagsHelper tagsHelper,
IWikimediaCommonGateway wikimediaCommonGateway,
IPointsOfInterestProvider pointsOfInterestProvider,
IBase64ImageStringToFileConverter base64ImageConverter,
IImagesUrlsStorageExecutor imageUrlStoreExecutor,
IOptions<ConfigurationData> options,
LruCache<string, TokenAndSecret> cache)
{
_clientsFactory = clientsFactory;
_tagsHelper = tagsHelper;
_cache = cache;
_base64ImageConverter = base64ImageConverter;
_imageUrlStoreExecutor = imageUrlStoreExecutor;
_pointsOfInterestProvider = pointsOfInterestProvider;
_wikimediaCommonGateway = wikimediaCommonGateway;
_options = options.Value;
Expand Down Expand Up @@ -147,24 +153,31 @@ public async Task<IActionResult> UploadPointOfInterest([FromBody]PointOfInterest
var imageUrls = pointOfInterest.ImagesUrls ?? new string[0];
for (var urlIndex = 0; urlIndex < imageUrls.Length; urlIndex++)
{
var url = imageUrls[urlIndex];
var fileName = string.IsNullOrWhiteSpace(pointOfInterest.Title)
? pointOfInterest.Icon.Replace("icon-", "")
: pointOfInterest.Title;
var file = _base64ImageConverter.ConvertToFile(url, fileName);
var file = _base64ImageConverter.ConvertToFile(imageUrls[urlIndex], fileName);
if (file == null)
{
continue;
}
using (var memoryStream = new MemoryStream(file.Content))
using (var md5 = MD5.Create())
{
var imageName = await _wikimediaCommonGateway.UploadImage(pointOfInterest.Title,
pointOfInterest.Description, user.DisplayName, file.FileName, memoryStream,
pointOfInterest.Location.ToCoordinate());
url = await _wikimediaCommonGateway.GetImageUrl(imageName);
imageUrls[urlIndex] = url;
var imageUrl = await _imageUrlStoreExecutor.GetImageUrlIfExists(md5, file.Content);
if (imageUrl != null)
{
imageUrls[urlIndex] = imageUrl;
continue;
}
using (var memoryStream = new MemoryStream(file.Content))
{
var imageName = await _wikimediaCommonGateway.UploadImage(pointOfInterest.Title,
pointOfInterest.Description, user.DisplayName, file.FileName, memoryStream,
pointOfInterest.Location.ToCoordinate());
imageUrls[urlIndex] = await _wikimediaCommonGateway.GetImageUrl(imageName);
await _imageUrlStoreExecutor.StoreImage(md5, file.Content, imageUrls[urlIndex]);
}
}

}

if (string.IsNullOrWhiteSpace(pointOfInterest.Id))
Expand Down
12 changes: 8 additions & 4 deletions IsraelHiking.API/Controllers/UpdateController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,25 @@ public async Task<IActionResult> PostUpdateData(UpdateRequest request)
request.Routing == false &&
request.Highways == false &&
request.PointsOfInterest == false &&
request.OsmFile == false &&
request.UpdateOsmFile == false &&
request.DownloadOsmFile == false &&
request.Images == false &&
request.SiteMap == false)
{
request = new UpdateRequest
{
Routing = true,
Highways = true,
PointsOfInterest = true,
OsmFile = true,
SiteMap = true
UpdateOsmFile = true,
DownloadOsmFile = true,
SiteMap = true,
Images = true
};
_logger.LogInformation("No specific filters were applied, updating all databases.");
}
_logger.LogInformation("Starting updating site's databases according to request: " + JsonConvert.SerializeObject(request));
await _osmLatestFileFetcherExecutor.Update(request.OsmFile);
await _osmLatestFileFetcherExecutor.Update(request.DownloadOsmFile, request.UpdateOsmFile);
_logger.LogInformation("Update OSM file completed.");

await _databasesUpdaterService.Rebuild(request);
Expand Down
36 changes: 36 additions & 0 deletions IsraelHiking.API/Executors/IImagesUrlsStorageExecutor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using System.Collections.Generic;
using System.Security.Cryptography;
using System.Threading.Tasks;

namespace IsraelHiking.API.Executors
{
/// <summary>
/// Stores images in order to avoid uploading the same image to wikimedia twice
/// </summary>
public interface IImagesUrlsStorageExecutor
{
/// <summary>
/// Dowonloads the content from the urls, calculates hash and stores to database
/// </summary>
/// <param name="imagesUrls"></param>
/// <returns></returns>
Task DownloadAndStoreUrls(List<string> imagesUrls);

/// <summary>
/// Get an image url if it exsits in the repository
/// </summary>
/// <param name="md5"></param>
/// <param name="content"></param>
/// <returns>The image url or null</returns>
Task<string> GetImageUrlIfExists(MD5 md5, byte[] content);

/// <summary>
/// This method stores images in the repostory after computing hash and resizing them
/// </summary>
/// <param name="md5"></param>
/// <param name="content"></param>
/// <param name="imageUrl"></param>
/// <returns></returns>
Task StoreImage(MD5 md5, byte[] content, string imageUrl);
}
}
3 changes: 2 additions & 1 deletion IsraelHiking.API/Executors/IOsmLatestFileFetcherExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ public interface IOsmLatestFileFetcherExecutor
/// <summary>
/// Updates the osm file to latest version
/// </summary>
/// <param name="downloadFile">Should the operation download the daily OSM file</param>
/// <param name="updateFile">Should the operation download updates for daily OSM file</param>
/// <returns></returns>
Task Update(bool updateFile = true);
Task Update(bool downloadFile = true, bool updateFile = true);

/// <summary>
/// Gets a stream to the OSM file
Expand Down
139 changes: 139 additions & 0 deletions IsraelHiking.API/Executors/ImagesUrlsStorageExecutor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
using IsraelHiking.API.Gpx;
using IsraelHiking.Common;
using IsraelHiking.DataAccessInterfaces;
using Microsoft.Extensions.Logging;
using SixLabors.ImageSharp;
using SixLabors.ImageSharp.Processing;
using SixLabors.Primitives;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Security.Cryptography;
using System.Threading;
using System.Threading.Tasks;

namespace IsraelHiking.API.Executors
{
/// <inheritdoc/>
public class ImagesUrlsStorageExecutor : IImagesUrlsStorageExecutor
{
private readonly IImagesRepository _imagesRepository;
private readonly IRemoteFileSizeFetcherGateway _remoteFileFetcherGateway;
private readonly ILogger _logger;

/// <summary>
/// Constrcutor
/// </summary>
/// <param name="imagesRepository"></param>
/// <param name="remoteFileFetcherGateway"></param>
/// <param name="logger"></param>
public ImagesUrlsStorageExecutor(IImagesRepository imagesRepository,
IRemoteFileSizeFetcherGateway remoteFileFetcherGateway,
ILogger logger)
{
_imagesRepository = imagesRepository;
_remoteFileFetcherGateway = remoteFileFetcherGateway;
_logger = logger;
}

/// <inheritdoc/>
public async Task DownloadAndStoreUrls(List<string> imagesUrls)
{
var exitingUrls = await _imagesRepository.GetAllUrls();
var needToRemove = exitingUrls.Except(imagesUrls).ToList();
_logger.LogInformation($"Need to remove {needToRemove.Count} images that are no longer relevant");
foreach(var imageUrlToRemove in needToRemove)
{
await _imagesRepository.DeleteImageByUrl(imageUrlToRemove);
}
_logger.LogInformation($"Finished removing images");
using (var md5 = MD5.Create())
{
var counter = 0;
Parallel.ForEach(imagesUrls, new ParallelOptions { MaxDegreeOfParallelism = 20 }, (imageUrl) =>
{
try
{
Interlocked.Increment(ref counter);
if (counter % 100 == 0)
{
_logger.LogInformation($"Indexed {counter} images of {imagesUrls.Count}");
}
if (exitingUrls.Contains(imageUrl))
{
var size = _remoteFileFetcherGateway.GetFileSize(imageUrl).Result;
if (size > 0)
{
return;
}
}
var content = new byte[0];
for (int retryIndex = 0; retryIndex < 3; retryIndex++)
{
try
{
content = _remoteFileFetcherGateway.GetFileContent(imageUrl).Result.Content;
break;
}
catch
{
Task.Delay(200).Wait();
}
}
if (content.Length == 0)
{
_imagesRepository.DeleteImageByUrl(imageUrl).Wait();
return;
}
StoreImage(md5, content, imageUrl).Wait();
}
catch (Exception ex)
{
_logger.LogWarning("There was a problem with the following image url: " + imageUrl + " " + ex.ToString());
}
});
}
}

private byte[] ResizeImage(Image originalImage, int newSizeInPixels)
{
var ratio = originalImage.Width > originalImage.Height
? newSizeInPixels * 1.0 / originalImage.Width
: newSizeInPixels * 1.0 / originalImage.Height;
var newSize = new Size((int)(originalImage.Width * ratio), (int)(originalImage.Height * ratio));
originalImage.Mutate(x => x.Resize(newSize));

var memoryStream = new MemoryStream();
originalImage.SaveAsJpeg(memoryStream);
return memoryStream.ToArray();
}

/// <inheritdoc/>
public Task StoreImage(MD5 md5, byte[] content, string imageUrl)
{
var hash = md5.ComputeHash(content).ToHashString();
var image = Image.Load(content, out var _);
content = ResizeImage(image, 200);
return _imagesRepository.StoreImage(new ImageItem
{
ImageUrl = imageUrl,
Data = $"data:image/jpeg;base64," + Convert.ToBase64String(content),
Hash = hash
});
}

/// <inheritdoc/>
public async Task<string> GetImageUrlIfExists(MD5 md5, byte[] content)
{
var hash = md5.ComputeHash(content).ToHashString();
var imageItem = await _imagesRepository.GetImageByHash(hash);
var imageUrl = imageItem?.ImageUrl;
if (imageUrl != null)
{
_logger.LogInformation($"Found exiting image with url: {imageUrl}");
}
return imageUrl;
}
}
}
7 changes: 5 additions & 2 deletions IsraelHiking.API/Executors/OsmLatestFileFetcherExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public OsmLatestFileFetcherExecutor(IFileSystemHelper fileSystemHelper,
}

/// <inheritdoc />
public async Task Update(bool updateFile = true)
public async Task Update(bool downloadFile = true, bool updateFile = true)
{
_logger.LogInformation("Starting updating to latest OSM file.");
var workingDirectory = Path.Combine(_options.BinariesFolder, OSM_C_TOOLS_FOLDER);
Expand All @@ -60,7 +60,10 @@ public async Task Update(bool updateFile = true)
{
_fileSystemHelper.CreateDirectory(workingDirectory);
}
await DownloadDailyOsmFile(workingDirectory);
if (downloadFile || updateFile)
{
await DownloadDailyOsmFile(workingDirectory);
}
if (updateFile)
{
UpdateFileToLatestVersion(workingDirectory);
Expand Down
Loading

0 comments on commit 499aa09

Please sign in to comment.