diff --git a/collab-document/src/importer/md_importer.rs b/collab-document/src/importer/md_importer.rs index 5c5f2d90a..3a7f24b1a 100644 --- a/collab-document/src/importer/md_importer.rs +++ b/collab-document/src/importer/md_importer.rs @@ -295,7 +295,7 @@ fn create_paragraph_block(document_data: &mut DocumentData, parent_id: &str) -> paragraph_block_id } -fn create_image_block(block_id: &str, url: String, parent_id: &str) -> Block { +pub fn create_image_block(block_id: &str, url: String, parent_id: &str) -> Block { let mut data = BlockData::new(); data.insert(URL_FIELD.to_string(), url.into()); data.insert(IMAGE_TYPE_FIELD.to_string(), EXTERNAL_IMAGE_TYPE.into()); diff --git a/collab-importer/src/notion/importer.rs b/collab-importer/src/notion/importer.rs index 5a3f2f079..f9c7164e6 100644 --- a/collab-importer/src/notion/importer.rs +++ b/collab-importer/src/notion/importer.rs @@ -90,6 +90,12 @@ impl NotionImporter { .await .unwrap_or_default(); + let no_subpages = !has_subdirectories(&self.path, 1); + let notion_export = NotionExportContext { + csv_relation, + no_subpages, + }; + let path = self.path.clone(); let host = self.host.clone(); let workspace_id = self.workspace_id.clone(); @@ -97,7 +103,7 @@ impl NotionImporter { // Process entries and track whether we have spaces (directories) and pages (non-directories) let mut notion_pages: Vec = vec![]; for entry in walk_sub_dir(&path) { - if let Some(view) = process_entry(&host, &workspace_id, &entry, false, &csv_relation) { + if let Some(view) = process_entry(&host, &workspace_id, &entry, false, ¬ion_export) { has_spaces |= view.is_dir; has_pages |= !view.is_dir; notion_pages.push(view); @@ -309,6 +315,15 @@ async fn convert_notion_page_to_parent_child( view_builder.build() } +pub struct NotionExportContext { + pub csv_relation: CSVRelation, + pub no_subpages: bool, +} + +/// [CSVRelation] manages parent-child relationships between CSV files exported in zip format from Notion. +/// The zip export may contain multiple CSV files that represent views of the main *_all.csv file. +/// When a partial CSV file is encountered, it is replaced with the main *_all.csv file and directed to +/// reference the *_all.csv file using the specified ID. #[derive(Default, Debug, Clone)] pub struct CSVRelation { inner: Arc>, @@ -481,6 +496,15 @@ fn extract_file_name(input: &str) -> String { normalized } + +fn has_subdirectories(path: &PathBuf, max_depth: usize) -> bool { + WalkDir::new(path) + .max_depth(max_depth) + .into_iter() + .filter_map(Result::ok) + .any(|entry| entry.file_type().is_dir() && entry.path() != path) +} + #[cfg(test)] mod test_csv_relation { use super::*; diff --git a/collab-importer/src/notion/page.rs b/collab-importer/src/notion/page.rs index 455db78b7..97b00801a 100644 --- a/collab-importer/src/notion/page.rs +++ b/collab-importer/src/notion/page.rs @@ -6,12 +6,12 @@ use collab_database::template::csv::{CSVResource, CSVTemplate}; use collab_document::blocks::{mention_block_data, mention_block_delta, TextDelta}; use collab_document::document::Document; use collab_document::importer::define::{BlockType, URL_FIELD}; -use collab_document::importer::md_importer::MDImporter; +use collab_document::importer::md_importer::{create_image_block, MDImporter}; use collab_entity::CollabType; use futures::stream::{self, StreamExt}; use crate::notion::file::NotionFile; -use crate::notion::walk_dir::extract_external_links; +use crate::notion::walk_dir::{extract_delta_link, extract_external_links}; use crate::notion::{CSVRelation, ImportedCollabInfoStream}; use crate::util::{upload_file_url, FileId}; use collab_database::rows::RowId; @@ -133,7 +133,7 @@ impl NotionPage { let external_link_views = self.get_external_link_notion_view(); match &self.notion_file { NotionFile::Markdown { file_path, .. } => { - let mut file_paths = self.notion_file.upload_files(); + let resource_paths = self.notion_file.upload_files(); let md_importer = MDImporter::new(None); let content = fs::read_to_string(file_path).await?; let document_data = md_importer.import(&self.view_id, content)?; @@ -149,12 +149,30 @@ impl NotionPage { )) }; let parent_path = file_path.parent().unwrap(); - self.replace_link_views(&mut document, external_link_views); - self - .replace_resources(&mut document, &mut file_paths, parent_path, url_builder) + let valid_delta_resources = self + .replace_link_views( + parent_path, + &mut document, + &resource_paths, + external_link_views, + &url_builder, + ) + .await; + let valid_block_resources = self + .replace_block_resources_recursively( + parent_path, + &mut document, + &resource_paths, + url_builder, + ) .await; - let files = file_paths + let all_resources = valid_block_resources + .into_iter() + .chain(valid_delta_resources.into_iter()) + .collect::>(); + + let files = all_resources .iter() .filter_map(|p| p.to_str().map(|s| s.to_string())) .collect(); @@ -173,40 +191,67 @@ impl NotionPage { } } - async fn replace_resources<'a, B, O>( + async fn replace_block_resources_recursively<'a, B, O>( &'a self, - document: &mut Document, - resources: &mut Vec, parent_path: &Path, + document: &mut Document, + resources: &[PathBuf], file_url_builder: B, - ) where + ) -> Vec + where B: Fn(&'a str, PathBuf) -> O + Send + Sync + 'a, O: Future> + Send + 'a, { let mut document_resources = HashSet::new(); if let Some(page_id) = document.get_page_id() { - let block_ids = document.get_block_children_ids(&page_id); - for block_id in block_ids.iter() { - if let Some((block_type, mut block_data)) = document.get_block_data(block_id) { - if matches!(block_type, BlockType::Image) { - if let Some(image_url) = block_data - .get(URL_FIELD) - .and_then(|v| v.as_str()) - .and_then(|s| percent_decode_str(s).decode_utf8().ok()) - { - let full_image_url = parent_path.join(image_url.to_string()); - let pos = resources.iter().position(|r| r == &full_image_url); - if let Some(pos) = pos { - if let Some(url) = file_url_builder(&self.view_id, full_image_url).await { - document_resources.insert(resources.remove(pos)); - block_data.insert(URL_FIELD.to_string(), json!(url)); - if let Err(err) = document.update_block(block_id, block_data) { - error!( - "Failed to update block when trying to replace image. error:{:?}", - err - ); - } - } + // Start the recursive processing with the root block (page_id) + self + .process_block_and_children( + parent_path, + document, + &page_id, + resources, + &file_url_builder, + &mut document_resources, + ) + .await; + } + + document_resources.into_iter().collect() + } + + #[async_recursion::async_recursion(?Send)] + async fn process_block_and_children<'a, B, O>( + &'a self, + parent_path: &Path, + document: &mut Document, + block_id: &str, + resources: &[PathBuf], + file_url_builder: &B, + document_resources: &mut HashSet, + ) where + B: Fn(&'a str, PathBuf) -> O + Send + Sync + 'a, + O: Future> + Send + 'a, + { + // Process the current block + if let Some((block_type, mut block_data)) = document.get_block_data(block_id) { + if matches!(block_type, BlockType::Image) { + if let Some(image_url) = block_data + .get(URL_FIELD) + .and_then(|v| v.as_str()) + .and_then(|s| percent_decode_str(s).decode_utf8().ok()) + { + let full_image_url = parent_path.join(image_url.to_string()); + let pos = resources.iter().position(|r| r == &full_image_url); + if let Some(pos) = pos { + if let Some(url) = file_url_builder(&self.view_id, full_image_url).await { + document_resources.insert(resources[pos].clone()); + block_data.insert(URL_FIELD.to_string(), json!(url)); + if let Err(err) = document.update_block(block_id, block_data) { + error!( + "Failed to update block when trying to replace image. error:{:?}", + err + ); } } } @@ -214,84 +259,207 @@ impl NotionPage { } } - *resources = document_resources.into_iter().collect(); + // Recursively process each child block + let block_children_ids = document.get_block_children_ids(block_id); + for child_id in block_children_ids.iter() { + self + .process_block_and_children( + parent_path, + document, + child_id, + resources, + file_url_builder, + document_resources, + ) + .await; + } } - fn replace_link_views( - &self, + async fn replace_link_views<'a, 'b, B, O>( + &'b self, + parent_path: &Path, document: &mut Document, + resources: &[PathBuf], external_link_views: HashMap, - ) { - if let Some(page_id) = document.get_page_id() { - // Get all block children and process them - let block_ids = document.get_block_children_ids(&page_id); - for block_id in block_ids.iter() { - if let Some((block_type, deltas)) = document.get_block_delta(block_id) { - self.process_block_deltas(document, block_id, block_type, deltas, &external_link_views); - } - } + file_url_builder: &'a B, + ) -> Vec + where + B: Fn(&'a str, PathBuf) -> O + Send + Sync + 'a, + O: Future> + Send + 'a, + 'b: 'a, + { + let mut delta_resources = HashSet::new(); + if let Some(first_page_id) = document.get_page_id() { + // Start the recursive processing with the first page's root block + self + .process_link_views_recursive( + parent_path, + document, + &first_page_id, + resources, + &external_link_views, + file_url_builder, + &mut delta_resources, + ) + .await; } + + delta_resources.into_iter().collect() } - /// Process the deltas for a block, looking for links to replace - fn process_block_deltas( - &self, + #[async_recursion::async_recursion] + #[allow(clippy::too_many_arguments)] + async fn process_link_views_recursive<'a, 'b, B, O>( + &'b self, + parent_path: &Path, document: &mut Document, block_id: &str, - block_type: BlockType, - deltas: Vec, + resources: &[PathBuf], external_link_views: &HashMap, - ) { - for delta in deltas { - if let TextDelta::Inserted(_v, Some(attrs)) = delta { - if let Some(href_value) = attrs.get("href") { - let delta_str = href_value.to_string(); - if let Ok(links) = extract_external_links(&delta_str) { - self.replace_links_in_deltas(document, block_id, &links, external_link_views); - self.update_paragraph_block( - document, - block_id, - &block_type, - &links, - external_link_views, - ); - } + file_url_builder: &'a B, + delta_resources: &mut HashSet, + ) where + B: Fn(&'a str, PathBuf) -> O + Send + Sync + 'a, + O: Future> + Send + 'a, + 'b: 'a, + { + if let Some((block_type, deltas)) = document.get_block_delta(block_id) { + let block_deltas_result = self + .process_block_deltas( + parent_path, + document, + block_id, + block_type, + deltas, + external_link_views, + resources, + file_url_builder, + ) + .await; + + // Collect resources from this block + delta_resources.extend(block_deltas_result.delta_resources); + + // Update document deltas if new ones are created + if let Some(new_deltas) = block_deltas_result.new_deltas { + if let Err(err) = document.set_block_delta(block_id, new_deltas) { + error!( + "Failed to set block delta when trying to replace ref link. error: {:?}", + err + ); + } + } + + // Insert new image blocks if any are created + for image_url in block_deltas_result.new_delta_image_blocks { + let new_block_id = collab_document::document_data::generate_id(); + let image_block = create_image_block(&new_block_id, image_url, block_id); + if let Err(err) = document.insert_block(image_block, Some(block_id.to_string())) { + error!( + "Failed to insert image block when trying to replace delta link. error: {:?}", + err + ); } } } + + // Recursively process each child block + let block_children_ids = document.get_block_children_ids(block_id); + for child_id in block_children_ids.iter() { + self + .process_link_views_recursive( + parent_path, + document, + child_id, + resources, + external_link_views, + file_url_builder, + delta_resources, + ) + .await; + } } - /// Replace links in the deltas with the corresponding view IDs - fn replace_links_in_deltas( - &self, + /// Process the deltas for a block, looking for links to replace + #[allow(clippy::too_many_arguments)] + async fn process_block_deltas<'a, 'b, B, O>( + &'b self, + parent_path: &Path, document: &mut Document, block_id: &str, - links: &[ExternalLink], + block_type: BlockType, + mut deltas: Vec, external_link_views: &HashMap, - ) { - let mut block_deltas = document - .get_block_delta(block_id) - .map(|t| t.1) - .unwrap_or_default(); - - for link in links { - if let Some(view) = external_link_views.get(&link.id) { - block_deltas.iter_mut().for_each(|d| { - if let TextDelta::Inserted(content, _) = d { - if content == &link.name { - *d = mention_block_delta(&view.view_id); + resources: &[PathBuf], + file_url_builder: &'a B, + ) -> ProcessBlockDeltaResult + where + B: Fn(&'a str, PathBuf) -> O + Send + Sync + 'a, + O: Future> + Send + 'a, + 'b: 'a, + { + let mut is_changed = false; + let mut new_delta_image_blocks = vec![]; + let mut delta_resources = HashSet::new(); + for delta in deltas.iter_mut() { + if let TextDelta::Inserted(v, attrs) = delta.clone() { + // If there are any href in the attrs, we will try to replace with the corresponding view id + if let Some(attrs) = &attrs { + if let Some(href_value) = attrs.get("href") { + let delta_str = href_value.to_string(); + + // Replace links in the deltas with the corresponding view IDs + if let Ok(links) = extract_external_links(&delta_str) { + for link in &links { + if let Some(view) = external_link_views.get(&link.id) { + if v == link.name { + is_changed = true; + *delta = mention_block_delta(&view.view_id); + } + } + } + + self.update_paragraph_block( + document, + block_id, + &block_type, + &links, + external_link_views, + ); } } - }); + } + + // extract link from insert delta and replace with real image link + if let Some(delta_link) = extract_delta_link(&v) { + debug_assert!(attrs.is_none(), "attrs should be None for image link"); + let full_image_url = parent_path.join(delta_link.link); + let pos = resources.iter().position(|r| r == &full_image_url); + if let Some(pos) = pos { + if let Some(url) = file_url_builder(&self.view_id, full_image_url).await { + delta_resources.insert(resources[pos].clone()); + + // replace the inserted image link with empty string + *delta = TextDelta::Inserted("".to_string(), None); + // generate a block for given image + new_delta_image_blocks.push(url); + } + } + } } } - if let Err(err) = document.set_block_delta(block_id, block_deltas) { - error!( - "Failed to set block delta when trying to replace ref link. error: {:?}", - err - ); + let mut result = ProcessBlockDeltaResult { + delta_resources: delta_resources.into_iter().collect(), + new_deltas: None, + new_delta_image_blocks, + }; + + if is_changed { + result.new_deltas = Some(deltas); } + + result } /// Update the paragraph block if the last link points to an external view @@ -518,6 +686,12 @@ pub async fn build_imported_collab_recursively<'a>( Box::pin(initial_stream.chain(child_stream)) } +pub struct ProcessBlockDeltaResult { + pub delta_resources: Vec, + pub new_deltas: Option>, + pub new_delta_image_blocks: Vec, +} + #[derive(Debug, Default, Clone, Eq, PartialEq, Serialize)] pub struct ExternalLink { pub id: String, diff --git a/collab-importer/src/notion/walk_dir.rs b/collab-importer/src/notion/walk_dir.rs index 80e6b342b..392a73bef 100644 --- a/collab-importer/src/notion/walk_dir.rs +++ b/collab-importer/src/notion/walk_dir.rs @@ -1,4 +1,5 @@ use crate::error::ImporterError; +use std::fmt::Display; use fancy_regex::Regex; use markdown::mdast::Node; @@ -7,7 +8,7 @@ use percent_encoding::percent_decode_str; use crate::notion::file::{process_row_md_content, NotionFile, Resource}; use crate::notion::page::{ExternalLink, ExternalLinkType, ImportedRowDocument, NotionPage}; -use crate::notion::CSVRelation; +use crate::notion::NotionExportContext; use crate::util::parse_csv; use std::fs; @@ -22,7 +23,6 @@ pub(crate) fn get_file_size(path: &PathBuf) -> std::io::Result { } pub(crate) fn collect_entry_resources( - _workspace_id: &str, walk_path: &Path, relative_path: Option<&Path>, ) -> Vec { @@ -80,7 +80,7 @@ pub(crate) fn process_entry( workspace_id: &str, current_entry: &DirEntry, include_partial_csv: bool, - csv_relation: &CSVRelation, + notion_export: &NotionExportContext, ) -> Option { // Skip macOS-specific files let entry_name = current_entry.file_name().to_str()?; @@ -92,7 +92,7 @@ pub(crate) fn process_entry( let ext = get_file_extension(path, include_partial_csv); if ext.is_file() { // Check if there's a corresponding directory for this .md file and skip it if so - process_file(host, workspace_id, path, ext, csv_relation) + process_file(host, workspace_id, path, ext, notion_export) } else if path.is_dir() { // If the path is a directory, it should contain a file with the same name but with either a .md or .csv extension. // If no such file is found, the directory will be treated as a space. @@ -113,7 +113,7 @@ pub(crate) fn process_entry( id, &md_file_path, include_partial_csv, - csv_relation, + notion_export, ) } else if all_csv_file_path.exists() { process_csv_dir( @@ -125,10 +125,10 @@ pub(crate) fn process_entry( parent_path, &all_csv_file_path, &csv_file_path, - csv_relation, + notion_export, ) } else { - process_space_dir(host, workspace_id, name, id, path, csv_relation) + process_space_dir(host, workspace_id, name, id, path, notion_export) } } else { None @@ -141,13 +141,13 @@ fn process_space_dir( name: String, id: Option, path: &Path, - csv_relation: &CSVRelation, + notion_export: &NotionExportContext, ) -> Option { let mut children = vec![]; // Collect all child entries first, to sort by created time let entries: Vec<_> = walk_sub_dir(path); for sub_entry in entries { - if let Some(child_view) = process_entry(host, workspace_id, &sub_entry, false, csv_relation) { + if let Some(child_view) = process_entry(host, workspace_id, &sub_entry, false, notion_export) { children.push(child_view); } } @@ -162,7 +162,7 @@ fn process_space_dir( host: host.to_string(), workspace_id: workspace_id.to_string(), is_dir: true, - csv_relation: csv_relation.clone(), + csv_relation: notion_export.csv_relation.clone(), }) } @@ -176,7 +176,7 @@ fn process_csv_dir( parent_path: &Path, all_csv_file_path: &PathBuf, csv_file_path: &PathBuf, - csv_relation: &CSVRelation, + notion_export: &NotionExportContext, ) -> Option { let mut resources = vec![]; let file_size = get_file_size(all_csv_file_path).ok()?; @@ -185,7 +185,7 @@ fn process_csv_dir( // To identify which CSV file contains these resources, we must check each row // to see if any paths match the resource path. // Currently, we do this in [filter_out_resources]. - resources.extend(collect_entry_resources(workspace_id, parent_path, None)); + resources.extend(collect_entry_resources(parent_path, None)); let mut row_documents = vec![]; // collect all sub entries whose entries are directory @@ -194,7 +194,7 @@ fn process_csv_dir( let csv_dir = parent_path.join(file_name); if csv_dir.exists() { for sub_entry in walk_sub_dir(&csv_dir) { - if let Some(mut page) = process_entry(host, workspace_id, &sub_entry, true, csv_relation) { + if let Some(mut page) = process_entry(host, workspace_id, &sub_entry, true, notion_export) { if page.children.iter().any(|c| c.notion_file.is_markdown()) { warn!("Only CSV file exist in the database row directory"); } @@ -216,7 +216,10 @@ fn process_csv_dir( page.children.retain(|child| { if let Some(file_path) = child.notion_file.file_path() { if let Ok(file_name) = file_name_from_path(file_path) { - return csv_relation.get(&file_name.to_lowercase()).is_none(); + return notion_export + .csv_relation + .get(&file_name.to_lowercase()) + .is_none(); } } true @@ -260,10 +263,12 @@ fn process_csv_dir( host: host.to_string(), workspace_id: workspace_id.to_string(), is_dir: false, - csv_relation: csv_relation.clone(), + csv_relation: notion_export.csv_relation.clone(), }; - csv_relation.set_page_by_path_buf(all_csv_file_path.clone(), page.clone()); + notion_export + .csv_relation + .set_page_by_path_buf(all_csv_file_path.clone(), page.clone()); Some(page) } @@ -286,7 +291,7 @@ fn process_md_dir( id: Option, md_file_path: &PathBuf, include_partial_csv: bool, - csv_relation: &CSVRelation, + notion_export: &NotionExportContext, ) -> Option { let mut children = vec![]; let external_links = get_md_links(md_file_path).unwrap_or_default(); @@ -300,18 +305,14 @@ fn process_md_dir( workspace_id, &sub_entry, include_partial_csv, - csv_relation, + notion_export, ) { children.push(child_view); } // When traversing the directory, resources like images and files // can be found within subdirectories of the current directory. - resources.extend(collect_entry_resources( - workspace_id, - sub_entry.path(), - None, - )); + resources.extend(collect_entry_resources(sub_entry.path(), None)); } } @@ -331,7 +332,7 @@ fn process_md_dir( host: host.to_string(), workspace_id: workspace_id.to_string(), is_dir: false, - csv_relation: csv_relation.clone(), + csv_relation: notion_export.csv_relation.clone(), }) } @@ -340,14 +341,14 @@ fn process_file( workspace_id: &str, path: &Path, ext: FileExtension, - csv_relation: &CSVRelation, + notion_export: &NotionExportContext, ) -> Option { match ext { FileExtension::Unknown => None, - FileExtension::Markdown => process_md_file(host, workspace_id, path, csv_relation), + FileExtension::Markdown => process_md_file(host, workspace_id, path, notion_export), FileExtension::Csv { include_partial_csv, - } => process_csv_file(host, workspace_id, path, include_partial_csv, csv_relation), + } => process_csv_file(host, workspace_id, path, include_partial_csv, notion_export), } } @@ -356,7 +357,7 @@ fn process_csv_file( workspace_id: &str, path: &Path, include_partial_csv: bool, - csv_relation: &CSVRelation, + notion_export: &NotionExportContext, ) -> Option { let file_name = path.file_name()?.to_str()?; // Check if a folder exists with the same name as the CSV file, excluding the "_all.csv" suffix. @@ -384,7 +385,7 @@ fn process_csv_file( // to see if any paths match the resource path. // Currently, we do this in [filter_out_resources]. if let Some(parent) = path.parent() { - resources.extend(collect_entry_resources(workspace_id, parent, None)); + resources.extend(collect_entry_resources(parent, None)); } let file_path = path.to_path_buf(); @@ -407,7 +408,7 @@ fn process_csv_file( host: host.to_string(), workspace_id: workspace_id.to_string(), is_dir: false, - csv_relation: csv_relation.clone(), + csv_relation: notion_export.csv_relation.clone(), }) } @@ -415,7 +416,7 @@ fn process_md_file( host: &str, workspace_id: &str, path: &Path, - csv_relation: &CSVRelation, + notion_export: &NotionExportContext, ) -> Option { if let Some(parent) = path.parent() { let file_stem = path.file_stem()?.to_str()?; @@ -427,7 +428,7 @@ fn process_md_file( // Process the file normally if it doesn't correspond to a directory let (name, id) = name_and_id_from_path(path).ok()?; - let notion_file = file_type_from_path(path)?; + let notion_file = notion_file_from_path(path, notion_export.no_subpages)?; let mut external_links = vec![]; if notion_file.is_markdown() { external_links = get_md_links(path).unwrap_or_default(); @@ -447,7 +448,7 @@ fn process_md_file( host: host.to_string(), workspace_id: workspace_id.to_string(), is_dir: false, - csv_relation: csv_relation.clone(), + csv_relation: notion_export.csv_relation.clone(), }) } @@ -540,6 +541,43 @@ pub(crate) fn extract_external_links(path_str: &str) -> Result Ok(result) } +pub struct DeltaLink { + pub file_name: String, + pub link: String, + pub start: usize, + pub end: usize, +} + +impl Display for DeltaLink { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DeltaLink {{ file_name: {}, link: {} }}", + self.file_name, self.link + ) + } +} + +pub fn extract_delta_link(input: &str) -> Option { + let re = Regex::new(r"!\[(.*?)\]\((.*?)\)").unwrap(); + + if let Some(captures) = re.captures(input).ok().flatten() { + let file_name = captures.get(1)?.as_str().to_string(); + let link = captures.get(2)?.as_str().to_string(); + let start = captures.get(0)?.start(); + let end = captures.get(0)?.end(); + + return Some(DeltaLink { + file_name, + link, + start, + end, + }); + } + + None +} + enum FileExtension { Unknown, Markdown, @@ -590,16 +628,25 @@ fn name_and_id_from_path(path: &Path) -> Result<(String, Option), Import /// - If the file is a `.csv` and contains `_all`, it's considered a `CSV`. /// - Otherwise, if it's a `.csv`, it's considered a `CSVPart`. /// - `.md` files are classified as `Markdown`. -fn file_type_from_path(path: &Path) -> Option { +fn notion_file_from_path(path: &Path, no_subpages: bool) -> Option { let extension = path.extension()?.to_str()?; let file_size = get_file_size(&path.to_path_buf()).ok()?; match extension { - "md" => Some(NotionFile::Markdown { - file_path: path.to_path_buf(), - size: file_size, - resources: vec![], - }), + "md" => { + let mut resources = vec![]; + if no_subpages { + if let Some(parent_path) = path.parent() { + resources = collect_entry_resources(parent_path, None); + } + } + + Some(NotionFile::Markdown { + file_path: path.to_path_buf(), + size: file_size, + resources, + }) + }, "csv" => { let file_name = path.file_name()?.to_str()?; if file_name.contains("_all") { @@ -629,6 +676,80 @@ pub(crate) fn file_name_from_path(path: &Path) -> Result .map(|s| s.to_string()) } +#[cfg(test)] +mod extract_delta_link_tests { + use super::*; + + #[test] + fn test_extract_info_valid_input() { + let input = + "![Dishes at Broken Spanish, in Downtown LA.](christine-siracusa-363257-unsplash.jpg)"; + let result = extract_delta_link(input); + assert!(result.is_some()); + let delta_link = result.unwrap(); + assert_eq!( + delta_link.file_name, + "Dishes at Broken Spanish, in Downtown LA." + ); + assert_eq!(delta_link.link, "christine-siracusa-363257-unsplash.jpg"); + assert_eq!(delta_link.start, 0); + assert_eq!(delta_link.end, input.len()); + } + + #[test] + fn test_extract_info_no_alt_text() { + let input = "![](christine-siracusa-363257-unsplash.jpg)"; + let result = extract_delta_link(input); + assert!(result.is_some()); + let delta_link = result.unwrap(); + assert_eq!(delta_link.file_name, ""); + assert_eq!(delta_link.link, "christine-siracusa-363257-unsplash.jpg"); + assert_eq!(delta_link.start, 0); + assert_eq!(delta_link.end, input.len()); + } + + #[test] + fn test_extract_info_no_link() { + let input = "![Dishes at Broken Spanish, in Downtown LA.]()"; + let result = extract_delta_link(input); + assert!(result.is_some()); + let delta_link = result.unwrap(); + assert_eq!( + delta_link.file_name, + "Dishes at Broken Spanish, in Downtown LA." + ); + assert_eq!(delta_link.link, ""); + assert_eq!(delta_link.start, 0); + assert_eq!(delta_link.end, input.len()); + } + + #[test] + fn test_extract_info_invalid_format() { + let input = "This is not an image markdown"; + let result = extract_delta_link(input); + assert!(result.is_none()); + } + + #[test] + fn test_extract_info_partial_markdown() { + let input = "![Only alt text]"; + let result = extract_delta_link(input); + assert!(result.is_none()); + } + + #[test] + fn test_extract_info_special_characters() { + let input = "![Special chars & symbols: @#$%^&*()!](file-with-special-chars-@#$%.jpg)"; + let result = extract_delta_link(input); + assert!(result.is_some()); + let delta_link = result.unwrap(); + assert_eq!(delta_link.file_name, "Special chars & symbols: @#$%^&*()!"); + assert_eq!(delta_link.link, "file-with-special-chars-@#$%.jpg"); + assert_eq!(delta_link.start, 0); + assert_eq!(delta_link.end, input.len()); + } +} + #[cfg(test)] mod name_and_id_from_path_tests { use super::*; diff --git a/collab-importer/tests/asset/blog_post_no_subpages.zip b/collab-importer/tests/asset/blog_post_no_subpages.zip new file mode 100644 index 000000000..a3ec66dcd Binary files /dev/null and b/collab-importer/tests/asset/blog_post_no_subpages.zip differ diff --git a/collab-importer/tests/notion_test/import_test.rs b/collab-importer/tests/notion_test/import_test.rs index b013755d9..5048c5de9 100644 --- a/collab-importer/tests/notion_test/import_test.rs +++ b/collab-importer/tests/notion_test/import_test.rs @@ -22,16 +22,16 @@ use collab_importer::notion::page::NotionPage; use collab_importer::notion::{is_csv_contained_cached, CSVContentCache, NotionImporter}; use collab_importer::util::{parse_csv, CSVRow}; +use collab_document::document::Document; use futures::stream::StreamExt; use percent_encoding::percent_decode_str; use std::collections::{HashMap, HashSet}; use std::env::temp_dir; use std::path::PathBuf; use std::sync::Arc; - // #[tokio::test] // async fn import_test() { -// let (_cleaner, file_path) = sync_unzip_asset("appflowy_io_full").await.unwrap(); +// let (_cleaner, file_path) = sync_unzip_asset("d-1").await.unwrap(); // let importer = NotionImporter::new( // 1, // &file_path, @@ -40,6 +40,15 @@ use std::sync::Arc; // ) // .unwrap(); // let info = importer.import().await.unwrap(); +// let view = info.views()[0].as_document().await.unwrap(); +// let document = view.0; +// let block_ids = document.get_all_block_ids(); +// for block_id in block_ids { +// if let Some((block_type, block_data)) = document.get_block_data(&block_id) { +// println!("{:?} {:?}", block_type, block_data); +// } +// } +// // let nested_view = info.build_nested_views().await; // println!("{}", nested_view); // } @@ -255,6 +264,20 @@ async fn import_blog_post_document_test() { assert_blog_post(host, &info.workspace_id, root_view).await; } +#[tokio::test] +async fn import_blog_post_no_subpages_test() { + setup_log(); + let workspace_id = uuid::Uuid::new_v4(); + let (_cleaner, file_path) = sync_unzip_asset("blog_post_no_subpages").await.unwrap(); + let host = "http://test.appflowy.cloud"; + let importer = NotionImporter::new(1, &file_path, workspace_id, host.to_string()).unwrap(); + let info = importer.import().await.unwrap(); + assert_eq!(info.name, "blog_post_no_subpages"); + + let root_view = &info.views()[0]; + assert_blog_post(host, &info.workspace_id, root_view).await; +} + #[tokio::test] async fn import_project_test() { let workspace_id = uuid::Uuid::new_v4(); @@ -433,16 +456,29 @@ async fn assert_blog_post(host: &str, workspace_id: &str, root_view: &NotionPage let (document, _) = root_view.as_document().await.unwrap(); let page_block_id = document.get_page_id().unwrap(); - let block_ids = document.get_block_children_ids(&page_block_id); - for block_id in block_ids.iter() { - if let Some((block_type, block_data)) = document.get_block_data(block_id) { - if matches!(block_type, BlockType::Image) { - let url = block_data.get(URL_FIELD).unwrap().as_str().unwrap(); + process_all_blocks_to_find_expected_urls(&document, &page_block_id, &mut expected_urls); + assert!(expected_urls.is_empty()); +} + +fn process_all_blocks_to_find_expected_urls( + document: &Document, + block_id: &str, + expected_urls: &mut Vec, +) { + // Process the current block + if let Some((block_type, block_data)) = document.get_block_data(block_id) { + if matches!(block_type, BlockType::Image) { + if let Some(url) = block_data.get(URL_FIELD).and_then(|value| value.as_str()) { expected_urls.retain(|allowed_url| !url.contains(allowed_url)); } } } - assert!(expected_urls.is_empty()); + + // Recursively process each child block + let block_children_ids = document.get_block_children_ids(block_id); + for child_id in block_children_ids.iter() { + process_all_blocks_to_find_expected_urls(document, child_id, expected_urls); + } } async fn check_project_and_task_document(