MarkdownDB: extracting links plus refactor (#472)

Closes #473 ## Motivation As we plan to replace Contentlayer with our MarkdownDB package, we need a solid base upon which we can build a full-fledged "content-layer" solution. As the code in its current state is pretty hard to reason about and manipulate and is rather weakly typed, the development of any new functionality on top of it would only make it harder and harder to engineer, adjust and test. This PR aims to make the development of new features easier and (because of stronger typing) less error-prone. Note, there is still quite a lot of room for improvement - TODOs comments were left in the code but will probably be resolved after further discussions on implementation design. ## Changes - `markdowndb.ts` cleaned up and refactored. Most notably: - tables creation (and deletion) logic moved out to `schema.ts` and bound with relevant classes corresponding to tables (see below) - wrapped raw batch inserts on knex db with class methods to prevent SQL errors in the first place, instead of debugging them after they occur - `indexFolder` bound with MarkdownDB class as a method - removed duplication of db initialization -> there is only one `init` method on `MarkdownDB` now - replaced ambiguous `query` method with `getFiles` - commented out querying by folder (?) for now as I'm not sure if it's even needed, at least not in the current form - added `getFileByUrl` and `getFileById` method to retrieve single file file from the db (e.g. for finding its backlinks) - added `getLinks` method which supports querying back- or forward links - removed `types.ts` classes in `schema.ts` can also by used as types. Other types like e.g. `DatabaseQuery` moved directly to method signature, as they are striclty bound with it and it's not needed to have it in a separate file (there is no other use case for it) - cleaned up and extended tests of `markdowndb` lib and split into smaller unit tests grouped by tested functionalities - `schema.ts` created with four classes: `File`, `Link`, `Tag` and `FileTag` - each class is now a representation of a table, i.e. `files`, `links`, `tags`, `file_tags` and so it describes the fields (columns) existent on each table and has methods for creating a table, deleting it and batch-inserting data to it - it is a single source of truth about tables and data types stored in them, - `utils` folder created with `recursiveWalkDir.ts`, `parseFile.ts` and `extractWikiLinks.ts` each with its own, separate test suite - stronger typing
datopian · Apr 4, 2023 · ad65355 · ad65355
1 parent 4d9289a
commit ad65355
Show file tree

Hide file tree

Showing 30 changed files with 922 additions and 377 deletions.
diff --git a/packages/markdowndb/CHANGELOG.md b/packages/markdowndb/CHANGELOG.md
@@ -1,5 +1,11 @@
 # @flowershow/markdowndb
 
+## 0.0.3
+
+### Patch Changes
+
+- 135a238: clean-up and reafctoring with more OOP approach and stronger typing.
+
 ## 0.0.2
 
 ### Patch Changes

diff --git a/packages/markdowndb/__mocks__/content/blog/blog1.mdx b/packages/markdowndb/__mocks__/content/blog/blog1.mdx
@@ -3,3 +3,5 @@ title: My Test Mdx Blog 1
 ---
 
 # My Test Mdx Blog 1
+
+[[Blog2]]
diff --git a/packages/markdowndb/__mocks__/content/blog/blog2.mdx b/packages/markdowndb/__mocks__/content/blog/blog2.mdx
@@ -1,7 +1,10 @@
 ---
 title: My Test Mdx Blog 2
+type: blog
 tags:
   - economy
 ---
 
 # My Test Mdx Blog 2
+
+[[../Blog0]]
diff --git a/packages/markdowndb/__mocks__/content/blog/blog3.mdx b/packages/markdowndb/__mocks__/content/blog/blog3.mdx
@@ -1,8 +1,11 @@
 ---
 title: My Test Mdx Blog 2
+type: blog
 tags:
   - politics
   - economy
 ---
 
 # My Test Mdx Blog 2
+
+[[/blog/Blog1]]
diff --git a/packages/markdowndb/__mocks__/content/blog0.mdx b/packages/markdowndb/__mocks__/content/blog0.mdx
@@ -4,3 +4,5 @@ type: blog
 ---
 
 # My Test Mdx Blog 1
+
+[[blog/Blog2]]
diff --git a/packages/markdowndb/__mocks__/content/news/index.md b/packages/markdowndb/__mocks__/content/news/index.md
@@ -0,0 +1,6 @@
+---
+type: news
+---
+
+# Document Title
+
diff --git a/packages/markdowndb/__mocks__/content/news/news1.mdx b/packages/markdowndb/__mocks__/content/news/news1.mdx
@@ -0,0 +1,7 @@
+---
+type: news
+tags:
+  - culture
+---
+
+# Document Title
diff --git a/packages/markdowndb/__mocks__/content/news/news2.md b/packages/markdowndb/__mocks__/content/news/news2.md
@@ -0,0 +1,8 @@
+---
+type: news
+tags:
+  - sports
+---
+
+# Document Title
+
diff --git a/packages/markdowndb/jest.config.ts b/packages/markdowndb/jest.config.ts
@@ -1,16 +1,14 @@
-/* eslint-disable */
-export default {
+import type { JestConfigWithTsJest } from "ts-jest";
+
+const jestConfig: JestConfigWithTsJest = {
   displayName: "markdowndb",
   preset: "../../jest.preset.js",
-  globals: {
-    "ts-jest": {
-      tsconfig: "<rootDir>/tsconfig.spec.json",
-    },
-  },
   testEnvironment: "node",
   transform: {
-    "^.+\\.[tj]sx?$": "ts-jest",
+    "^.+\\.[tj]s?$": "ts-jest",
   },
-  moduleFileExtensions: ["ts", "tsx", "js", "jsx"],
-  coverageDirectory: "../../coverage/packages/markdowndb",
+  transformIgnorePatterns: ["<rootDir>/node_modules/(?!remark-parse)"],
+  moduleFileExtensions: ["ts", "js"],
 };
+
+export default jestConfig;
diff --git a/packages/markdowndb/package.json b/packages/markdowndb/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@flowershow/markdowndb",
-  "version": "0.0.2",
+  "version": "0.0.3",
   "description": "Parse markdown files and store them in an SQL database.",
   "repository": {
     "type": "git",
@@ -27,5 +27,8 @@
     "gray-matter": "^4.0.3",
     "knex": "^2.4.2",
     "sqlite3": "^5.1.6"
+  },
+  "devDependencies": {
+    "remark-gfm": "^3.0.1"
   }
 }
diff --git a/packages/markdowndb/src/lib/markdowndb.spec.ts b/packages/markdowndb/src/lib/markdowndb.spec.ts
@@ -1,101 +1,186 @@
-import knex from "knex";
-import * as markdowndb from "./markdowndb";
-import * as fs from "fs";
+// import knex from "knex";
+import { MarkdownDB } from "./markdowndb";
+import { Table } from "./schema";
+import { recursiveWalkDir } from "../utils";
 
 /**
  * @jest-environment node
  */
-describe("MarkdownDB lib", () => {
-  it("builds a new MarkdownDB", async () => {
-    const pathToFixturesFolder = "packages/markdowndb/__mocks__/content";
+
+// TODO test index files
+describe("MarkdownDB", () => {
+  const pathToContentFixture = "packages/markdowndb/__mocks__/content";
+  let mddb: MarkdownDB;
+
+  beforeAll(async () => {
     const dbConfig = {
       client: "sqlite3",
       connection: {
         filename: "markdown.db",
       },
     };
 
-    const db = knex(dbConfig);
+    mddb = new MarkdownDB(dbConfig);
+    await mddb.init();
+    await mddb.indexFolder({ folderPath: pathToContentFixture });
+  });
 
-    //  Index folder
-    await markdowndb.indexFolder("markdown.db", pathToFixturesFolder);
+  afterAll(async () => {
+    // TODO why we have to call this twice?
+    mddb.db.destroy();
+    mddb._destroyDb();
+  });
 
-    //  Ensure there is a "files" table
-    expect(await db.schema.hasTable("files")).toBe(true);
+  describe("correct startup and indexing", () => {
+    test("adds tables to db", async () => {
+      expect(await mddb.db.schema.hasTable(Table.Files)).toBe(true);
+      expect(await mddb.db.schema.hasTable(Table.Tags)).toBe(true);
+      expect(await mddb.db.schema.hasTable(Table.FileTags)).toBe(true);
+      expect(await mddb.db.schema.hasTable(Table.Links)).toBe(true);
+    });
 
-    //  Ensure there is a "tags" table
-    expect(await db.schema.hasTable("tags")).toBe(true);
+    test("indexes all files in folder", async () => {
+      const allFiles = recursiveWalkDir(pathToContentFixture);
+      const allIndexedFiles = await mddb.getFiles();
+      expect(allIndexedFiles).toHaveLength(allFiles.length);
+    });
+  });
 
-    //  Ensure there is a "file_tags" table
-    expect(await db.schema.hasTable("file_tags")).toBe(true);
+  describe("querying files", () => {
+    test("can get all files", async () => {
+      const dbFiles = await mddb.getFiles();
+      const dbFilesPaths = dbFiles.map((f) => f.file_path);
+      const allFilesPaths = recursiveWalkDir(pathToContentFixture);
 
-    const myMdDb = markdowndb.Database("markdown.db");
+      expect(dbFiles).toHaveLength(allFilesPaths.length);
+      dbFilesPaths.forEach((p) => {
+        expect(allFilesPaths).toContain(p);
+      });
+    });
 
-    //  Check if all files were indexed
-    const allFiles = walk(pathToFixturesFolder);
-    const allFilesCount = allFiles.length;
+    test("can query by file type", async () => {
+      const dbFiles = await mddb.getFiles({ filetypes: ["blog"] });
+      const dbFilesPaths = dbFiles.map((f) => f.file_path);
 
-    const allIndexedFiles = await myMdDb.query();
-    expect(allIndexedFiles.length).toBe(allFilesCount);
+      const expectedPaths = [
+        `${pathToContentFixture}/blog/blog3.mdx`,
+        `${pathToContentFixture}/blog/blog2.mdx`,
+        `${pathToContentFixture}/blog0.mdx`,
+      ];
 
-    //  Check if querying by folder is working
-    const blogFiles = allFiles.filter((p) =>
-      p.startsWith(`${pathToFixturesFolder}/blog/`)
-    );
-    const blogFilesCount = blogFiles.length;
+      expect(dbFilesPaths).toHaveLength(expectedPaths.length);
+      dbFilesPaths.forEach((p) => {
+        expect(expectedPaths).toContain(p);
+      });
+    });
 
-    const indexedBlogFiles = await myMdDb.query({
-      folder: "blog",
-      filetypes: ["md", "mdx"],
+    test("can query by tags", async () => {
+      const dbFiles = await mddb.getFiles({ tags: ["economy", "politics"] });
+      const dbFilesPaths = dbFiles.map((f) => f.file_path);
+
+      const expectedPaths = [
+        `${pathToContentFixture}/blog/blog3.mdx`,
+        `${pathToContentFixture}/blog/blog2.mdx`,
+      ];
+
+      expect(dbFilesPaths).toHaveLength(expectedPaths.length);
+      dbFilesPaths.forEach((p) => {
+        expect(expectedPaths).toContain(p);
+      });
+    });
+
+    test("can query by extensions", async () => {
+      const dbFiles = await mddb.getFiles({ extensions: ["png"] });
+      const dbFilesPaths = dbFiles.map((f) => f.file_path);
+
+      const expectedPaths = [
+        `${pathToContentFixture}/assets/datopian-logo.png`,
+      ];
+
+      expect(dbFilesPaths).toHaveLength(expectedPaths.length);
+      dbFilesPaths.forEach((p) => {
+        expect(expectedPaths).toContain(p);
+      });
     });
 
-    expect(indexedBlogFiles.length).toBe(blogFilesCount);
+    test("can query by tags AND filetypes AND extensions", async () => {
+      const dbFiles = await mddb.getFiles({
+        tags: ["culture"],
+        filetypes: ["news"],
+        extensions: ["md", "mdx"],
+      });
+      const dbFilesPaths = dbFiles.map((f) => f.file_path);
+      const expectedPaths = [`${pathToContentFixture}/news/news1.mdx`];
+
+      expect(dbFilesPaths).toHaveLength(expectedPaths.length);
+      dbFilesPaths.forEach((p) => {
+        expect(expectedPaths).toContain(p);
+      });
+    });
 
-    //  Check if querying by tags is working
-    const economyFiles = await myMdDb.query({ tags: ["economy"] });
-    const economyFilesPaths = economyFiles.map((f) => f._path);
+    test("can find file by url path", async () => {
+      const dbFile = await mddb.getFileByUrl("blog/blog2");
+      expect(dbFile.url_path).toBe("blog/blog2");
+    });
 
-    const expectedPaths = [
-      `${pathToFixturesFolder}/blog/blog3.mdx`,
-      `${pathToFixturesFolder}/blog/blog2.mdx`,
-    ];
+    test("can find file by id", async () => {
+      const dbFile = await mddb.getFileByUrl("blog/blog2");
+      const dbFileById = await mddb.getFileById(dbFile._id);
+      expect(dbFileById.url_path).toBe("blog/blog2");
+    });
+  });
 
-    expect(economyFilesPaths).toHaveLength(expectedPaths.length);
-    economyFilesPaths.forEach((p) => {
-      expect(expectedPaths).toContain(p);
+  describe("getTags", () => {
+    // TODO the list of tags in db should be defined in some config file instead of being extracted from all the files
+    test("can get all tags", async () => {
+      const dbTags = await mddb.getTags();
+      const extectedTags = [
+        { name: "economy" },
+        { name: "politics" },
+        { name: "sports" },
+        { name: "culture" },
+      ];
+
+      expect(dbTags).toHaveLength(extectedTags.length);
+      dbTags.forEach((t) => {
+        expect(extectedTags).toContainEqual(t);
+      });
     });
+  });
 
-    //  Check if querying by filetypes is working
-    const pngFiles = await myMdDb.query({ filetypes: ["png"] });
-    expect(
-      pngFiles
-        .map((f) => f.filetype)
-        //  Filter out duplicates
-        .filter((v, i, s) => {
-          return s.indexOf(v) === i;
-        })
-    ).toEqual(["png"]);
-
-    db.destroy();
-    myMdDb._destroyDb();
+  describe("getLinks", () => {
+    test("can get all forward links of a file", async () => {
+      const fromFile = await mddb.getFileByUrl("blog/blog2");
+      const toFile = await mddb.getFileByUrl("blog0");
+
+      const forwardLinks = await mddb.getLinks({
+        fileId: fromFile._id,
+      });
+      expect(forwardLinks.length).toBe(1);
+      expect(forwardLinks[0].to).toBe(toFile._id);
+    });
+
+    test("can get all backward links of a file", async () => {
+      const toFile = await mddb.getFileByUrl("blog/blog2");
+      const fromFile1 = await mddb.getFileByUrl("blog0");
+      const fromFile2 = await mddb.getFileByUrl("blog/blog1");
+
+      const backwardLinks = await mddb.getLinks({
+        fileId: toFile._id,
+        direction: "backward",
+      });
+      const backwardLinksFileIds = backwardLinks.map((l) => l.from);
+      expect(backwardLinksFileIds).toHaveLength(2);
+      expect(backwardLinksFileIds).toContain(fromFile1._id);
+      expect(backwardLinksFileIds).toContain(fromFile2._id);
+    });
   });
-});
 
-const walk = (dir: fs.PathLike) => {
-  let files: string[] = [];
-  for (const item of fs.readdirSync(dir)) {
-    if (!(dir as string).endsWith("/")) {
-      dir += "/";
-    }
-
-    const fullPath = dir + item;
-    const stat = fs.statSync(fullPath);
-
-    if (stat.isDirectory()) {
-      files = files.concat(walk(fullPath));
-    } else if (stat.isFile()) {
-      files.push(fullPath);
-    }
-  }
-  return files;
-};
+  test("can query by folder", async () => {
+    const allBlogFiles = recursiveWalkDir(`${pathToContentFixture}/blog`);
+    const indexedBlogFiles = await mddb.getFiles({
+      folder: "blog",
+    });
+    expect(indexedBlogFiles.length).toBe(allBlogFiles.length);
+  });
+});
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,5 @@ title: My Test Mdx Blog 1
		---

		# My Test Mdx Blog 1

		[[Blog2]]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,3 +4,5 @@ type: blog
		---

		# My Test Mdx Blog 1

		[[blog/Blog2]]