jupyter-naas · FlorentLvr · May 7, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/__pipeline__.ipynb b/__pipeline__.ipynb
@@ -105,6 +105,9 @@
     "- `abi_spreadsheet`: Google Sheets spreadsheet URL\n",
     "- `sheet_entity`: Entity sheet name that stores all your personal data\n",
     "- `long_lived_token`: Long lived token to connect to Naas API.\n",
+    "- `limit_linkedin`: Max call on a specific LinkedIn endpoint (profile top card, company info).\n",
+    "- `limit_llm`: Max call on LLM (Naas API) to enrich specific data from Growth OBT (PEOPLE, ORGANIZATIONS, CONTACTS).\n",
+    "- `cron`: Represents the scheduling pattern of your notebook (https://crontab.guru/)\n",
     "\n",
     "**Outputs**\n",
     "- `datalake_dir`: Datalake directory"
@@ -125,6 +128,9 @@
     "long_lived_token = naas.secret.get('NAAS_API_TOKEN')\n",
     "entity_start = 0\n",
     "entity_end = None\n",
+    "limit_linkedin = 30\n",
+    "limit_llm = 50\n",
+    "cron = \"0 12 * * *\"\n",
     "\n",
     "# Outputs\n",
     "datalake_dir = naas.secret.get(\"ABI_DATALAKE_DIR\")"
@@ -228,9 +234,9 @@
     "            if index == 0:\n",
     "                naas.secret.add(x, value)\n",
     "    if li_at == \"NA\":\n",
-    "        li_at = naas.secret.get(\"li_at\")\n",
+    "        li_at = naas.secret.get(\"li_at\") or naas.secret.get(\"LINKEDIN_LI_AT\")\n",
     "    if JSESSIONID == \"NA\":\n",
-    "        JSESSIONID = naas.secret.get(\"JSESSIONID\").replace('\"', '')\n",
+    "        JSESSIONID = naas.secret.get(\"JSESSIONID\") or naas.secret.get(\"LINKEDIN_JSESSIONID\")\n",
     "    print(\"- LinkedIn li_at:\", li_at)\n",
     "    print(\"- LinkedIn JSESSIONID:\", JSESSIONID)\n",
     "    \n",
@@ -281,7 +287,9 @@
     "            \"linkedin_url\": linkedin_url,\n",
     "            \"li_at\": li_at,\n",
     "            \"JSESSIONID\": JSESSIONID,\n",
-    "            \"entity_dir\": entity_dir\n",
+    "            \"entity_dir\": entity_dir,\n",
+    "            \"limit_linkedin\": limit_linkedin,\n",
+    "            \"limit_llm\": limit_llm\n",
     "        }\n",
     "    )\n",
     "    sales = NotebookStep(\n",
@@ -361,7 +369,6 @@
    "outputs": [],
    "source": [
     "# Schedule pipeline\n",
-    "cron = \"0 8 * * *\"\n",
     "print(\"⏰ Scheduler:\", cron)\n",
     "naas.scheduler.add(cron=cron)\n",
     "# naas.scheduler.delete()"

diff --git a/models/growth-engine/__pipeline__.ipynb b/models/growth-engine/__pipeline__.ipynb
@@ -129,8 +129,8 @@
     "entity_name = pload(os.path.join(datalake_dir, \"entities\", entity_index), \"entity_name\") or \"\"\n",
     "emails = pload(os.path.join(datalake_dir, \"entities\", entity_index), \"emails\") or []\n",
     "linkedin_url = pload(os.path.join(datalake_dir, \"entities\", entity_index), \"linkedin_url\") or \"\"\n",
-    "li_at = naas.secret.get(\"LINKEDIN_LI_AT\")\n",
-    "JSESSIONID = naas.secret.get(\"LINKEDIN_JSESSIONID\")\n",
+    "li_at = naas.secret.get(\"li_at\") or naas.secret.get(\"LINKEDIN_LI_AT\")\n",
+    "JSESSIONID = naas.secret.get(\"JSESSIONID\") or naas.secret.get(\"LINKEDIN_JSESSIONID\")\n",
     "entity_dir = pload(os.path.join(datalake_dir, \"entities\", entity_index), \"entity_dir\") or \"\"\n",
     "\n",
     "# Google Sheets\n",
@@ -145,7 +145,9 @@
     "# Engine\n",
     "engine_name = \"growth-engine\"\n",
     "assistant_name = \"Growth Assistant\"\n",
-    "custom_pipeline_path = os.path.join(naas_data_product.MODELS_PATH, engine_name, \"custom\", \"__pipeline__.ipynb\")"
+    "custom_pipeline_path = os.path.join(naas_data_product.MODELS_PATH, engine_name, \"custom\", \"__pipeline__.ipynb\")\n",
+    "limit_linkedin = 0\n",
+    "limit_llm = 5"
    ]
   },
   {
@@ -269,6 +271,8 @@
     "        \"sheet_people\": sheet_people,\n",
     "        \"output_dir\": engine_dir,\n",
     "        \"datalake_dir\": datalake_dir,\n",
+    "        \"limit_linkedin\": limit_linkedin,\n",
+    "        \"limit_llm\": limit_llm\n",
     "    }\n",
     ")\n",
     "organizations = NotebookStep(\n",
@@ -284,6 +288,7 @@
     "        \"sheet_people\": sheet_people,\n",
     "        \"output_dir\": engine_dir,\n",
     "        \"datalake_dir\": datalake_dir,\n",
+    "        \"limit_linkedin\": limit_linkedin,\n",
     "    }\n",
     ")\n",
     "contacts = NotebookStep(\n",
@@ -298,6 +303,7 @@
     "        \"linkedin_url\": linkedin_url,\n",
     "        \"datalake_dir\": datalake_dir,\n",
     "        \"output_dir\": engine_dir,\n",
+    "        \"limit_llm\": limit_llm\n",
     "    }\n",
     ")\n",
     "analytics = NotebookStep(\n",

diff --git a/models/growth-engine/core/domain/Growth_Create_contacts_view.ipynb b/models/growth-engine/core/domain/Growth_Create_contacts_view.ipynb
@@ -113,6 +113,7 @@
     "- `file_organizations`: Name of the file storing organization data to be retrieved.\n",
     "- `spreadsheet_url`: Google Sheets spreadsheet URL.\n",
     "- `sheet_contacts`: Google Sheets sheet name storing leads profiles.\n",
+    "- `limit_llm`: Max call on LLM, Naas API.\n",
     "\n",
     "**Outputs**\n",
     "- `output_dir`: Output directory to save file to.\n",
@@ -142,6 +143,7 @@
     "sheet_contacts = \"CONTACTS\"\n",
     "api_key = os.environ.get(\"NAAS_API_TOKEN\") or naas.secret.get('NAAS_API_TOKEN')\n",
     "linkedin_url = pload(os.path.join(naas_data_product.OUTPUTS_PATH, \"entities\", entity_index), \"linkedin_url\")\n",
+    "limit_llm = 50\n",
     "\n",
     "# Outputs\n",
     "output_dir = os.path.join(entity_dir, \"growth-engine\", date.today().isoformat())\n",
@@ -179,12 +181,11 @@
    "outputs": [],
    "source": [
     "df_init = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_contacts)\n",
-    "df_init = pload(os.path.join(entity_dir, \"growth-engine\", \"2024-04-12\"), file_people)\n",
     "if not isinstance(df_init, pd.DataFrame):\n",
     "    df_init = pd.DataFrame()\n",
     "    messaging_options = {}\n",
     "else:\n",
-    "    messaging_options = get_dict_from_df(df_init, \"MESSAGING_OPTIONS\", \"PEOPLE_PROFILE_URL\", \"messaging_options\", output_dir)\n",
+    "    messaging_options = get_dict_from_df(df_init, \"MESSAGING_OPTIONS\", \"PEOPLE_PROFILE_URL\", \"messaging_options\", output_dir, force_update=True)\n",
     "print(\"- Contacts (init):\", len(df_init))\n",
     "df_init.head(3)"
    ]
@@ -363,7 +364,7 @@
     "    # Loop on people (LinkedIn URL)\n",
     "    for p in people:\n",
     "        tmp_df = df.copy()\n",
-    "        tmp_df = tmp_df[tmp_df[\"PROFILE_URL\"] == p].reset_index(drop=True)\n",
+    "        tmp_df = tmp_df[tmp_df[\"PROFILE_URL\"] == p][:10].reset_index(drop=True)\n",
     "        interactions = []\n",
     "        for row in tmp_df.itertuples():\n",
     "            # Append interaction text to create notes\n",
@@ -384,6 +385,9 @@
     "    entity_name,\n",
     "    limit_llm=50\n",
     "):\n",
+    "    # Init\n",
+    "    limit_llm = int(limit_llm)\n",
+    "    \n",
     "    # Entity\n",
     "    df_entity = df_interactions.copy()\n",
     "    df_entity = df_entity[[\"ENTITY\", \"PROFILE_URL\"]].drop_duplicates(\"PROFILE_URL\", keep=\"first\")\n",
@@ -520,7 +524,8 @@
     "    api_key,\n",
     "    prompt_sales_messagings,\n",
     "    messaging_options,\n",
-    "    entity_name\n",
+    "    entity_name,\n",
+    "    limit_llm=limit_llm\n",
     ")\n",
     "df_contacts.head(1)"
    ]