In [1]:
from json import dumps as json_dumps
import marimo as mo
import plotly.express as px
import polars as pl
from bs4 import BeautifulSoup
from bs4.element import Tag
from httpx import get as http_get
from pydantic import BaseModel
from pydantic_ai.agent import Agent
from pydantic_ai.models.openai import OpenAIModel
from pydantic_ai.providers.openai import OpenAIProvider
In [2]:
import plotly.offline
plotly.offline.init_notebook_mode()
In [3]:
_SHAARLI_URL = "https://sebsauvage.net/links/"
class Link(BaseModel):
id: str
title: str
href: str
description: str
tags: list[str]
def extract_link(entry_node: Tag) -> Link:
link_id = entry_node.select_one("a")["id"]
container_node = entry_node.select_one(".linkcontainer")
link_node = container_node.select_one(".linktitle a")
link_href = link_node["href"]
if link_href == "?" + link_id:
link_href = ""
link_title = link_node.text.strip()
if description_node := container_node.select_one(".linkdescription"):
link_description = description_node.text.strip()
else:
link_description = ""
link_tags = [
tag.text.strip() for tag in container_node.select(".linktaglist a")
]
link = Link(
id=link_id,
title=link_title,
href=link_href,
description=link_description,
tags=link_tags,
)
return link
def extract_links(soup: BeautifulSoup) -> list[Link]:
return [extract_link(entry_node) for entry_node in soup.select("#linklist li")]
_r = http_get(_SHAARLI_URL)
_r.raise_for_status()
_soup = BeautifulSoup(_r.text, "html5lib")
_links = extract_links(_soup)
links_df = pl.DataFrame(_links)
In [4]:
links_df
Out[4]:
shape: (50, 5)
| id | title | href | description | tags |
|---|---|---|---|---|
| str | str | str | str | list[str] |
| "jYuT9A" | "</> htmx - high power tools fo… | "https://htmx.org/" | "Concevoir des pages dynamiques… | ["web"] |
| "b186Aw" | "Dialogue typique avec une IA" | "" | "👨🦰 J'ai besoin de la réponse … | [] |
| "6gW5Dw" | "Tiling terminal multiplexers f… | "https://www.theregister.com/20… | "Si vous cherchez des multiplex… | ["Linux", "LogicielLibre"] |
| "3TTqyQ" | "Économisez 100 % sur SteamWorl… | "https://store.steampowered.com… | "Jeu gratuit sur Steam." | ["gratuit", "jeux"] |
| "HVNakw" | "En Alsace, le data center de M… | "https://www.rue89strasbourg.co… | "Sachant que l'Alsace c'est moi… | [] |
| … | … | … | … | … |
| "lWStRA" | "Piratage sportif : l’Arcom sur… | "https://next.ink/188426/pirata… | "Ah bah oui après avoir censuré… | ["copyreich"] |
| "7bTyYA" | "WhatsApp Introduces Ads in Its… | "https://www.nytimes.com/2025/0… | "La publicité arrive dans Whats… | ["merdification"] |
| "FZk9Tg" | "Refonder la solidarité queer :… | "https://friction-magazine.fr/r… | "Excellent article sur les diss… | ["LGBT"] |
| "_EKv9g" | "Recrutements automatisés : déf… | "https://framablog.org/2025/06/… | "(En complément de https://sebs… | [] |
| "AfBPtA" | "KAIST Succeeds in Real-Time Ca… | "https://news.kaist.ac.kr/newse… | "Un capteur de CO² qui ne néces… | ["technologie"] |
In [5]:
_OLLAMA_URL = "http://edemaruh:11434/v1"
_LLM_MODEL = "ollama:qwen3:8b"
# LLM_MODEL = "google-gla:gemini-2.5-flash"
_SYSTEM_PROMPT = """\
You are a data classifier. for each content item, determines whether it talks about AI/LLM (True) or not (False).
"""
_provider_name, _model_name = _LLM_MODEL.split(":", maxsplit=1)
if _provider_name == "ollama":
_provider = OpenAIProvider(
base_url=_OLLAMA_URL,
)
_model = OpenAIModel(
model_name=_model_name,
provider=_provider,
)
else:
_model = _LLM_MODEL
_agent = Agent(model=_model, output_type=bool, system_prompt=_SYSTEM_PROMPT)
classified_links_df = links_df.with_columns(
pl.Series(
"is_ai",
[
(await _agent.run(json_dumps(link))).output
for link in mo.status.progress_bar(
links_df.iter_rows(), total=links_df.height
)
],
)
)
In [6]:
classified_links_df
Out[6]:
shape: (50, 6)
| id | title | href | description | tags | is_ai |
|---|---|---|---|---|---|
| str | str | str | str | list[str] | bool |
| "jYuT9A" | "</> htmx - high power tools fo… | "https://htmx.org/" | "Concevoir des pages dynamiques… | ["web"] | false |
| "b186Aw" | "Dialogue typique avec une IA" | "" | "👨🦰 J'ai besoin de la réponse … | [] | true |
| "6gW5Dw" | "Tiling terminal multiplexers f… | "https://www.theregister.com/20… | "Si vous cherchez des multiplex… | ["Linux", "LogicielLibre"] | false |
| "3TTqyQ" | "Économisez 100 % sur SteamWorl… | "https://store.steampowered.com… | "Jeu gratuit sur Steam." | ["gratuit", "jeux"] | false |
| "HVNakw" | "En Alsace, le data center de M… | "https://www.rue89strasbourg.co… | "Sachant que l'Alsace c'est moi… | [] | true |
| … | … | … | … | … | … |
| "lWStRA" | "Piratage sportif : l’Arcom sur… | "https://next.ink/188426/pirata… | "Ah bah oui après avoir censuré… | ["copyreich"] | false |
| "7bTyYA" | "WhatsApp Introduces Ads in Its… | "https://www.nytimes.com/2025/0… | "La publicité arrive dans Whats… | ["merdification"] | false |
| "FZk9Tg" | "Refonder la solidarité queer :… | "https://friction-magazine.fr/r… | "Excellent article sur les diss… | ["LGBT"] | false |
| "_EKv9g" | "Recrutements automatisés : déf… | "https://framablog.org/2025/06/… | "(En complément de https://sebs… | [] | false |
| "AfBPtA" | "KAIST Succeeds in Real-Time Ca… | "https://news.kaist.ac.kr/newse… | "Un capteur de CO² qui ne néces… | ["technologie"] | false |
In [7]:
px.pie(
classified_links_df,
names="is_ai",
title="AI/LLM Links Classification",
color_discrete_sequence=px.colors.qualitative.Pastel,
hole=0.3,
).show()